]> git.saurik.com Git - redis.git/blame - redis.c
faster Set loading time from .rdb file resizing the hash table to the right size...
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
2 * Copyright (c) 2006-2009, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
b0d8747d 30#define REDIS_VERSION "1.3.3"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
c9468bcf 40#define __USE_POSIX199309
ed9b544e 41#include <signal.h>
fbf9bcdb 42
43#ifdef HAVE_BACKTRACE
c9468bcf 44#include <execinfo.h>
45#include <ucontext.h>
fbf9bcdb 46#endif /* HAVE_BACKTRACE */
47
ed9b544e 48#include <sys/wait.h>
49#include <errno.h>
50#include <assert.h>
51#include <ctype.h>
52#include <stdarg.h>
53#include <inttypes.h>
54#include <arpa/inet.h>
55#include <sys/stat.h>
56#include <fcntl.h>
57#include <sys/time.h>
58#include <sys/resource.h>
2895e862 59#include <sys/uio.h>
f78fd11b 60#include <limits.h>
a7866db6 61#include <math.h>
92f8e882 62#include <pthread.h>
0bc1b2f6 63
64#if defined(__sun)
5043dff3 65#include "solarisfixes.h"
66#endif
ed9b544e 67
c9468bcf 68#include "redis.h"
ed9b544e 69#include "ae.h" /* Event driven programming library */
70#include "sds.h" /* Dynamic safe strings */
71#include "anet.h" /* Networking the easy way */
72#include "dict.h" /* Hash tables */
73#include "adlist.h" /* Linked lists */
74#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 75#include "lzf.h" /* LZF compression library */
76#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
ed9b544e 77
78/* Error codes */
79#define REDIS_OK 0
80#define REDIS_ERR -1
81
82/* Static server configuration */
83#define REDIS_SERVERPORT 6379 /* TCP port */
84#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 85#define REDIS_IOBUF_LEN 1024
ed9b544e 86#define REDIS_LOADBUF_LEN 1024
93ea3759 87#define REDIS_STATIC_ARGS 4
ed9b544e 88#define REDIS_DEFAULT_DBNUM 16
89#define REDIS_CONFIGLINE_MAX 1024
90#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
91#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94754ccc 92#define REDIS_EXPIRELOOKUPS_PER_CRON 100 /* try to expire 100 keys/second */
6f376729 93#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 94#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
95
96/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
97#define REDIS_WRITEV_THRESHOLD 3
98/* Max number of iovecs used for each writev call */
99#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 100
101/* Hash table parameters */
102#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 103
104/* Command flags */
3fd78bcd 105#define REDIS_CMD_BULK 1 /* Bulk write command */
106#define REDIS_CMD_INLINE 2 /* Inline command */
107/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
108 this flags will return an error when the 'maxmemory' option is set in the
109 config file and the server is using more than maxmemory bytes of memory.
110 In short this commands are denied on low memory conditions. */
111#define REDIS_CMD_DENYOOM 4
ed9b544e 112
113/* Object types */
114#define REDIS_STRING 0
115#define REDIS_LIST 1
116#define REDIS_SET 2
1812e024 117#define REDIS_ZSET 3
118#define REDIS_HASH 4
f78fd11b 119
942a3961 120/* Objects encoding */
121#define REDIS_ENCODING_RAW 0 /* Raw representation */
122#define REDIS_ENCODING_INT 1 /* Encoded as integer */
123
f78fd11b 124/* Object types only used for dumping to disk */
bb32ede5 125#define REDIS_EXPIRETIME 253
ed9b544e 126#define REDIS_SELECTDB 254
127#define REDIS_EOF 255
128
f78fd11b 129/* Defines related to the dump file format. To store 32 bits lengths for short
130 * keys requires a lot of space, so we check the most significant 2 bits of
131 * the first byte to interpreter the length:
132 *
133 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
134 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
135 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 136 * 11|000000 this means: specially encoded object will follow. The six bits
137 * number specify the kind of object that follows.
138 * See the REDIS_RDB_ENC_* defines.
f78fd11b 139 *
10c43610 140 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
141 * values, will fit inside. */
f78fd11b 142#define REDIS_RDB_6BITLEN 0
143#define REDIS_RDB_14BITLEN 1
144#define REDIS_RDB_32BITLEN 2
17be1a4a 145#define REDIS_RDB_ENCVAL 3
f78fd11b 146#define REDIS_RDB_LENERR UINT_MAX
147
a4d1ba9a 148/* When a length of a string object stored on disk has the first two bits
149 * set, the remaining two bits specify a special encoding for the object
150 * accordingly to the following defines: */
151#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
152#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
153#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 154#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 155
75680a3c 156/* Virtual memory object->where field. */
157#define REDIS_VM_MEMORY 0 /* The object is on memory */
158#define REDIS_VM_SWAPPED 1 /* The object is on disk */
159#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
160#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
161
06224fec 162/* Virtual memory static configuration stuff.
163 * Check vmFindContiguousPages() to know more about this magic numbers. */
164#define REDIS_VM_MAX_NEAR_PAGES 65536
165#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 166#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 167#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 168/* The following is the *percentage* of completed I/O jobs to process when the
169 * handelr is called. While Virtual Memory I/O operations are performed by
170 * threads, this operations must be processed by the main thread when completed
171 * in order to take effect. */
c953f24b 172#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 173
ed9b544e 174/* Client flags */
d5d55fc3 175#define REDIS_SLAVE 1 /* This client is a slave server */
176#define REDIS_MASTER 2 /* This client is a master server */
177#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
178#define REDIS_MULTI 8 /* This client is in a MULTI context */
179#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
180#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
ed9b544e 181
40d224a9 182/* Slave replication state - slave side */
ed9b544e 183#define REDIS_REPL_NONE 0 /* No active replication */
184#define REDIS_REPL_CONNECT 1 /* Must connect to master */
185#define REDIS_REPL_CONNECTED 2 /* Connected to master */
186
40d224a9 187/* Slave replication state - from the point of view of master
188 * Note that in SEND_BULK and ONLINE state the slave receives new updates
189 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
190 * to start the next background saving in order to send updates to it. */
191#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
192#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
193#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
194#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
195
ed9b544e 196/* List related stuff */
197#define REDIS_HEAD 0
198#define REDIS_TAIL 1
199
200/* Sort operations */
201#define REDIS_SORT_GET 0
443c6409 202#define REDIS_SORT_ASC 1
203#define REDIS_SORT_DESC 2
ed9b544e 204#define REDIS_SORTKEY_MAX 1024
205
206/* Log levels */
207#define REDIS_DEBUG 0
f870935d 208#define REDIS_VERBOSE 1
209#define REDIS_NOTICE 2
210#define REDIS_WARNING 3
ed9b544e 211
212/* Anti-warning macro... */
213#define REDIS_NOTUSED(V) ((void) V)
214
6b47e12e 215#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
216#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 217
48f0308a 218/* Append only defines */
219#define APPENDFSYNC_NO 0
220#define APPENDFSYNC_ALWAYS 1
221#define APPENDFSYNC_EVERYSEC 2
222
dfc5e96c 223/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 224#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
6c96ba7d 225static void _redisAssert(char *estr, char *file, int line);
dfc5e96c 226
ed9b544e 227/*================================= Data types ============================== */
228
229/* A redis object, that is a type able to hold a string / list / set */
75680a3c 230
231/* The VM object structure */
232struct redisObjectVM {
3a66edc7 233 off_t page; /* the page at witch the object is stored on disk */
234 off_t usedpages; /* number of pages used on disk */
235 time_t atime; /* Last access time */
75680a3c 236} vm;
237
238/* The actual Redis Object */
ed9b544e 239typedef struct redisObject {
ed9b544e 240 void *ptr;
942a3961 241 unsigned char type;
242 unsigned char encoding;
d894161b 243 unsigned char storage; /* If this object is a key, where is the value?
244 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
245 unsigned char vtype; /* If this object is a key, and value is swapped out,
246 * this is the type of the swapped out object. */
ed9b544e 247 int refcount;
75680a3c 248 /* VM fields, this are only allocated if VM is active, otherwise the
249 * object allocation function will just allocate
250 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
251 * Redis without VM active will not have any overhead. */
252 struct redisObjectVM vm;
ed9b544e 253} robj;
254
dfc5e96c 255/* Macro used to initalize a Redis object allocated on the stack.
256 * Note that this macro is taken near the structure definition to make sure
257 * we'll update it when the structure is changed, to avoid bugs like
258 * bug #85 introduced exactly in this way. */
259#define initStaticStringObject(_var,_ptr) do { \
260 _var.refcount = 1; \
261 _var.type = REDIS_STRING; \
262 _var.encoding = REDIS_ENCODING_RAW; \
263 _var.ptr = _ptr; \
3a66edc7 264 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 265} while(0);
266
3305306f 267typedef struct redisDb {
4409877e 268 dict *dict; /* The keyspace for this DB */
269 dict *expires; /* Timeout of keys with a timeout set */
270 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 271 dict *io_keys; /* Keys with clients waiting for VM I/O */
3305306f 272 int id;
273} redisDb;
274
6e469882 275/* Client MULTI/EXEC state */
276typedef struct multiCmd {
277 robj **argv;
278 int argc;
279 struct redisCommand *cmd;
280} multiCmd;
281
282typedef struct multiState {
283 multiCmd *commands; /* Array of MULTI commands */
284 int count; /* Total number of MULTI commands */
285} multiState;
286
ed9b544e 287/* With multiplexing we need to take per-clinet state.
288 * Clients are taken in a liked list. */
289typedef struct redisClient {
290 int fd;
3305306f 291 redisDb *db;
ed9b544e 292 int dictid;
293 sds querybuf;
e8a74421 294 robj **argv, **mbargv;
295 int argc, mbargc;
40d224a9 296 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 297 int multibulk; /* multi bulk command format active */
ed9b544e 298 list *reply;
299 int sentlen;
300 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 301 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 302 int slaveseldb; /* slave selected db, if this client is a slave */
303 int authenticated; /* when requirepass is non-NULL */
304 int replstate; /* replication state if this is a slave */
305 int repldbfd; /* replication DB file descriptor */
6e469882 306 long repldboff; /* replication DB file offset */
40d224a9 307 off_t repldbsize; /* replication DB file size */
6e469882 308 multiState mstate; /* MULTI/EXEC state */
d5d55fc3 309 robj **blockingkeys; /* The key we are waiting to terminate a blocking
4409877e 310 * operation such as BLPOP. Otherwise NULL. */
b177fd30 311 int blockingkeysnum; /* Number of blocking keys */
4409877e 312 time_t blockingto; /* Blocking operation timeout. If UNIX current time
313 * is >= blockingto then the operation timed out. */
92f8e882 314 list *io_keys; /* Keys this client is waiting to be loaded from the
315 * swap file in order to continue. */
ed9b544e 316} redisClient;
317
318struct saveparam {
319 time_t seconds;
320 int changes;
321};
322
323/* Global server state structure */
324struct redisServer {
325 int port;
326 int fd;
3305306f 327 redisDb *db;
4409877e 328 dict *sharingpool; /* Poll used for object sharing */
10c43610 329 unsigned int sharingpoolsize;
ed9b544e 330 long long dirty; /* changes to DB from the last save */
331 list *clients;
87eca727 332 list *slaves, *monitors;
ed9b544e 333 char neterr[ANET_ERR_LEN];
334 aeEventLoop *el;
335 int cronloops; /* number of times the cron function run */
336 list *objfreelist; /* A list of freed objects to avoid malloc() */
337 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 338 /* Fields used only for stats */
339 time_t stat_starttime; /* server start time */
340 long long stat_numcommands; /* number of processed commands */
341 long long stat_numconnections; /* number of connections received */
342 /* Configuration */
343 int verbosity;
344 int glueoutputbuf;
345 int maxidletime;
346 int dbnum;
347 int daemonize;
44b38ef4 348 int appendonly;
48f0308a 349 int appendfsync;
350 time_t lastfsync;
44b38ef4 351 int appendfd;
352 int appendseldb;
ed329fcf 353 char *pidfile;
9f3c422c 354 pid_t bgsavechildpid;
9d65a1bb 355 pid_t bgrewritechildpid;
356 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
ed9b544e 357 struct saveparam *saveparams;
358 int saveparamslen;
359 char *logfile;
360 char *bindaddr;
361 char *dbfilename;
44b38ef4 362 char *appendfilename;
abcb223e 363 char *requirepass;
10c43610 364 int shareobjects;
121f70cf 365 int rdbcompression;
ed9b544e 366 /* Replication related */
367 int isslave;
d0ccebcf 368 char *masterauth;
ed9b544e 369 char *masterhost;
370 int masterport;
40d224a9 371 redisClient *master; /* client that is master for this slave */
ed9b544e 372 int replstate;
285add55 373 unsigned int maxclients;
4ef8de8a 374 unsigned long long maxmemory;
d5d55fc3 375 unsigned int blpop_blocked_clients;
376 unsigned int vm_blocked_clients;
ed9b544e 377 /* Sort parameters - qsort_r() is only available under BSD so we
378 * have to take this state global, in order to pass it to sortCompare() */
379 int sort_desc;
380 int sort_alpha;
381 int sort_bypattern;
75680a3c 382 /* Virtual memory configuration */
383 int vm_enabled;
054e426d 384 char *vm_swap_file;
75680a3c 385 off_t vm_page_size;
386 off_t vm_pages;
4ef8de8a 387 unsigned long long vm_max_memory;
75680a3c 388 /* Virtual memory state */
389 FILE *vm_fp;
390 int vm_fd;
391 off_t vm_next_page; /* Next probably empty page */
392 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 393 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 394 time_t unixtime; /* Unix time sampled every second. */
92f8e882 395 /* Virtual memory I/O threads stuff */
92f8e882 396 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 397 * put the result of the operation in the io_done list. While the
398 * job is being processed, it's put on io_processing queue. */
399 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
400 list *io_processing; /* List of VM I/O jobs being processed */
401 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 402 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 403 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 404 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
405 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 406 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 407 int io_active_threads; /* Number of running I/O threads */
408 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 409 /* Our main thread is blocked on the event loop, locking for sockets ready
410 * to be read or written, so when a threaded I/O operation is ready to be
411 * processed by the main thread, the I/O thread will use a unix pipe to
412 * awake the main thread. The followings are the two pipe FDs. */
413 int io_ready_pipe_read;
414 int io_ready_pipe_write;
7d98e08c 415 /* Virtual memory stats */
416 unsigned long long vm_stats_used_pages;
417 unsigned long long vm_stats_swapped_objects;
418 unsigned long long vm_stats_swapouts;
419 unsigned long long vm_stats_swapins;
b9bc0eef 420 FILE *devnull;
ed9b544e 421};
422
423typedef void redisCommandProc(redisClient *c);
424struct redisCommand {
425 char *name;
426 redisCommandProc *proc;
427 int arity;
428 int flags;
429};
430
de96dbfe 431struct redisFunctionSym {
432 char *name;
56906eef 433 unsigned long pointer;
de96dbfe 434};
435
ed9b544e 436typedef struct _redisSortObject {
437 robj *obj;
438 union {
439 double score;
440 robj *cmpobj;
441 } u;
442} redisSortObject;
443
444typedef struct _redisSortOperation {
445 int type;
446 robj *pattern;
447} redisSortOperation;
448
6b47e12e 449/* ZSETs use a specialized version of Skiplists */
450
451typedef struct zskiplistNode {
452 struct zskiplistNode **forward;
e3870fab 453 struct zskiplistNode *backward;
6b47e12e 454 double score;
455 robj *obj;
456} zskiplistNode;
457
458typedef struct zskiplist {
e3870fab 459 struct zskiplistNode *header, *tail;
d13f767c 460 unsigned long length;
6b47e12e 461 int level;
462} zskiplist;
463
1812e024 464typedef struct zset {
465 dict *dict;
6b47e12e 466 zskiplist *zsl;
1812e024 467} zset;
468
6b47e12e 469/* Our shared "common" objects */
470
ed9b544e 471struct sharedObjectsStruct {
c937aa89 472 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 473 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 474 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
475 *outofrangeerr, *plus,
ed9b544e 476 *select0, *select1, *select2, *select3, *select4,
477 *select5, *select6, *select7, *select8, *select9;
478} shared;
479
a7866db6 480/* Global vars that are actally used as constants. The following double
481 * values are used for double on-disk serialization, and are initialized
482 * at runtime to avoid strange compiler optimizations. */
483
484static double R_Zero, R_PosInf, R_NegInf, R_Nan;
485
92f8e882 486/* VM threaded I/O request message */
b9bc0eef 487#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
488#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
489#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 490typedef struct iojob {
996cb5f7 491 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 492 redisDb *db;/* Redis database */
92f8e882 493 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 494 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 495 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
496 off_t page; /* Swap page where to read/write the object */
b9bc0eef 497 off_t pages; /* Swap pages needed to safe object. PREPARE_SWAP return val */
996cb5f7 498 int canceled; /* True if this command was canceled by blocking side of VM */
499 pthread_t thread; /* ID of the thread processing this entry */
500} iojob;
92f8e882 501
ed9b544e 502/*================================ Prototypes =============================== */
503
504static void freeStringObject(robj *o);
505static void freeListObject(robj *o);
506static void freeSetObject(robj *o);
507static void decrRefCount(void *o);
508static robj *createObject(int type, void *ptr);
509static void freeClient(redisClient *c);
f78fd11b 510static int rdbLoad(char *filename);
ed9b544e 511static void addReply(redisClient *c, robj *obj);
512static void addReplySds(redisClient *c, sds s);
513static void incrRefCount(robj *o);
f78fd11b 514static int rdbSaveBackground(char *filename);
ed9b544e 515static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 516static robj *dupStringObject(robj *o);
87eca727 517static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc);
44b38ef4 518static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 519static int syncWithMaster(void);
10c43610 520static robj *tryObjectSharing(robj *o);
942a3961 521static int tryObjectEncoding(robj *o);
9d65a1bb 522static robj *getDecodedObject(robj *o);
3305306f 523static int removeExpire(redisDb *db, robj *key);
524static int expireIfNeeded(redisDb *db, robj *key);
525static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 526static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 527static int deleteKey(redisDb *db, robj *key);
bb32ede5 528static time_t getExpire(redisDb *db, robj *key);
529static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 530static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 531static void freeMemoryIfNeeded(void);
de96dbfe 532static int processCommand(redisClient *c);
56906eef 533static void setupSigSegvAction(void);
a3b21203 534static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 535static void aofRemoveTempFile(pid_t childpid);
0ea663ea 536static size_t stringObjectLen(robj *o);
638e42ac 537static void processInputBuffer(redisClient *c);
6b47e12e 538static zskiplist *zslCreate(void);
fd8ccf44 539static void zslFree(zskiplist *zsl);
2b59cfdf 540static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 541static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 542static void initClientMultiState(redisClient *c);
543static void freeClientMultiState(redisClient *c);
544static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 545static void unblockClientWaitingData(redisClient *c);
4409877e 546static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 547static void vmInit(void);
a35ddf12 548static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 549static robj *vmLoadObject(robj *key);
7e69548d 550static robj *vmPreviewObject(robj *key);
a69a0c9c 551static int vmSwapOneObjectBlocking(void);
552static int vmSwapOneObjectThreaded(void);
7e69548d 553static int vmCanSwapOut(void);
a5819310 554static int tryFreeOneObjectFromFreelist(void);
996cb5f7 555static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
556static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
557static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 558static void lockThreadedIO(void);
559static void unlockThreadedIO(void);
560static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
561static void freeIOJob(iojob *j);
562static void queueIOJob(iojob *j);
a5819310 563static int vmWriteObjectOnSwap(robj *o, off_t page);
564static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 565static void waitEmptyIOJobsQueue(void);
566static void vmReopenSwapFile(void);
970e10bb 567static int vmFreePage(off_t page);
d5d55fc3 568static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
569static int dontWaitForSwappedKey(redisClient *c, robj *key);
570static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
571static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
572static struct redisCommand *lookupCommand(char *name);
573static void call(redisClient *c, struct redisCommand *cmd);
574static void resetClient(redisClient *c);
ed9b544e 575
abcb223e 576static void authCommand(redisClient *c);
ed9b544e 577static void pingCommand(redisClient *c);
578static void echoCommand(redisClient *c);
579static void setCommand(redisClient *c);
580static void setnxCommand(redisClient *c);
581static void getCommand(redisClient *c);
582static void delCommand(redisClient *c);
583static void existsCommand(redisClient *c);
584static void incrCommand(redisClient *c);
585static void decrCommand(redisClient *c);
586static void incrbyCommand(redisClient *c);
587static void decrbyCommand(redisClient *c);
588static void selectCommand(redisClient *c);
589static void randomkeyCommand(redisClient *c);
590static void keysCommand(redisClient *c);
591static void dbsizeCommand(redisClient *c);
592static void lastsaveCommand(redisClient *c);
593static void saveCommand(redisClient *c);
594static void bgsaveCommand(redisClient *c);
9d65a1bb 595static void bgrewriteaofCommand(redisClient *c);
ed9b544e 596static void shutdownCommand(redisClient *c);
597static void moveCommand(redisClient *c);
598static void renameCommand(redisClient *c);
599static void renamenxCommand(redisClient *c);
600static void lpushCommand(redisClient *c);
601static void rpushCommand(redisClient *c);
602static void lpopCommand(redisClient *c);
603static void rpopCommand(redisClient *c);
604static void llenCommand(redisClient *c);
605static void lindexCommand(redisClient *c);
606static void lrangeCommand(redisClient *c);
607static void ltrimCommand(redisClient *c);
608static void typeCommand(redisClient *c);
609static void lsetCommand(redisClient *c);
610static void saddCommand(redisClient *c);
611static void sremCommand(redisClient *c);
a4460ef4 612static void smoveCommand(redisClient *c);
ed9b544e 613static void sismemberCommand(redisClient *c);
614static void scardCommand(redisClient *c);
12fea928 615static void spopCommand(redisClient *c);
2abb95a9 616static void srandmemberCommand(redisClient *c);
ed9b544e 617static void sinterCommand(redisClient *c);
618static void sinterstoreCommand(redisClient *c);
40d224a9 619static void sunionCommand(redisClient *c);
620static void sunionstoreCommand(redisClient *c);
f4f56e1d 621static void sdiffCommand(redisClient *c);
622static void sdiffstoreCommand(redisClient *c);
ed9b544e 623static void syncCommand(redisClient *c);
624static void flushdbCommand(redisClient *c);
625static void flushallCommand(redisClient *c);
626static void sortCommand(redisClient *c);
627static void lremCommand(redisClient *c);
0f5f7e9a 628static void rpoplpushcommand(redisClient *c);
ed9b544e 629static void infoCommand(redisClient *c);
70003d28 630static void mgetCommand(redisClient *c);
87eca727 631static void monitorCommand(redisClient *c);
3305306f 632static void expireCommand(redisClient *c);
802e8373 633static void expireatCommand(redisClient *c);
f6b141c5 634static void getsetCommand(redisClient *c);
fd88489a 635static void ttlCommand(redisClient *c);
321b0e13 636static void slaveofCommand(redisClient *c);
7f957c92 637static void debugCommand(redisClient *c);
f6b141c5 638static void msetCommand(redisClient *c);
639static void msetnxCommand(redisClient *c);
fd8ccf44 640static void zaddCommand(redisClient *c);
7db723ad 641static void zincrbyCommand(redisClient *c);
cc812361 642static void zrangeCommand(redisClient *c);
50c55df5 643static void zrangebyscoreCommand(redisClient *c);
e3870fab 644static void zrevrangeCommand(redisClient *c);
3c41331e 645static void zcardCommand(redisClient *c);
1b7106e7 646static void zremCommand(redisClient *c);
6e333bbe 647static void zscoreCommand(redisClient *c);
1807985b 648static void zremrangebyscoreCommand(redisClient *c);
6e469882 649static void multiCommand(redisClient *c);
650static void execCommand(redisClient *c);
4409877e 651static void blpopCommand(redisClient *c);
652static void brpopCommand(redisClient *c);
f6b141c5 653
ed9b544e 654/*================================= Globals ================================= */
655
656/* Global vars */
657static struct redisServer server; /* server global state */
658static struct redisCommand cmdTable[] = {
659 {"get",getCommand,2,REDIS_CMD_INLINE},
3fd78bcd 660 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
661 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
5109cdff 662 {"del",delCommand,-2,REDIS_CMD_INLINE},
ed9b544e 663 {"exists",existsCommand,2,REDIS_CMD_INLINE},
3fd78bcd 664 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
665 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
70003d28 666 {"mget",mgetCommand,-2,REDIS_CMD_INLINE},
3fd78bcd 667 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
668 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
ed9b544e 669 {"rpop",rpopCommand,2,REDIS_CMD_INLINE},
670 {"lpop",lpopCommand,2,REDIS_CMD_INLINE},
b177fd30 671 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE},
672 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE},
ed9b544e 673 {"llen",llenCommand,2,REDIS_CMD_INLINE},
674 {"lindex",lindexCommand,3,REDIS_CMD_INLINE},
3fd78bcd 675 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
ed9b544e 676 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE},
677 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE},
678 {"lrem",lremCommand,4,REDIS_CMD_BULK},
b0d8747d 679 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
3fd78bcd 680 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
ed9b544e 681 {"srem",sremCommand,3,REDIS_CMD_BULK},
a4460ef4 682 {"smove",smoveCommand,4,REDIS_CMD_BULK},
ed9b544e 683 {"sismember",sismemberCommand,3,REDIS_CMD_BULK},
684 {"scard",scardCommand,2,REDIS_CMD_INLINE},
12fea928 685 {"spop",spopCommand,2,REDIS_CMD_INLINE},
2abb95a9 686 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE},
3fd78bcd 687 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
688 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
689 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
690 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
691 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
692 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
ed9b544e 693 {"smembers",sinterCommand,2,REDIS_CMD_INLINE},
fd8ccf44 694 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
7db723ad 695 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
1b7106e7 696 {"zrem",zremCommand,3,REDIS_CMD_BULK},
1807985b 697 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE},
752da584 698 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE},
80181f78 699 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE},
752da584 700 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE},
3c41331e 701 {"zcard",zcardCommand,2,REDIS_CMD_INLINE},
6e333bbe 702 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
3fd78bcd 703 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
704 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
f6b141c5 705 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
706 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
707 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
ed9b544e 708 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE},
709 {"select",selectCommand,2,REDIS_CMD_INLINE},
710 {"move",moveCommand,3,REDIS_CMD_INLINE},
711 {"rename",renameCommand,3,REDIS_CMD_INLINE},
712 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE},
321b0e13 713 {"expire",expireCommand,3,REDIS_CMD_INLINE},
802e8373 714 {"expireat",expireatCommand,3,REDIS_CMD_INLINE},
ed9b544e 715 {"keys",keysCommand,2,REDIS_CMD_INLINE},
716 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE},
abcb223e 717 {"auth",authCommand,2,REDIS_CMD_INLINE},
ed9b544e 718 {"ping",pingCommand,1,REDIS_CMD_INLINE},
719 {"echo",echoCommand,2,REDIS_CMD_BULK},
720 {"save",saveCommand,1,REDIS_CMD_INLINE},
721 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE},
9d65a1bb 722 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE},
ed9b544e 723 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE},
724 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE},
725 {"type",typeCommand,2,REDIS_CMD_INLINE},
6e469882 726 {"multi",multiCommand,1,REDIS_CMD_INLINE},
727 {"exec",execCommand,1,REDIS_CMD_INLINE},
ed9b544e 728 {"sync",syncCommand,1,REDIS_CMD_INLINE},
729 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE},
730 {"flushall",flushallCommand,1,REDIS_CMD_INLINE},
3fd78bcd 731 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
ed9b544e 732 {"info",infoCommand,1,REDIS_CMD_INLINE},
87eca727 733 {"monitor",monitorCommand,1,REDIS_CMD_INLINE},
fd88489a 734 {"ttl",ttlCommand,2,REDIS_CMD_INLINE},
321b0e13 735 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE},
7f957c92 736 {"debug",debugCommand,-2,REDIS_CMD_INLINE},
ed9b544e 737 {NULL,NULL,0,0}
738};
bcfc686d 739
ed9b544e 740/*============================ Utility functions ============================ */
741
742/* Glob-style pattern matching. */
743int stringmatchlen(const char *pattern, int patternLen,
744 const char *string, int stringLen, int nocase)
745{
746 while(patternLen) {
747 switch(pattern[0]) {
748 case '*':
749 while (pattern[1] == '*') {
750 pattern++;
751 patternLen--;
752 }
753 if (patternLen == 1)
754 return 1; /* match */
755 while(stringLen) {
756 if (stringmatchlen(pattern+1, patternLen-1,
757 string, stringLen, nocase))
758 return 1; /* match */
759 string++;
760 stringLen--;
761 }
762 return 0; /* no match */
763 break;
764 case '?':
765 if (stringLen == 0)
766 return 0; /* no match */
767 string++;
768 stringLen--;
769 break;
770 case '[':
771 {
772 int not, match;
773
774 pattern++;
775 patternLen--;
776 not = pattern[0] == '^';
777 if (not) {
778 pattern++;
779 patternLen--;
780 }
781 match = 0;
782 while(1) {
783 if (pattern[0] == '\\') {
784 pattern++;
785 patternLen--;
786 if (pattern[0] == string[0])
787 match = 1;
788 } else if (pattern[0] == ']') {
789 break;
790 } else if (patternLen == 0) {
791 pattern--;
792 patternLen++;
793 break;
794 } else if (pattern[1] == '-' && patternLen >= 3) {
795 int start = pattern[0];
796 int end = pattern[2];
797 int c = string[0];
798 if (start > end) {
799 int t = start;
800 start = end;
801 end = t;
802 }
803 if (nocase) {
804 start = tolower(start);
805 end = tolower(end);
806 c = tolower(c);
807 }
808 pattern += 2;
809 patternLen -= 2;
810 if (c >= start && c <= end)
811 match = 1;
812 } else {
813 if (!nocase) {
814 if (pattern[0] == string[0])
815 match = 1;
816 } else {
817 if (tolower((int)pattern[0]) == tolower((int)string[0]))
818 match = 1;
819 }
820 }
821 pattern++;
822 patternLen--;
823 }
824 if (not)
825 match = !match;
826 if (!match)
827 return 0; /* no match */
828 string++;
829 stringLen--;
830 break;
831 }
832 case '\\':
833 if (patternLen >= 2) {
834 pattern++;
835 patternLen--;
836 }
837 /* fall through */
838 default:
839 if (!nocase) {
840 if (pattern[0] != string[0])
841 return 0; /* no match */
842 } else {
843 if (tolower((int)pattern[0]) != tolower((int)string[0]))
844 return 0; /* no match */
845 }
846 string++;
847 stringLen--;
848 break;
849 }
850 pattern++;
851 patternLen--;
852 if (stringLen == 0) {
853 while(*pattern == '*') {
854 pattern++;
855 patternLen--;
856 }
857 break;
858 }
859 }
860 if (patternLen == 0 && stringLen == 0)
861 return 1;
862 return 0;
863}
864
56906eef 865static void redisLog(int level, const char *fmt, ...) {
ed9b544e 866 va_list ap;
867 FILE *fp;
868
869 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
870 if (!fp) return;
871
872 va_start(ap, fmt);
873 if (level >= server.verbosity) {
874 char *c = ".-*";
1904ecc1 875 char buf[64];
876 time_t now;
877
878 now = time(NULL);
6c9385e0 879 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 880 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 881 vfprintf(fp, fmt, ap);
882 fprintf(fp,"\n");
883 fflush(fp);
884 }
885 va_end(ap);
886
887 if (server.logfile) fclose(fp);
888}
889
890/*====================== Hash table type implementation ==================== */
891
892/* This is an hash table type that uses the SDS dynamic strings libary as
893 * keys and radis objects as values (objects can hold SDS strings,
894 * lists, sets). */
895
1812e024 896static void dictVanillaFree(void *privdata, void *val)
897{
898 DICT_NOTUSED(privdata);
899 zfree(val);
900}
901
4409877e 902static void dictListDestructor(void *privdata, void *val)
903{
904 DICT_NOTUSED(privdata);
905 listRelease((list*)val);
906}
907
ed9b544e 908static int sdsDictKeyCompare(void *privdata, const void *key1,
909 const void *key2)
910{
911 int l1,l2;
912 DICT_NOTUSED(privdata);
913
914 l1 = sdslen((sds)key1);
915 l2 = sdslen((sds)key2);
916 if (l1 != l2) return 0;
917 return memcmp(key1, key2, l1) == 0;
918}
919
920static void dictRedisObjectDestructor(void *privdata, void *val)
921{
922 DICT_NOTUSED(privdata);
923
a35ddf12 924 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 925 decrRefCount(val);
926}
927
942a3961 928static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 929 const void *key2)
930{
931 const robj *o1 = key1, *o2 = key2;
932 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
933}
934
942a3961 935static unsigned int dictObjHash(const void *key) {
ed9b544e 936 const robj *o = key;
937 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
938}
939
942a3961 940static int dictEncObjKeyCompare(void *privdata, const void *key1,
941 const void *key2)
942{
9d65a1bb 943 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
944 int cmp;
942a3961 945
9d65a1bb 946 o1 = getDecodedObject(o1);
947 o2 = getDecodedObject(o2);
948 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
949 decrRefCount(o1);
950 decrRefCount(o2);
951 return cmp;
942a3961 952}
953
954static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 955 robj *o = (robj*) key;
942a3961 956
9d65a1bb 957 o = getDecodedObject(o);
958 unsigned int hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
959 decrRefCount(o);
960 return hash;
942a3961 961}
962
f2d9f50f 963/* Sets type and expires */
ed9b544e 964static dictType setDictType = {
942a3961 965 dictEncObjHash, /* hash function */
ed9b544e 966 NULL, /* key dup */
967 NULL, /* val dup */
942a3961 968 dictEncObjKeyCompare, /* key compare */
ed9b544e 969 dictRedisObjectDestructor, /* key destructor */
970 NULL /* val destructor */
971};
972
f2d9f50f 973/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 974static dictType zsetDictType = {
975 dictEncObjHash, /* hash function */
976 NULL, /* key dup */
977 NULL, /* val dup */
978 dictEncObjKeyCompare, /* key compare */
979 dictRedisObjectDestructor, /* key destructor */
da0a1620 980 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 981};
982
f2d9f50f 983/* Db->dict */
ed9b544e 984static dictType hashDictType = {
942a3961 985 dictObjHash, /* hash function */
ed9b544e 986 NULL, /* key dup */
987 NULL, /* val dup */
942a3961 988 dictObjKeyCompare, /* key compare */
ed9b544e 989 dictRedisObjectDestructor, /* key destructor */
990 dictRedisObjectDestructor /* val destructor */
991};
992
f2d9f50f 993/* Db->expires */
994static dictType keyptrDictType = {
995 dictObjHash, /* hash function */
996 NULL, /* key dup */
997 NULL, /* val dup */
998 dictObjKeyCompare, /* key compare */
999 dictRedisObjectDestructor, /* key destructor */
1000 NULL /* val destructor */
1001};
1002
4409877e 1003/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1004 * lists as values. It's used for blocking operations (BLPOP) and to
1005 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1006static dictType keylistDictType = {
1007 dictObjHash, /* hash function */
1008 NULL, /* key dup */
1009 NULL, /* val dup */
1010 dictObjKeyCompare, /* key compare */
1011 dictRedisObjectDestructor, /* key destructor */
1012 dictListDestructor /* val destructor */
1013};
1014
ed9b544e 1015/* ========================= Random utility functions ======================= */
1016
1017/* Redis generally does not try to recover from out of memory conditions
1018 * when allocating objects or strings, it is not clear if it will be possible
1019 * to report this condition to the client since the networking layer itself
1020 * is based on heap allocation for send buffers, so we simply abort.
1021 * At least the code will be simpler to read... */
1022static void oom(const char *msg) {
71c54b21 1023 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1024 sleep(1);
1025 abort();
1026}
1027
1028/* ====================== Redis server networking stuff ===================== */
56906eef 1029static void closeTimedoutClients(void) {
ed9b544e 1030 redisClient *c;
ed9b544e 1031 listNode *ln;
1032 time_t now = time(NULL);
c7df85a4 1033 listIter li;
ed9b544e 1034
c7df85a4 1035 listRewind(server.clients,&li);
1036 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1037 c = listNodeValue(ln);
f86a74e9 1038 if (server.maxidletime &&
1039 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1040 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
f86a74e9 1041 (now - c->lastinteraction > server.maxidletime))
1042 {
f870935d 1043 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1044 freeClient(c);
f86a74e9 1045 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1046 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1047 addReply(c,shared.nullmultibulk);
b0d8747d 1048 unblockClientWaitingData(c);
f86a74e9 1049 }
ed9b544e 1050 }
1051 }
ed9b544e 1052}
1053
12fea928 1054static int htNeedsResize(dict *dict) {
1055 long long size, used;
1056
1057 size = dictSlots(dict);
1058 used = dictSize(dict);
1059 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1060 (used*100/size < REDIS_HT_MINFILL));
1061}
1062
0bc03378 1063/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1064 * we resize the hash table to save memory */
56906eef 1065static void tryResizeHashTables(void) {
0bc03378 1066 int j;
1067
1068 for (j = 0; j < server.dbnum; j++) {
12fea928 1069 if (htNeedsResize(server.db[j].dict)) {
f870935d 1070 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
0bc03378 1071 dictResize(server.db[j].dict);
f870935d 1072 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
0bc03378 1073 }
12fea928 1074 if (htNeedsResize(server.db[j].expires))
1075 dictResize(server.db[j].expires);
0bc03378 1076 }
1077}
1078
9d65a1bb 1079/* A background saving child (BGSAVE) terminated its work. Handle this. */
1080void backgroundSaveDoneHandler(int statloc) {
1081 int exitcode = WEXITSTATUS(statloc);
1082 int bysignal = WIFSIGNALED(statloc);
1083
1084 if (!bysignal && exitcode == 0) {
1085 redisLog(REDIS_NOTICE,
1086 "Background saving terminated with success");
1087 server.dirty = 0;
1088 server.lastsave = time(NULL);
1089 } else if (!bysignal && exitcode != 0) {
1090 redisLog(REDIS_WARNING, "Background saving error");
1091 } else {
1092 redisLog(REDIS_WARNING,
1093 "Background saving terminated by signal");
1094 rdbRemoveTempFile(server.bgsavechildpid);
1095 }
1096 server.bgsavechildpid = -1;
1097 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1098 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1099 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1100}
1101
1102/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1103 * Handle this. */
1104void backgroundRewriteDoneHandler(int statloc) {
1105 int exitcode = WEXITSTATUS(statloc);
1106 int bysignal = WIFSIGNALED(statloc);
1107
1108 if (!bysignal && exitcode == 0) {
1109 int fd;
1110 char tmpfile[256];
1111
1112 redisLog(REDIS_NOTICE,
1113 "Background append only file rewriting terminated with success");
1114 /* Now it's time to flush the differences accumulated by the parent */
1115 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1116 fd = open(tmpfile,O_WRONLY|O_APPEND);
1117 if (fd == -1) {
1118 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1119 goto cleanup;
1120 }
1121 /* Flush our data... */
1122 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1123 (signed) sdslen(server.bgrewritebuf)) {
1124 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1125 close(fd);
1126 goto cleanup;
1127 }
b32627cd 1128 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1129 /* Now our work is to rename the temp file into the stable file. And
1130 * switch the file descriptor used by the server for append only. */
1131 if (rename(tmpfile,server.appendfilename) == -1) {
1132 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1133 close(fd);
1134 goto cleanup;
1135 }
1136 /* Mission completed... almost */
1137 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1138 if (server.appendfd != -1) {
1139 /* If append only is actually enabled... */
1140 close(server.appendfd);
1141 server.appendfd = fd;
1142 fsync(fd);
85a83172 1143 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1144 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1145 } else {
1146 /* If append only is disabled we just generate a dump in this
1147 * format. Why not? */
1148 close(fd);
1149 }
1150 } else if (!bysignal && exitcode != 0) {
1151 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1152 } else {
1153 redisLog(REDIS_WARNING,
1154 "Background append only file rewriting terminated by signal");
1155 }
1156cleanup:
1157 sdsfree(server.bgrewritebuf);
1158 server.bgrewritebuf = sdsempty();
1159 aofRemoveTempFile(server.bgrewritechildpid);
1160 server.bgrewritechildpid = -1;
1161}
1162
56906eef 1163static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1164 int j, loops = server.cronloops++;
ed9b544e 1165 REDIS_NOTUSED(eventLoop);
1166 REDIS_NOTUSED(id);
1167 REDIS_NOTUSED(clientData);
1168
3a66edc7 1169 /* We take a cached value of the unix time in the global state because
1170 * with virtual memory and aging there is to store the current time
1171 * in objects at every object access, and accuracy is not needed.
1172 * To access a global var is faster than calling time(NULL) */
1173 server.unixtime = time(NULL);
1174
0bc03378 1175 /* Show some info about non-empty databases */
ed9b544e 1176 for (j = 0; j < server.dbnum; j++) {
dec423d9 1177 long long size, used, vkeys;
94754ccc 1178
3305306f 1179 size = dictSlots(server.db[j].dict);
1180 used = dictSize(server.db[j].dict);
94754ccc 1181 vkeys = dictSize(server.db[j].expires);
c3cb078d 1182 if (!(loops % 5) && (used || vkeys)) {
f870935d 1183 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1184 /* dictPrintStats(server.dict); */
ed9b544e 1185 }
ed9b544e 1186 }
1187
0bc03378 1188 /* We don't want to resize the hash tables while a bacground saving
1189 * is in progress: the saving child is created using fork() that is
1190 * implemented with a copy-on-write semantic in most modern systems, so
1191 * if we resize the HT while there is the saving child at work actually
1192 * a lot of memory movements in the parent will cause a lot of pages
1193 * copied. */
9d65a1bb 1194 if (server.bgsavechildpid == -1) tryResizeHashTables();
0bc03378 1195
ed9b544e 1196 /* Show information about connected clients */
1197 if (!(loops % 5)) {
f870935d 1198 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
ed9b544e 1199 listLength(server.clients)-listLength(server.slaves),
1200 listLength(server.slaves),
b72f6a4b 1201 zmalloc_used_memory(),
3305306f 1202 dictSize(server.sharingpool));
ed9b544e 1203 }
1204
1205 /* Close connections of timedout clients */
d5d55fc3 1206 if ((server.maxidletime && !(loops % 10)) || server.blpop_blocked_clients)
ed9b544e 1207 closeTimedoutClients();
1208
9d65a1bb 1209 /* Check if a background saving or AOF rewrite in progress terminated */
1210 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1211 int statloc;
9d65a1bb 1212 pid_t pid;
1213
1214 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1215 if (pid == server.bgsavechildpid) {
1216 backgroundSaveDoneHandler(statloc);
ed9b544e 1217 } else {
9d65a1bb 1218 backgroundRewriteDoneHandler(statloc);
ed9b544e 1219 }
ed9b544e 1220 }
1221 } else {
1222 /* If there is not a background saving in progress check if
1223 * we have to save now */
1224 time_t now = time(NULL);
1225 for (j = 0; j < server.saveparamslen; j++) {
1226 struct saveparam *sp = server.saveparams+j;
1227
1228 if (server.dirty >= sp->changes &&
1229 now-server.lastsave > sp->seconds) {
1230 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1231 sp->changes, sp->seconds);
f78fd11b 1232 rdbSaveBackground(server.dbfilename);
ed9b544e 1233 break;
1234 }
1235 }
1236 }
94754ccc 1237
f2324293 1238 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1239 * will use few CPU cycles if there are few expiring keys, otherwise
1240 * it will get more aggressive to avoid that too much memory is used by
1241 * keys that can be removed from the keyspace. */
94754ccc 1242 for (j = 0; j < server.dbnum; j++) {
f2324293 1243 int expired;
94754ccc 1244 redisDb *db = server.db+j;
94754ccc 1245
f2324293 1246 /* Continue to expire if at the end of the cycle more than 25%
1247 * of the keys were expired. */
1248 do {
4ef8de8a 1249 long num = dictSize(db->expires);
94754ccc 1250 time_t now = time(NULL);
1251
f2324293 1252 expired = 0;
94754ccc 1253 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1254 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1255 while (num--) {
1256 dictEntry *de;
1257 time_t t;
1258
1259 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1260 t = (time_t) dictGetEntryVal(de);
1261 if (now > t) {
1262 deleteKey(db,dictGetEntryKey(de));
f2324293 1263 expired++;
94754ccc 1264 }
1265 }
f2324293 1266 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1267 }
1268
4ef8de8a 1269 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1270 * is enbled. Try to free objects from the free list first. */
7e69548d 1271 if (vmCanSwapOut()) {
1272 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1273 server.vm_max_memory)
1274 {
72e9fd40 1275 int retval;
1276
a5819310 1277 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1278 retval = (server.vm_max_threads == 0) ?
1279 vmSwapOneObjectBlocking() :
1280 vmSwapOneObjectThreaded();
1281 if (retval == REDIS_ERR && (loops % 30) == 0 &&
1282 zmalloc_used_memory() >
1283 (server.vm_max_memory+server.vm_max_memory/10))
1284 {
1285 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1286 }
72e9fd40 1287 /* Note that when using threade I/O we free just one object,
1288 * because anyway when the I/O thread in charge to swap this
1289 * object out will finish, the handler of completed jobs
1290 * will try to swap more objects if we are still out of memory. */
1291 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1292 }
1293 }
1294
ed9b544e 1295 /* Check if we should connect to a MASTER */
1296 if (server.replstate == REDIS_REPL_CONNECT) {
1297 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1298 if (syncWithMaster() == REDIS_OK) {
1299 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1300 }
1301 }
1302 return 1000;
1303}
1304
d5d55fc3 1305/* This function gets called every time Redis is entering the
1306 * main loop of the event driven library, that is, before to sleep
1307 * for ready file descriptors. */
1308static void beforeSleep(struct aeEventLoop *eventLoop) {
1309 REDIS_NOTUSED(eventLoop);
1310
1311 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1312 listIter li;
1313 listNode *ln;
1314
1315 listRewind(server.io_ready_clients,&li);
1316 while((ln = listNext(&li))) {
1317 redisClient *c = ln->value;
1318 struct redisCommand *cmd;
1319
1320 /* Resume the client. */
1321 listDelNode(server.io_ready_clients,ln);
1322 c->flags &= (~REDIS_IO_WAIT);
1323 server.vm_blocked_clients--;
1324 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1325 readQueryFromClient, c);
1326 cmd = lookupCommand(c->argv[0]->ptr);
1327 assert(cmd != NULL);
1328 call(c,cmd);
1329 resetClient(c);
1330 /* There may be more data to process in the input buffer. */
1331 if (c->querybuf && sdslen(c->querybuf) > 0)
1332 processInputBuffer(c);
1333 }
1334 }
1335}
1336
ed9b544e 1337static void createSharedObjects(void) {
1338 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1339 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1340 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1341 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1342 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1343 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1344 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1345 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1346 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1347 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1348 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1349 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1350 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1351 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1352 "-ERR no such key\r\n"));
ed9b544e 1353 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1354 "-ERR syntax error\r\n"));
c937aa89 1355 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1356 "-ERR source and destination objects are the same\r\n"));
1357 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1358 "-ERR index out of range\r\n"));
ed9b544e 1359 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1360 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1361 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1362 shared.select0 = createStringObject("select 0\r\n",10);
1363 shared.select1 = createStringObject("select 1\r\n",10);
1364 shared.select2 = createStringObject("select 2\r\n",10);
1365 shared.select3 = createStringObject("select 3\r\n",10);
1366 shared.select4 = createStringObject("select 4\r\n",10);
1367 shared.select5 = createStringObject("select 5\r\n",10);
1368 shared.select6 = createStringObject("select 6\r\n",10);
1369 shared.select7 = createStringObject("select 7\r\n",10);
1370 shared.select8 = createStringObject("select 8\r\n",10);
1371 shared.select9 = createStringObject("select 9\r\n",10);
1372}
1373
1374static void appendServerSaveParams(time_t seconds, int changes) {
1375 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1376 server.saveparams[server.saveparamslen].seconds = seconds;
1377 server.saveparams[server.saveparamslen].changes = changes;
1378 server.saveparamslen++;
1379}
1380
bcfc686d 1381static void resetServerSaveParams() {
ed9b544e 1382 zfree(server.saveparams);
1383 server.saveparams = NULL;
1384 server.saveparamslen = 0;
1385}
1386
1387static void initServerConfig() {
1388 server.dbnum = REDIS_DEFAULT_DBNUM;
1389 server.port = REDIS_SERVERPORT;
f870935d 1390 server.verbosity = REDIS_VERBOSE;
ed9b544e 1391 server.maxidletime = REDIS_MAXIDLETIME;
1392 server.saveparams = NULL;
1393 server.logfile = NULL; /* NULL = log on standard output */
1394 server.bindaddr = NULL;
1395 server.glueoutputbuf = 1;
1396 server.daemonize = 0;
44b38ef4 1397 server.appendonly = 0;
4e141d5a 1398 server.appendfsync = APPENDFSYNC_ALWAYS;
48f0308a 1399 server.lastfsync = time(NULL);
44b38ef4 1400 server.appendfd = -1;
1401 server.appendseldb = -1; /* Make sure the first time will not match */
ed329fcf 1402 server.pidfile = "/var/run/redis.pid";
ed9b544e 1403 server.dbfilename = "dump.rdb";
9d65a1bb 1404 server.appendfilename = "appendonly.aof";
abcb223e 1405 server.requirepass = NULL;
10c43610 1406 server.shareobjects = 0;
b0553789 1407 server.rdbcompression = 1;
21aecf4b 1408 server.sharingpoolsize = 1024;
285add55 1409 server.maxclients = 0;
d5d55fc3 1410 server.blpop_blocked_clients = 0;
3fd78bcd 1411 server.maxmemory = 0;
75680a3c 1412 server.vm_enabled = 0;
054e426d 1413 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1414 server.vm_page_size = 256; /* 256 bytes per page */
1415 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1416 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1417 server.vm_max_threads = 4;
d5d55fc3 1418 server.vm_blocked_clients = 0;
75680a3c 1419
bcfc686d 1420 resetServerSaveParams();
ed9b544e 1421
1422 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1423 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1424 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1425 /* Replication related */
1426 server.isslave = 0;
d0ccebcf 1427 server.masterauth = NULL;
ed9b544e 1428 server.masterhost = NULL;
1429 server.masterport = 6379;
1430 server.master = NULL;
1431 server.replstate = REDIS_REPL_NONE;
a7866db6 1432
1433 /* Double constants initialization */
1434 R_Zero = 0.0;
1435 R_PosInf = 1.0/R_Zero;
1436 R_NegInf = -1.0/R_Zero;
1437 R_Nan = R_Zero/R_Zero;
ed9b544e 1438}
1439
1440static void initServer() {
1441 int j;
1442
1443 signal(SIGHUP, SIG_IGN);
1444 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1445 setupSigSegvAction();
ed9b544e 1446
b9bc0eef 1447 server.devnull = fopen("/dev/null","w");
1448 if (server.devnull == NULL) {
1449 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1450 exit(1);
1451 }
ed9b544e 1452 server.clients = listCreate();
1453 server.slaves = listCreate();
87eca727 1454 server.monitors = listCreate();
ed9b544e 1455 server.objfreelist = listCreate();
1456 createSharedObjects();
1457 server.el = aeCreateEventLoop();
3305306f 1458 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
10c43610 1459 server.sharingpool = dictCreate(&setDictType,NULL);
ed9b544e 1460 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1461 if (server.fd == -1) {
1462 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1463 exit(1);
1464 }
3305306f 1465 for (j = 0; j < server.dbnum; j++) {
1466 server.db[j].dict = dictCreate(&hashDictType,NULL);
f2d9f50f 1467 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
4409877e 1468 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1469 if (server.vm_enabled)
1470 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1471 server.db[j].id = j;
1472 }
ed9b544e 1473 server.cronloops = 0;
9f3c422c 1474 server.bgsavechildpid = -1;
9d65a1bb 1475 server.bgrewritechildpid = -1;
1476 server.bgrewritebuf = sdsempty();
ed9b544e 1477 server.lastsave = time(NULL);
1478 server.dirty = 0;
ed9b544e 1479 server.stat_numcommands = 0;
1480 server.stat_numconnections = 0;
1481 server.stat_starttime = time(NULL);
3a66edc7 1482 server.unixtime = time(NULL);
d8f8b666 1483 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1484 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1485 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1486
1487 if (server.appendonly) {
71eba477 1488 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1489 if (server.appendfd == -1) {
1490 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1491 strerror(errno));
1492 exit(1);
1493 }
1494 }
75680a3c 1495
1496 if (server.vm_enabled) vmInit();
ed9b544e 1497}
1498
1499/* Empty the whole database */
ca37e9cd 1500static long long emptyDb() {
ed9b544e 1501 int j;
ca37e9cd 1502 long long removed = 0;
ed9b544e 1503
3305306f 1504 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1505 removed += dictSize(server.db[j].dict);
3305306f 1506 dictEmpty(server.db[j].dict);
1507 dictEmpty(server.db[j].expires);
1508 }
ca37e9cd 1509 return removed;
ed9b544e 1510}
1511
85dd2f3a 1512static int yesnotoi(char *s) {
1513 if (!strcasecmp(s,"yes")) return 1;
1514 else if (!strcasecmp(s,"no")) return 0;
1515 else return -1;
1516}
1517
ed9b544e 1518/* I agree, this is a very rudimental way to load a configuration...
1519 will improve later if the config gets more complex */
1520static void loadServerConfig(char *filename) {
c9a111ac 1521 FILE *fp;
ed9b544e 1522 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1523 int linenum = 0;
1524 sds line = NULL;
c9a111ac 1525
1526 if (filename[0] == '-' && filename[1] == '\0')
1527 fp = stdin;
1528 else {
1529 if ((fp = fopen(filename,"r")) == NULL) {
1530 redisLog(REDIS_WARNING,"Fatal error, can't open config file");
1531 exit(1);
1532 }
ed9b544e 1533 }
c9a111ac 1534
ed9b544e 1535 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1536 sds *argv;
1537 int argc, j;
1538
1539 linenum++;
1540 line = sdsnew(buf);
1541 line = sdstrim(line," \t\r\n");
1542
1543 /* Skip comments and blank lines*/
1544 if (line[0] == '#' || line[0] == '\0') {
1545 sdsfree(line);
1546 continue;
1547 }
1548
1549 /* Split into arguments */
1550 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1551 sdstolower(argv[0]);
1552
1553 /* Execute config directives */
bb0b03a3 1554 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1555 server.maxidletime = atoi(argv[1]);
0150db36 1556 if (server.maxidletime < 0) {
ed9b544e 1557 err = "Invalid timeout value"; goto loaderr;
1558 }
bb0b03a3 1559 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1560 server.port = atoi(argv[1]);
1561 if (server.port < 1 || server.port > 65535) {
1562 err = "Invalid port"; goto loaderr;
1563 }
bb0b03a3 1564 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1565 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1566 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1567 int seconds = atoi(argv[1]);
1568 int changes = atoi(argv[2]);
1569 if (seconds < 1 || changes < 0) {
1570 err = "Invalid save parameters"; goto loaderr;
1571 }
1572 appendServerSaveParams(seconds,changes);
bb0b03a3 1573 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1574 if (chdir(argv[1]) == -1) {
1575 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1576 argv[1], strerror(errno));
1577 exit(1);
1578 }
bb0b03a3 1579 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1580 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1581 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1582 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1583 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1584 else {
1585 err = "Invalid log level. Must be one of debug, notice, warning";
1586 goto loaderr;
1587 }
bb0b03a3 1588 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1589 FILE *logfp;
ed9b544e 1590
1591 server.logfile = zstrdup(argv[1]);
bb0b03a3 1592 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1593 zfree(server.logfile);
1594 server.logfile = NULL;
1595 }
1596 if (server.logfile) {
1597 /* Test if we are able to open the file. The server will not
1598 * be able to abort just for this problem later... */
c9a111ac 1599 logfp = fopen(server.logfile,"a");
1600 if (logfp == NULL) {
ed9b544e 1601 err = sdscatprintf(sdsempty(),
1602 "Can't open the log file: %s", strerror(errno));
1603 goto loaderr;
1604 }
c9a111ac 1605 fclose(logfp);
ed9b544e 1606 }
bb0b03a3 1607 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1608 server.dbnum = atoi(argv[1]);
1609 if (server.dbnum < 1) {
1610 err = "Invalid number of databases"; goto loaderr;
1611 }
285add55 1612 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1613 server.maxclients = atoi(argv[1]);
3fd78bcd 1614 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
d4465900 1615 server.maxmemory = strtoll(argv[1], NULL, 10);
bb0b03a3 1616 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1617 server.masterhost = sdsnew(argv[1]);
1618 server.masterport = atoi(argv[2]);
1619 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1620 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1621 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1622 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1623 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1624 err = "argument must be 'yes' or 'no'"; goto loaderr;
1625 }
bb0b03a3 1626 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
85dd2f3a 1627 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
10c43610 1628 err = "argument must be 'yes' or 'no'"; goto loaderr;
1629 }
121f70cf 1630 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1631 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1632 err = "argument must be 'yes' or 'no'"; goto loaderr;
1633 }
e52c65b9 1634 } else if (!strcasecmp(argv[0],"shareobjectspoolsize") && argc == 2) {
1635 server.sharingpoolsize = atoi(argv[1]);
1636 if (server.sharingpoolsize < 1) {
1637 err = "invalid object sharing pool size"; goto loaderr;
1638 }
bb0b03a3 1639 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1640 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1641 err = "argument must be 'yes' or 'no'"; goto loaderr;
1642 }
44b38ef4 1643 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1644 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1645 err = "argument must be 'yes' or 'no'"; goto loaderr;
1646 }
48f0308a 1647 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1648 if (!strcasecmp(argv[1],"no")) {
48f0308a 1649 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1650 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1651 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1652 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1653 server.appendfsync = APPENDFSYNC_EVERYSEC;
1654 } else {
1655 err = "argument must be 'no', 'always' or 'everysec'";
1656 goto loaderr;
1657 }
bb0b03a3 1658 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 1659 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1660 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
054e426d 1661 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1662 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
054e426d 1663 server.dbfilename = zstrdup(argv[1]);
75680a3c 1664 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1665 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1666 err = "argument must be 'yes' or 'no'"; goto loaderr;
1667 }
054e426d 1668 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 1669 zfree(server.vm_swap_file);
054e426d 1670 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 1671 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1672 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1673 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1674 server.vm_page_size = strtoll(argv[1], NULL, 10);
1675 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1676 server.vm_pages = strtoll(argv[1], NULL, 10);
92f8e882 1677 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1678 server.vm_max_threads = strtoll(argv[1], NULL, 10);
ed9b544e 1679 } else {
1680 err = "Bad directive or wrong number of arguments"; goto loaderr;
1681 }
1682 for (j = 0; j < argc; j++)
1683 sdsfree(argv[j]);
1684 zfree(argv);
1685 sdsfree(line);
1686 }
c9a111ac 1687 if (fp != stdin) fclose(fp);
ed9b544e 1688 return;
1689
1690loaderr:
1691 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1692 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1693 fprintf(stderr, ">>> '%s'\n", line);
1694 fprintf(stderr, "%s\n", err);
1695 exit(1);
1696}
1697
1698static void freeClientArgv(redisClient *c) {
1699 int j;
1700
1701 for (j = 0; j < c->argc; j++)
1702 decrRefCount(c->argv[j]);
e8a74421 1703 for (j = 0; j < c->mbargc; j++)
1704 decrRefCount(c->mbargv[j]);
ed9b544e 1705 c->argc = 0;
e8a74421 1706 c->mbargc = 0;
ed9b544e 1707}
1708
1709static void freeClient(redisClient *c) {
1710 listNode *ln;
1711
4409877e 1712 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 1713 * call, we have to set querybuf to NULL *before* to call
1714 * unblockClientWaitingData() to avoid processInputBuffer() will get
1715 * called. Also it is important to remove the file events after
1716 * this, because this call adds the READABLE event. */
4409877e 1717 sdsfree(c->querybuf);
1718 c->querybuf = NULL;
1719 if (c->flags & REDIS_BLOCKED)
b0d8747d 1720 unblockClientWaitingData(c);
4409877e 1721
ed9b544e 1722 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1723 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 1724 listRelease(c->reply);
1725 freeClientArgv(c);
1726 close(c->fd);
92f8e882 1727 /* Remove from the list of clients */
ed9b544e 1728 ln = listSearchKey(server.clients,c);
dfc5e96c 1729 redisAssert(ln != NULL);
ed9b544e 1730 listDelNode(server.clients,ln);
d5d55fc3 1731 /* Remove from the list of clients waiting for swapped keys */
1732 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1733 ln = listSearchKey(server.io_ready_clients,c);
1734 if (ln) {
1735 listDelNode(server.io_ready_clients,ln);
1736 server.vm_blocked_clients--;
1737 }
1738 }
1739 while (server.vm_enabled && listLength(c->io_keys)) {
1740 ln = listFirst(c->io_keys);
1741 dontWaitForSwappedKey(c,ln->value);
92f8e882 1742 }
b3e3d0d7 1743 listRelease(c->io_keys);
92f8e882 1744 /* Other cleanup */
ed9b544e 1745 if (c->flags & REDIS_SLAVE) {
6208b3a7 1746 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1747 close(c->repldbfd);
87eca727 1748 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1749 ln = listSearchKey(l,c);
dfc5e96c 1750 redisAssert(ln != NULL);
87eca727 1751 listDelNode(l,ln);
ed9b544e 1752 }
1753 if (c->flags & REDIS_MASTER) {
1754 server.master = NULL;
1755 server.replstate = REDIS_REPL_CONNECT;
1756 }
93ea3759 1757 zfree(c->argv);
e8a74421 1758 zfree(c->mbargv);
6e469882 1759 freeClientMultiState(c);
ed9b544e 1760 zfree(c);
1761}
1762
cc30e368 1763#define GLUEREPLY_UP_TO (1024)
ed9b544e 1764static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 1765 int copylen = 0;
1766 char buf[GLUEREPLY_UP_TO];
6208b3a7 1767 listNode *ln;
c7df85a4 1768 listIter li;
ed9b544e 1769 robj *o;
1770
c7df85a4 1771 listRewind(c->reply,&li);
1772 while((ln = listNext(&li))) {
c28b42ac 1773 int objlen;
1774
ed9b544e 1775 o = ln->value;
c28b42ac 1776 objlen = sdslen(o->ptr);
1777 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1778 memcpy(buf+copylen,o->ptr,objlen);
1779 copylen += objlen;
ed9b544e 1780 listDelNode(c->reply,ln);
c28b42ac 1781 } else {
1782 if (copylen == 0) return;
1783 break;
ed9b544e 1784 }
ed9b544e 1785 }
c28b42ac 1786 /* Now the output buffer is empty, add the new single element */
1787 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1788 listAddNodeHead(c->reply,o);
ed9b544e 1789}
1790
1791static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1792 redisClient *c = privdata;
1793 int nwritten = 0, totwritten = 0, objlen;
1794 robj *o;
1795 REDIS_NOTUSED(el);
1796 REDIS_NOTUSED(mask);
1797
2895e862 1798 /* Use writev() if we have enough buffers to send */
7ea870c0 1799 if (!server.glueoutputbuf &&
1800 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1801 !(c->flags & REDIS_MASTER))
2895e862 1802 {
1803 sendReplyToClientWritev(el, fd, privdata, mask);
1804 return;
1805 }
2895e862 1806
ed9b544e 1807 while(listLength(c->reply)) {
c28b42ac 1808 if (server.glueoutputbuf && listLength(c->reply) > 1)
1809 glueReplyBuffersIfNeeded(c);
1810
ed9b544e 1811 o = listNodeValue(listFirst(c->reply));
1812 objlen = sdslen(o->ptr);
1813
1814 if (objlen == 0) {
1815 listDelNode(c->reply,listFirst(c->reply));
1816 continue;
1817 }
1818
1819 if (c->flags & REDIS_MASTER) {
6f376729 1820 /* Don't reply to a master */
ed9b544e 1821 nwritten = objlen - c->sentlen;
1822 } else {
a4d1ba9a 1823 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 1824 if (nwritten <= 0) break;
1825 }
1826 c->sentlen += nwritten;
1827 totwritten += nwritten;
1828 /* If we fully sent the object on head go to the next one */
1829 if (c->sentlen == objlen) {
1830 listDelNode(c->reply,listFirst(c->reply));
1831 c->sentlen = 0;
1832 }
6f376729 1833 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 1834 * bytes, in a single threaded server it's a good idea to serve
6f376729 1835 * other clients as well, even if a very large request comes from
1836 * super fast link that is always able to accept data (in real world
12f9d551 1837 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 1838 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 1839 }
1840 if (nwritten == -1) {
1841 if (errno == EAGAIN) {
1842 nwritten = 0;
1843 } else {
f870935d 1844 redisLog(REDIS_VERBOSE,
ed9b544e 1845 "Error writing to client: %s", strerror(errno));
1846 freeClient(c);
1847 return;
1848 }
1849 }
1850 if (totwritten > 0) c->lastinteraction = time(NULL);
1851 if (listLength(c->reply) == 0) {
1852 c->sentlen = 0;
1853 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1854 }
1855}
1856
2895e862 1857static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
1858{
1859 redisClient *c = privdata;
1860 int nwritten = 0, totwritten = 0, objlen, willwrite;
1861 robj *o;
1862 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
1863 int offset, ion = 0;
1864 REDIS_NOTUSED(el);
1865 REDIS_NOTUSED(mask);
1866
1867 listNode *node;
1868 while (listLength(c->reply)) {
1869 offset = c->sentlen;
1870 ion = 0;
1871 willwrite = 0;
1872
1873 /* fill-in the iov[] array */
1874 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
1875 o = listNodeValue(node);
1876 objlen = sdslen(o->ptr);
1877
1878 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
1879 break;
1880
1881 if(ion == REDIS_WRITEV_IOVEC_COUNT)
1882 break; /* no more iovecs */
1883
1884 iov[ion].iov_base = ((char*)o->ptr) + offset;
1885 iov[ion].iov_len = objlen - offset;
1886 willwrite += objlen - offset;
1887 offset = 0; /* just for the first item */
1888 ion++;
1889 }
1890
1891 if(willwrite == 0)
1892 break;
1893
1894 /* write all collected blocks at once */
1895 if((nwritten = writev(fd, iov, ion)) < 0) {
1896 if (errno != EAGAIN) {
f870935d 1897 redisLog(REDIS_VERBOSE,
2895e862 1898 "Error writing to client: %s", strerror(errno));
1899 freeClient(c);
1900 return;
1901 }
1902 break;
1903 }
1904
1905 totwritten += nwritten;
1906 offset = c->sentlen;
1907
1908 /* remove written robjs from c->reply */
1909 while (nwritten && listLength(c->reply)) {
1910 o = listNodeValue(listFirst(c->reply));
1911 objlen = sdslen(o->ptr);
1912
1913 if(nwritten >= objlen - offset) {
1914 listDelNode(c->reply, listFirst(c->reply));
1915 nwritten -= objlen - offset;
1916 c->sentlen = 0;
1917 } else {
1918 /* partial write */
1919 c->sentlen += nwritten;
1920 break;
1921 }
1922 offset = 0;
1923 }
1924 }
1925
1926 if (totwritten > 0)
1927 c->lastinteraction = time(NULL);
1928
1929 if (listLength(c->reply) == 0) {
1930 c->sentlen = 0;
1931 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1932 }
1933}
1934
ed9b544e 1935static struct redisCommand *lookupCommand(char *name) {
1936 int j = 0;
1937 while(cmdTable[j].name != NULL) {
bb0b03a3 1938 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
ed9b544e 1939 j++;
1940 }
1941 return NULL;
1942}
1943
1944/* resetClient prepare the client to process the next command */
1945static void resetClient(redisClient *c) {
1946 freeClientArgv(c);
1947 c->bulklen = -1;
e8a74421 1948 c->multibulk = 0;
ed9b544e 1949}
1950
6e469882 1951/* Call() is the core of Redis execution of a command */
1952static void call(redisClient *c, struct redisCommand *cmd) {
1953 long long dirty;
1954
1955 dirty = server.dirty;
1956 cmd->proc(c);
1957 if (server.appendonly && server.dirty-dirty)
1958 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
1959 if (server.dirty-dirty && listLength(server.slaves))
1960 replicationFeedSlaves(server.slaves,cmd,c->db->id,c->argv,c->argc);
1961 if (listLength(server.monitors))
1962 replicationFeedSlaves(server.monitors,cmd,c->db->id,c->argv,c->argc);
1963 server.stat_numcommands++;
1964}
1965
ed9b544e 1966/* If this function gets called we already read a whole
1967 * command, argments are in the client argv/argc fields.
1968 * processCommand() execute the command or prepare the
1969 * server for a bulk read from the client.
1970 *
1971 * If 1 is returned the client is still alive and valid and
1972 * and other operations can be performed by the caller. Otherwise
1973 * if 0 is returned the client was destroied (i.e. after QUIT). */
1974static int processCommand(redisClient *c) {
1975 struct redisCommand *cmd;
ed9b544e 1976
3fd78bcd 1977 /* Free some memory if needed (maxmemory setting) */
1978 if (server.maxmemory) freeMemoryIfNeeded();
1979
e8a74421 1980 /* Handle the multi bulk command type. This is an alternative protocol
1981 * supported by Redis in order to receive commands that are composed of
1982 * multiple binary-safe "bulk" arguments. The latency of processing is
1983 * a bit higher but this allows things like multi-sets, so if this
1984 * protocol is used only for MSET and similar commands this is a big win. */
1985 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
1986 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
1987 if (c->multibulk <= 0) {
1988 resetClient(c);
1989 return 1;
1990 } else {
1991 decrRefCount(c->argv[c->argc-1]);
1992 c->argc--;
1993 return 1;
1994 }
1995 } else if (c->multibulk) {
1996 if (c->bulklen == -1) {
1997 if (((char*)c->argv[0]->ptr)[0] != '$') {
1998 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
1999 resetClient(c);
2000 return 1;
2001 } else {
2002 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2003 decrRefCount(c->argv[0]);
2004 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2005 c->argc--;
2006 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2007 resetClient(c);
2008 return 1;
2009 }
2010 c->argc--;
2011 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2012 return 1;
2013 }
2014 } else {
2015 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2016 c->mbargv[c->mbargc] = c->argv[0];
2017 c->mbargc++;
2018 c->argc--;
2019 c->multibulk--;
2020 if (c->multibulk == 0) {
2021 robj **auxargv;
2022 int auxargc;
2023
2024 /* Here we need to swap the multi-bulk argc/argv with the
2025 * normal argc/argv of the client structure. */
2026 auxargv = c->argv;
2027 c->argv = c->mbargv;
2028 c->mbargv = auxargv;
2029
2030 auxargc = c->argc;
2031 c->argc = c->mbargc;
2032 c->mbargc = auxargc;
2033
2034 /* We need to set bulklen to something different than -1
2035 * in order for the code below to process the command without
2036 * to try to read the last argument of a bulk command as
2037 * a special argument. */
2038 c->bulklen = 0;
2039 /* continue below and process the command */
2040 } else {
2041 c->bulklen = -1;
2042 return 1;
2043 }
2044 }
2045 }
2046 /* -- end of multi bulk commands processing -- */
2047
ed9b544e 2048 /* The QUIT command is handled as a special case. Normal command
2049 * procs are unable to close the client connection safely */
bb0b03a3 2050 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2051 freeClient(c);
2052 return 0;
2053 }
d5d55fc3 2054
2055 /* Now lookup the command and check ASAP about trivial error conditions
2056 * such wrong arity, bad command name and so forth. */
ed9b544e 2057 cmd = lookupCommand(c->argv[0]->ptr);
2058 if (!cmd) {
2c14807b 2059 addReplySds(c,
2060 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2061 (char*)c->argv[0]->ptr));
ed9b544e 2062 resetClient(c);
2063 return 1;
2064 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2065 (c->argc < -cmd->arity)) {
454d4e43 2066 addReplySds(c,
2067 sdscatprintf(sdsempty(),
2068 "-ERR wrong number of arguments for '%s' command\r\n",
2069 cmd->name));
ed9b544e 2070 resetClient(c);
2071 return 1;
3fd78bcd 2072 } else if (server.maxmemory && cmd->flags & REDIS_CMD_DENYOOM && zmalloc_used_memory() > server.maxmemory) {
2073 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2074 resetClient(c);
2075 return 1;
ed9b544e 2076 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2077 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2078 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2079
2080 decrRefCount(c->argv[c->argc-1]);
2081 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2082 c->argc--;
2083 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2084 resetClient(c);
2085 return 1;
2086 }
2087 c->argc--;
2088 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2089 /* It is possible that the bulk read is already in the
8d0490e7 2090 * buffer. Check this condition and handle it accordingly.
2091 * This is just a fast path, alternative to call processInputBuffer().
2092 * It's a good idea since the code is small and this condition
2093 * happens most of the times. */
ed9b544e 2094 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2095 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2096 c->argc++;
2097 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2098 } else {
d5d55fc3 2099 /* Otherwise return... there is to read the last argument
2100 * from the socket. */
ed9b544e 2101 return 1;
2102 }
2103 }
10c43610 2104 /* Let's try to share objects on the command arguments vector */
2105 if (server.shareobjects) {
2106 int j;
2107 for(j = 1; j < c->argc; j++)
2108 c->argv[j] = tryObjectSharing(c->argv[j]);
2109 }
942a3961 2110 /* Let's try to encode the bulk object to save space. */
2111 if (cmd->flags & REDIS_CMD_BULK)
2112 tryObjectEncoding(c->argv[c->argc-1]);
2113
e63943a4 2114 /* Check if the user is authenticated */
2115 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2116 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2117 resetClient(c);
2118 return 1;
2119 }
2120
ed9b544e 2121 /* Exec the command */
6e469882 2122 if (c->flags & REDIS_MULTI && cmd->proc != execCommand) {
2123 queueMultiCommand(c,cmd);
2124 addReply(c,shared.queued);
2125 } else {
d5d55fc3 2126 if (server.vm_enabled && server.vm_max_threads > 0 &&
2127 blockClientOnSwappedKeys(cmd,c)) return 1;
6e469882 2128 call(c,cmd);
2129 }
ed9b544e 2130
2131 /* Prepare the client for the next command */
ed9b544e 2132 resetClient(c);
2133 return 1;
2134}
2135
87eca727 2136static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc) {
6208b3a7 2137 listNode *ln;
c7df85a4 2138 listIter li;
ed9b544e 2139 int outc = 0, j;
93ea3759 2140 robj **outv;
2141 /* (args*2)+1 is enough room for args, spaces, newlines */
2142 robj *static_outv[REDIS_STATIC_ARGS*2+1];
2143
2144 if (argc <= REDIS_STATIC_ARGS) {
2145 outv = static_outv;
2146 } else {
2147 outv = zmalloc(sizeof(robj*)*(argc*2+1));
93ea3759 2148 }
ed9b544e 2149
2150 for (j = 0; j < argc; j++) {
2151 if (j != 0) outv[outc++] = shared.space;
2152 if ((cmd->flags & REDIS_CMD_BULK) && j == argc-1) {
2153 robj *lenobj;
2154
2155 lenobj = createObject(REDIS_STRING,
682ac724 2156 sdscatprintf(sdsempty(),"%lu\r\n",
83c6a618 2157 (unsigned long) stringObjectLen(argv[j])));
ed9b544e 2158 lenobj->refcount = 0;
2159 outv[outc++] = lenobj;
2160 }
2161 outv[outc++] = argv[j];
2162 }
2163 outv[outc++] = shared.crlf;
2164
40d224a9 2165 /* Increment all the refcounts at start and decrement at end in order to
2166 * be sure to free objects if there is no slave in a replication state
2167 * able to be feed with commands */
2168 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2169 listRewind(slaves,&li);
2170 while((ln = listNext(&li))) {
ed9b544e 2171 redisClient *slave = ln->value;
40d224a9 2172
2173 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2174 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2175
2176 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2177 if (slave->slaveseldb != dictid) {
2178 robj *selectcmd;
2179
2180 switch(dictid) {
2181 case 0: selectcmd = shared.select0; break;
2182 case 1: selectcmd = shared.select1; break;
2183 case 2: selectcmd = shared.select2; break;
2184 case 3: selectcmd = shared.select3; break;
2185 case 4: selectcmd = shared.select4; break;
2186 case 5: selectcmd = shared.select5; break;
2187 case 6: selectcmd = shared.select6; break;
2188 case 7: selectcmd = shared.select7; break;
2189 case 8: selectcmd = shared.select8; break;
2190 case 9: selectcmd = shared.select9; break;
2191 default:
2192 selectcmd = createObject(REDIS_STRING,
2193 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2194 selectcmd->refcount = 0;
2195 break;
2196 }
2197 addReply(slave,selectcmd);
2198 slave->slaveseldb = dictid;
2199 }
2200 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2201 }
40d224a9 2202 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2203 if (outv != static_outv) zfree(outv);
ed9b544e 2204}
2205
638e42ac 2206static void processInputBuffer(redisClient *c) {
ed9b544e 2207again:
4409877e 2208 /* Before to process the input buffer, make sure the client is not
2209 * waitig for a blocking operation such as BLPOP. Note that the first
2210 * iteration the client is never blocked, otherwise the processInputBuffer
2211 * would not be called at all, but after the execution of the first commands
2212 * in the input buffer the client may be blocked, and the "goto again"
2213 * will try to reiterate. The following line will make it return asap. */
92f8e882 2214 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2215 if (c->bulklen == -1) {
2216 /* Read the first line of the query */
2217 char *p = strchr(c->querybuf,'\n');
2218 size_t querylen;
644fafa3 2219
ed9b544e 2220 if (p) {
2221 sds query, *argv;
2222 int argc, j;
2223
2224 query = c->querybuf;
2225 c->querybuf = sdsempty();
2226 querylen = 1+(p-(query));
2227 if (sdslen(query) > querylen) {
2228 /* leave data after the first line of the query in the buffer */
2229 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2230 }
2231 *p = '\0'; /* remove "\n" */
2232 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2233 sdsupdatelen(query);
2234
2235 /* Now we can split the query in arguments */
ed9b544e 2236 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2237 sdsfree(query);
2238
2239 if (c->argv) zfree(c->argv);
2240 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2241
2242 for (j = 0; j < argc; j++) {
ed9b544e 2243 if (sdslen(argv[j])) {
2244 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2245 c->argc++;
2246 } else {
2247 sdsfree(argv[j]);
2248 }
2249 }
2250 zfree(argv);
7c49733c 2251 if (c->argc) {
2252 /* Execute the command. If the client is still valid
2253 * after processCommand() return and there is something
2254 * on the query buffer try to process the next command. */
2255 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2256 } else {
2257 /* Nothing to process, argc == 0. Just process the query
2258 * buffer if it's not empty or return to the caller */
2259 if (sdslen(c->querybuf)) goto again;
2260 }
ed9b544e 2261 return;
644fafa3 2262 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2263 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2264 freeClient(c);
2265 return;
2266 }
2267 } else {
2268 /* Bulk read handling. Note that if we are at this point
2269 the client already sent a command terminated with a newline,
2270 we are reading the bulk data that is actually the last
2271 argument of the command. */
2272 int qbl = sdslen(c->querybuf);
2273
2274 if (c->bulklen <= qbl) {
2275 /* Copy everything but the final CRLF as final argument */
2276 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2277 c->argc++;
2278 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2279 /* Process the command. If the client is still valid after
2280 * the processing and there is more data in the buffer
2281 * try to parse it. */
2282 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2283 return;
2284 }
2285 }
2286}
2287
638e42ac 2288static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2289 redisClient *c = (redisClient*) privdata;
2290 char buf[REDIS_IOBUF_LEN];
2291 int nread;
2292 REDIS_NOTUSED(el);
2293 REDIS_NOTUSED(mask);
2294
2295 nread = read(fd, buf, REDIS_IOBUF_LEN);
2296 if (nread == -1) {
2297 if (errno == EAGAIN) {
2298 nread = 0;
2299 } else {
f870935d 2300 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2301 freeClient(c);
2302 return;
2303 }
2304 } else if (nread == 0) {
f870935d 2305 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2306 freeClient(c);
2307 return;
2308 }
2309 if (nread) {
2310 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2311 c->lastinteraction = time(NULL);
2312 } else {
2313 return;
2314 }
2315 processInputBuffer(c);
2316}
2317
ed9b544e 2318static int selectDb(redisClient *c, int id) {
2319 if (id < 0 || id >= server.dbnum)
2320 return REDIS_ERR;
3305306f 2321 c->db = &server.db[id];
ed9b544e 2322 return REDIS_OK;
2323}
2324
40d224a9 2325static void *dupClientReplyValue(void *o) {
2326 incrRefCount((robj*)o);
2327 return 0;
2328}
2329
ed9b544e 2330static redisClient *createClient(int fd) {
2331 redisClient *c = zmalloc(sizeof(*c));
2332
2333 anetNonBlock(NULL,fd);
2334 anetTcpNoDelay(NULL,fd);
2335 if (!c) return NULL;
2336 selectDb(c,0);
2337 c->fd = fd;
2338 c->querybuf = sdsempty();
2339 c->argc = 0;
93ea3759 2340 c->argv = NULL;
ed9b544e 2341 c->bulklen = -1;
e8a74421 2342 c->multibulk = 0;
2343 c->mbargc = 0;
2344 c->mbargv = NULL;
ed9b544e 2345 c->sentlen = 0;
2346 c->flags = 0;
2347 c->lastinteraction = time(NULL);
abcb223e 2348 c->authenticated = 0;
40d224a9 2349 c->replstate = REDIS_REPL_NONE;
6b47e12e 2350 c->reply = listCreate();
ed9b544e 2351 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2352 listSetDupMethod(c->reply,dupClientReplyValue);
92f8e882 2353 c->blockingkeys = NULL;
2354 c->blockingkeysnum = 0;
2355 c->io_keys = listCreate();
2356 listSetFreeMethod(c->io_keys,decrRefCount);
ed9b544e 2357 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2358 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2359 freeClient(c);
2360 return NULL;
2361 }
6b47e12e 2362 listAddNodeTail(server.clients,c);
6e469882 2363 initClientMultiState(c);
ed9b544e 2364 return c;
2365}
2366
2367static void addReply(redisClient *c, robj *obj) {
2368 if (listLength(c->reply) == 0 &&
6208b3a7 2369 (c->replstate == REDIS_REPL_NONE ||
2370 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2371 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2372 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2373
2374 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2375 obj = dupStringObject(obj);
2376 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2377 }
9d65a1bb 2378 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2379}
2380
2381static void addReplySds(redisClient *c, sds s) {
2382 robj *o = createObject(REDIS_STRING,s);
2383 addReply(c,o);
2384 decrRefCount(o);
2385}
2386
e2665397 2387static void addReplyDouble(redisClient *c, double d) {
2388 char buf[128];
2389
2390 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2391 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2392 (unsigned long) strlen(buf),buf));
e2665397 2393}
2394
942a3961 2395static void addReplyBulkLen(redisClient *c, robj *obj) {
2396 size_t len;
2397
2398 if (obj->encoding == REDIS_ENCODING_RAW) {
2399 len = sdslen(obj->ptr);
2400 } else {
2401 long n = (long)obj->ptr;
2402
e054afda 2403 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2404 len = 1;
2405 if (n < 0) {
2406 len++;
2407 n = -n;
2408 }
2409 while((n = n/10) != 0) {
2410 len++;
2411 }
2412 }
83c6a618 2413 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
942a3961 2414}
2415
ed9b544e 2416static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2417 int cport, cfd;
2418 char cip[128];
285add55 2419 redisClient *c;
ed9b544e 2420 REDIS_NOTUSED(el);
2421 REDIS_NOTUSED(mask);
2422 REDIS_NOTUSED(privdata);
2423
2424 cfd = anetAccept(server.neterr, fd, cip, &cport);
2425 if (cfd == AE_ERR) {
f870935d 2426 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2427 return;
2428 }
f870935d 2429 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2430 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2431 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2432 close(cfd); /* May be already closed, just ingore errors */
2433 return;
2434 }
285add55 2435 /* If maxclient directive is set and this is one client more... close the
2436 * connection. Note that we create the client instead to check before
2437 * for this condition, since now the socket is already set in nonblocking
2438 * mode and we can send an error for free using the Kernel I/O */
2439 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2440 char *err = "-ERR max number of clients reached\r\n";
2441
2442 /* That's a best effort error message, don't check write errors */
fee803ba 2443 if (write(c->fd,err,strlen(err)) == -1) {
2444 /* Nothing to do, Just to avoid the warning... */
2445 }
285add55 2446 freeClient(c);
2447 return;
2448 }
ed9b544e 2449 server.stat_numconnections++;
2450}
2451
2452/* ======================= Redis objects implementation ===================== */
2453
2454static robj *createObject(int type, void *ptr) {
2455 robj *o;
2456
a5819310 2457 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2458 if (listLength(server.objfreelist)) {
2459 listNode *head = listFirst(server.objfreelist);
2460 o = listNodeValue(head);
2461 listDelNode(server.objfreelist,head);
a5819310 2462 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2463 } else {
75680a3c 2464 if (server.vm_enabled) {
a5819310 2465 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2466 o = zmalloc(sizeof(*o));
2467 } else {
2468 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2469 }
ed9b544e 2470 }
ed9b544e 2471 o->type = type;
942a3961 2472 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2473 o->ptr = ptr;
2474 o->refcount = 1;
3a66edc7 2475 if (server.vm_enabled) {
1064ef87 2476 /* Note that this code may run in the context of an I/O thread
2477 * and accessing to server.unixtime in theory is an error
2478 * (no locks). But in practice this is safe, and even if we read
2479 * garbage Redis will not fail, as it's just a statistical info */
3a66edc7 2480 o->vm.atime = server.unixtime;
2481 o->storage = REDIS_VM_MEMORY;
2482 }
ed9b544e 2483 return o;
2484}
2485
2486static robj *createStringObject(char *ptr, size_t len) {
2487 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2488}
2489
4ef8de8a 2490static robj *dupStringObject(robj *o) {
b9bc0eef 2491 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2492 return createStringObject(o->ptr,sdslen(o->ptr));
2493}
2494
ed9b544e 2495static robj *createListObject(void) {
2496 list *l = listCreate();
2497
ed9b544e 2498 listSetFreeMethod(l,decrRefCount);
2499 return createObject(REDIS_LIST,l);
2500}
2501
2502static robj *createSetObject(void) {
2503 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2504 return createObject(REDIS_SET,d);
2505}
2506
1812e024 2507static robj *createZsetObject(void) {
6b47e12e 2508 zset *zs = zmalloc(sizeof(*zs));
2509
2510 zs->dict = dictCreate(&zsetDictType,NULL);
2511 zs->zsl = zslCreate();
2512 return createObject(REDIS_ZSET,zs);
1812e024 2513}
2514
ed9b544e 2515static void freeStringObject(robj *o) {
942a3961 2516 if (o->encoding == REDIS_ENCODING_RAW) {
2517 sdsfree(o->ptr);
2518 }
ed9b544e 2519}
2520
2521static void freeListObject(robj *o) {
2522 listRelease((list*) o->ptr);
2523}
2524
2525static void freeSetObject(robj *o) {
2526 dictRelease((dict*) o->ptr);
2527}
2528
fd8ccf44 2529static void freeZsetObject(robj *o) {
2530 zset *zs = o->ptr;
2531
2532 dictRelease(zs->dict);
2533 zslFree(zs->zsl);
2534 zfree(zs);
2535}
2536
ed9b544e 2537static void freeHashObject(robj *o) {
2538 dictRelease((dict*) o->ptr);
2539}
2540
2541static void incrRefCount(robj *o) {
f2b8ab34 2542 redisAssert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
ed9b544e 2543 o->refcount++;
2544}
2545
2546static void decrRefCount(void *obj) {
2547 robj *o = obj;
94754ccc 2548
970e10bb 2549 /* Object is a key of a swapped out value, or in the process of being
2550 * loaded. */
996cb5f7 2551 if (server.vm_enabled &&
2552 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2553 {
2554 if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
2555 redisAssert(o->refcount == 1);
2556 }
2557 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 2558 redisAssert(o->type == REDIS_STRING);
a35ddf12 2559 freeStringObject(o);
2560 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 2561 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 2562 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2563 !listAddNodeHead(server.objfreelist,o))
2564 zfree(o);
a5819310 2565 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 2566 server.vm_stats_swapped_objects--;
a35ddf12 2567 return;
2568 }
996cb5f7 2569 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 2570 if (--(o->refcount) == 0) {
996cb5f7 2571 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2572 vmCancelThreadedIOJob(obj);
ed9b544e 2573 switch(o->type) {
2574 case REDIS_STRING: freeStringObject(o); break;
2575 case REDIS_LIST: freeListObject(o); break;
2576 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 2577 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 2578 case REDIS_HASH: freeHashObject(o); break;
dfc5e96c 2579 default: redisAssert(0 != 0); break;
ed9b544e 2580 }
a5819310 2581 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2582 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2583 !listAddNodeHead(server.objfreelist,o))
2584 zfree(o);
a5819310 2585 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2586 }
2587}
2588
942a3961 2589static robj *lookupKey(redisDb *db, robj *key) {
2590 dictEntry *de = dictFind(db->dict,key);
3a66edc7 2591 if (de) {
55cf8433 2592 robj *key = dictGetEntryKey(de);
2593 robj *val = dictGetEntryVal(de);
3a66edc7 2594
55cf8433 2595 if (server.vm_enabled) {
996cb5f7 2596 if (key->storage == REDIS_VM_MEMORY ||
2597 key->storage == REDIS_VM_SWAPPING)
2598 {
2599 /* If we were swapping the object out, stop it, this key
2600 * was requested. */
2601 if (key->storage == REDIS_VM_SWAPPING)
2602 vmCancelThreadedIOJob(key);
55cf8433 2603 /* Update the access time of the key for the aging algorithm. */
2604 key->vm.atime = server.unixtime;
2605 } else {
d5d55fc3 2606 int notify = (key->storage == REDIS_VM_LOADING);
2607
55cf8433 2608 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 2609 redisAssert(val == NULL);
55cf8433 2610 val = vmLoadObject(key);
2611 dictGetEntryVal(de) = val;
d5d55fc3 2612
2613 /* Clients blocked by the VM subsystem may be waiting for
2614 * this key... */
2615 if (notify) handleClientsBlockedOnSwappedKey(db,key);
55cf8433 2616 }
2617 }
2618 return val;
3a66edc7 2619 } else {
2620 return NULL;
2621 }
942a3961 2622}
2623
2624static robj *lookupKeyRead(redisDb *db, robj *key) {
2625 expireIfNeeded(db,key);
2626 return lookupKey(db,key);
2627}
2628
2629static robj *lookupKeyWrite(redisDb *db, robj *key) {
2630 deleteIfVolatile(db,key);
2631 return lookupKey(db,key);
2632}
2633
2634static int deleteKey(redisDb *db, robj *key) {
2635 int retval;
2636
2637 /* We need to protect key from destruction: after the first dictDelete()
2638 * it may happen that 'key' is no longer valid if we don't increment
2639 * it's count. This may happen when we get the object reference directly
2640 * from the hash table with dictRandomKey() or dict iterators */
2641 incrRefCount(key);
2642 if (dictSize(db->expires)) dictDelete(db->expires,key);
2643 retval = dictDelete(db->dict,key);
2644 decrRefCount(key);
2645
2646 return retval == DICT_OK;
2647}
2648
10c43610 2649/* Try to share an object against the shared objects pool */
2650static robj *tryObjectSharing(robj *o) {
2651 struct dictEntry *de;
2652 unsigned long c;
2653
3305306f 2654 if (o == NULL || server.shareobjects == 0) return o;
10c43610 2655
dfc5e96c 2656 redisAssert(o->type == REDIS_STRING);
10c43610 2657 de = dictFind(server.sharingpool,o);
2658 if (de) {
2659 robj *shared = dictGetEntryKey(de);
2660
2661 c = ((unsigned long) dictGetEntryVal(de))+1;
2662 dictGetEntryVal(de) = (void*) c;
2663 incrRefCount(shared);
2664 decrRefCount(o);
2665 return shared;
2666 } else {
2667 /* Here we are using a stream algorihtm: Every time an object is
2668 * shared we increment its count, everytime there is a miss we
2669 * recrement the counter of a random object. If this object reaches
2670 * zero we remove the object and put the current object instead. */
3305306f 2671 if (dictSize(server.sharingpool) >=
10c43610 2672 server.sharingpoolsize) {
2673 de = dictGetRandomKey(server.sharingpool);
dfc5e96c 2674 redisAssert(de != NULL);
10c43610 2675 c = ((unsigned long) dictGetEntryVal(de))-1;
2676 dictGetEntryVal(de) = (void*) c;
2677 if (c == 0) {
2678 dictDelete(server.sharingpool,de->key);
2679 }
2680 } else {
2681 c = 0; /* If the pool is empty we want to add this object */
2682 }
2683 if (c == 0) {
2684 int retval;
2685
2686 retval = dictAdd(server.sharingpool,o,(void*)1);
dfc5e96c 2687 redisAssert(retval == DICT_OK);
10c43610 2688 incrRefCount(o);
2689 }
2690 return o;
2691 }
2692}
2693
724a51b1 2694/* Check if the nul-terminated string 's' can be represented by a long
2695 * (that is, is a number that fits into long without any other space or
2696 * character before or after the digits).
2697 *
2698 * If so, the function returns REDIS_OK and *longval is set to the value
2699 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 2700static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 2701 char buf[32], *endptr;
2702 long value;
2703 int slen;
2704
2705 value = strtol(s, &endptr, 10);
2706 if (endptr[0] != '\0') return REDIS_ERR;
2707 slen = snprintf(buf,32,"%ld",value);
2708
2709 /* If the number converted back into a string is not identical
2710 * then it's not possible to encode the string as integer */
f69f2cba 2711 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 2712 if (longval) *longval = value;
2713 return REDIS_OK;
2714}
2715
942a3961 2716/* Try to encode a string object in order to save space */
2717static int tryObjectEncoding(robj *o) {
2718 long value;
942a3961 2719 sds s = o->ptr;
3305306f 2720
942a3961 2721 if (o->encoding != REDIS_ENCODING_RAW)
2722 return REDIS_ERR; /* Already encoded */
3305306f 2723
942a3961 2724 /* It's not save to encode shared objects: shared objects can be shared
2725 * everywhere in the "object space" of Redis. Encoded objects can only
2726 * appear as "values" (and not, for instance, as keys) */
2727 if (o->refcount > 1) return REDIS_ERR;
3305306f 2728
942a3961 2729 /* Currently we try to encode only strings */
dfc5e96c 2730 redisAssert(o->type == REDIS_STRING);
94754ccc 2731
724a51b1 2732 /* Check if we can represent this string as a long integer */
2733 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
942a3961 2734
2735 /* Ok, this object can be encoded */
2736 o->encoding = REDIS_ENCODING_INT;
2737 sdsfree(o->ptr);
2738 o->ptr = (void*) value;
2739 return REDIS_OK;
2740}
2741
9d65a1bb 2742/* Get a decoded version of an encoded object (returned as a new object).
2743 * If the object is already raw-encoded just increment the ref count. */
2744static robj *getDecodedObject(robj *o) {
942a3961 2745 robj *dec;
2746
9d65a1bb 2747 if (o->encoding == REDIS_ENCODING_RAW) {
2748 incrRefCount(o);
2749 return o;
2750 }
942a3961 2751 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
2752 char buf[32];
2753
2754 snprintf(buf,32,"%ld",(long)o->ptr);
2755 dec = createStringObject(buf,strlen(buf));
2756 return dec;
2757 } else {
dfc5e96c 2758 redisAssert(1 != 1);
942a3961 2759 }
3305306f 2760}
2761
d7f43c08 2762/* Compare two string objects via strcmp() or alike.
2763 * Note that the objects may be integer-encoded. In such a case we
2764 * use snprintf() to get a string representation of the numbers on the stack
1fd9bc8a 2765 * and compare the strings, it's much faster than calling getDecodedObject().
2766 *
2767 * Important note: if objects are not integer encoded, but binary-safe strings,
2768 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2769 * binary safe. */
724a51b1 2770static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 2771 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 2772 char bufa[128], bufb[128], *astr, *bstr;
2773 int bothsds = 1;
724a51b1 2774
e197b441 2775 if (a == b) return 0;
d7f43c08 2776 if (a->encoding != REDIS_ENCODING_RAW) {
2777 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
2778 astr = bufa;
2779 bothsds = 0;
724a51b1 2780 } else {
d7f43c08 2781 astr = a->ptr;
724a51b1 2782 }
d7f43c08 2783 if (b->encoding != REDIS_ENCODING_RAW) {
2784 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
2785 bstr = bufb;
2786 bothsds = 0;
2787 } else {
2788 bstr = b->ptr;
2789 }
2790 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 2791}
2792
0ea663ea 2793static size_t stringObjectLen(robj *o) {
dfc5e96c 2794 redisAssert(o->type == REDIS_STRING);
0ea663ea 2795 if (o->encoding == REDIS_ENCODING_RAW) {
2796 return sdslen(o->ptr);
2797 } else {
2798 char buf[32];
2799
2800 return snprintf(buf,32,"%ld",(long)o->ptr);
2801 }
2802}
2803
06233c45 2804/*============================ RDB saving/loading =========================== */
ed9b544e 2805
f78fd11b 2806static int rdbSaveType(FILE *fp, unsigned char type) {
2807 if (fwrite(&type,1,1,fp) == 0) return -1;
2808 return 0;
2809}
2810
bb32ede5 2811static int rdbSaveTime(FILE *fp, time_t t) {
2812 int32_t t32 = (int32_t) t;
2813 if (fwrite(&t32,4,1,fp) == 0) return -1;
2814 return 0;
2815}
2816
e3566d4b 2817/* check rdbLoadLen() comments for more info */
f78fd11b 2818static int rdbSaveLen(FILE *fp, uint32_t len) {
2819 unsigned char buf[2];
2820
2821 if (len < (1<<6)) {
2822 /* Save a 6 bit len */
10c43610 2823 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 2824 if (fwrite(buf,1,1,fp) == 0) return -1;
2825 } else if (len < (1<<14)) {
2826 /* Save a 14 bit len */
10c43610 2827 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 2828 buf[1] = len&0xFF;
17be1a4a 2829 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 2830 } else {
2831 /* Save a 32 bit len */
10c43610 2832 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 2833 if (fwrite(buf,1,1,fp) == 0) return -1;
2834 len = htonl(len);
2835 if (fwrite(&len,4,1,fp) == 0) return -1;
2836 }
2837 return 0;
2838}
2839
e3566d4b 2840/* String objects in the form "2391" "-100" without any space and with a
2841 * range of values that can fit in an 8, 16 or 32 bit signed value can be
2842 * encoded as integers to save space */
56906eef 2843static int rdbTryIntegerEncoding(sds s, unsigned char *enc) {
e3566d4b 2844 long long value;
2845 char *endptr, buf[32];
2846
2847 /* Check if it's possible to encode this value as a number */
2848 value = strtoll(s, &endptr, 10);
2849 if (endptr[0] != '\0') return 0;
2850 snprintf(buf,32,"%lld",value);
2851
2852 /* If the number converted back into a string is not identical
2853 * then it's not possible to encode the string as integer */
2854 if (strlen(buf) != sdslen(s) || memcmp(buf,s,sdslen(s))) return 0;
2855
2856 /* Finally check if it fits in our ranges */
2857 if (value >= -(1<<7) && value <= (1<<7)-1) {
2858 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
2859 enc[1] = value&0xFF;
2860 return 2;
2861 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
2862 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
2863 enc[1] = value&0xFF;
2864 enc[2] = (value>>8)&0xFF;
2865 return 3;
2866 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
2867 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
2868 enc[1] = value&0xFF;
2869 enc[2] = (value>>8)&0xFF;
2870 enc[3] = (value>>16)&0xFF;
2871 enc[4] = (value>>24)&0xFF;
2872 return 5;
2873 } else {
2874 return 0;
2875 }
2876}
2877
774e3047 2878static int rdbSaveLzfStringObject(FILE *fp, robj *obj) {
2879 unsigned int comprlen, outlen;
2880 unsigned char byte;
2881 void *out;
2882
2883 /* We require at least four bytes compression for this to be worth it */
2884 outlen = sdslen(obj->ptr)-4;
2885 if (outlen <= 0) return 0;
3a2694c4 2886 if ((out = zmalloc(outlen+1)) == NULL) return 0;
774e3047 2887 comprlen = lzf_compress(obj->ptr, sdslen(obj->ptr), out, outlen);
2888 if (comprlen == 0) {
88e85998 2889 zfree(out);
774e3047 2890 return 0;
2891 }
2892 /* Data compressed! Let's save it on disk */
2893 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
2894 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
2895 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
2896 if (rdbSaveLen(fp,sdslen(obj->ptr)) == -1) goto writeerr;
2897 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 2898 zfree(out);
774e3047 2899 return comprlen;
2900
2901writeerr:
88e85998 2902 zfree(out);
774e3047 2903 return -1;
2904}
2905
e3566d4b 2906/* Save a string objet as [len][data] on disk. If the object is a string
2907 * representation of an integer value we try to safe it in a special form */
942a3961 2908static int rdbSaveStringObjectRaw(FILE *fp, robj *obj) {
2909 size_t len;
e3566d4b 2910 int enclen;
10c43610 2911
942a3961 2912 len = sdslen(obj->ptr);
2913
774e3047 2914 /* Try integer encoding */
e3566d4b 2915 if (len <= 11) {
2916 unsigned char buf[5];
2917 if ((enclen = rdbTryIntegerEncoding(obj->ptr,buf)) > 0) {
2918 if (fwrite(buf,enclen,1,fp) == 0) return -1;
2919 return 0;
2920 }
2921 }
774e3047 2922
2923 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 2924 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 2925 if (server.rdbcompression && len > 20) {
774e3047 2926 int retval;
2927
2928 retval = rdbSaveLzfStringObject(fp,obj);
2929 if (retval == -1) return -1;
2930 if (retval > 0) return 0;
2931 /* retval == 0 means data can't be compressed, save the old way */
2932 }
2933
2934 /* Store verbatim */
10c43610 2935 if (rdbSaveLen(fp,len) == -1) return -1;
2936 if (len && fwrite(obj->ptr,len,1,fp) == 0) return -1;
2937 return 0;
2938}
2939
942a3961 2940/* Like rdbSaveStringObjectRaw() but handle encoded objects */
2941static int rdbSaveStringObject(FILE *fp, robj *obj) {
2942 int retval;
942a3961 2943
f2d9f50f 2944 /* Avoid incr/decr ref count business when possible.
2945 * This plays well with copy-on-write given that we are probably
2946 * in a child process (BGSAVE). Also this makes sure key objects
2947 * of swapped objects are not incRefCount-ed (an assert does not allow
2948 * this in order to avoid bugs) */
2949 if (obj->encoding != REDIS_ENCODING_RAW) {
996cb5f7 2950 obj = getDecodedObject(obj);
2951 retval = rdbSaveStringObjectRaw(fp,obj);
2952 decrRefCount(obj);
2953 } else {
996cb5f7 2954 retval = rdbSaveStringObjectRaw(fp,obj);
2955 }
9d65a1bb 2956 return retval;
942a3961 2957}
2958
a7866db6 2959/* Save a double value. Doubles are saved as strings prefixed by an unsigned
2960 * 8 bit integer specifing the length of the representation.
2961 * This 8 bit integer has special values in order to specify the following
2962 * conditions:
2963 * 253: not a number
2964 * 254: + inf
2965 * 255: - inf
2966 */
2967static int rdbSaveDoubleValue(FILE *fp, double val) {
2968 unsigned char buf[128];
2969 int len;
2970
2971 if (isnan(val)) {
2972 buf[0] = 253;
2973 len = 1;
2974 } else if (!isfinite(val)) {
2975 len = 1;
2976 buf[0] = (val < 0) ? 255 : 254;
2977 } else {
eaa256ad 2978 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 2979 buf[0] = strlen((char*)buf+1);
a7866db6 2980 len = buf[0]+1;
2981 }
2982 if (fwrite(buf,len,1,fp) == 0) return -1;
2983 return 0;
2984}
2985
06233c45 2986/* Save a Redis object. */
2987static int rdbSaveObject(FILE *fp, robj *o) {
2988 if (o->type == REDIS_STRING) {
2989 /* Save a string value */
2990 if (rdbSaveStringObject(fp,o) == -1) return -1;
2991 } else if (o->type == REDIS_LIST) {
2992 /* Save a list value */
2993 list *list = o->ptr;
c7df85a4 2994 listIter li;
06233c45 2995 listNode *ln;
2996
06233c45 2997 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
c7df85a4 2998 listRewind(list,&li);
2999 while((ln = listNext(&li))) {
06233c45 3000 robj *eleobj = listNodeValue(ln);
3001
3002 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3003 }
3004 } else if (o->type == REDIS_SET) {
3005 /* Save a set value */
3006 dict *set = o->ptr;
3007 dictIterator *di = dictGetIterator(set);
3008 dictEntry *de;
3009
3010 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3011 while((de = dictNext(di)) != NULL) {
3012 robj *eleobj = dictGetEntryKey(de);
3013
3014 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3015 }
3016 dictReleaseIterator(di);
3017 } else if (o->type == REDIS_ZSET) {
3018 /* Save a set value */
3019 zset *zs = o->ptr;
3020 dictIterator *di = dictGetIterator(zs->dict);
3021 dictEntry *de;
3022
3023 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3024 while((de = dictNext(di)) != NULL) {
3025 robj *eleobj = dictGetEntryKey(de);
3026 double *score = dictGetEntryVal(de);
3027
3028 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3029 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3030 }
3031 dictReleaseIterator(di);
3032 } else {
3033 redisAssert(0 != 0);
3034 }
3035 return 0;
3036}
3037
3038/* Return the length the object will have on disk if saved with
3039 * the rdbSaveObject() function. Currently we use a trick to get
3040 * this length with very little changes to the code. In the future
3041 * we could switch to a faster solution. */
b9bc0eef 3042static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3043 if (fp == NULL) fp = server.devnull;
06233c45 3044 rewind(fp);
3045 assert(rdbSaveObject(fp,o) != 1);
3046 return ftello(fp);
3047}
3048
06224fec 3049/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3050static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3051 off_t bytes = rdbSavedObjectLen(o,fp);
06224fec 3052
3053 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3054}
3055
ed9b544e 3056/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3057static int rdbSave(char *filename) {
ed9b544e 3058 dictIterator *di = NULL;
3059 dictEntry *de;
ed9b544e 3060 FILE *fp;
3061 char tmpfile[256];
3062 int j;
bb32ede5 3063 time_t now = time(NULL);
ed9b544e 3064
2316bb3b 3065 /* Wait for I/O therads to terminate, just in case this is a
3066 * foreground-saving, to avoid seeking the swap file descriptor at the
3067 * same time. */
3068 if (server.vm_enabled)
3069 waitEmptyIOJobsQueue();
3070
a3b21203 3071 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3072 fp = fopen(tmpfile,"w");
3073 if (!fp) {
3074 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3075 return REDIS_ERR;
3076 }
f78fd11b 3077 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3078 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3079 redisDb *db = server.db+j;
3080 dict *d = db->dict;
3305306f 3081 if (dictSize(d) == 0) continue;
ed9b544e 3082 di = dictGetIterator(d);
3083 if (!di) {
3084 fclose(fp);
3085 return REDIS_ERR;
3086 }
3087
3088 /* Write the SELECT DB opcode */
f78fd11b 3089 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3090 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3091
3092 /* Iterate this DB writing every entry */
3093 while((de = dictNext(di)) != NULL) {
3094 robj *key = dictGetEntryKey(de);
3095 robj *o = dictGetEntryVal(de);
bb32ede5 3096 time_t expiretime = getExpire(db,key);
3097
3098 /* Save the expire time */
3099 if (expiretime != -1) {
3100 /* If this key is already expired skip it */
3101 if (expiretime < now) continue;
3102 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3103 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3104 }
7e69548d 3105 /* Save the key and associated value. This requires special
3106 * handling if the value is swapped out. */
996cb5f7 3107 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3108 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3109 /* Save type, key, value */
3110 if (rdbSaveType(fp,o->type) == -1) goto werr;
3111 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3112 if (rdbSaveObject(fp,o) == -1) goto werr;
3113 } else {
996cb5f7 3114 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3115 robj *po;
7e69548d 3116 /* Get a preview of the object in memory */
3117 po = vmPreviewObject(key);
7e69548d 3118 /* Save type, key, value */
3119 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3120 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3121 if (rdbSaveObject(fp,po) == -1) goto werr;
3122 /* Remove the loaded object from memory */
3123 decrRefCount(po);
7e69548d 3124 }
ed9b544e 3125 }
3126 dictReleaseIterator(di);
3127 }
3128 /* EOF opcode */
f78fd11b 3129 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3130
3131 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3132 fflush(fp);
3133 fsync(fileno(fp));
3134 fclose(fp);
3135
3136 /* Use RENAME to make sure the DB file is changed atomically only
3137 * if the generate DB file is ok. */
3138 if (rename(tmpfile,filename) == -1) {
325d1eb4 3139 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3140 unlink(tmpfile);
3141 return REDIS_ERR;
3142 }
3143 redisLog(REDIS_NOTICE,"DB saved on disk");
3144 server.dirty = 0;
3145 server.lastsave = time(NULL);
3146 return REDIS_OK;
3147
3148werr:
3149 fclose(fp);
3150 unlink(tmpfile);
3151 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3152 if (di) dictReleaseIterator(di);
3153 return REDIS_ERR;
3154}
3155
f78fd11b 3156static int rdbSaveBackground(char *filename) {
ed9b544e 3157 pid_t childpid;
3158
9d65a1bb 3159 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3160 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3161 if ((childpid = fork()) == 0) {
3162 /* Child */
054e426d 3163 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3164 close(server.fd);
f78fd11b 3165 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3166 _exit(0);
ed9b544e 3167 } else {
478c2c6f 3168 _exit(1);
ed9b544e 3169 }
3170 } else {
3171 /* Parent */
5a7c647e 3172 if (childpid == -1) {
3173 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3174 strerror(errno));
3175 return REDIS_ERR;
3176 }
ed9b544e 3177 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3178 server.bgsavechildpid = childpid;
ed9b544e 3179 return REDIS_OK;
3180 }
3181 return REDIS_OK; /* unreached */
3182}
3183
a3b21203 3184static void rdbRemoveTempFile(pid_t childpid) {
3185 char tmpfile[256];
3186
3187 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3188 unlink(tmpfile);
3189}
3190
f78fd11b 3191static int rdbLoadType(FILE *fp) {
3192 unsigned char type;
7b45bfb2 3193 if (fread(&type,1,1,fp) == 0) return -1;
3194 return type;
3195}
3196
bb32ede5 3197static time_t rdbLoadTime(FILE *fp) {
3198 int32_t t32;
3199 if (fread(&t32,4,1,fp) == 0) return -1;
3200 return (time_t) t32;
3201}
3202
e3566d4b 3203/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3204 * of this file for a description of how this are stored on disk.
3205 *
3206 * isencoded is set to 1 if the readed length is not actually a length but
3207 * an "encoding type", check the above comments for more info */
c78a8ccc 3208static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3209 unsigned char buf[2];
3210 uint32_t len;
c78a8ccc 3211 int type;
f78fd11b 3212
e3566d4b 3213 if (isencoded) *isencoded = 0;
c78a8ccc 3214 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3215 type = (buf[0]&0xC0)>>6;
3216 if (type == REDIS_RDB_6BITLEN) {
3217 /* Read a 6 bit len */
3218 return buf[0]&0x3F;
3219 } else if (type == REDIS_RDB_ENCVAL) {
3220 /* Read a 6 bit len encoding type */
3221 if (isencoded) *isencoded = 1;
3222 return buf[0]&0x3F;
3223 } else if (type == REDIS_RDB_14BITLEN) {
3224 /* Read a 14 bit len */
3225 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3226 return ((buf[0]&0x3F)<<8)|buf[1];
3227 } else {
3228 /* Read a 32 bit len */
f78fd11b 3229 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3230 return ntohl(len);
f78fd11b 3231 }
f78fd11b 3232}
3233
e3566d4b 3234static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3235 unsigned char enc[4];
3236 long long val;
3237
3238 if (enctype == REDIS_RDB_ENC_INT8) {
3239 if (fread(enc,1,1,fp) == 0) return NULL;
3240 val = (signed char)enc[0];
3241 } else if (enctype == REDIS_RDB_ENC_INT16) {
3242 uint16_t v;
3243 if (fread(enc,2,1,fp) == 0) return NULL;
3244 v = enc[0]|(enc[1]<<8);
3245 val = (int16_t)v;
3246 } else if (enctype == REDIS_RDB_ENC_INT32) {
3247 uint32_t v;
3248 if (fread(enc,4,1,fp) == 0) return NULL;
3249 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3250 val = (int32_t)v;
3251 } else {
3252 val = 0; /* anti-warning */
dfc5e96c 3253 redisAssert(0!=0);
e3566d4b 3254 }
3255 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3256}
3257
c78a8ccc 3258static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3259 unsigned int len, clen;
3260 unsigned char *c = NULL;
3261 sds val = NULL;
3262
c78a8ccc 3263 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3264 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3265 if ((c = zmalloc(clen)) == NULL) goto err;
3266 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3267 if (fread(c,clen,1,fp) == 0) goto err;
3268 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3269 zfree(c);
88e85998 3270 return createObject(REDIS_STRING,val);
3271err:
3272 zfree(c);
3273 sdsfree(val);
3274 return NULL;
3275}
3276
c78a8ccc 3277static robj *rdbLoadStringObject(FILE*fp) {
e3566d4b 3278 int isencoded;
3279 uint32_t len;
f78fd11b 3280 sds val;
3281
c78a8ccc 3282 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3283 if (isencoded) {
3284 switch(len) {
3285 case REDIS_RDB_ENC_INT8:
3286 case REDIS_RDB_ENC_INT16:
3287 case REDIS_RDB_ENC_INT32:
3305306f 3288 return tryObjectSharing(rdbLoadIntegerObject(fp,len));
88e85998 3289 case REDIS_RDB_ENC_LZF:
c78a8ccc 3290 return tryObjectSharing(rdbLoadLzfStringObject(fp));
e3566d4b 3291 default:
dfc5e96c 3292 redisAssert(0!=0);
e3566d4b 3293 }
3294 }
3295
f78fd11b 3296 if (len == REDIS_RDB_LENERR) return NULL;
3297 val = sdsnewlen(NULL,len);
3298 if (len && fread(val,len,1,fp) == 0) {
3299 sdsfree(val);
3300 return NULL;
3301 }
10c43610 3302 return tryObjectSharing(createObject(REDIS_STRING,val));
f78fd11b 3303}
3304
a7866db6 3305/* For information about double serialization check rdbSaveDoubleValue() */
3306static int rdbLoadDoubleValue(FILE *fp, double *val) {
3307 char buf[128];
3308 unsigned char len;
3309
3310 if (fread(&len,1,1,fp) == 0) return -1;
3311 switch(len) {
3312 case 255: *val = R_NegInf; return 0;
3313 case 254: *val = R_PosInf; return 0;
3314 case 253: *val = R_Nan; return 0;
3315 default:
3316 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3317 buf[len] = '\0';
a7866db6 3318 sscanf(buf, "%lg", val);
3319 return 0;
3320 }
3321}
3322
c78a8ccc 3323/* Load a Redis object of the specified type from the specified file.
3324 * On success a newly allocated object is returned, otherwise NULL. */
3325static robj *rdbLoadObject(int type, FILE *fp) {
3326 robj *o;
3327
3328 if (type == REDIS_STRING) {
3329 /* Read string value */
3330 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3331 tryObjectEncoding(o);
3332 } else if (type == REDIS_LIST || type == REDIS_SET) {
3333 /* Read list/set value */
3334 uint32_t listlen;
3335
3336 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3337 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3c68de9b 3338 /* It's faster to expand the dict to the right size asap in order
3339 * to avoid rehashing */
3340 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3341 dictExpand(o->ptr,listlen);
c78a8ccc 3342 /* Load every single element of the list/set */
3343 while(listlen--) {
3344 robj *ele;
3345
3346 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3347 tryObjectEncoding(ele);
3348 if (type == REDIS_LIST) {
3349 listAddNodeTail((list*)o->ptr,ele);
3350 } else {
3351 dictAdd((dict*)o->ptr,ele,NULL);
3352 }
3353 }
3354 } else if (type == REDIS_ZSET) {
3355 /* Read list/set value */
3356 uint32_t zsetlen;
3357 zset *zs;
3358
3359 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3360 o = createZsetObject();
3361 zs = o->ptr;
3362 /* Load every single element of the list/set */
3363 while(zsetlen--) {
3364 robj *ele;
3365 double *score = zmalloc(sizeof(double));
3366
3367 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3368 tryObjectEncoding(ele);
3369 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3370 dictAdd(zs->dict,ele,score);
3371 zslInsert(zs->zsl,*score,ele);
3372 incrRefCount(ele); /* added to skiplist */
3373 }
3374 } else {
3375 redisAssert(0 != 0);
3376 }
3377 return o;
3378}
3379
f78fd11b 3380static int rdbLoad(char *filename) {
ed9b544e 3381 FILE *fp;
f78fd11b 3382 robj *keyobj = NULL;
3383 uint32_t dbid;
bb32ede5 3384 int type, retval, rdbver;
3305306f 3385 dict *d = server.db[0].dict;
bb32ede5 3386 redisDb *db = server.db+0;
f78fd11b 3387 char buf[1024];
bb32ede5 3388 time_t expiretime = -1, now = time(NULL);
b492cf00 3389 long long loadedkeys = 0;
bb32ede5 3390
ed9b544e 3391 fp = fopen(filename,"r");
3392 if (!fp) return REDIS_ERR;
3393 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 3394 buf[9] = '\0';
3395 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 3396 fclose(fp);
3397 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3398 return REDIS_ERR;
3399 }
f78fd11b 3400 rdbver = atoi(buf+5);
c78a8ccc 3401 if (rdbver != 1) {
f78fd11b 3402 fclose(fp);
3403 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3404 return REDIS_ERR;
3405 }
ed9b544e 3406 while(1) {
3407 robj *o;
3408
3409 /* Read type. */
f78fd11b 3410 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 3411 if (type == REDIS_EXPIRETIME) {
3412 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3413 /* We read the time so we need to read the object type again */
3414 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3415 }
ed9b544e 3416 if (type == REDIS_EOF) break;
3417 /* Handle SELECT DB opcode as a special case */
3418 if (type == REDIS_SELECTDB) {
c78a8ccc 3419 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 3420 goto eoferr;
ed9b544e 3421 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 3422 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 3423 exit(1);
3424 }
bb32ede5 3425 db = server.db+dbid;
3426 d = db->dict;
ed9b544e 3427 continue;
3428 }
3429 /* Read key */
c78a8ccc 3430 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3431 /* Read value */
3432 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
ed9b544e 3433 /* Add the new object in the hash table */
f78fd11b 3434 retval = dictAdd(d,keyobj,o);
ed9b544e 3435 if (retval == DICT_ERR) {
f78fd11b 3436 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
ed9b544e 3437 exit(1);
3438 }
bb32ede5 3439 /* Set the expire time if needed */
3440 if (expiretime != -1) {
3441 setExpire(db,keyobj,expiretime);
3442 /* Delete this key if already expired */
3443 if (expiretime < now) deleteKey(db,keyobj);
3444 expiretime = -1;
3445 }
f78fd11b 3446 keyobj = o = NULL;
b492cf00 3447 /* Handle swapping while loading big datasets when VM is on */
3448 loadedkeys++;
3449 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3450 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 3451 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 3452 }
3453 }
ed9b544e 3454 }
3455 fclose(fp);
3456 return REDIS_OK;
3457
3458eoferr: /* unexpected end of file is handled here with a fatal exit */
e3566d4b 3459 if (keyobj) decrRefCount(keyobj);
f80dff62 3460 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 3461 exit(1);
3462 return REDIS_ERR; /* Just to avoid warning */
3463}
3464
3465/*================================== Commands =============================== */
3466
abcb223e 3467static void authCommand(redisClient *c) {
2e77c2ee 3468 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
3469 c->authenticated = 1;
3470 addReply(c,shared.ok);
3471 } else {
3472 c->authenticated = 0;
fa4c0aba 3473 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
3474 }
3475}
3476
ed9b544e 3477static void pingCommand(redisClient *c) {
3478 addReply(c,shared.pong);
3479}
3480
3481static void echoCommand(redisClient *c) {
942a3961 3482 addReplyBulkLen(c,c->argv[1]);
ed9b544e 3483 addReply(c,c->argv[1]);
3484 addReply(c,shared.crlf);
3485}
3486
3487/*=================================== Strings =============================== */
3488
3489static void setGenericCommand(redisClient *c, int nx) {
3490 int retval;
3491
333fd216 3492 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3305306f 3493 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3494 if (retval == DICT_ERR) {
3495 if (!nx) {
1b03836c 3496 /* If the key is about a swapped value, we want a new key object
3497 * to overwrite the old. So we delete the old key in the database.
3498 * This will also make sure that swap pages about the old object
3499 * will be marked as free. */
3500 if (deleteIfSwapped(c->db,c->argv[1]))
3501 incrRefCount(c->argv[1]);
3305306f 3502 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3503 incrRefCount(c->argv[2]);
3504 } else {
c937aa89 3505 addReply(c,shared.czero);
ed9b544e 3506 return;
3507 }
3508 } else {
3509 incrRefCount(c->argv[1]);
3510 incrRefCount(c->argv[2]);
3511 }
3512 server.dirty++;
3305306f 3513 removeExpire(c->db,c->argv[1]);
c937aa89 3514 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 3515}
3516
3517static void setCommand(redisClient *c) {
a4d1ba9a 3518 setGenericCommand(c,0);
ed9b544e 3519}
3520
3521static void setnxCommand(redisClient *c) {
a4d1ba9a 3522 setGenericCommand(c,1);
ed9b544e 3523}
3524
322fc7d8 3525static int getGenericCommand(redisClient *c) {
3305306f 3526 robj *o = lookupKeyRead(c->db,c->argv[1]);
3527
3528 if (o == NULL) {
c937aa89 3529 addReply(c,shared.nullbulk);
322fc7d8 3530 return REDIS_OK;
ed9b544e 3531 } else {
ed9b544e 3532 if (o->type != REDIS_STRING) {
c937aa89 3533 addReply(c,shared.wrongtypeerr);
322fc7d8 3534 return REDIS_ERR;
ed9b544e 3535 } else {
942a3961 3536 addReplyBulkLen(c,o);
ed9b544e 3537 addReply(c,o);
3538 addReply(c,shared.crlf);
322fc7d8 3539 return REDIS_OK;
ed9b544e 3540 }
3541 }
3542}
3543
322fc7d8 3544static void getCommand(redisClient *c) {
3545 getGenericCommand(c);
3546}
3547
f6b141c5 3548static void getsetCommand(redisClient *c) {
322fc7d8 3549 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 3550 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3551 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3552 } else {
3553 incrRefCount(c->argv[1]);
3554 }
3555 incrRefCount(c->argv[2]);
3556 server.dirty++;
3557 removeExpire(c->db,c->argv[1]);
3558}
3559
70003d28 3560static void mgetCommand(redisClient *c) {
70003d28 3561 int j;
3562
c937aa89 3563 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 3564 for (j = 1; j < c->argc; j++) {
3305306f 3565 robj *o = lookupKeyRead(c->db,c->argv[j]);
3566 if (o == NULL) {
c937aa89 3567 addReply(c,shared.nullbulk);
70003d28 3568 } else {
70003d28 3569 if (o->type != REDIS_STRING) {
c937aa89 3570 addReply(c,shared.nullbulk);
70003d28 3571 } else {
942a3961 3572 addReplyBulkLen(c,o);
70003d28 3573 addReply(c,o);
3574 addReply(c,shared.crlf);
3575 }
3576 }
3577 }
3578}
3579
6c446631 3580static void msetGenericCommand(redisClient *c, int nx) {
906573e7 3581 int j, busykeys = 0;
6c446631 3582
3583 if ((c->argc % 2) == 0) {
454d4e43 3584 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 3585 return;
3586 }
3587 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3588 * set nothing at all if at least one already key exists. */
3589 if (nx) {
3590 for (j = 1; j < c->argc; j += 2) {
906573e7 3591 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3592 busykeys++;
6c446631 3593 }
3594 }
3595 }
906573e7 3596 if (busykeys) {
3597 addReply(c, shared.czero);
3598 return;
3599 }
6c446631 3600
3601 for (j = 1; j < c->argc; j += 2) {
3602 int retval;
3603
17511391 3604 tryObjectEncoding(c->argv[j+1]);
6c446631 3605 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3606 if (retval == DICT_ERR) {
3607 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3608 incrRefCount(c->argv[j+1]);
3609 } else {
3610 incrRefCount(c->argv[j]);
3611 incrRefCount(c->argv[j+1]);
3612 }
3613 removeExpire(c->db,c->argv[j]);
3614 }
3615 server.dirty += (c->argc-1)/2;
3616 addReply(c, nx ? shared.cone : shared.ok);
3617}
3618
3619static void msetCommand(redisClient *c) {
3620 msetGenericCommand(c,0);
3621}
3622
3623static void msetnxCommand(redisClient *c) {
3624 msetGenericCommand(c,1);
3625}
3626
d68ed120 3627static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 3628 long long value;
3629 int retval;
3630 robj *o;
3631
3305306f 3632 o = lookupKeyWrite(c->db,c->argv[1]);
3633 if (o == NULL) {
ed9b544e 3634 value = 0;
3635 } else {
ed9b544e 3636 if (o->type != REDIS_STRING) {
3637 value = 0;
3638 } else {
3639 char *eptr;
3640
942a3961 3641 if (o->encoding == REDIS_ENCODING_RAW)
3642 value = strtoll(o->ptr, &eptr, 10);
3643 else if (o->encoding == REDIS_ENCODING_INT)
3644 value = (long)o->ptr;
3645 else
dfc5e96c 3646 redisAssert(1 != 1);
ed9b544e 3647 }
3648 }
3649
3650 value += incr;
3651 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
942a3961 3652 tryObjectEncoding(o);
3305306f 3653 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 3654 if (retval == DICT_ERR) {
3305306f 3655 dictReplace(c->db->dict,c->argv[1],o);
3656 removeExpire(c->db,c->argv[1]);
ed9b544e 3657 } else {
3658 incrRefCount(c->argv[1]);
3659 }
3660 server.dirty++;
c937aa89 3661 addReply(c,shared.colon);
ed9b544e 3662 addReply(c,o);
3663 addReply(c,shared.crlf);
3664}
3665
3666static void incrCommand(redisClient *c) {
a4d1ba9a 3667 incrDecrCommand(c,1);
ed9b544e 3668}
3669
3670static void decrCommand(redisClient *c) {
a4d1ba9a 3671 incrDecrCommand(c,-1);
ed9b544e 3672}
3673
3674static void incrbyCommand(redisClient *c) {
d68ed120 3675 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
a4d1ba9a 3676 incrDecrCommand(c,incr);
ed9b544e 3677}
3678
3679static void decrbyCommand(redisClient *c) {
d68ed120 3680 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
a4d1ba9a 3681 incrDecrCommand(c,-incr);
ed9b544e 3682}
3683
3684/* ========================= Type agnostic commands ========================= */
3685
3686static void delCommand(redisClient *c) {
5109cdff 3687 int deleted = 0, j;
3688
3689 for (j = 1; j < c->argc; j++) {
3690 if (deleteKey(c->db,c->argv[j])) {
3691 server.dirty++;
3692 deleted++;
3693 }
3694 }
3695 switch(deleted) {
3696 case 0:
c937aa89 3697 addReply(c,shared.czero);
5109cdff 3698 break;
3699 case 1:
3700 addReply(c,shared.cone);
3701 break;
3702 default:
3703 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",deleted));
3704 break;
ed9b544e 3705 }
3706}
3707
3708static void existsCommand(redisClient *c) {
3305306f 3709 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
ed9b544e 3710}
3711
3712static void selectCommand(redisClient *c) {
3713 int id = atoi(c->argv[1]->ptr);
3714
3715 if (selectDb(c,id) == REDIS_ERR) {
774e3047 3716 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 3717 } else {
3718 addReply(c,shared.ok);
3719 }
3720}
3721
3722static void randomkeyCommand(redisClient *c) {
3723 dictEntry *de;
3305306f 3724
3725 while(1) {
3726 de = dictGetRandomKey(c->db->dict);
ce7bef07 3727 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 3728 }
ed9b544e 3729 if (de == NULL) {
ce7bef07 3730 addReply(c,shared.plus);
ed9b544e 3731 addReply(c,shared.crlf);
3732 } else {
c937aa89 3733 addReply(c,shared.plus);
ed9b544e 3734 addReply(c,dictGetEntryKey(de));
3735 addReply(c,shared.crlf);
3736 }
3737}
3738
3739static void keysCommand(redisClient *c) {
3740 dictIterator *di;
3741 dictEntry *de;
3742 sds pattern = c->argv[1]->ptr;
3743 int plen = sdslen(pattern);
682ac724 3744 unsigned long numkeys = 0, keyslen = 0;
ed9b544e 3745 robj *lenobj = createObject(REDIS_STRING,NULL);
3746
3305306f 3747 di = dictGetIterator(c->db->dict);
ed9b544e 3748 addReply(c,lenobj);
3749 decrRefCount(lenobj);
3750 while((de = dictNext(di)) != NULL) {
3751 robj *keyobj = dictGetEntryKey(de);
3305306f 3752
ed9b544e 3753 sds key = keyobj->ptr;
3754 if ((pattern[0] == '*' && pattern[1] == '\0') ||
3755 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 3756 if (expireIfNeeded(c->db,keyobj) == 0) {
3757 if (numkeys != 0)
3758 addReply(c,shared.space);
3759 addReply(c,keyobj);
3760 numkeys++;
3761 keyslen += sdslen(key);
3762 }
ed9b544e 3763 }
3764 }
3765 dictReleaseIterator(di);
c937aa89 3766 lenobj->ptr = sdscatprintf(sdsempty(),"$%lu\r\n",keyslen+(numkeys ? (numkeys-1) : 0));
ed9b544e 3767 addReply(c,shared.crlf);
3768}
3769
3770static void dbsizeCommand(redisClient *c) {
3771 addReplySds(c,
3305306f 3772 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 3773}
3774
3775static void lastsaveCommand(redisClient *c) {
3776 addReplySds(c,
c937aa89 3777 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 3778}
3779
3780static void typeCommand(redisClient *c) {
3305306f 3781 robj *o;
ed9b544e 3782 char *type;
3305306f 3783
3784 o = lookupKeyRead(c->db,c->argv[1]);
3785 if (o == NULL) {
c937aa89 3786 type = "+none";
ed9b544e 3787 } else {
ed9b544e 3788 switch(o->type) {
c937aa89 3789 case REDIS_STRING: type = "+string"; break;
3790 case REDIS_LIST: type = "+list"; break;
3791 case REDIS_SET: type = "+set"; break;
412a8bce 3792 case REDIS_ZSET: type = "+zset"; break;
ed9b544e 3793 default: type = "unknown"; break;
3794 }
3795 }
3796 addReplySds(c,sdsnew(type));
3797 addReply(c,shared.crlf);
3798}
3799
3800static void saveCommand(redisClient *c) {
9d65a1bb 3801 if (server.bgsavechildpid != -1) {
05557f6d 3802 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
3803 return;
3804 }
f78fd11b 3805 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 3806 addReply(c,shared.ok);
3807 } else {
3808 addReply(c,shared.err);
3809 }
3810}
3811
3812static void bgsaveCommand(redisClient *c) {
9d65a1bb 3813 if (server.bgsavechildpid != -1) {
ed9b544e 3814 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
3815 return;
3816 }
f78fd11b 3817 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 3818 char *status = "+Background saving started\r\n";
3819 addReplySds(c,sdsnew(status));
ed9b544e 3820 } else {
3821 addReply(c,shared.err);
3822 }
3823}
3824
3825static void shutdownCommand(redisClient *c) {
3826 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
a3b21203 3827 /* Kill the saving child if there is a background saving in progress.
3828 We want to avoid race conditions, for instance our saving child may
3829 overwrite the synchronous saving did by SHUTDOWN. */
9d65a1bb 3830 if (server.bgsavechildpid != -1) {
9f3c422c 3831 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
3832 kill(server.bgsavechildpid,SIGKILL);
a3b21203 3833 rdbRemoveTempFile(server.bgsavechildpid);
9f3c422c 3834 }
ac945e2d 3835 if (server.appendonly) {
3836 /* Append only file: fsync() the AOF and exit */
3837 fsync(server.appendfd);
054e426d 3838 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 3839 exit(0);
ed9b544e 3840 } else {
ac945e2d 3841 /* Snapshotting. Perform a SYNC SAVE and exit */
3842 if (rdbSave(server.dbfilename) == REDIS_OK) {
3843 if (server.daemonize)
3844 unlink(server.pidfile);
3845 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
3846 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
054e426d 3847 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 3848 exit(0);
3849 } else {
3850 /* Ooops.. error saving! The best we can do is to continue operating.
3851 * Note that if there was a background saving process, in the next
3852 * cron() Redis will be notified that the background saving aborted,
3853 * handling special stuff like slaves pending for synchronization... */
3854 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
3855 addReplySds(c,sdsnew("-ERR can't quit, problems saving the DB\r\n"));
3856 }
ed9b544e 3857 }
3858}
3859
3860static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 3861 robj *o;
3862
3863 /* To use the same key as src and dst is probably an error */
3864 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 3865 addReply(c,shared.sameobjecterr);
ed9b544e 3866 return;
3867 }
3868
3305306f 3869 o = lookupKeyWrite(c->db,c->argv[1]);
3870 if (o == NULL) {
c937aa89 3871 addReply(c,shared.nokeyerr);
ed9b544e 3872 return;
3873 }
ed9b544e 3874 incrRefCount(o);
3305306f 3875 deleteIfVolatile(c->db,c->argv[2]);
3876 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 3877 if (nx) {
3878 decrRefCount(o);
c937aa89 3879 addReply(c,shared.czero);
ed9b544e 3880 return;
3881 }
3305306f 3882 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 3883 } else {
3884 incrRefCount(c->argv[2]);
3885 }
3305306f 3886 deleteKey(c->db,c->argv[1]);
ed9b544e 3887 server.dirty++;
c937aa89 3888 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 3889}
3890
3891static void renameCommand(redisClient *c) {
3892 renameGenericCommand(c,0);
3893}
3894
3895static void renamenxCommand(redisClient *c) {
3896 renameGenericCommand(c,1);
3897}
3898
3899static void moveCommand(redisClient *c) {
3305306f 3900 robj *o;
3901 redisDb *src, *dst;
ed9b544e 3902 int srcid;
3903
3904 /* Obtain source and target DB pointers */
3305306f 3905 src = c->db;
3906 srcid = c->db->id;
ed9b544e 3907 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 3908 addReply(c,shared.outofrangeerr);
ed9b544e 3909 return;
3910 }
3305306f 3911 dst = c->db;
3912 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 3913
3914 /* If the user is moving using as target the same
3915 * DB as the source DB it is probably an error. */
3916 if (src == dst) {
c937aa89 3917 addReply(c,shared.sameobjecterr);
ed9b544e 3918 return;
3919 }
3920
3921 /* Check if the element exists and get a reference */
3305306f 3922 o = lookupKeyWrite(c->db,c->argv[1]);
3923 if (!o) {
c937aa89 3924 addReply(c,shared.czero);
ed9b544e 3925 return;
3926 }
3927
3928 /* Try to add the element to the target DB */
3305306f 3929 deleteIfVolatile(dst,c->argv[1]);
3930 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 3931 addReply(c,shared.czero);
ed9b544e 3932 return;
3933 }
3305306f 3934 incrRefCount(c->argv[1]);
ed9b544e 3935 incrRefCount(o);
3936
3937 /* OK! key moved, free the entry in the source DB */
3305306f 3938 deleteKey(src,c->argv[1]);
ed9b544e 3939 server.dirty++;
c937aa89 3940 addReply(c,shared.cone);
ed9b544e 3941}
3942
3943/* =================================== Lists ================================ */
3944static void pushGenericCommand(redisClient *c, int where) {
3945 robj *lobj;
ed9b544e 3946 list *list;
3305306f 3947
3948 lobj = lookupKeyWrite(c->db,c->argv[1]);
3949 if (lobj == NULL) {
95242ab5 3950 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
3951 addReply(c,shared.ok);
3952 return;
3953 }
ed9b544e 3954 lobj = createListObject();
3955 list = lobj->ptr;
3956 if (where == REDIS_HEAD) {
6b47e12e 3957 listAddNodeHead(list,c->argv[2]);
ed9b544e 3958 } else {
6b47e12e 3959 listAddNodeTail(list,c->argv[2]);
ed9b544e 3960 }
3305306f 3961 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 3962 incrRefCount(c->argv[1]);
3963 incrRefCount(c->argv[2]);
3964 } else {
ed9b544e 3965 if (lobj->type != REDIS_LIST) {
3966 addReply(c,shared.wrongtypeerr);
3967 return;
3968 }
95242ab5 3969 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
3970 addReply(c,shared.ok);
3971 return;
3972 }
ed9b544e 3973 list = lobj->ptr;
3974 if (where == REDIS_HEAD) {
6b47e12e 3975 listAddNodeHead(list,c->argv[2]);
ed9b544e 3976 } else {
6b47e12e 3977 listAddNodeTail(list,c->argv[2]);
ed9b544e 3978 }
3979 incrRefCount(c->argv[2]);
3980 }
3981 server.dirty++;
3982 addReply(c,shared.ok);
3983}
3984
3985static void lpushCommand(redisClient *c) {
3986 pushGenericCommand(c,REDIS_HEAD);
3987}
3988
3989static void rpushCommand(redisClient *c) {
3990 pushGenericCommand(c,REDIS_TAIL);
3991}
3992
3993static void llenCommand(redisClient *c) {
3305306f 3994 robj *o;
ed9b544e 3995 list *l;
3996
3305306f 3997 o = lookupKeyRead(c->db,c->argv[1]);
3998 if (o == NULL) {
c937aa89 3999 addReply(c,shared.czero);
ed9b544e 4000 return;
4001 } else {
ed9b544e 4002 if (o->type != REDIS_LIST) {
c937aa89 4003 addReply(c,shared.wrongtypeerr);
ed9b544e 4004 } else {
4005 l = o->ptr;
c937aa89 4006 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(l)));
ed9b544e 4007 }
4008 }
4009}
4010
4011static void lindexCommand(redisClient *c) {
3305306f 4012 robj *o;
ed9b544e 4013 int index = atoi(c->argv[2]->ptr);
4014
3305306f 4015 o = lookupKeyRead(c->db,c->argv[1]);
4016 if (o == NULL) {
c937aa89 4017 addReply(c,shared.nullbulk);
ed9b544e 4018 } else {
ed9b544e 4019 if (o->type != REDIS_LIST) {
c937aa89 4020 addReply(c,shared.wrongtypeerr);
ed9b544e 4021 } else {
4022 list *list = o->ptr;
4023 listNode *ln;
4024
4025 ln = listIndex(list, index);
4026 if (ln == NULL) {
c937aa89 4027 addReply(c,shared.nullbulk);
ed9b544e 4028 } else {
4029 robj *ele = listNodeValue(ln);
942a3961 4030 addReplyBulkLen(c,ele);
ed9b544e 4031 addReply(c,ele);
4032 addReply(c,shared.crlf);
4033 }
4034 }
4035 }
4036}
4037
4038static void lsetCommand(redisClient *c) {
3305306f 4039 robj *o;
ed9b544e 4040 int index = atoi(c->argv[2]->ptr);
4041
3305306f 4042 o = lookupKeyWrite(c->db,c->argv[1]);
4043 if (o == NULL) {
ed9b544e 4044 addReply(c,shared.nokeyerr);
4045 } else {
ed9b544e 4046 if (o->type != REDIS_LIST) {
4047 addReply(c,shared.wrongtypeerr);
4048 } else {
4049 list *list = o->ptr;
4050 listNode *ln;
4051
4052 ln = listIndex(list, index);
4053 if (ln == NULL) {
c937aa89 4054 addReply(c,shared.outofrangeerr);
ed9b544e 4055 } else {
4056 robj *ele = listNodeValue(ln);
4057
4058 decrRefCount(ele);
4059 listNodeValue(ln) = c->argv[3];
4060 incrRefCount(c->argv[3]);
4061 addReply(c,shared.ok);
4062 server.dirty++;
4063 }
4064 }
4065 }
4066}
4067
4068static void popGenericCommand(redisClient *c, int where) {
3305306f 4069 robj *o;
4070
4071 o = lookupKeyWrite(c->db,c->argv[1]);
4072 if (o == NULL) {
c937aa89 4073 addReply(c,shared.nullbulk);
ed9b544e 4074 } else {
ed9b544e 4075 if (o->type != REDIS_LIST) {
c937aa89 4076 addReply(c,shared.wrongtypeerr);
ed9b544e 4077 } else {
4078 list *list = o->ptr;
4079 listNode *ln;
4080
4081 if (where == REDIS_HEAD)
4082 ln = listFirst(list);
4083 else
4084 ln = listLast(list);
4085
4086 if (ln == NULL) {
c937aa89 4087 addReply(c,shared.nullbulk);
ed9b544e 4088 } else {
4089 robj *ele = listNodeValue(ln);
942a3961 4090 addReplyBulkLen(c,ele);
ed9b544e 4091 addReply(c,ele);
4092 addReply(c,shared.crlf);
4093 listDelNode(list,ln);
4094 server.dirty++;
4095 }
4096 }
4097 }
4098}
4099
4100static void lpopCommand(redisClient *c) {
4101 popGenericCommand(c,REDIS_HEAD);
4102}
4103
4104static void rpopCommand(redisClient *c) {
4105 popGenericCommand(c,REDIS_TAIL);
4106}
4107
4108static void lrangeCommand(redisClient *c) {
3305306f 4109 robj *o;
ed9b544e 4110 int start = atoi(c->argv[2]->ptr);
4111 int end = atoi(c->argv[3]->ptr);
3305306f 4112
4113 o = lookupKeyRead(c->db,c->argv[1]);
4114 if (o == NULL) {
c937aa89 4115 addReply(c,shared.nullmultibulk);
ed9b544e 4116 } else {
ed9b544e 4117 if (o->type != REDIS_LIST) {
c937aa89 4118 addReply(c,shared.wrongtypeerr);
ed9b544e 4119 } else {
4120 list *list = o->ptr;
4121 listNode *ln;
4122 int llen = listLength(list);
4123 int rangelen, j;
4124 robj *ele;
4125
4126 /* convert negative indexes */
4127 if (start < 0) start = llen+start;
4128 if (end < 0) end = llen+end;
4129 if (start < 0) start = 0;
4130 if (end < 0) end = 0;
4131
4132 /* indexes sanity checks */
4133 if (start > end || start >= llen) {
4134 /* Out of range start or start > end result in empty list */
c937aa89 4135 addReply(c,shared.emptymultibulk);
ed9b544e 4136 return;
4137 }
4138 if (end >= llen) end = llen-1;
4139 rangelen = (end-start)+1;
4140
4141 /* Return the result in form of a multi-bulk reply */
4142 ln = listIndex(list, start);
c937aa89 4143 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
ed9b544e 4144 for (j = 0; j < rangelen; j++) {
4145 ele = listNodeValue(ln);
942a3961 4146 addReplyBulkLen(c,ele);
ed9b544e 4147 addReply(c,ele);
4148 addReply(c,shared.crlf);
4149 ln = ln->next;
4150 }
4151 }
4152 }
4153}
4154
4155static void ltrimCommand(redisClient *c) {
3305306f 4156 robj *o;
ed9b544e 4157 int start = atoi(c->argv[2]->ptr);
4158 int end = atoi(c->argv[3]->ptr);
4159
3305306f 4160 o = lookupKeyWrite(c->db,c->argv[1]);
4161 if (o == NULL) {
ab9d4cb1 4162 addReply(c,shared.ok);
ed9b544e 4163 } else {
ed9b544e 4164 if (o->type != REDIS_LIST) {
4165 addReply(c,shared.wrongtypeerr);
4166 } else {
4167 list *list = o->ptr;
4168 listNode *ln;
4169 int llen = listLength(list);
4170 int j, ltrim, rtrim;
4171
4172 /* convert negative indexes */
4173 if (start < 0) start = llen+start;
4174 if (end < 0) end = llen+end;
4175 if (start < 0) start = 0;
4176 if (end < 0) end = 0;
4177
4178 /* indexes sanity checks */
4179 if (start > end || start >= llen) {
4180 /* Out of range start or start > end result in empty list */
4181 ltrim = llen;
4182 rtrim = 0;
4183 } else {
4184 if (end >= llen) end = llen-1;
4185 ltrim = start;
4186 rtrim = llen-end-1;
4187 }
4188
4189 /* Remove list elements to perform the trim */
4190 for (j = 0; j < ltrim; j++) {
4191 ln = listFirst(list);
4192 listDelNode(list,ln);
4193 }
4194 for (j = 0; j < rtrim; j++) {
4195 ln = listLast(list);
4196 listDelNode(list,ln);
4197 }
ed9b544e 4198 server.dirty++;
e59229a2 4199 addReply(c,shared.ok);
ed9b544e 4200 }
4201 }
4202}
4203
4204static void lremCommand(redisClient *c) {
3305306f 4205 robj *o;
ed9b544e 4206
3305306f 4207 o = lookupKeyWrite(c->db,c->argv[1]);
4208 if (o == NULL) {
33c08b39 4209 addReply(c,shared.czero);
ed9b544e 4210 } else {
ed9b544e 4211 if (o->type != REDIS_LIST) {
c937aa89 4212 addReply(c,shared.wrongtypeerr);
ed9b544e 4213 } else {
4214 list *list = o->ptr;
4215 listNode *ln, *next;
4216 int toremove = atoi(c->argv[2]->ptr);
4217 int removed = 0;
4218 int fromtail = 0;
4219
4220 if (toremove < 0) {
4221 toremove = -toremove;
4222 fromtail = 1;
4223 }
4224 ln = fromtail ? list->tail : list->head;
4225 while (ln) {
ed9b544e 4226 robj *ele = listNodeValue(ln);
a4d1ba9a 4227
4228 next = fromtail ? ln->prev : ln->next;
724a51b1 4229 if (compareStringObjects(ele,c->argv[3]) == 0) {
ed9b544e 4230 listDelNode(list,ln);
4231 server.dirty++;
4232 removed++;
4233 if (toremove && removed == toremove) break;
4234 }
4235 ln = next;
4236 }
c937aa89 4237 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 4238 }
4239 }
4240}
4241
12f9d551 4242/* This is the semantic of this command:
0f5f7e9a 4243 * RPOPLPUSH srclist dstlist:
12f9d551 4244 * IF LLEN(srclist) > 0
4245 * element = RPOP srclist
4246 * LPUSH dstlist element
4247 * RETURN element
4248 * ELSE
4249 * RETURN nil
4250 * END
4251 * END
4252 *
4253 * The idea is to be able to get an element from a list in a reliable way
4254 * since the element is not just returned but pushed against another list
4255 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4256 */
0f5f7e9a 4257static void rpoplpushcommand(redisClient *c) {
12f9d551 4258 robj *sobj;
4259
4260 sobj = lookupKeyWrite(c->db,c->argv[1]);
4261 if (sobj == NULL) {
4262 addReply(c,shared.nullbulk);
4263 } else {
4264 if (sobj->type != REDIS_LIST) {
4265 addReply(c,shared.wrongtypeerr);
4266 } else {
4267 list *srclist = sobj->ptr;
4268 listNode *ln = listLast(srclist);
4269
4270 if (ln == NULL) {
4271 addReply(c,shared.nullbulk);
4272 } else {
4273 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4274 robj *ele = listNodeValue(ln);
4275 list *dstlist;
4276
e20fb74f 4277 if (dobj && dobj->type != REDIS_LIST) {
12f9d551 4278 addReply(c,shared.wrongtypeerr);
4279 return;
4280 }
e20fb74f 4281
4282 /* Add the element to the target list (unless it's directly
4283 * passed to some BLPOP-ing client */
4284 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4285 if (dobj == NULL) {
4286 /* Create the list if the key does not exist */
4287 dobj = createListObject();
4288 dictAdd(c->db->dict,c->argv[2],dobj);
4289 incrRefCount(c->argv[2]);
4290 }
4291 dstlist = dobj->ptr;
4292 listAddNodeHead(dstlist,ele);
4293 incrRefCount(ele);
4294 }
12f9d551 4295
4296 /* Send the element to the client as reply as well */
4297 addReplyBulkLen(c,ele);
4298 addReply(c,ele);
4299 addReply(c,shared.crlf);
4300
4301 /* Finally remove the element from the source list */
4302 listDelNode(srclist,ln);
4303 server.dirty++;
4304 }
4305 }
4306 }
4307}
4308
4309
ed9b544e 4310/* ==================================== Sets ================================ */
4311
4312static void saddCommand(redisClient *c) {
ed9b544e 4313 robj *set;
4314
3305306f 4315 set = lookupKeyWrite(c->db,c->argv[1]);
4316 if (set == NULL) {
ed9b544e 4317 set = createSetObject();
3305306f 4318 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 4319 incrRefCount(c->argv[1]);
4320 } else {
ed9b544e 4321 if (set->type != REDIS_SET) {
c937aa89 4322 addReply(c,shared.wrongtypeerr);
ed9b544e 4323 return;
4324 }
4325 }
4326 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4327 incrRefCount(c->argv[2]);
4328 server.dirty++;
c937aa89 4329 addReply(c,shared.cone);
ed9b544e 4330 } else {
c937aa89 4331 addReply(c,shared.czero);
ed9b544e 4332 }
4333}
4334
4335static void sremCommand(redisClient *c) {
3305306f 4336 robj *set;
ed9b544e 4337
3305306f 4338 set = lookupKeyWrite(c->db,c->argv[1]);
4339 if (set == NULL) {
c937aa89 4340 addReply(c,shared.czero);
ed9b544e 4341 } else {
ed9b544e 4342 if (set->type != REDIS_SET) {
c937aa89 4343 addReply(c,shared.wrongtypeerr);
ed9b544e 4344 return;
4345 }
4346 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4347 server.dirty++;
12fea928 4348 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
c937aa89 4349 addReply(c,shared.cone);
ed9b544e 4350 } else {
c937aa89 4351 addReply(c,shared.czero);
ed9b544e 4352 }
4353 }
4354}
4355
a4460ef4 4356static void smoveCommand(redisClient *c) {
4357 robj *srcset, *dstset;
4358
4359 srcset = lookupKeyWrite(c->db,c->argv[1]);
4360 dstset = lookupKeyWrite(c->db,c->argv[2]);
4361
4362 /* If the source key does not exist return 0, if it's of the wrong type
4363 * raise an error */
4364 if (srcset == NULL || srcset->type != REDIS_SET) {
4365 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4366 return;
4367 }
4368 /* Error if the destination key is not a set as well */
4369 if (dstset && dstset->type != REDIS_SET) {
4370 addReply(c,shared.wrongtypeerr);
4371 return;
4372 }
4373 /* Remove the element from the source set */
4374 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4375 /* Key not found in the src set! return zero */
4376 addReply(c,shared.czero);
4377 return;
4378 }
4379 server.dirty++;
4380 /* Add the element to the destination set */
4381 if (!dstset) {
4382 dstset = createSetObject();
4383 dictAdd(c->db->dict,c->argv[2],dstset);
4384 incrRefCount(c->argv[2]);
4385 }
4386 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4387 incrRefCount(c->argv[3]);
4388 addReply(c,shared.cone);
4389}
4390
ed9b544e 4391static void sismemberCommand(redisClient *c) {
3305306f 4392 robj *set;
ed9b544e 4393
3305306f 4394 set = lookupKeyRead(c->db,c->argv[1]);
4395 if (set == NULL) {
c937aa89 4396 addReply(c,shared.czero);
ed9b544e 4397 } else {
ed9b544e 4398 if (set->type != REDIS_SET) {
c937aa89 4399 addReply(c,shared.wrongtypeerr);
ed9b544e 4400 return;
4401 }
4402 if (dictFind(set->ptr,c->argv[2]))
c937aa89 4403 addReply(c,shared.cone);
ed9b544e 4404 else
c937aa89 4405 addReply(c,shared.czero);
ed9b544e 4406 }
4407}
4408
4409static void scardCommand(redisClient *c) {
3305306f 4410 robj *o;
ed9b544e 4411 dict *s;
4412
3305306f 4413 o = lookupKeyRead(c->db,c->argv[1]);
4414 if (o == NULL) {
c937aa89 4415 addReply(c,shared.czero);
ed9b544e 4416 return;
4417 } else {
ed9b544e 4418 if (o->type != REDIS_SET) {
c937aa89 4419 addReply(c,shared.wrongtypeerr);
ed9b544e 4420 } else {
4421 s = o->ptr;
682ac724 4422 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
3305306f 4423 dictSize(s)));
ed9b544e 4424 }
4425 }
4426}
4427
12fea928 4428static void spopCommand(redisClient *c) {
4429 robj *set;
4430 dictEntry *de;
4431
4432 set = lookupKeyWrite(c->db,c->argv[1]);
4433 if (set == NULL) {
4434 addReply(c,shared.nullbulk);
4435 } else {
4436 if (set->type != REDIS_SET) {
4437 addReply(c,shared.wrongtypeerr);
4438 return;
4439 }
4440 de = dictGetRandomKey(set->ptr);
4441 if (de == NULL) {
4442 addReply(c,shared.nullbulk);
4443 } else {
4444 robj *ele = dictGetEntryKey(de);
4445
942a3961 4446 addReplyBulkLen(c,ele);
12fea928 4447 addReply(c,ele);
4448 addReply(c,shared.crlf);
4449 dictDelete(set->ptr,ele);
4450 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4451 server.dirty++;
4452 }
4453 }
4454}
4455
2abb95a9 4456static void srandmemberCommand(redisClient *c) {
4457 robj *set;
4458 dictEntry *de;
4459
4460 set = lookupKeyRead(c->db,c->argv[1]);
4461 if (set == NULL) {
4462 addReply(c,shared.nullbulk);
4463 } else {
4464 if (set->type != REDIS_SET) {
4465 addReply(c,shared.wrongtypeerr);
4466 return;
4467 }
4468 de = dictGetRandomKey(set->ptr);
4469 if (de == NULL) {
4470 addReply(c,shared.nullbulk);
4471 } else {
4472 robj *ele = dictGetEntryKey(de);
4473
4474 addReplyBulkLen(c,ele);
4475 addReply(c,ele);
4476 addReply(c,shared.crlf);
4477 }
4478 }
4479}
4480
ed9b544e 4481static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4482 dict **d1 = (void*) s1, **d2 = (void*) s2;
4483
3305306f 4484 return dictSize(*d1)-dictSize(*d2);
ed9b544e 4485}
4486
682ac724 4487static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 4488 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4489 dictIterator *di;
4490 dictEntry *de;
4491 robj *lenobj = NULL, *dstset = NULL;
682ac724 4492 unsigned long j, cardinality = 0;
ed9b544e 4493
ed9b544e 4494 for (j = 0; j < setsnum; j++) {
4495 robj *setobj;
3305306f 4496
4497 setobj = dstkey ?
4498 lookupKeyWrite(c->db,setskeys[j]) :
4499 lookupKeyRead(c->db,setskeys[j]);
4500 if (!setobj) {
ed9b544e 4501 zfree(dv);
5faa6025 4502 if (dstkey) {
fdcaae84 4503 if (deleteKey(c->db,dstkey))
4504 server.dirty++;
0d36ded0 4505 addReply(c,shared.czero);
5faa6025 4506 } else {
4507 addReply(c,shared.nullmultibulk);
4508 }
ed9b544e 4509 return;
4510 }
ed9b544e 4511 if (setobj->type != REDIS_SET) {
4512 zfree(dv);
c937aa89 4513 addReply(c,shared.wrongtypeerr);
ed9b544e 4514 return;
4515 }
4516 dv[j] = setobj->ptr;
4517 }
4518 /* Sort sets from the smallest to largest, this will improve our
4519 * algorithm's performace */
4520 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4521
4522 /* The first thing we should output is the total number of elements...
4523 * since this is a multi-bulk write, but at this stage we don't know
4524 * the intersection set size, so we use a trick, append an empty object
4525 * to the output list and save the pointer to later modify it with the
4526 * right length */
4527 if (!dstkey) {
4528 lenobj = createObject(REDIS_STRING,NULL);
4529 addReply(c,lenobj);
4530 decrRefCount(lenobj);
4531 } else {
4532 /* If we have a target key where to store the resulting set
4533 * create this key with an empty set inside */
4534 dstset = createSetObject();
ed9b544e 4535 }
4536
4537 /* Iterate all the elements of the first (smallest) set, and test
4538 * the element against all the other sets, if at least one set does
4539 * not include the element it is discarded */
4540 di = dictGetIterator(dv[0]);
ed9b544e 4541
4542 while((de = dictNext(di)) != NULL) {
4543 robj *ele;
4544
4545 for (j = 1; j < setsnum; j++)
4546 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4547 if (j != setsnum)
4548 continue; /* at least one set does not contain the member */
4549 ele = dictGetEntryKey(de);
4550 if (!dstkey) {
942a3961 4551 addReplyBulkLen(c,ele);
ed9b544e 4552 addReply(c,ele);
4553 addReply(c,shared.crlf);
4554 cardinality++;
4555 } else {
4556 dictAdd(dstset->ptr,ele,NULL);
4557 incrRefCount(ele);
4558 }
4559 }
4560 dictReleaseIterator(di);
4561
83cdfe18
AG
4562 if (dstkey) {
4563 /* Store the resulting set into the target */
4564 deleteKey(c->db,dstkey);
4565 dictAdd(c->db->dict,dstkey,dstset);
4566 incrRefCount(dstkey);
4567 }
4568
40d224a9 4569 if (!dstkey) {
682ac724 4570 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 4571 } else {
682ac724 4572 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
03fd01c7 4573 dictSize((dict*)dstset->ptr)));
40d224a9 4574 server.dirty++;
4575 }
ed9b544e 4576 zfree(dv);
4577}
4578
4579static void sinterCommand(redisClient *c) {
4580 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4581}
4582
4583static void sinterstoreCommand(redisClient *c) {
4584 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4585}
4586
f4f56e1d 4587#define REDIS_OP_UNION 0
4588#define REDIS_OP_DIFF 1
4589
4590static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 4591 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4592 dictIterator *di;
4593 dictEntry *de;
f4f56e1d 4594 robj *dstset = NULL;
40d224a9 4595 int j, cardinality = 0;
4596
40d224a9 4597 for (j = 0; j < setsnum; j++) {
4598 robj *setobj;
4599
4600 setobj = dstkey ?
4601 lookupKeyWrite(c->db,setskeys[j]) :
4602 lookupKeyRead(c->db,setskeys[j]);
4603 if (!setobj) {
4604 dv[j] = NULL;
4605 continue;
4606 }
4607 if (setobj->type != REDIS_SET) {
4608 zfree(dv);
4609 addReply(c,shared.wrongtypeerr);
4610 return;
4611 }
4612 dv[j] = setobj->ptr;
4613 }
4614
4615 /* We need a temp set object to store our union. If the dstkey
4616 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4617 * this set object will be the resulting object to set into the target key*/
4618 dstset = createSetObject();
4619
40d224a9 4620 /* Iterate all the elements of all the sets, add every element a single
4621 * time to the result set */
4622 for (j = 0; j < setsnum; j++) {
51829ed3 4623 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 4624 if (!dv[j]) continue; /* non existing keys are like empty sets */
4625
4626 di = dictGetIterator(dv[j]);
40d224a9 4627
4628 while((de = dictNext(di)) != NULL) {
4629 robj *ele;
4630
4631 /* dictAdd will not add the same element multiple times */
4632 ele = dictGetEntryKey(de);
f4f56e1d 4633 if (op == REDIS_OP_UNION || j == 0) {
4634 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4635 incrRefCount(ele);
40d224a9 4636 cardinality++;
4637 }
f4f56e1d 4638 } else if (op == REDIS_OP_DIFF) {
4639 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4640 cardinality--;
4641 }
40d224a9 4642 }
4643 }
4644 dictReleaseIterator(di);
51829ed3
AG
4645
4646 if (op == REDIS_OP_DIFF && cardinality == 0) break; /* result set is empty */
40d224a9 4647 }
4648
f4f56e1d 4649 /* Output the content of the resulting set, if not in STORE mode */
4650 if (!dstkey) {
4651 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4652 di = dictGetIterator(dstset->ptr);
f4f56e1d 4653 while((de = dictNext(di)) != NULL) {
4654 robj *ele;
4655
4656 ele = dictGetEntryKey(de);
942a3961 4657 addReplyBulkLen(c,ele);
f4f56e1d 4658 addReply(c,ele);
4659 addReply(c,shared.crlf);
4660 }
4661 dictReleaseIterator(di);
83cdfe18
AG
4662 } else {
4663 /* If we have a target key where to store the resulting set
4664 * create this key with the result set inside */
4665 deleteKey(c->db,dstkey);
4666 dictAdd(c->db->dict,dstkey,dstset);
4667 incrRefCount(dstkey);
f4f56e1d 4668 }
4669
4670 /* Cleanup */
40d224a9 4671 if (!dstkey) {
40d224a9 4672 decrRefCount(dstset);
4673 } else {
682ac724 4674 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
03fd01c7 4675 dictSize((dict*)dstset->ptr)));
40d224a9 4676 server.dirty++;
4677 }
4678 zfree(dv);
4679}
4680
4681static void sunionCommand(redisClient *c) {
f4f56e1d 4682 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 4683}
4684
4685static void sunionstoreCommand(redisClient *c) {
f4f56e1d 4686 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4687}
4688
4689static void sdiffCommand(redisClient *c) {
4690 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4691}
4692
4693static void sdiffstoreCommand(redisClient *c) {
4694 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 4695}
4696
6b47e12e 4697/* ==================================== ZSets =============================== */
4698
4699/* ZSETs are ordered sets using two data structures to hold the same elements
4700 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4701 * data structure.
4702 *
4703 * The elements are added to an hash table mapping Redis objects to scores.
4704 * At the same time the elements are added to a skip list mapping scores
4705 * to Redis objects (so objects are sorted by scores in this "view"). */
4706
4707/* This skiplist implementation is almost a C translation of the original
4708 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4709 * Alternative to Balanced Trees", modified in three ways:
4710 * a) this implementation allows for repeated values.
4711 * b) the comparison is not just by key (our 'score') but by satellite data.
4712 * c) there is a back pointer, so it's a doubly linked list with the back
4713 * pointers being only at "level 1". This allows to traverse the list
4714 * from tail to head, useful for ZREVRANGE. */
4715
4716static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
4717 zskiplistNode *zn = zmalloc(sizeof(*zn));
4718
4719 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
4720 zn->score = score;
4721 zn->obj = obj;
4722 return zn;
4723}
4724
4725static zskiplist *zslCreate(void) {
4726 int j;
4727 zskiplist *zsl;
4728
4729 zsl = zmalloc(sizeof(*zsl));
4730 zsl->level = 1;
cc812361 4731 zsl->length = 0;
6b47e12e 4732 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
4733 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++)
4734 zsl->header->forward[j] = NULL;
e3870fab 4735 zsl->header->backward = NULL;
4736 zsl->tail = NULL;
6b47e12e 4737 return zsl;
4738}
4739
fd8ccf44 4740static void zslFreeNode(zskiplistNode *node) {
4741 decrRefCount(node->obj);
ad807e6f 4742 zfree(node->forward);
fd8ccf44 4743 zfree(node);
4744}
4745
4746static void zslFree(zskiplist *zsl) {
ad807e6f 4747 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 4748
ad807e6f 4749 zfree(zsl->header->forward);
4750 zfree(zsl->header);
fd8ccf44 4751 while(node) {
599379dd 4752 next = node->forward[0];
fd8ccf44 4753 zslFreeNode(node);
4754 node = next;
4755 }
ad807e6f 4756 zfree(zsl);
fd8ccf44 4757}
4758
6b47e12e 4759static int zslRandomLevel(void) {
4760 int level = 1;
4761 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
4762 level += 1;
4763 return level;
4764}
4765
4766static void zslInsert(zskiplist *zsl, double score, robj *obj) {
4767 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4768 int i, level;
4769
4770 x = zsl->header;
4771 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 4772 while (x->forward[i] &&
4773 (x->forward[i]->score < score ||
4774 (x->forward[i]->score == score &&
4775 compareStringObjects(x->forward[i]->obj,obj) < 0)))
6b47e12e 4776 x = x->forward[i];
4777 update[i] = x;
4778 }
6b47e12e 4779 /* we assume the key is not already inside, since we allow duplicated
4780 * scores, and the re-insertion of score and redis object should never
4781 * happpen since the caller of zslInsert() should test in the hash table
4782 * if the element is already inside or not. */
4783 level = zslRandomLevel();
4784 if (level > zsl->level) {
4785 for (i = zsl->level; i < level; i++)
4786 update[i] = zsl->header;
4787 zsl->level = level;
4788 }
4789 x = zslCreateNode(level,score,obj);
4790 for (i = 0; i < level; i++) {
4791 x->forward[i] = update[i]->forward[i];
4792 update[i]->forward[i] = x;
4793 }
bb975144 4794 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 4795 if (x->forward[0])
4796 x->forward[0]->backward = x;
4797 else
4798 zsl->tail = x;
cc812361 4799 zsl->length++;
6b47e12e 4800}
4801
50c55df5 4802/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 4803static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 4804 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4805 int i;
4806
4807 x = zsl->header;
4808 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 4809 while (x->forward[i] &&
4810 (x->forward[i]->score < score ||
4811 (x->forward[i]->score == score &&
4812 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 4813 x = x->forward[i];
4814 update[i] = x;
4815 }
4816 /* We may have multiple elements with the same score, what we need
4817 * is to find the element with both the right score and object. */
4818 x = x->forward[0];
50c55df5 4819 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
9d60e6e4 4820 for (i = 0; i < zsl->level; i++) {
4821 if (update[i]->forward[i] != x) break;
4822 update[i]->forward[i] = x->forward[i];
4823 }
4824 if (x->forward[0]) {
4825 x->forward[0]->backward = (x->backward == zsl->header) ?
4826 NULL : x->backward;
e197b441 4827 } else {
9d60e6e4 4828 zsl->tail = x->backward;
e197b441 4829 }
9d60e6e4 4830 zslFreeNode(x);
4831 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
4832 zsl->level--;
4833 zsl->length--;
4834 return 1;
4835 } else {
4836 return 0; /* not found */
e197b441 4837 }
4838 return 0; /* not found */
fd8ccf44 4839}
4840
1807985b 4841/* Delete all the elements with score between min and max from the skiplist.
4842 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
4843 * Note that this function takes the reference to the hash table view of the
4844 * sorted set, in order to remove the elements from the hash table too. */
4845static unsigned long zslDeleteRange(zskiplist *zsl, double min, double max, dict *dict) {
4846 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4847 unsigned long removed = 0;
4848 int i;
4849
4850 x = zsl->header;
4851 for (i = zsl->level-1; i >= 0; i--) {
4852 while (x->forward[i] && x->forward[i]->score < min)
4853 x = x->forward[i];
4854 update[i] = x;
4855 }
4856 /* We may have multiple elements with the same score, what we need
4857 * is to find the element with both the right score and object. */
4858 x = x->forward[0];
4859 while (x && x->score <= max) {
4860 zskiplistNode *next;
4861
4862 for (i = 0; i < zsl->level; i++) {
4863 if (update[i]->forward[i] != x) break;
4864 update[i]->forward[i] = x->forward[i];
4865 }
4866 if (x->forward[0]) {
4867 x->forward[0]->backward = (x->backward == zsl->header) ?
4868 NULL : x->backward;
4869 } else {
4870 zsl->tail = x->backward;
4871 }
4872 next = x->forward[0];
4873 dictDelete(dict,x->obj);
4874 zslFreeNode(x);
4875 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
4876 zsl->level--;
4877 zsl->length--;
4878 removed++;
4879 x = next;
4880 }
4881 return removed; /* not found */
4882}
4883
50c55df5 4884/* Find the first node having a score equal or greater than the specified one.
4885 * Returns NULL if there is no match. */
4886static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
4887 zskiplistNode *x;
4888 int i;
4889
4890 x = zsl->header;
4891 for (i = zsl->level-1; i >= 0; i--) {
4892 while (x->forward[i] && x->forward[i]->score < score)
4893 x = x->forward[i];
4894 }
4895 /* We may have multiple elements with the same score, what we need
4896 * is to find the element with both the right score and object. */
4897 return x->forward[0];
4898}
4899
fd8ccf44 4900/* The actual Z-commands implementations */
4901
7db723ad 4902/* This generic command implements both ZADD and ZINCRBY.
e2665397 4903 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 4904 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 4905static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 4906 robj *zsetobj;
4907 zset *zs;
4908 double *score;
4909
e2665397 4910 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 4911 if (zsetobj == NULL) {
4912 zsetobj = createZsetObject();
e2665397 4913 dictAdd(c->db->dict,key,zsetobj);
4914 incrRefCount(key);
fd8ccf44 4915 } else {
4916 if (zsetobj->type != REDIS_ZSET) {
4917 addReply(c,shared.wrongtypeerr);
4918 return;
4919 }
4920 }
fd8ccf44 4921 zs = zsetobj->ptr;
e2665397 4922
7db723ad 4923 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 4924 * needs to handle the two different conditions. It's all about setting
4925 * '*score', that is, the new score to set, to the right value. */
4926 score = zmalloc(sizeof(double));
4927 if (doincrement) {
4928 dictEntry *de;
4929
4930 /* Read the old score. If the element was not present starts from 0 */
4931 de = dictFind(zs->dict,ele);
4932 if (de) {
4933 double *oldscore = dictGetEntryVal(de);
4934 *score = *oldscore + scoreval;
4935 } else {
4936 *score = scoreval;
4937 }
4938 } else {
4939 *score = scoreval;
4940 }
4941
4942 /* What follows is a simple remove and re-insert operation that is common
7db723ad 4943 * to both ZADD and ZINCRBY... */
e2665397 4944 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 4945 /* case 1: New element */
e2665397 4946 incrRefCount(ele); /* added to hash */
4947 zslInsert(zs->zsl,*score,ele);
4948 incrRefCount(ele); /* added to skiplist */
fd8ccf44 4949 server.dirty++;
e2665397 4950 if (doincrement)
e2665397 4951 addReplyDouble(c,*score);
91d71bfc 4952 else
4953 addReply(c,shared.cone);
fd8ccf44 4954 } else {
4955 dictEntry *de;
4956 double *oldscore;
4957
4958 /* case 2: Score update operation */
e2665397 4959 de = dictFind(zs->dict,ele);
dfc5e96c 4960 redisAssert(de != NULL);
fd8ccf44 4961 oldscore = dictGetEntryVal(de);
4962 if (*score != *oldscore) {
4963 int deleted;
4964
e2665397 4965 /* Remove and insert the element in the skip list with new score */
4966 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 4967 redisAssert(deleted != 0);
e2665397 4968 zslInsert(zs->zsl,*score,ele);
4969 incrRefCount(ele);
4970 /* Update the score in the hash table */
4971 dictReplace(zs->dict,ele,score);
fd8ccf44 4972 server.dirty++;
2161a965 4973 } else {
4974 zfree(score);
fd8ccf44 4975 }
e2665397 4976 if (doincrement)
4977 addReplyDouble(c,*score);
4978 else
4979 addReply(c,shared.czero);
fd8ccf44 4980 }
4981}
4982
e2665397 4983static void zaddCommand(redisClient *c) {
4984 double scoreval;
4985
4986 scoreval = strtod(c->argv[2]->ptr,NULL);
4987 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
4988}
4989
7db723ad 4990static void zincrbyCommand(redisClient *c) {
e2665397 4991 double scoreval;
4992
4993 scoreval = strtod(c->argv[2]->ptr,NULL);
4994 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
4995}
4996
1b7106e7 4997static void zremCommand(redisClient *c) {
4998 robj *zsetobj;
4999 zset *zs;
5000
5001 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5002 if (zsetobj == NULL) {
5003 addReply(c,shared.czero);
5004 } else {
5005 dictEntry *de;
5006 double *oldscore;
5007 int deleted;
5008
5009 if (zsetobj->type != REDIS_ZSET) {
5010 addReply(c,shared.wrongtypeerr);
5011 return;
5012 }
5013 zs = zsetobj->ptr;
5014 de = dictFind(zs->dict,c->argv[2]);
5015 if (de == NULL) {
5016 addReply(c,shared.czero);
5017 return;
5018 }
5019 /* Delete from the skiplist */
5020 oldscore = dictGetEntryVal(de);
5021 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
dfc5e96c 5022 redisAssert(deleted != 0);
1b7106e7 5023
5024 /* Delete from the hash table */
5025 dictDelete(zs->dict,c->argv[2]);
5026 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5027 server.dirty++;
5028 addReply(c,shared.cone);
5029 }
5030}
5031
1807985b 5032static void zremrangebyscoreCommand(redisClient *c) {
5033 double min = strtod(c->argv[2]->ptr,NULL);
5034 double max = strtod(c->argv[3]->ptr,NULL);
5035 robj *zsetobj;
5036 zset *zs;
5037
5038 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
5039 if (zsetobj == NULL) {
5040 addReply(c,shared.czero);
5041 } else {
5042 long deleted;
5043
5044 if (zsetobj->type != REDIS_ZSET) {
5045 addReply(c,shared.wrongtypeerr);
5046 return;
5047 }
5048 zs = zsetobj->ptr;
5049 deleted = zslDeleteRange(zs->zsl,min,max,zs->dict);
5050 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
5051 server.dirty += deleted;
5052 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",deleted));
5053 }
5054}
5055
e3870fab 5056static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 5057 robj *o;
5058 int start = atoi(c->argv[2]->ptr);
5059 int end = atoi(c->argv[3]->ptr);
752da584 5060 int withscores = 0;
5061
5062 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5063 withscores = 1;
5064 } else if (c->argc >= 5) {
5065 addReply(c,shared.syntaxerr);
5066 return;
5067 }
cc812361 5068
5069 o = lookupKeyRead(c->db,c->argv[1]);
5070 if (o == NULL) {
5071 addReply(c,shared.nullmultibulk);
5072 } else {
5073 if (o->type != REDIS_ZSET) {
5074 addReply(c,shared.wrongtypeerr);
5075 } else {
5076 zset *zsetobj = o->ptr;
5077 zskiplist *zsl = zsetobj->zsl;
5078 zskiplistNode *ln;
5079
5080 int llen = zsl->length;
5081 int rangelen, j;
5082 robj *ele;
5083
5084 /* convert negative indexes */
5085 if (start < 0) start = llen+start;
5086 if (end < 0) end = llen+end;
5087 if (start < 0) start = 0;
5088 if (end < 0) end = 0;
5089
5090 /* indexes sanity checks */
5091 if (start > end || start >= llen) {
5092 /* Out of range start or start > end result in empty list */
5093 addReply(c,shared.emptymultibulk);
5094 return;
5095 }
5096 if (end >= llen) end = llen-1;
5097 rangelen = (end-start)+1;
5098
5099 /* Return the result in form of a multi-bulk reply */
e3870fab 5100 if (reverse) {
5101 ln = zsl->tail;
5102 while (start--)
5103 ln = ln->backward;
5104 } else {
5105 ln = zsl->header->forward[0];
5106 while (start--)
5107 ln = ln->forward[0];
5108 }
cc812361 5109
752da584 5110 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5111 withscores ? (rangelen*2) : rangelen));
cc812361 5112 for (j = 0; j < rangelen; j++) {
0aad7a19 5113 ele = ln->obj;
cc812361 5114 addReplyBulkLen(c,ele);
5115 addReply(c,ele);
5116 addReply(c,shared.crlf);
752da584 5117 if (withscores)
5118 addReplyDouble(c,ln->score);
e3870fab 5119 ln = reverse ? ln->backward : ln->forward[0];
cc812361 5120 }
5121 }
5122 }
5123}
5124
e3870fab 5125static void zrangeCommand(redisClient *c) {
5126 zrangeGenericCommand(c,0);
5127}
5128
5129static void zrevrangeCommand(redisClient *c) {
5130 zrangeGenericCommand(c,1);
5131}
5132
50c55df5 5133static void zrangebyscoreCommand(redisClient *c) {
5134 robj *o;
5135 double min = strtod(c->argv[2]->ptr,NULL);
5136 double max = strtod(c->argv[3]->ptr,NULL);
80181f78 5137 int offset = 0, limit = -1;
5138
5139 if (c->argc != 4 && c->argc != 7) {
454d4e43 5140 addReplySds(c,
5141 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 5142 return;
5143 } else if (c->argc == 7 && strcasecmp(c->argv[4]->ptr,"limit")) {
5144 addReply(c,shared.syntaxerr);
5145 return;
5146 } else if (c->argc == 7) {
5147 offset = atoi(c->argv[5]->ptr);
5148 limit = atoi(c->argv[6]->ptr);
0b13687c 5149 if (offset < 0) offset = 0;
80181f78 5150 }
50c55df5 5151
5152 o = lookupKeyRead(c->db,c->argv[1]);
5153 if (o == NULL) {
5154 addReply(c,shared.nullmultibulk);
5155 } else {
5156 if (o->type != REDIS_ZSET) {
5157 addReply(c,shared.wrongtypeerr);
5158 } else {
5159 zset *zsetobj = o->ptr;
5160 zskiplist *zsl = zsetobj->zsl;
5161 zskiplistNode *ln;
5162 robj *ele, *lenobj;
5163 unsigned int rangelen = 0;
5164
5165 /* Get the first node with the score >= min */
5166 ln = zslFirstWithScore(zsl,min);
5167 if (ln == NULL) {
5168 /* No element matching the speciifed interval */
5169 addReply(c,shared.emptymultibulk);
5170 return;
5171 }
5172
5173 /* We don't know in advance how many matching elements there
5174 * are in the list, so we push this object that will represent
5175 * the multi-bulk length in the output buffer, and will "fix"
5176 * it later */
5177 lenobj = createObject(REDIS_STRING,NULL);
5178 addReply(c,lenobj);
c74e7c77 5179 decrRefCount(lenobj);
50c55df5 5180
dbbc7285 5181 while(ln && ln->score <= max) {
80181f78 5182 if (offset) {
5183 offset--;
5184 ln = ln->forward[0];
5185 continue;
5186 }
5187 if (limit == 0) break;
50c55df5 5188 ele = ln->obj;
5189 addReplyBulkLen(c,ele);
5190 addReply(c,ele);
5191 addReply(c,shared.crlf);
5192 ln = ln->forward[0];
5193 rangelen++;
80181f78 5194 if (limit > 0) limit--;
50c55df5 5195 }
5196 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",rangelen);
5197 }
5198 }
5199}
5200
3c41331e 5201static void zcardCommand(redisClient *c) {
e197b441 5202 robj *o;
5203 zset *zs;
5204
5205 o = lookupKeyRead(c->db,c->argv[1]);
5206 if (o == NULL) {
5207 addReply(c,shared.czero);
5208 return;
5209 } else {
5210 if (o->type != REDIS_ZSET) {
5211 addReply(c,shared.wrongtypeerr);
5212 } else {
5213 zs = o->ptr;
682ac724 5214 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",zs->zsl->length));
e197b441 5215 }
5216 }
5217}
5218
6e333bbe 5219static void zscoreCommand(redisClient *c) {
5220 robj *o;
5221 zset *zs;
5222
5223 o = lookupKeyRead(c->db,c->argv[1]);
5224 if (o == NULL) {
96d8b4ee 5225 addReply(c,shared.nullbulk);
6e333bbe 5226 return;
5227 } else {
5228 if (o->type != REDIS_ZSET) {
5229 addReply(c,shared.wrongtypeerr);
5230 } else {
5231 dictEntry *de;
5232
5233 zs = o->ptr;
5234 de = dictFind(zs->dict,c->argv[2]);
5235 if (!de) {
5236 addReply(c,shared.nullbulk);
5237 } else {
6e333bbe 5238 double *score = dictGetEntryVal(de);
5239
e2665397 5240 addReplyDouble(c,*score);
6e333bbe 5241 }
5242 }
5243 }
5244}
5245
6b47e12e 5246/* ========================= Non type-specific commands ==================== */
5247
ed9b544e 5248static void flushdbCommand(redisClient *c) {
ca37e9cd 5249 server.dirty += dictSize(c->db->dict);
3305306f 5250 dictEmpty(c->db->dict);
5251 dictEmpty(c->db->expires);
ed9b544e 5252 addReply(c,shared.ok);
ed9b544e 5253}
5254
5255static void flushallCommand(redisClient *c) {
ca37e9cd 5256 server.dirty += emptyDb();
ed9b544e 5257 addReply(c,shared.ok);
f78fd11b 5258 rdbSave(server.dbfilename);
ca37e9cd 5259 server.dirty++;
ed9b544e 5260}
5261
56906eef 5262static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 5263 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 5264 so->type = type;
5265 so->pattern = pattern;
5266 return so;
5267}
5268
5269/* Return the value associated to the key with a name obtained
5270 * substituting the first occurence of '*' in 'pattern' with 'subst' */
56906eef 5271static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
ed9b544e 5272 char *p;
5273 sds spat, ssub;
5274 robj keyobj;
5275 int prefixlen, sublen, postfixlen;
ed9b544e 5276 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
5277 struct {
f1017b3f 5278 long len;
5279 long free;
ed9b544e 5280 char buf[REDIS_SORTKEY_MAX+1];
5281 } keyname;
5282
28173a49 5283 /* If the pattern is "#" return the substitution object itself in order
5284 * to implement the "SORT ... GET #" feature. */
5285 spat = pattern->ptr;
5286 if (spat[0] == '#' && spat[1] == '\0') {
5287 return subst;
5288 }
5289
5290 /* The substitution object may be specially encoded. If so we create
9d65a1bb 5291 * a decoded object on the fly. Otherwise getDecodedObject will just
5292 * increment the ref count, that we'll decrement later. */
5293 subst = getDecodedObject(subst);
942a3961 5294
ed9b544e 5295 ssub = subst->ptr;
5296 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
5297 p = strchr(spat,'*');
ed5a857a 5298 if (!p) {
5299 decrRefCount(subst);
5300 return NULL;
5301 }
ed9b544e 5302
5303 prefixlen = p-spat;
5304 sublen = sdslen(ssub);
5305 postfixlen = sdslen(spat)-(prefixlen+1);
5306 memcpy(keyname.buf,spat,prefixlen);
5307 memcpy(keyname.buf+prefixlen,ssub,sublen);
5308 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
5309 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
5310 keyname.len = prefixlen+sublen+postfixlen;
5311
dfc5e96c 5312 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
942a3961 5313 decrRefCount(subst);
5314
a4d1ba9a 5315 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
3305306f 5316 return lookupKeyRead(db,&keyobj);
ed9b544e 5317}
5318
5319/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
5320 * the additional parameter is not standard but a BSD-specific we have to
5321 * pass sorting parameters via the global 'server' structure */
5322static int sortCompare(const void *s1, const void *s2) {
5323 const redisSortObject *so1 = s1, *so2 = s2;
5324 int cmp;
5325
5326 if (!server.sort_alpha) {
5327 /* Numeric sorting. Here it's trivial as we precomputed scores */
5328 if (so1->u.score > so2->u.score) {
5329 cmp = 1;
5330 } else if (so1->u.score < so2->u.score) {
5331 cmp = -1;
5332 } else {
5333 cmp = 0;
5334 }
5335 } else {
5336 /* Alphanumeric sorting */
5337 if (server.sort_bypattern) {
5338 if (!so1->u.cmpobj || !so2->u.cmpobj) {
5339 /* At least one compare object is NULL */
5340 if (so1->u.cmpobj == so2->u.cmpobj)
5341 cmp = 0;
5342 else if (so1->u.cmpobj == NULL)
5343 cmp = -1;
5344 else
5345 cmp = 1;
5346 } else {
5347 /* We have both the objects, use strcoll */
5348 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
5349 }
5350 } else {
5351 /* Compare elements directly */
9d65a1bb 5352 robj *dec1, *dec2;
5353
5354 dec1 = getDecodedObject(so1->obj);
5355 dec2 = getDecodedObject(so2->obj);
5356 cmp = strcoll(dec1->ptr,dec2->ptr);
5357 decrRefCount(dec1);
5358 decrRefCount(dec2);
ed9b544e 5359 }
5360 }
5361 return server.sort_desc ? -cmp : cmp;
5362}
5363
5364/* The SORT command is the most complex command in Redis. Warning: this code
5365 * is optimized for speed and a bit less for readability */
5366static void sortCommand(redisClient *c) {
ed9b544e 5367 list *operations;
5368 int outputlen = 0;
5369 int desc = 0, alpha = 0;
5370 int limit_start = 0, limit_count = -1, start, end;
5371 int j, dontsort = 0, vectorlen;
5372 int getop = 0; /* GET operation counter */
443c6409 5373 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 5374 redisSortObject *vector; /* Resulting vector to sort */
5375
5376 /* Lookup the key to sort. It must be of the right types */
3305306f 5377 sortval = lookupKeyRead(c->db,c->argv[1]);
5378 if (sortval == NULL) {
d922ae65 5379 addReply(c,shared.nullmultibulk);
ed9b544e 5380 return;
5381 }
a5eb649b 5382 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
5383 sortval->type != REDIS_ZSET)
5384 {
c937aa89 5385 addReply(c,shared.wrongtypeerr);
ed9b544e 5386 return;
5387 }
5388
5389 /* Create a list of operations to perform for every sorted element.
5390 * Operations can be GET/DEL/INCR/DECR */
5391 operations = listCreate();
092dac2a 5392 listSetFreeMethod(operations,zfree);
ed9b544e 5393 j = 2;
5394
5395 /* Now we need to protect sortval incrementing its count, in the future
5396 * SORT may have options able to overwrite/delete keys during the sorting
5397 * and the sorted key itself may get destroied */
5398 incrRefCount(sortval);
5399
5400 /* The SORT command has an SQL-alike syntax, parse it */
5401 while(j < c->argc) {
5402 int leftargs = c->argc-j-1;
5403 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
5404 desc = 0;
5405 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
5406 desc = 1;
5407 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
5408 alpha = 1;
5409 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
5410 limit_start = atoi(c->argv[j+1]->ptr);
5411 limit_count = atoi(c->argv[j+2]->ptr);
5412 j+=2;
443c6409 5413 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
5414 storekey = c->argv[j+1];
5415 j++;
ed9b544e 5416 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
5417 sortby = c->argv[j+1];
5418 /* If the BY pattern does not contain '*', i.e. it is constant,
5419 * we don't need to sort nor to lookup the weight keys. */
5420 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
5421 j++;
5422 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
5423 listAddNodeTail(operations,createSortOperation(
5424 REDIS_SORT_GET,c->argv[j+1]));
5425 getop++;
5426 j++;
ed9b544e 5427 } else {
5428 decrRefCount(sortval);
5429 listRelease(operations);
c937aa89 5430 addReply(c,shared.syntaxerr);
ed9b544e 5431 return;
5432 }
5433 j++;
5434 }
5435
5436 /* Load the sorting vector with all the objects to sort */
a5eb649b 5437 switch(sortval->type) {
5438 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
5439 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
5440 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
dfc5e96c 5441 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
a5eb649b 5442 }
ed9b544e 5443 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 5444 j = 0;
a5eb649b 5445
ed9b544e 5446 if (sortval->type == REDIS_LIST) {
5447 list *list = sortval->ptr;
6208b3a7 5448 listNode *ln;
c7df85a4 5449 listIter li;
6208b3a7 5450
c7df85a4 5451 listRewind(list,&li);
5452 while((ln = listNext(&li))) {
ed9b544e 5453 robj *ele = ln->value;
5454 vector[j].obj = ele;
5455 vector[j].u.score = 0;
5456 vector[j].u.cmpobj = NULL;
ed9b544e 5457 j++;
5458 }
5459 } else {
a5eb649b 5460 dict *set;
ed9b544e 5461 dictIterator *di;
5462 dictEntry *setele;
5463
a5eb649b 5464 if (sortval->type == REDIS_SET) {
5465 set = sortval->ptr;
5466 } else {
5467 zset *zs = sortval->ptr;
5468 set = zs->dict;
5469 }
5470
ed9b544e 5471 di = dictGetIterator(set);
ed9b544e 5472 while((setele = dictNext(di)) != NULL) {
5473 vector[j].obj = dictGetEntryKey(setele);
5474 vector[j].u.score = 0;
5475 vector[j].u.cmpobj = NULL;
5476 j++;
5477 }
5478 dictReleaseIterator(di);
5479 }
dfc5e96c 5480 redisAssert(j == vectorlen);
ed9b544e 5481
5482 /* Now it's time to load the right scores in the sorting vector */
5483 if (dontsort == 0) {
5484 for (j = 0; j < vectorlen; j++) {
5485 if (sortby) {
5486 robj *byval;
5487
3305306f 5488 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
ed9b544e 5489 if (!byval || byval->type != REDIS_STRING) continue;
5490 if (alpha) {
9d65a1bb 5491 vector[j].u.cmpobj = getDecodedObject(byval);
ed9b544e 5492 } else {
942a3961 5493 if (byval->encoding == REDIS_ENCODING_RAW) {
5494 vector[j].u.score = strtod(byval->ptr,NULL);
5495 } else {
9d65a1bb 5496 /* Don't need to decode the object if it's
5497 * integer-encoded (the only encoding supported) so
5498 * far. We can just cast it */
f1017b3f 5499 if (byval->encoding == REDIS_ENCODING_INT) {
942a3961 5500 vector[j].u.score = (long)byval->ptr;
f1017b3f 5501 } else
dfc5e96c 5502 redisAssert(1 != 1);
942a3961 5503 }
ed9b544e 5504 }
5505 } else {
942a3961 5506 if (!alpha) {
5507 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
5508 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
5509 else {
5510 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
5511 vector[j].u.score = (long) vector[j].obj->ptr;
5512 else
dfc5e96c 5513 redisAssert(1 != 1);
942a3961 5514 }
5515 }
ed9b544e 5516 }
5517 }
5518 }
5519
5520 /* We are ready to sort the vector... perform a bit of sanity check
5521 * on the LIMIT option too. We'll use a partial version of quicksort. */
5522 start = (limit_start < 0) ? 0 : limit_start;
5523 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
5524 if (start >= vectorlen) {
5525 start = vectorlen-1;
5526 end = vectorlen-2;
5527 }
5528 if (end >= vectorlen) end = vectorlen-1;
5529
5530 if (dontsort == 0) {
5531 server.sort_desc = desc;
5532 server.sort_alpha = alpha;
5533 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 5534 if (sortby && (start != 0 || end != vectorlen-1))
5535 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
5536 else
5537 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 5538 }
5539
5540 /* Send command output to the output buffer, performing the specified
5541 * GET/DEL/INCR/DECR operations if any. */
5542 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 5543 if (storekey == NULL) {
5544 /* STORE option not specified, sent the sorting result to client */
5545 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
5546 for (j = start; j <= end; j++) {
5547 listNode *ln;
c7df85a4 5548 listIter li;
5549
443c6409 5550 if (!getop) {
5551 addReplyBulkLen(c,vector[j].obj);
5552 addReply(c,vector[j].obj);
5553 addReply(c,shared.crlf);
5554 }
c7df85a4 5555 listRewind(operations,&li);
5556 while((ln = listNext(&li))) {
443c6409 5557 redisSortOperation *sop = ln->value;
5558 robj *val = lookupKeyByPattern(c->db,sop->pattern,
5559 vector[j].obj);
5560
5561 if (sop->type == REDIS_SORT_GET) {
5562 if (!val || val->type != REDIS_STRING) {
5563 addReply(c,shared.nullbulk);
5564 } else {
5565 addReplyBulkLen(c,val);
5566 addReply(c,val);
5567 addReply(c,shared.crlf);
5568 }
5569 } else {
dfc5e96c 5570 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 5571 }
5572 }
ed9b544e 5573 }
443c6409 5574 } else {
5575 robj *listObject = createListObject();
5576 list *listPtr = (list*) listObject->ptr;
5577
5578 /* STORE option specified, set the sorting result as a List object */
5579 for (j = start; j <= end; j++) {
5580 listNode *ln;
c7df85a4 5581 listIter li;
5582
443c6409 5583 if (!getop) {
5584 listAddNodeTail(listPtr,vector[j].obj);
5585 incrRefCount(vector[j].obj);
5586 }
c7df85a4 5587 listRewind(operations,&li);
5588 while((ln = listNext(&li))) {
443c6409 5589 redisSortOperation *sop = ln->value;
5590 robj *val = lookupKeyByPattern(c->db,sop->pattern,
5591 vector[j].obj);
5592
5593 if (sop->type == REDIS_SORT_GET) {
5594 if (!val || val->type != REDIS_STRING) {
5595 listAddNodeTail(listPtr,createStringObject("",0));
5596 } else {
5597 listAddNodeTail(listPtr,val);
5598 incrRefCount(val);
5599 }
ed9b544e 5600 } else {
dfc5e96c 5601 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 5602 }
ed9b544e 5603 }
ed9b544e 5604 }
121796f7 5605 if (dictReplace(c->db->dict,storekey,listObject)) {
5606 incrRefCount(storekey);
5607 }
443c6409 5608 /* Note: we add 1 because the DB is dirty anyway since even if the
5609 * SORT result is empty a new key is set and maybe the old content
5610 * replaced. */
5611 server.dirty += 1+outputlen;
5612 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 5613 }
5614
5615 /* Cleanup */
5616 decrRefCount(sortval);
5617 listRelease(operations);
5618 for (j = 0; j < vectorlen; j++) {
5619 if (sortby && alpha && vector[j].u.cmpobj)
5620 decrRefCount(vector[j].u.cmpobj);
5621 }
5622 zfree(vector);
5623}
5624
ec6c7a1d 5625/* Convert an amount of bytes into a human readable string in the form
5626 * of 100B, 2G, 100M, 4K, and so forth. */
5627static void bytesToHuman(char *s, unsigned long long n) {
5628 double d;
5629
5630 if (n < 1024) {
5631 /* Bytes */
5632 sprintf(s,"%lluB",n);
5633 return;
5634 } else if (n < (1024*1024)) {
5635 d = (double)n/(1024);
5636 sprintf(s,"%.2fK",d);
5637 } else if (n < (1024LL*1024*1024)) {
5638 d = (double)n/(1024*1024);
5639 sprintf(s,"%.2fM",d);
5640 } else if (n < (1024LL*1024*1024*1024)) {
5641 d = (double)n/(1024LL*1024*1024);
b72f6a4b 5642 sprintf(s,"%.2fG",d);
ec6c7a1d 5643 }
5644}
5645
1c85b79f 5646/* Create the string returned by the INFO command. This is decoupled
5647 * by the INFO command itself as we need to report the same information
5648 * on memory corruption problems. */
5649static sds genRedisInfoString(void) {
ed9b544e 5650 sds info;
5651 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 5652 int j;
ec6c7a1d 5653 char hmem[64];
5654
b72f6a4b 5655 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 5656 info = sdscatprintf(sdsempty(),
5657 "redis_version:%s\r\n"
f1017b3f 5658 "arch_bits:%s\r\n"
7a932b74 5659 "multiplexing_api:%s\r\n"
0d7170a4 5660 "process_id:%ld\r\n"
682ac724 5661 "uptime_in_seconds:%ld\r\n"
5662 "uptime_in_days:%ld\r\n"
ed9b544e 5663 "connected_clients:%d\r\n"
5664 "connected_slaves:%d\r\n"
f86a74e9 5665 "blocked_clients:%d\r\n"
5fba9f71 5666 "used_memory:%zu\r\n"
ec6c7a1d 5667 "used_memory_human:%s\r\n"
ed9b544e 5668 "changes_since_last_save:%lld\r\n"
be2bb6b0 5669 "bgsave_in_progress:%d\r\n"
682ac724 5670 "last_save_time:%ld\r\n"
b3fad521 5671 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 5672 "total_connections_received:%lld\r\n"
5673 "total_commands_processed:%lld\r\n"
7d98e08c 5674 "vm_enabled:%d\r\n"
a0f643ea 5675 "role:%s\r\n"
ed9b544e 5676 ,REDIS_VERSION,
f1017b3f 5677 (sizeof(long) == 8) ? "64" : "32",
7a932b74 5678 aeGetApiName(),
0d7170a4 5679 (long) getpid(),
a0f643ea 5680 uptime,
5681 uptime/(3600*24),
ed9b544e 5682 listLength(server.clients)-listLength(server.slaves),
5683 listLength(server.slaves),
d5d55fc3 5684 server.blpop_blocked_clients,
b72f6a4b 5685 zmalloc_used_memory(),
ec6c7a1d 5686 hmem,
ed9b544e 5687 server.dirty,
9d65a1bb 5688 server.bgsavechildpid != -1,
ed9b544e 5689 server.lastsave,
b3fad521 5690 server.bgrewritechildpid != -1,
ed9b544e 5691 server.stat_numconnections,
5692 server.stat_numcommands,
7d98e08c 5693 server.vm_enabled != 0,
a0f643ea 5694 server.masterhost == NULL ? "master" : "slave"
ed9b544e 5695 );
a0f643ea 5696 if (server.masterhost) {
5697 info = sdscatprintf(info,
5698 "master_host:%s\r\n"
5699 "master_port:%d\r\n"
5700 "master_link_status:%s\r\n"
5701 "master_last_io_seconds_ago:%d\r\n"
5702 ,server.masterhost,
5703 server.masterport,
5704 (server.replstate == REDIS_REPL_CONNECTED) ?
5705 "up" : "down",
f72b934d 5706 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 5707 );
5708 }
7d98e08c 5709 if (server.vm_enabled) {
1064ef87 5710 lockThreadedIO();
7d98e08c 5711 info = sdscatprintf(info,
5712 "vm_conf_max_memory:%llu\r\n"
5713 "vm_conf_page_size:%llu\r\n"
5714 "vm_conf_pages:%llu\r\n"
5715 "vm_stats_used_pages:%llu\r\n"
5716 "vm_stats_swapped_objects:%llu\r\n"
5717 "vm_stats_swappin_count:%llu\r\n"
5718 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 5719 "vm_stats_io_newjobs_len:%lu\r\n"
5720 "vm_stats_io_processing_len:%lu\r\n"
5721 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 5722 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 5723 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 5724 ,(unsigned long long) server.vm_max_memory,
5725 (unsigned long long) server.vm_page_size,
5726 (unsigned long long) server.vm_pages,
5727 (unsigned long long) server.vm_stats_used_pages,
5728 (unsigned long long) server.vm_stats_swapped_objects,
5729 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 5730 (unsigned long long) server.vm_stats_swapouts,
5731 (unsigned long) listLength(server.io_newjobs),
5732 (unsigned long) listLength(server.io_processing),
5733 (unsigned long) listLength(server.io_processed),
d5d55fc3 5734 (unsigned long) server.io_active_threads,
5735 (unsigned long) server.vm_blocked_clients
7d98e08c 5736 );
1064ef87 5737 unlockThreadedIO();
7d98e08c 5738 }
c3cb078d 5739 for (j = 0; j < server.dbnum; j++) {
5740 long long keys, vkeys;
5741
5742 keys = dictSize(server.db[j].dict);
5743 vkeys = dictSize(server.db[j].expires);
5744 if (keys || vkeys) {
9d65a1bb 5745 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 5746 j, keys, vkeys);
5747 }
5748 }
1c85b79f 5749 return info;
5750}
5751
5752static void infoCommand(redisClient *c) {
5753 sds info = genRedisInfoString();
83c6a618 5754 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
5755 (unsigned long)sdslen(info)));
ed9b544e 5756 addReplySds(c,info);
70003d28 5757 addReply(c,shared.crlf);
ed9b544e 5758}
5759
3305306f 5760static void monitorCommand(redisClient *c) {
5761 /* ignore MONITOR if aleady slave or in monitor mode */
5762 if (c->flags & REDIS_SLAVE) return;
5763
5764 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
5765 c->slaveseldb = 0;
6b47e12e 5766 listAddNodeTail(server.monitors,c);
3305306f 5767 addReply(c,shared.ok);
5768}
5769
5770/* ================================= Expire ================================= */
5771static int removeExpire(redisDb *db, robj *key) {
5772 if (dictDelete(db->expires,key) == DICT_OK) {
5773 return 1;
5774 } else {
5775 return 0;
5776 }
5777}
5778
5779static int setExpire(redisDb *db, robj *key, time_t when) {
5780 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
5781 return 0;
5782 } else {
5783 incrRefCount(key);
5784 return 1;
5785 }
5786}
5787
bb32ede5 5788/* Return the expire time of the specified key, or -1 if no expire
5789 * is associated with this key (i.e. the key is non volatile) */
5790static time_t getExpire(redisDb *db, robj *key) {
5791 dictEntry *de;
5792
5793 /* No expire? return ASAP */
5794 if (dictSize(db->expires) == 0 ||
5795 (de = dictFind(db->expires,key)) == NULL) return -1;
5796
5797 return (time_t) dictGetEntryVal(de);
5798}
5799
3305306f 5800static int expireIfNeeded(redisDb *db, robj *key) {
5801 time_t when;
5802 dictEntry *de;
5803
5804 /* No expire? return ASAP */
5805 if (dictSize(db->expires) == 0 ||
5806 (de = dictFind(db->expires,key)) == NULL) return 0;
5807
5808 /* Lookup the expire */
5809 when = (time_t) dictGetEntryVal(de);
5810 if (time(NULL) <= when) return 0;
5811
5812 /* Delete the key */
5813 dictDelete(db->expires,key);
5814 return dictDelete(db->dict,key) == DICT_OK;
5815}
5816
5817static int deleteIfVolatile(redisDb *db, robj *key) {
5818 dictEntry *de;
5819
5820 /* No expire? return ASAP */
5821 if (dictSize(db->expires) == 0 ||
5822 (de = dictFind(db->expires,key)) == NULL) return 0;
5823
5824 /* Delete the key */
0c66a471 5825 server.dirty++;
3305306f 5826 dictDelete(db->expires,key);
5827 return dictDelete(db->dict,key) == DICT_OK;
5828}
5829
802e8373 5830static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
3305306f 5831 dictEntry *de;
3305306f 5832
802e8373 5833 de = dictFind(c->db->dict,key);
3305306f 5834 if (de == NULL) {
5835 addReply(c,shared.czero);
5836 return;
5837 }
43e5ccdf 5838 if (seconds < 0) {
5839 if (deleteKey(c->db,key)) server.dirty++;
5840 addReply(c, shared.cone);
3305306f 5841 return;
5842 } else {
5843 time_t when = time(NULL)+seconds;
802e8373 5844 if (setExpire(c->db,key,when)) {
3305306f 5845 addReply(c,shared.cone);
77423026 5846 server.dirty++;
5847 } else {
3305306f 5848 addReply(c,shared.czero);
77423026 5849 }
3305306f 5850 return;
5851 }
5852}
5853
802e8373 5854static void expireCommand(redisClient *c) {
5855 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
5856}
5857
5858static void expireatCommand(redisClient *c) {
5859 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
5860}
5861
fd88489a 5862static void ttlCommand(redisClient *c) {
5863 time_t expire;
5864 int ttl = -1;
5865
5866 expire = getExpire(c->db,c->argv[1]);
5867 if (expire != -1) {
5868 ttl = (int) (expire-time(NULL));
5869 if (ttl < 0) ttl = -1;
5870 }
5871 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
5872}
5873
6e469882 5874/* ================================ MULTI/EXEC ============================== */
5875
5876/* Client state initialization for MULTI/EXEC */
5877static void initClientMultiState(redisClient *c) {
5878 c->mstate.commands = NULL;
5879 c->mstate.count = 0;
5880}
5881
5882/* Release all the resources associated with MULTI/EXEC state */
5883static void freeClientMultiState(redisClient *c) {
5884 int j;
5885
5886 for (j = 0; j < c->mstate.count; j++) {
5887 int i;
5888 multiCmd *mc = c->mstate.commands+j;
5889
5890 for (i = 0; i < mc->argc; i++)
5891 decrRefCount(mc->argv[i]);
5892 zfree(mc->argv);
5893 }
5894 zfree(c->mstate.commands);
5895}
5896
5897/* Add a new command into the MULTI commands queue */
5898static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
5899 multiCmd *mc;
5900 int j;
5901
5902 c->mstate.commands = zrealloc(c->mstate.commands,
5903 sizeof(multiCmd)*(c->mstate.count+1));
5904 mc = c->mstate.commands+c->mstate.count;
5905 mc->cmd = cmd;
5906 mc->argc = c->argc;
5907 mc->argv = zmalloc(sizeof(robj*)*c->argc);
5908 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
5909 for (j = 0; j < c->argc; j++)
5910 incrRefCount(mc->argv[j]);
5911 c->mstate.count++;
5912}
5913
5914static void multiCommand(redisClient *c) {
5915 c->flags |= REDIS_MULTI;
36c548f0 5916 addReply(c,shared.ok);
6e469882 5917}
5918
5919static void execCommand(redisClient *c) {
5920 int j;
5921 robj **orig_argv;
5922 int orig_argc;
5923
5924 if (!(c->flags & REDIS_MULTI)) {
5925 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
5926 return;
5927 }
5928
5929 orig_argv = c->argv;
5930 orig_argc = c->argc;
5931 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
5932 for (j = 0; j < c->mstate.count; j++) {
5933 c->argc = c->mstate.commands[j].argc;
5934 c->argv = c->mstate.commands[j].argv;
5935 call(c,c->mstate.commands[j].cmd);
5936 }
5937 c->argv = orig_argv;
5938 c->argc = orig_argc;
5939 freeClientMultiState(c);
5940 initClientMultiState(c);
5941 c->flags &= (~REDIS_MULTI);
5942}
5943
4409877e 5944/* =========================== Blocking Operations ========================= */
5945
5946/* Currently Redis blocking operations support is limited to list POP ops,
5947 * so the current implementation is not fully generic, but it is also not
5948 * completely specific so it will not require a rewrite to support new
5949 * kind of blocking operations in the future.
5950 *
5951 * Still it's important to note that list blocking operations can be already
5952 * used as a notification mechanism in order to implement other blocking
5953 * operations at application level, so there must be a very strong evidence
5954 * of usefulness and generality before new blocking operations are implemented.
5955 *
5956 * This is how the current blocking POP works, we use BLPOP as example:
5957 * - If the user calls BLPOP and the key exists and contains a non empty list
5958 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
5959 * if there is not to block.
5960 * - If instead BLPOP is called and the key does not exists or the list is
5961 * empty we need to block. In order to do so we remove the notification for
5962 * new data to read in the client socket (so that we'll not serve new
5963 * requests if the blocking request is not served). Also we put the client
95242ab5 5964 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
4409877e 5965 * blocking for this keys.
5966 * - If a PUSH operation against a key with blocked clients waiting is
5967 * performed, we serve the first in the list: basically instead to push
5968 * the new element inside the list we return it to the (first / oldest)
5969 * blocking client, unblock the client, and remove it form the list.
5970 *
5971 * The above comment and the source code should be enough in order to understand
5972 * the implementation and modify / fix it later.
5973 */
5974
5975/* Set a client in blocking mode for the specified key, with the specified
5976 * timeout */
b177fd30 5977static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 5978 dictEntry *de;
5979 list *l;
b177fd30 5980 int j;
4409877e 5981
b177fd30 5982 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
5983 c->blockingkeysnum = numkeys;
4409877e 5984 c->blockingto = timeout;
b177fd30 5985 for (j = 0; j < numkeys; j++) {
5986 /* Add the key in the client structure, to map clients -> keys */
5987 c->blockingkeys[j] = keys[j];
5988 incrRefCount(keys[j]);
4409877e 5989
b177fd30 5990 /* And in the other "side", to map keys -> clients */
5991 de = dictFind(c->db->blockingkeys,keys[j]);
5992 if (de == NULL) {
5993 int retval;
5994
5995 /* For every key we take a list of clients blocked for it */
5996 l = listCreate();
5997 retval = dictAdd(c->db->blockingkeys,keys[j],l);
5998 incrRefCount(keys[j]);
5999 assert(retval == DICT_OK);
6000 } else {
6001 l = dictGetEntryVal(de);
6002 }
6003 listAddNodeTail(l,c);
4409877e 6004 }
b177fd30 6005 /* Mark the client as a blocked client */
4409877e 6006 c->flags |= REDIS_BLOCKED;
6007 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
d5d55fc3 6008 server.blpop_blocked_clients++;
4409877e 6009}
6010
6011/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 6012static void unblockClientWaitingData(redisClient *c) {
4409877e 6013 dictEntry *de;
6014 list *l;
b177fd30 6015 int j;
4409877e 6016
b177fd30 6017 assert(c->blockingkeys != NULL);
6018 /* The client may wait for multiple keys, so unblock it for every key. */
6019 for (j = 0; j < c->blockingkeysnum; j++) {
6020 /* Remove this client from the list of clients waiting for this key. */
6021 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
6022 assert(de != NULL);
6023 l = dictGetEntryVal(de);
6024 listDelNode(l,listSearchKey(l,c));
6025 /* If the list is empty we need to remove it to avoid wasting memory */
6026 if (listLength(l) == 0)
6027 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
6028 decrRefCount(c->blockingkeys[j]);
6029 }
6030 /* Cleanup the client structure */
6031 zfree(c->blockingkeys);
6032 c->blockingkeys = NULL;
4409877e 6033 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 6034 server.blpop_blocked_clients--;
4409877e 6035 /* Ok now we are ready to get read events from socket, note that we
b0d8747d 6036 * can't trap errors here as it's possible that unblockClientWaitingDatas() is
4409877e 6037 * called from freeClient() itself, and the only thing we can do
6038 * if we failed to register the READABLE event is to kill the client.
6039 * Still the following function should never fail in the real world as
6040 * we are sure the file descriptor is sane, and we exit on out of mem. */
6041 aeCreateFileEvent(server.el, c->fd, AE_READABLE, readQueryFromClient, c);
6042 /* As a final step we want to process data if there is some command waiting
b0d8747d 6043 * in the input buffer. Note that this is safe even if
6044 * unblockClientWaitingData() gets called from freeClient() because
6045 * freeClient() will be smart enough to call this function
6046 * *after* c->querybuf was set to NULL. */
4409877e 6047 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
6048}
6049
6050/* This should be called from any function PUSHing into lists.
6051 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
6052 * 'ele' is the element pushed.
6053 *
6054 * If the function returns 0 there was no client waiting for a list push
6055 * against this key.
6056 *
6057 * If the function returns 1 there was a client waiting for a list push
6058 * against this key, the element was passed to this client thus it's not
6059 * needed to actually add it to the list and the caller should return asap. */
6060static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
6061 struct dictEntry *de;
6062 redisClient *receiver;
6063 list *l;
6064 listNode *ln;
6065
6066 de = dictFind(c->db->blockingkeys,key);
6067 if (de == NULL) return 0;
6068 l = dictGetEntryVal(de);
6069 ln = listFirst(l);
6070 assert(ln != NULL);
6071 receiver = ln->value;
4409877e 6072
b177fd30 6073 addReplySds(receiver,sdsnew("*2\r\n"));
6074 addReplyBulkLen(receiver,key);
6075 addReply(receiver,key);
6076 addReply(receiver,shared.crlf);
4409877e 6077 addReplyBulkLen(receiver,ele);
6078 addReply(receiver,ele);
6079 addReply(receiver,shared.crlf);
b0d8747d 6080 unblockClientWaitingData(receiver);
4409877e 6081 return 1;
6082}
6083
6084/* Blocking RPOP/LPOP */
6085static void blockingPopGenericCommand(redisClient *c, int where) {
6086 robj *o;
6087 time_t timeout;
b177fd30 6088 int j;
4409877e 6089
b177fd30 6090 for (j = 1; j < c->argc-1; j++) {
6091 o = lookupKeyWrite(c->db,c->argv[j]);
6092 if (o != NULL) {
6093 if (o->type != REDIS_LIST) {
6094 addReply(c,shared.wrongtypeerr);
4409877e 6095 return;
b177fd30 6096 } else {
6097 list *list = o->ptr;
6098 if (listLength(list) != 0) {
6099 /* If the list contains elements fall back to the usual
6100 * non-blocking POP operation */
6101 robj *argv[2], **orig_argv;
6102 int orig_argc;
6103
6104 /* We need to alter the command arguments before to call
6105 * popGenericCommand() as the command takes a single key. */
6106 orig_argv = c->argv;
6107 orig_argc = c->argc;
6108 argv[1] = c->argv[j];
6109 c->argv = argv;
6110 c->argc = 2;
6111
6112 /* Also the return value is different, we need to output
6113 * the multi bulk reply header and the key name. The
6114 * "real" command will add the last element (the value)
6115 * for us. If this souds like an hack to you it's just
6116 * because it is... */
6117 addReplySds(c,sdsnew("*2\r\n"));
6118 addReplyBulkLen(c,argv[1]);
6119 addReply(c,argv[1]);
6120 addReply(c,shared.crlf);
6121 popGenericCommand(c,where);
6122
6123 /* Fix the client structure with the original stuff */
6124 c->argv = orig_argv;
6125 c->argc = orig_argc;
6126 return;
6127 }
4409877e 6128 }
6129 }
6130 }
6131 /* If the list is empty or the key does not exists we must block */
b177fd30 6132 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 6133 if (timeout > 0) timeout += time(NULL);
b177fd30 6134 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 6135}
6136
6137static void blpopCommand(redisClient *c) {
6138 blockingPopGenericCommand(c,REDIS_HEAD);
6139}
6140
6141static void brpopCommand(redisClient *c) {
6142 blockingPopGenericCommand(c,REDIS_TAIL);
6143}
6144
ed9b544e 6145/* =============================== Replication ============================= */
6146
a4d1ba9a 6147static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 6148 ssize_t nwritten, ret = size;
6149 time_t start = time(NULL);
6150
6151 timeout++;
6152 while(size) {
6153 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
6154 nwritten = write(fd,ptr,size);
6155 if (nwritten == -1) return -1;
6156 ptr += nwritten;
6157 size -= nwritten;
6158 }
6159 if ((time(NULL)-start) > timeout) {
6160 errno = ETIMEDOUT;
6161 return -1;
6162 }
6163 }
6164 return ret;
6165}
6166
a4d1ba9a 6167static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 6168 ssize_t nread, totread = 0;
6169 time_t start = time(NULL);
6170
6171 timeout++;
6172 while(size) {
6173 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
6174 nread = read(fd,ptr,size);
6175 if (nread == -1) return -1;
6176 ptr += nread;
6177 size -= nread;
6178 totread += nread;
6179 }
6180 if ((time(NULL)-start) > timeout) {
6181 errno = ETIMEDOUT;
6182 return -1;
6183 }
6184 }
6185 return totread;
6186}
6187
6188static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
6189 ssize_t nread = 0;
6190
6191 size--;
6192 while(size) {
6193 char c;
6194
6195 if (syncRead(fd,&c,1,timeout) == -1) return -1;
6196 if (c == '\n') {
6197 *ptr = '\0';
6198 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
6199 return nread;
6200 } else {
6201 *ptr++ = c;
6202 *ptr = '\0';
6203 nread++;
6204 }
6205 }
6206 return nread;
6207}
6208
6209static void syncCommand(redisClient *c) {
40d224a9 6210 /* ignore SYNC if aleady slave or in monitor mode */
6211 if (c->flags & REDIS_SLAVE) return;
6212
6213 /* SYNC can't be issued when the server has pending data to send to
6214 * the client about already issued commands. We need a fresh reply
6215 * buffer registering the differences between the BGSAVE and the current
6216 * dataset, so that we can copy to other slaves if needed. */
6217 if (listLength(c->reply) != 0) {
6218 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
6219 return;
6220 }
6221
6222 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
6223 /* Here we need to check if there is a background saving operation
6224 * in progress, or if it is required to start one */
9d65a1bb 6225 if (server.bgsavechildpid != -1) {
40d224a9 6226 /* Ok a background save is in progress. Let's check if it is a good
6227 * one for replication, i.e. if there is another slave that is
6228 * registering differences since the server forked to save */
6229 redisClient *slave;
6230 listNode *ln;
c7df85a4 6231 listIter li;
40d224a9 6232
c7df85a4 6233 listRewind(server.slaves,&li);
6234 while((ln = listNext(&li))) {
40d224a9 6235 slave = ln->value;
6236 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 6237 }
6238 if (ln) {
6239 /* Perfect, the server is already registering differences for
6240 * another slave. Set the right state, and copy the buffer. */
6241 listRelease(c->reply);
6242 c->reply = listDup(slave->reply);
40d224a9 6243 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6244 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
6245 } else {
6246 /* No way, we need to wait for the next BGSAVE in order to
6247 * register differences */
6248 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
6249 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
6250 }
6251 } else {
6252 /* Ok we don't have a BGSAVE in progress, let's start one */
6253 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
6254 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
6255 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
6256 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
6257 return;
6258 }
6259 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6260 }
6208b3a7 6261 c->repldbfd = -1;
40d224a9 6262 c->flags |= REDIS_SLAVE;
6263 c->slaveseldb = 0;
6b47e12e 6264 listAddNodeTail(server.slaves,c);
40d224a9 6265 return;
6266}
6267
6208b3a7 6268static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
6269 redisClient *slave = privdata;
6270 REDIS_NOTUSED(el);
6271 REDIS_NOTUSED(mask);
6272 char buf[REDIS_IOBUF_LEN];
6273 ssize_t nwritten, buflen;
6274
6275 if (slave->repldboff == 0) {
6276 /* Write the bulk write count before to transfer the DB. In theory here
6277 * we don't know how much room there is in the output buffer of the
6278 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
6279 * operations) will never be smaller than the few bytes we need. */
6280 sds bulkcount;
6281
6282 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
6283 slave->repldbsize);
6284 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
6285 {
6286 sdsfree(bulkcount);
6287 freeClient(slave);
6288 return;
6289 }
6290 sdsfree(bulkcount);
6291 }
6292 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
6293 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
6294 if (buflen <= 0) {
6295 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
6296 (buflen == 0) ? "premature EOF" : strerror(errno));
6297 freeClient(slave);
6298 return;
6299 }
6300 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 6301 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 6302 strerror(errno));
6303 freeClient(slave);
6304 return;
6305 }
6306 slave->repldboff += nwritten;
6307 if (slave->repldboff == slave->repldbsize) {
6308 close(slave->repldbfd);
6309 slave->repldbfd = -1;
6310 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
6311 slave->replstate = REDIS_REPL_ONLINE;
6312 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 6313 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 6314 freeClient(slave);
6315 return;
6316 }
6317 addReplySds(slave,sdsempty());
6318 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
6319 }
6320}
ed9b544e 6321
a3b21203 6322/* This function is called at the end of every backgrond saving.
6323 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
6324 * otherwise REDIS_ERR is passed to the function.
6325 *
6326 * The goal of this function is to handle slaves waiting for a successful
6327 * background saving in order to perform non-blocking synchronization. */
6328static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 6329 listNode *ln;
6330 int startbgsave = 0;
c7df85a4 6331 listIter li;
ed9b544e 6332
c7df85a4 6333 listRewind(server.slaves,&li);
6334 while((ln = listNext(&li))) {
6208b3a7 6335 redisClient *slave = ln->value;
ed9b544e 6336
6208b3a7 6337 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
6338 startbgsave = 1;
6339 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6340 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 6341 struct redis_stat buf;
6208b3a7 6342
6343 if (bgsaveerr != REDIS_OK) {
6344 freeClient(slave);
6345 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
6346 continue;
6347 }
6348 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 6349 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 6350 freeClient(slave);
6351 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
6352 continue;
6353 }
6354 slave->repldboff = 0;
6355 slave->repldbsize = buf.st_size;
6356 slave->replstate = REDIS_REPL_SEND_BULK;
6357 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 6358 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 6359 freeClient(slave);
6360 continue;
6361 }
6362 }
ed9b544e 6363 }
6208b3a7 6364 if (startbgsave) {
6365 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 6366 listIter li;
6367
6368 listRewind(server.slaves,&li);
6208b3a7 6369 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 6370 while((ln = listNext(&li))) {
6208b3a7 6371 redisClient *slave = ln->value;
ed9b544e 6372
6208b3a7 6373 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
6374 freeClient(slave);
6375 }
6376 }
6377 }
ed9b544e 6378}
6379
6380static int syncWithMaster(void) {
d0ccebcf 6381 char buf[1024], tmpfile[256], authcmd[1024];
ed9b544e 6382 int dumpsize;
6383 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
6384 int dfd;
6385
6386 if (fd == -1) {
6387 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
6388 strerror(errno));
6389 return REDIS_ERR;
6390 }
d0ccebcf 6391
6392 /* AUTH with the master if required. */
6393 if(server.masterauth) {
6394 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
6395 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
6396 close(fd);
6397 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
6398 strerror(errno));
6399 return REDIS_ERR;
6400 }
6401 /* Read the AUTH result. */
6402 if (syncReadLine(fd,buf,1024,3600) == -1) {
6403 close(fd);
6404 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
6405 strerror(errno));
6406 return REDIS_ERR;
6407 }
6408 if (buf[0] != '+') {
6409 close(fd);
6410 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
6411 return REDIS_ERR;
6412 }
6413 }
6414
ed9b544e 6415 /* Issue the SYNC command */
6416 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
6417 close(fd);
6418 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
6419 strerror(errno));
6420 return REDIS_ERR;
6421 }
6422 /* Read the bulk write count */
8c4d91fc 6423 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 6424 close(fd);
6425 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
6426 strerror(errno));
6427 return REDIS_ERR;
6428 }
4aa701c1 6429 if (buf[0] != '$') {
6430 close(fd);
6431 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
6432 return REDIS_ERR;
6433 }
c937aa89 6434 dumpsize = atoi(buf+1);
ed9b544e 6435 redisLog(REDIS_NOTICE,"Receiving %d bytes data dump from MASTER",dumpsize);
6436 /* Read the bulk write data on a temp file */
6437 snprintf(tmpfile,256,"temp-%d.%ld.rdb",(int)time(NULL),(long int)random());
6438 dfd = open(tmpfile,O_CREAT|O_WRONLY,0644);
6439 if (dfd == -1) {
6440 close(fd);
6441 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
6442 return REDIS_ERR;
6443 }
6444 while(dumpsize) {
6445 int nread, nwritten;
6446
6447 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
6448 if (nread == -1) {
6449 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
6450 strerror(errno));
6451 close(fd);
6452 close(dfd);
6453 return REDIS_ERR;
6454 }
6455 nwritten = write(dfd,buf,nread);
6456 if (nwritten == -1) {
6457 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
6458 close(fd);
6459 close(dfd);
6460 return REDIS_ERR;
6461 }
6462 dumpsize -= nread;
6463 }
6464 close(dfd);
6465 if (rename(tmpfile,server.dbfilename) == -1) {
6466 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
6467 unlink(tmpfile);
6468 close(fd);
6469 return REDIS_ERR;
6470 }
6471 emptyDb();
f78fd11b 6472 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 6473 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
6474 close(fd);
6475 return REDIS_ERR;
6476 }
6477 server.master = createClient(fd);
6478 server.master->flags |= REDIS_MASTER;
179b3952 6479 server.master->authenticated = 1;
ed9b544e 6480 server.replstate = REDIS_REPL_CONNECTED;
6481 return REDIS_OK;
6482}
6483
321b0e13 6484static void slaveofCommand(redisClient *c) {
6485 if (!strcasecmp(c->argv[1]->ptr,"no") &&
6486 !strcasecmp(c->argv[2]->ptr,"one")) {
6487 if (server.masterhost) {
6488 sdsfree(server.masterhost);
6489 server.masterhost = NULL;
6490 if (server.master) freeClient(server.master);
6491 server.replstate = REDIS_REPL_NONE;
6492 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
6493 }
6494 } else {
6495 sdsfree(server.masterhost);
6496 server.masterhost = sdsdup(c->argv[1]->ptr);
6497 server.masterport = atoi(c->argv[2]->ptr);
6498 if (server.master) freeClient(server.master);
6499 server.replstate = REDIS_REPL_CONNECT;
6500 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
6501 server.masterhost, server.masterport);
6502 }
6503 addReply(c,shared.ok);
6504}
6505
3fd78bcd 6506/* ============================ Maxmemory directive ======================== */
6507
a5819310 6508/* Try to free one object form the pre-allocated objects free list.
6509 * This is useful under low mem conditions as by default we take 1 million
6510 * free objects allocated. On success REDIS_OK is returned, otherwise
6511 * REDIS_ERR. */
6512static int tryFreeOneObjectFromFreelist(void) {
f870935d 6513 robj *o;
6514
a5819310 6515 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
6516 if (listLength(server.objfreelist)) {
6517 listNode *head = listFirst(server.objfreelist);
6518 o = listNodeValue(head);
6519 listDelNode(server.objfreelist,head);
6520 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
6521 zfree(o);
6522 return REDIS_OK;
6523 } else {
6524 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
6525 return REDIS_ERR;
6526 }
f870935d 6527}
6528
3fd78bcd 6529/* This function gets called when 'maxmemory' is set on the config file to limit
6530 * the max memory used by the server, and we are out of memory.
6531 * This function will try to, in order:
6532 *
6533 * - Free objects from the free list
6534 * - Try to remove keys with an EXPIRE set
6535 *
6536 * It is not possible to free enough memory to reach used-memory < maxmemory
6537 * the server will start refusing commands that will enlarge even more the
6538 * memory usage.
6539 */
6540static void freeMemoryIfNeeded(void) {
6541 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 6542 int j, k, freed = 0;
6543
6544 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
6545 for (j = 0; j < server.dbnum; j++) {
6546 int minttl = -1;
6547 robj *minkey = NULL;
6548 struct dictEntry *de;
6549
6550 if (dictSize(server.db[j].expires)) {
6551 freed = 1;
6552 /* From a sample of three keys drop the one nearest to
6553 * the natural expire */
6554 for (k = 0; k < 3; k++) {
6555 time_t t;
6556
6557 de = dictGetRandomKey(server.db[j].expires);
6558 t = (time_t) dictGetEntryVal(de);
6559 if (minttl == -1 || t < minttl) {
6560 minkey = dictGetEntryKey(de);
6561 minttl = t;
3fd78bcd 6562 }
3fd78bcd 6563 }
a5819310 6564 deleteKey(server.db+j,minkey);
3fd78bcd 6565 }
3fd78bcd 6566 }
a5819310 6567 if (!freed) return; /* nothing to free... */
3fd78bcd 6568 }
6569}
6570
f80dff62 6571/* ============================== Append Only file ========================== */
6572
6573static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
6574 sds buf = sdsempty();
6575 int j;
6576 ssize_t nwritten;
6577 time_t now;
6578 robj *tmpargv[3];
6579
6580 /* The DB this command was targetting is not the same as the last command
6581 * we appendend. To issue a SELECT command is needed. */
6582 if (dictid != server.appendseldb) {
6583 char seldb[64];
6584
6585 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 6586 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 6587 (unsigned long)strlen(seldb),seldb);
f80dff62 6588 server.appendseldb = dictid;
6589 }
6590
6591 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
6592 * EXPIREs into EXPIREATs calls */
6593 if (cmd->proc == expireCommand) {
6594 long when;
6595
6596 tmpargv[0] = createStringObject("EXPIREAT",8);
6597 tmpargv[1] = argv[1];
6598 incrRefCount(argv[1]);
6599 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
6600 tmpargv[2] = createObject(REDIS_STRING,
6601 sdscatprintf(sdsempty(),"%ld",when));
6602 argv = tmpargv;
6603 }
6604
6605 /* Append the actual command */
6606 buf = sdscatprintf(buf,"*%d\r\n",argc);
6607 for (j = 0; j < argc; j++) {
6608 robj *o = argv[j];
6609
9d65a1bb 6610 o = getDecodedObject(o);
83c6a618 6611 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
f80dff62 6612 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
6613 buf = sdscatlen(buf,"\r\n",2);
9d65a1bb 6614 decrRefCount(o);
f80dff62 6615 }
6616
6617 /* Free the objects from the modified argv for EXPIREAT */
6618 if (cmd->proc == expireCommand) {
6619 for (j = 0; j < 3; j++)
6620 decrRefCount(argv[j]);
6621 }
6622
6623 /* We want to perform a single write. This should be guaranteed atomic
6624 * at least if the filesystem we are writing is a real physical one.
6625 * While this will save us against the server being killed I don't think
6626 * there is much to do about the whole server stopping for power problems
6627 * or alike */
6628 nwritten = write(server.appendfd,buf,sdslen(buf));
6629 if (nwritten != (signed)sdslen(buf)) {
6630 /* Ooops, we are in troubles. The best thing to do for now is
6631 * to simply exit instead to give the illusion that everything is
6632 * working as expected. */
6633 if (nwritten == -1) {
6634 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
6635 } else {
6636 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
6637 }
6638 exit(1);
6639 }
85a83172 6640 /* If a background append only file rewriting is in progress we want to
6641 * accumulate the differences between the child DB and the current one
6642 * in a buffer, so that when the child process will do its work we
6643 * can append the differences to the new append only file. */
6644 if (server.bgrewritechildpid != -1)
6645 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
6646
6647 sdsfree(buf);
f80dff62 6648 now = time(NULL);
6649 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
6650 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
6651 now-server.lastfsync > 1))
6652 {
6653 fsync(server.appendfd); /* Let's try to get this data on the disk */
6654 server.lastfsync = now;
6655 }
6656}
6657
6658/* In Redis commands are always executed in the context of a client, so in
6659 * order to load the append only file we need to create a fake client. */
6660static struct redisClient *createFakeClient(void) {
6661 struct redisClient *c = zmalloc(sizeof(*c));
6662
6663 selectDb(c,0);
6664 c->fd = -1;
6665 c->querybuf = sdsempty();
6666 c->argc = 0;
6667 c->argv = NULL;
6668 c->flags = 0;
9387d17d 6669 /* We set the fake client as a slave waiting for the synchronization
6670 * so that Redis will not try to send replies to this client. */
6671 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 6672 c->reply = listCreate();
6673 listSetFreeMethod(c->reply,decrRefCount);
6674 listSetDupMethod(c->reply,dupClientReplyValue);
6675 return c;
6676}
6677
6678static void freeFakeClient(struct redisClient *c) {
6679 sdsfree(c->querybuf);
6680 listRelease(c->reply);
6681 zfree(c);
6682}
6683
6684/* Replay the append log file. On error REDIS_OK is returned. On non fatal
6685 * error (the append only file is zero-length) REDIS_ERR is returned. On
6686 * fatal error an error message is logged and the program exists. */
6687int loadAppendOnlyFile(char *filename) {
6688 struct redisClient *fakeClient;
6689 FILE *fp = fopen(filename,"r");
6690 struct redis_stat sb;
b492cf00 6691 unsigned long long loadedkeys = 0;
f80dff62 6692
6693 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
6694 return REDIS_ERR;
6695
6696 if (fp == NULL) {
6697 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
6698 exit(1);
6699 }
6700
6701 fakeClient = createFakeClient();
6702 while(1) {
6703 int argc, j;
6704 unsigned long len;
6705 robj **argv;
6706 char buf[128];
6707 sds argsds;
6708 struct redisCommand *cmd;
6709
6710 if (fgets(buf,sizeof(buf),fp) == NULL) {
6711 if (feof(fp))
6712 break;
6713 else
6714 goto readerr;
6715 }
6716 if (buf[0] != '*') goto fmterr;
6717 argc = atoi(buf+1);
6718 argv = zmalloc(sizeof(robj*)*argc);
6719 for (j = 0; j < argc; j++) {
6720 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
6721 if (buf[0] != '$') goto fmterr;
6722 len = strtol(buf+1,NULL,10);
6723 argsds = sdsnewlen(NULL,len);
0f151ef1 6724 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 6725 argv[j] = createObject(REDIS_STRING,argsds);
6726 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
6727 }
6728
6729 /* Command lookup */
6730 cmd = lookupCommand(argv[0]->ptr);
6731 if (!cmd) {
6732 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
6733 exit(1);
6734 }
6735 /* Try object sharing and encoding */
6736 if (server.shareobjects) {
6737 int j;
6738 for(j = 1; j < argc; j++)
6739 argv[j] = tryObjectSharing(argv[j]);
6740 }
6741 if (cmd->flags & REDIS_CMD_BULK)
6742 tryObjectEncoding(argv[argc-1]);
6743 /* Run the command in the context of a fake client */
6744 fakeClient->argc = argc;
6745 fakeClient->argv = argv;
6746 cmd->proc(fakeClient);
6747 /* Discard the reply objects list from the fake client */
6748 while(listLength(fakeClient->reply))
6749 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
6750 /* Clean up, ready for the next command */
6751 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
6752 zfree(argv);
b492cf00 6753 /* Handle swapping while loading big datasets when VM is on */
6754 loadedkeys++;
6755 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
6756 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 6757 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 6758 }
6759 }
f80dff62 6760 }
6761 fclose(fp);
6762 freeFakeClient(fakeClient);
6763 return REDIS_OK;
6764
6765readerr:
6766 if (feof(fp)) {
6767 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
6768 } else {
6769 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
6770 }
6771 exit(1);
6772fmterr:
6773 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
6774 exit(1);
6775}
6776
9d65a1bb 6777/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
6778static int fwriteBulk(FILE *fp, robj *obj) {
6779 char buf[128];
b9bc0eef 6780 int decrrc = 0;
6781
f2d9f50f 6782 /* Avoid the incr/decr ref count business if possible to help
6783 * copy-on-write (we are often in a child process when this function
6784 * is called).
6785 * Also makes sure that key objects don't get incrRefCount-ed when VM
6786 * is enabled */
6787 if (obj->encoding != REDIS_ENCODING_RAW) {
b9bc0eef 6788 obj = getDecodedObject(obj);
6789 decrrc = 1;
6790 }
9d65a1bb 6791 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
6792 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 6793 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
6794 goto err;
9d65a1bb 6795 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 6796 if (decrrc) decrRefCount(obj);
9d65a1bb 6797 return 1;
6798err:
b9bc0eef 6799 if (decrrc) decrRefCount(obj);
9d65a1bb 6800 return 0;
6801}
6802
6803/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
6804static int fwriteBulkDouble(FILE *fp, double d) {
6805 char buf[128], dbuf[128];
6806
6807 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
6808 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
6809 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
6810 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
6811 return 1;
6812}
6813
6814/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
6815static int fwriteBulkLong(FILE *fp, long l) {
6816 char buf[128], lbuf[128];
6817
6818 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
6819 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
6820 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
6821 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
6822 return 1;
6823}
6824
6825/* Write a sequence of commands able to fully rebuild the dataset into
6826 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
6827static int rewriteAppendOnlyFile(char *filename) {
6828 dictIterator *di = NULL;
6829 dictEntry *de;
6830 FILE *fp;
6831 char tmpfile[256];
6832 int j;
6833 time_t now = time(NULL);
6834
6835 /* Note that we have to use a different temp name here compared to the
6836 * one used by rewriteAppendOnlyFileBackground() function. */
6837 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
6838 fp = fopen(tmpfile,"w");
6839 if (!fp) {
6840 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
6841 return REDIS_ERR;
6842 }
6843 for (j = 0; j < server.dbnum; j++) {
6844 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
6845 redisDb *db = server.db+j;
6846 dict *d = db->dict;
6847 if (dictSize(d) == 0) continue;
6848 di = dictGetIterator(d);
6849 if (!di) {
6850 fclose(fp);
6851 return REDIS_ERR;
6852 }
6853
6854 /* SELECT the new DB */
6855 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 6856 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 6857
6858 /* Iterate this DB writing every entry */
6859 while((de = dictNext(di)) != NULL) {
e7546c63 6860 robj *key, *o;
6861 time_t expiretime;
6862 int swapped;
6863
6864 key = dictGetEntryKey(de);
b9bc0eef 6865 /* If the value for this key is swapped, load a preview in memory.
6866 * We use a "swapped" flag to remember if we need to free the
6867 * value object instead to just increment the ref count anyway
6868 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 6869 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
6870 key->storage == REDIS_VM_SWAPPING) {
e7546c63 6871 o = dictGetEntryVal(de);
6872 swapped = 0;
6873 } else {
6874 o = vmPreviewObject(key);
e7546c63 6875 swapped = 1;
6876 }
6877 expiretime = getExpire(db,key);
9d65a1bb 6878
6879 /* Save the key and associated value */
9d65a1bb 6880 if (o->type == REDIS_STRING) {
6881 /* Emit a SET command */
6882 char cmd[]="*3\r\n$3\r\nSET\r\n";
6883 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
6884 /* Key and value */
6885 if (fwriteBulk(fp,key) == 0) goto werr;
6886 if (fwriteBulk(fp,o) == 0) goto werr;
6887 } else if (o->type == REDIS_LIST) {
6888 /* Emit the RPUSHes needed to rebuild the list */
6889 list *list = o->ptr;
6890 listNode *ln;
c7df85a4 6891 listIter li;
9d65a1bb 6892
c7df85a4 6893 listRewind(list,&li);
6894 while((ln = listNext(&li))) {
9d65a1bb 6895 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
6896 robj *eleobj = listNodeValue(ln);
6897
6898 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
6899 if (fwriteBulk(fp,key) == 0) goto werr;
6900 if (fwriteBulk(fp,eleobj) == 0) goto werr;
6901 }
6902 } else if (o->type == REDIS_SET) {
6903 /* Emit the SADDs needed to rebuild the set */
6904 dict *set = o->ptr;
6905 dictIterator *di = dictGetIterator(set);
6906 dictEntry *de;
6907
6908 while((de = dictNext(di)) != NULL) {
6909 char cmd[]="*3\r\n$4\r\nSADD\r\n";
6910 robj *eleobj = dictGetEntryKey(de);
6911
6912 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
6913 if (fwriteBulk(fp,key) == 0) goto werr;
6914 if (fwriteBulk(fp,eleobj) == 0) goto werr;
6915 }
6916 dictReleaseIterator(di);
6917 } else if (o->type == REDIS_ZSET) {
6918 /* Emit the ZADDs needed to rebuild the sorted set */
6919 zset *zs = o->ptr;
6920 dictIterator *di = dictGetIterator(zs->dict);
6921 dictEntry *de;
6922
6923 while((de = dictNext(di)) != NULL) {
6924 char cmd[]="*4\r\n$4\r\nZADD\r\n";
6925 robj *eleobj = dictGetEntryKey(de);
6926 double *score = dictGetEntryVal(de);
6927
6928 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
6929 if (fwriteBulk(fp,key) == 0) goto werr;
6930 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
6931 if (fwriteBulk(fp,eleobj) == 0) goto werr;
6932 }
6933 dictReleaseIterator(di);
6934 } else {
dfc5e96c 6935 redisAssert(0 != 0);
9d65a1bb 6936 }
6937 /* Save the expire time */
6938 if (expiretime != -1) {
e96e4fbf 6939 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 6940 /* If this key is already expired skip it */
6941 if (expiretime < now) continue;
6942 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
6943 if (fwriteBulk(fp,key) == 0) goto werr;
6944 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
6945 }
b9bc0eef 6946 if (swapped) decrRefCount(o);
9d65a1bb 6947 }
6948 dictReleaseIterator(di);
6949 }
6950
6951 /* Make sure data will not remain on the OS's output buffers */
6952 fflush(fp);
6953 fsync(fileno(fp));
6954 fclose(fp);
6955
6956 /* Use RENAME to make sure the DB file is changed atomically only
6957 * if the generate DB file is ok. */
6958 if (rename(tmpfile,filename) == -1) {
6959 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
6960 unlink(tmpfile);
6961 return REDIS_ERR;
6962 }
6963 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
6964 return REDIS_OK;
6965
6966werr:
6967 fclose(fp);
6968 unlink(tmpfile);
e96e4fbf 6969 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 6970 if (di) dictReleaseIterator(di);
6971 return REDIS_ERR;
6972}
6973
6974/* This is how rewriting of the append only file in background works:
6975 *
6976 * 1) The user calls BGREWRITEAOF
6977 * 2) Redis calls this function, that forks():
6978 * 2a) the child rewrite the append only file in a temp file.
6979 * 2b) the parent accumulates differences in server.bgrewritebuf.
6980 * 3) When the child finished '2a' exists.
6981 * 4) The parent will trap the exit code, if it's OK, will append the
6982 * data accumulated into server.bgrewritebuf into the temp file, and
6983 * finally will rename(2) the temp file in the actual file name.
6984 * The the new file is reopened as the new append only file. Profit!
6985 */
6986static int rewriteAppendOnlyFileBackground(void) {
6987 pid_t childpid;
6988
6989 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 6990 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 6991 if ((childpid = fork()) == 0) {
6992 /* Child */
6993 char tmpfile[256];
9d65a1bb 6994
054e426d 6995 if (server.vm_enabled) vmReopenSwapFile();
6996 close(server.fd);
9d65a1bb 6997 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
6998 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 6999 _exit(0);
9d65a1bb 7000 } else {
478c2c6f 7001 _exit(1);
9d65a1bb 7002 }
7003 } else {
7004 /* Parent */
7005 if (childpid == -1) {
7006 redisLog(REDIS_WARNING,
7007 "Can't rewrite append only file in background: fork: %s",
7008 strerror(errno));
7009 return REDIS_ERR;
7010 }
7011 redisLog(REDIS_NOTICE,
7012 "Background append only file rewriting started by pid %d",childpid);
7013 server.bgrewritechildpid = childpid;
85a83172 7014 /* We set appendseldb to -1 in order to force the next call to the
7015 * feedAppendOnlyFile() to issue a SELECT command, so the differences
7016 * accumulated by the parent into server.bgrewritebuf will start
7017 * with a SELECT statement and it will be safe to merge. */
7018 server.appendseldb = -1;
9d65a1bb 7019 return REDIS_OK;
7020 }
7021 return REDIS_OK; /* unreached */
7022}
7023
7024static void bgrewriteaofCommand(redisClient *c) {
7025 if (server.bgrewritechildpid != -1) {
7026 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
7027 return;
7028 }
7029 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 7030 char *status = "+Background append only file rewriting started\r\n";
7031 addReplySds(c,sdsnew(status));
9d65a1bb 7032 } else {
7033 addReply(c,shared.err);
7034 }
7035}
7036
7037static void aofRemoveTempFile(pid_t childpid) {
7038 char tmpfile[256];
7039
7040 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
7041 unlink(tmpfile);
7042}
7043
996cb5f7 7044/* Virtual Memory is composed mainly of two subsystems:
7045 * - Blocking Virutal Memory
7046 * - Threaded Virtual Memory I/O
7047 * The two parts are not fully decoupled, but functions are split among two
7048 * different sections of the source code (delimited by comments) in order to
7049 * make more clear what functionality is about the blocking VM and what about
7050 * the threaded (not blocking) VM.
7051 *
7052 * Redis VM design:
7053 *
7054 * Redis VM is a blocking VM (one that blocks reading swapped values from
7055 * disk into memory when a value swapped out is needed in memory) that is made
7056 * unblocking by trying to examine the command argument vector in order to
7057 * load in background values that will likely be needed in order to exec
7058 * the command. The command is executed only once all the relevant keys
7059 * are loaded into memory.
7060 *
7061 * This basically is almost as simple of a blocking VM, but almost as parallel
7062 * as a fully non-blocking VM.
7063 */
7064
7065/* =================== Virtual Memory - Blocking Side ====================== */
054e426d 7066
7067/* substitute the first occurrence of '%p' with the process pid in the
7068 * swap file name. */
7069static void expandVmSwapFilename(void) {
7070 char *p = strstr(server.vm_swap_file,"%p");
7071 sds new;
7072
7073 if (!p) return;
7074 new = sdsempty();
7075 *p = '\0';
7076 new = sdscat(new,server.vm_swap_file);
7077 new = sdscatprintf(new,"%ld",(long) getpid());
7078 new = sdscat(new,p+2);
7079 zfree(server.vm_swap_file);
7080 server.vm_swap_file = new;
7081}
7082
75680a3c 7083static void vmInit(void) {
7084 off_t totsize;
996cb5f7 7085 int pipefds[2];
bcaa7a4f 7086 size_t stacksize;
75680a3c 7087
4ad37480 7088 if (server.vm_max_threads != 0)
7089 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
7090
054e426d 7091 expandVmSwapFilename();
7092 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
6fa987e3 7093 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
7094 server.vm_fp = fopen(server.vm_swap_file,"w+b");
7095 }
75680a3c 7096 if (server.vm_fp == NULL) {
6fa987e3 7097 redisLog(REDIS_WARNING,
7098 "Impossible to open the swap file: %s. Exiting.",
7099 strerror(errno));
75680a3c 7100 exit(1);
7101 }
7102 server.vm_fd = fileno(server.vm_fp);
7103 server.vm_next_page = 0;
7104 server.vm_near_pages = 0;
7d98e08c 7105 server.vm_stats_used_pages = 0;
7106 server.vm_stats_swapped_objects = 0;
7107 server.vm_stats_swapouts = 0;
7108 server.vm_stats_swapins = 0;
75680a3c 7109 totsize = server.vm_pages*server.vm_page_size;
7110 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
7111 if (ftruncate(server.vm_fd,totsize) == -1) {
7112 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
7113 strerror(errno));
7114 exit(1);
7115 } else {
7116 redisLog(REDIS_NOTICE,"Swap file allocated with success");
7117 }
7d30035d 7118 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 7119 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 7120 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 7121 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 7122
996cb5f7 7123 /* Initialize threaded I/O (used by Virtual Memory) */
7124 server.io_newjobs = listCreate();
7125 server.io_processing = listCreate();
7126 server.io_processed = listCreate();
d5d55fc3 7127 server.io_ready_clients = listCreate();
92f8e882 7128 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 7129 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
7130 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 7131 server.io_active_threads = 0;
996cb5f7 7132 if (pipe(pipefds) == -1) {
7133 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
7134 ,strerror(errno));
7135 exit(1);
7136 }
7137 server.io_ready_pipe_read = pipefds[0];
7138 server.io_ready_pipe_write = pipefds[1];
7139 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 7140 /* LZF requires a lot of stack */
7141 pthread_attr_init(&server.io_threads_attr);
7142 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
7143 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
7144 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 7145 /* Listen for events in the threaded I/O pipe */
7146 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
7147 vmThreadedIOCompletedJob, NULL) == AE_ERR)
7148 oom("creating file event");
75680a3c 7149}
7150
06224fec 7151/* Mark the page as used */
7152static void vmMarkPageUsed(off_t page) {
7153 off_t byte = page/8;
7154 int bit = page&7;
970e10bb 7155 redisAssert(vmFreePage(page) == 1);
06224fec 7156 server.vm_bitmap[byte] |= 1<<bit;
f870935d 7157 redisLog(REDIS_DEBUG,"Mark used: %lld (byte:%lld bit:%d)\n",
7158 (long long)page, (long long)byte, bit);
06224fec 7159}
7160
7161/* Mark N contiguous pages as used, with 'page' being the first. */
7162static void vmMarkPagesUsed(off_t page, off_t count) {
7163 off_t j;
7164
7165 for (j = 0; j < count; j++)
7d30035d 7166 vmMarkPageUsed(page+j);
7d98e08c 7167 server.vm_stats_used_pages += count;
06224fec 7168}
7169
7170/* Mark the page as free */
7171static void vmMarkPageFree(off_t page) {
7172 off_t byte = page/8;
7173 int bit = page&7;
970e10bb 7174 redisAssert(vmFreePage(page) == 0);
06224fec 7175 server.vm_bitmap[byte] &= ~(1<<bit);
970e10bb 7176 redisLog(REDIS_DEBUG,"Mark free: %lld (byte:%lld bit:%d)\n",
7177 (long long)page, (long long)byte, bit);
06224fec 7178}
7179
7180/* Mark N contiguous pages as free, with 'page' being the first. */
7181static void vmMarkPagesFree(off_t page, off_t count) {
7182 off_t j;
7183
7184 for (j = 0; j < count; j++)
7d30035d 7185 vmMarkPageFree(page+j);
7d98e08c 7186 server.vm_stats_used_pages -= count;
970e10bb 7187 if (server.vm_stats_used_pages > 100000000) {
7188 *((char*)-1) = 'x';
7189 }
06224fec 7190}
7191
7192/* Test if the page is free */
7193static int vmFreePage(off_t page) {
7194 off_t byte = page/8;
7195 int bit = page&7;
7d30035d 7196 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 7197}
7198
7199/* Find N contiguous free pages storing the first page of the cluster in *first.
3a66edc7 7200 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
7201 * REDIS_ERR is returned.
06224fec 7202 *
7203 * This function uses a simple algorithm: we try to allocate
7204 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
7205 * again from the start of the swap file searching for free spaces.
7206 *
7207 * If it looks pretty clear that there are no free pages near our offset
7208 * we try to find less populated places doing a forward jump of
7209 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
7210 * without hurry, and then we jump again and so forth...
7211 *
7212 * This function can be improved using a free list to avoid to guess
7213 * too much, since we could collect data about freed pages.
7214 *
7215 * note: I implemented this function just after watching an episode of
7216 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
7217 */
c7df85a4 7218static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 7219 off_t base, offset = 0, since_jump = 0, numfree = 0;
7220
7221 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
7222 server.vm_near_pages = 0;
7223 server.vm_next_page = 0;
7224 }
7225 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
7226 base = server.vm_next_page;
7227
7228 while(offset < server.vm_pages) {
7229 off_t this = base+offset;
7230
7231 /* If we overflow, restart from page zero */
7232 if (this >= server.vm_pages) {
7233 this -= server.vm_pages;
7234 if (this == 0) {
7235 /* Just overflowed, what we found on tail is no longer
7236 * interesting, as it's no longer contiguous. */
7237 numfree = 0;
7238 }
7239 }
f6c0bba8 7240 redisLog(REDIS_DEBUG, "THIS: %lld (%c)\n", (long long) this, vmFreePage(this) ? 'F' : 'X');
06224fec 7241 if (vmFreePage(this)) {
7242 /* This is a free page */
7243 numfree++;
7244 /* Already got N free pages? Return to the caller, with success */
7245 if (numfree == n) {
7d30035d 7246 *first = this-(n-1);
7247 server.vm_next_page = this+1;
3a66edc7 7248 return REDIS_OK;
06224fec 7249 }
7250 } else {
7251 /* The current one is not a free page */
7252 numfree = 0;
7253 }
7254
7255 /* Fast-forward if the current page is not free and we already
7256 * searched enough near this place. */
7257 since_jump++;
7258 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
7259 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
7260 since_jump = 0;
7261 /* Note that even if we rewind after the jump, we are don't need
7262 * to make sure numfree is set to zero as we only jump *if* it
7263 * is set to zero. */
7264 } else {
7265 /* Otherwise just check the next page */
7266 offset++;
7267 }
7268 }
3a66edc7 7269 return REDIS_ERR;
7270}
7271
a5819310 7272/* Write the specified object at the specified page of the swap file */
7273static int vmWriteObjectOnSwap(robj *o, off_t page) {
7274 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
7275 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
7276 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7277 redisLog(REDIS_WARNING,
7278 "Critical VM problem in vmSwapObjectBlocking(): can't seek: %s",
7279 strerror(errno));
7280 return REDIS_ERR;
7281 }
7282 rdbSaveObject(server.vm_fp,o);
7283 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7284 return REDIS_OK;
7285}
7286
3a66edc7 7287/* Swap the 'val' object relative to 'key' into disk. Store all the information
7288 * needed to later retrieve the object into the key object.
7289 * If we can't find enough contiguous empty pages to swap the object on disk
7290 * REDIS_ERR is returned. */
a69a0c9c 7291static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 7292 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 7293 off_t page;
7294
7295 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 7296 assert(key->refcount == 1);
3a66edc7 7297 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 7298 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 7299 key->vm.page = page;
7300 key->vm.usedpages = pages;
7301 key->storage = REDIS_VM_SWAPPED;
d894161b 7302 key->vtype = val->type;
3a66edc7 7303 decrRefCount(val); /* Deallocate the object from memory. */
7304 vmMarkPagesUsed(page,pages);
7d30035d 7305 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
7306 (unsigned char*) key->ptr,
7307 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 7308 server.vm_stats_swapped_objects++;
7309 server.vm_stats_swapouts++;
0841cc92 7310 fflush(server.vm_fp);
3a66edc7 7311 return REDIS_OK;
7312}
7313
a5819310 7314static robj *vmReadObjectFromSwap(off_t page, int type) {
7315 robj *o;
3a66edc7 7316
a5819310 7317 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
7318 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 7319 redisLog(REDIS_WARNING,
d5d55fc3 7320 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 7321 strerror(errno));
478c2c6f 7322 _exit(1);
3a66edc7 7323 }
a5819310 7324 o = rdbLoadObject(type,server.vm_fp);
7325 if (o == NULL) {
d5d55fc3 7326 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 7327 _exit(1);
3a66edc7 7328 }
a5819310 7329 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7330 return o;
7331}
7332
7333/* Load the value object relative to the 'key' object from swap to memory.
7334 * The newly allocated object is returned.
7335 *
7336 * If preview is true the unserialized object is returned to the caller but
7337 * no changes are made to the key object, nor the pages are marked as freed */
7338static robj *vmGenericLoadObject(robj *key, int preview) {
7339 robj *val;
7340
d5d55fc3 7341 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
a5819310 7342 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 7343 if (!preview) {
7344 key->storage = REDIS_VM_MEMORY;
7345 key->vm.atime = server.unixtime;
7346 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
7347 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
7348 (unsigned char*) key->ptr);
7d98e08c 7349 server.vm_stats_swapped_objects--;
38aba9a1 7350 } else {
7351 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
7352 (unsigned char*) key->ptr);
7e69548d 7353 }
7d98e08c 7354 server.vm_stats_swapins++;
3a66edc7 7355 return val;
06224fec 7356}
7357
7e69548d 7358/* Plain object loading, from swap to memory */
7359static robj *vmLoadObject(robj *key) {
996cb5f7 7360 /* If we are loading the object in background, stop it, we
7361 * need to load this object synchronously ASAP. */
7362 if (key->storage == REDIS_VM_LOADING)
7363 vmCancelThreadedIOJob(key);
7e69548d 7364 return vmGenericLoadObject(key,0);
7365}
7366
7367/* Just load the value on disk, without to modify the key.
7368 * This is useful when we want to perform some operation on the value
7369 * without to really bring it from swap to memory, like while saving the
7370 * dataset or rewriting the append only log. */
7371static robj *vmPreviewObject(robj *key) {
7372 return vmGenericLoadObject(key,1);
7373}
7374
4ef8de8a 7375/* How a good candidate is this object for swapping?
7376 * The better candidate it is, the greater the returned value.
7377 *
7378 * Currently we try to perform a fast estimation of the object size in
7379 * memory, and combine it with aging informations.
7380 *
7381 * Basically swappability = idle-time * log(estimated size)
7382 *
7383 * Bigger objects are preferred over smaller objects, but not
7384 * proportionally, this is why we use the logarithm. This algorithm is
7385 * just a first try and will probably be tuned later. */
7386static double computeObjectSwappability(robj *o) {
7387 time_t age = server.unixtime - o->vm.atime;
7388 long asize = 0;
7389 list *l;
7390 dict *d;
7391 struct dictEntry *de;
7392 int z;
7393
7394 if (age <= 0) return 0;
7395 switch(o->type) {
7396 case REDIS_STRING:
7397 if (o->encoding != REDIS_ENCODING_RAW) {
7398 asize = sizeof(*o);
7399 } else {
7400 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
7401 }
7402 break;
7403 case REDIS_LIST:
7404 l = o->ptr;
7405 listNode *ln = listFirst(l);
7406
7407 asize = sizeof(list);
7408 if (ln) {
7409 robj *ele = ln->value;
7410 long elesize;
7411
7412 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
7413 (sizeof(*o)+sdslen(ele->ptr)) :
7414 sizeof(*o);
7415 asize += (sizeof(listNode)+elesize)*listLength(l);
7416 }
7417 break;
7418 case REDIS_SET:
7419 case REDIS_ZSET:
7420 z = (o->type == REDIS_ZSET);
7421 d = z ? ((zset*)o->ptr)->dict : o->ptr;
7422
7423 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
7424 if (z) asize += sizeof(zset)-sizeof(dict);
7425 if (dictSize(d)) {
7426 long elesize;
7427 robj *ele;
7428
7429 de = dictGetRandomKey(d);
7430 ele = dictGetEntryKey(de);
7431 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
7432 (sizeof(*o)+sdslen(ele->ptr)) :
7433 sizeof(*o);
7434 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
7435 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
7436 }
7437 break;
7438 }
7439 return (double)asize*log(1+asize);
7440}
7441
7442/* Try to swap an object that's a good candidate for swapping.
7443 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 7444 * to swap any object at all.
7445 *
7446 * If 'usethreaded' is true, Redis will try to swap the object in background
7447 * using I/O threads. */
7448static int vmSwapOneObject(int usethreads) {
4ef8de8a 7449 int j, i;
7450 struct dictEntry *best = NULL;
7451 double best_swappability = 0;
b9bc0eef 7452 redisDb *best_db = NULL;
4ef8de8a 7453 robj *key, *val;
7454
7455 for (j = 0; j < server.dbnum; j++) {
7456 redisDb *db = server.db+j;
b72f6a4b 7457 /* Why maxtries is set to 100?
7458 * Because this way (usually) we'll find 1 object even if just 1% - 2%
7459 * are swappable objects */
b0d8747d 7460 int maxtries = 100;
4ef8de8a 7461
7462 if (dictSize(db->dict) == 0) continue;
7463 for (i = 0; i < 5; i++) {
7464 dictEntry *de;
7465 double swappability;
7466
e3cadb8a 7467 if (maxtries) maxtries--;
4ef8de8a 7468 de = dictGetRandomKey(db->dict);
7469 key = dictGetEntryKey(de);
7470 val = dictGetEntryVal(de);
1064ef87 7471 /* Only swap objects that are currently in memory.
7472 *
7473 * Also don't swap shared objects if threaded VM is on, as we
7474 * try to ensure that the main thread does not touch the
7475 * object while the I/O thread is using it, but we can't
7476 * control other keys without adding additional mutex. */
7477 if (key->storage != REDIS_VM_MEMORY ||
7478 (server.vm_max_threads != 0 && val->refcount != 1)) {
e3cadb8a 7479 if (maxtries) i--; /* don't count this try */
7480 continue;
7481 }
4ef8de8a 7482 swappability = computeObjectSwappability(val);
7483 if (!best || swappability > best_swappability) {
7484 best = de;
7485 best_swappability = swappability;
b9bc0eef 7486 best_db = db;
4ef8de8a 7487 }
7488 }
7489 }
e3cadb8a 7490 if (best == NULL) {
7491 redisLog(REDIS_DEBUG,"No swappable key found!");
7492 return REDIS_ERR;
7493 }
4ef8de8a 7494 key = dictGetEntryKey(best);
7495 val = dictGetEntryVal(best);
7496
e3cadb8a 7497 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 7498 key->ptr, best_swappability);
7499
7500 /* Unshare the key if needed */
7501 if (key->refcount > 1) {
7502 robj *newkey = dupStringObject(key);
7503 decrRefCount(key);
7504 key = dictGetEntryKey(best) = newkey;
7505 }
7506 /* Swap it */
a69a0c9c 7507 if (usethreads) {
b9bc0eef 7508 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 7509 return REDIS_OK;
7510 } else {
a69a0c9c 7511 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7512 dictGetEntryVal(best) = NULL;
7513 return REDIS_OK;
7514 } else {
7515 return REDIS_ERR;
7516 }
4ef8de8a 7517 }
7518}
7519
a69a0c9c 7520static int vmSwapOneObjectBlocking() {
7521 return vmSwapOneObject(0);
7522}
7523
7524static int vmSwapOneObjectThreaded() {
7525 return vmSwapOneObject(1);
7526}
7527
7e69548d 7528/* Return true if it's safe to swap out objects in a given moment.
7529 * Basically we don't want to swap objects out while there is a BGSAVE
7530 * or a BGAEOREWRITE running in backgroud. */
7531static int vmCanSwapOut(void) {
7532 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
7533}
7534
1b03836c 7535/* Delete a key if swapped. Returns 1 if the key was found, was swapped
7536 * and was deleted. Otherwise 0 is returned. */
7537static int deleteIfSwapped(redisDb *db, robj *key) {
7538 dictEntry *de;
7539 robj *foundkey;
7540
7541 if ((de = dictFind(db->dict,key)) == NULL) return 0;
7542 foundkey = dictGetEntryKey(de);
7543 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
7544 deleteKey(db,key);
7545 return 1;
7546}
7547
996cb5f7 7548/* =================== Virtual Memory - Threaded I/O ======================= */
7549
b9bc0eef 7550static void freeIOJob(iojob *j) {
d5d55fc3 7551 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
7552 j->type == REDIS_IOJOB_DO_SWAP ||
7553 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
b9bc0eef 7554 decrRefCount(j->val);
7555 decrRefCount(j->key);
7556 zfree(j);
7557}
7558
996cb5f7 7559/* Every time a thread finished a Job, it writes a byte into the write side
7560 * of an unix pipe in order to "awake" the main thread, and this function
7561 * is called. */
7562static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
7563 int mask)
7564{
7565 char buf[1];
b0d8747d 7566 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 7567 REDIS_NOTUSED(el);
7568 REDIS_NOTUSED(mask);
7569 REDIS_NOTUSED(privdata);
7570
7571 /* For every byte we read in the read side of the pipe, there is one
7572 * I/O job completed to process. */
7573 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 7574 iojob *j;
7575 listNode *ln;
7576 robj *key;
7577 struct dictEntry *de;
7578
996cb5f7 7579 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 7580
7581 /* Get the processed element (the oldest one) */
7582 lockThreadedIO();
1064ef87 7583 assert(listLength(server.io_processed) != 0);
f6c0bba8 7584 if (toprocess == -1) {
7585 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
7586 if (toprocess <= 0) toprocess = 1;
7587 }
b9bc0eef 7588 ln = listFirst(server.io_processed);
7589 j = ln->value;
7590 listDelNode(server.io_processed,ln);
7591 unlockThreadedIO();
7592 /* If this job is marked as canceled, just ignore it */
7593 if (j->canceled) {
7594 freeIOJob(j);
7595 continue;
7596 }
7597 /* Post process it in the main thread, as there are things we
7598 * can do just here to avoid race conditions and/or invasive locks */
6c96ba7d 7599 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 7600 de = dictFind(j->db->dict,j->key);
7601 assert(de != NULL);
7602 key = dictGetEntryKey(de);
7603 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 7604 redisDb *db;
7605
b9bc0eef 7606 /* Key loaded, bring it at home */
7607 key->storage = REDIS_VM_MEMORY;
7608 key->vm.atime = server.unixtime;
7609 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
7610 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
7611 (unsigned char*) key->ptr);
7612 server.vm_stats_swapped_objects--;
7613 server.vm_stats_swapins++;
d5d55fc3 7614 dictGetEntryVal(de) = j->val;
7615 incrRefCount(j->val);
7616 db = j->db;
b9bc0eef 7617 freeIOJob(j);
d5d55fc3 7618 /* Handle clients waiting for this key to be loaded. */
7619 handleClientsBlockedOnSwappedKey(db,key);
b9bc0eef 7620 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
7621 /* Now we know the amount of pages required to swap this object.
7622 * Let's find some space for it, and queue this task again
7623 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 7624 if (!vmCanSwapOut() ||
7625 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
7626 {
7627 /* Ooops... no space or we can't swap as there is
7628 * a fork()ed Redis trying to save stuff on disk. */
b9bc0eef 7629 freeIOJob(j);
054e426d 7630 key->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 7631 } else {
c7df85a4 7632 /* Note that we need to mark this pages as used now,
7633 * if the job will be canceled, we'll mark them as freed
7634 * again. */
7635 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 7636 j->type = REDIS_IOJOB_DO_SWAP;
7637 lockThreadedIO();
7638 queueIOJob(j);
7639 unlockThreadedIO();
7640 }
7641 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
7642 robj *val;
7643
7644 /* Key swapped. We can finally free some memory. */
6c96ba7d 7645 if (key->storage != REDIS_VM_SWAPPING) {
7646 printf("key->storage: %d\n",key->storage);
7647 printf("key->name: %s\n",(char*)key->ptr);
7648 printf("key->refcount: %d\n",key->refcount);
7649 printf("val: %p\n",(void*)j->val);
7650 printf("val->type: %d\n",j->val->type);
7651 printf("val->ptr: %s\n",(char*)j->val->ptr);
7652 }
7653 redisAssert(key->storage == REDIS_VM_SWAPPING);
b9bc0eef 7654 val = dictGetEntryVal(de);
7655 key->vm.page = j->page;
7656 key->vm.usedpages = j->pages;
7657 key->storage = REDIS_VM_SWAPPED;
7658 key->vtype = j->val->type;
7659 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 7660 dictGetEntryVal(de) = NULL;
b9bc0eef 7661 redisLog(REDIS_DEBUG,
7662 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
7663 (unsigned char*) key->ptr,
7664 (unsigned long long) j->page, (unsigned long long) j->pages);
7665 server.vm_stats_swapped_objects++;
7666 server.vm_stats_swapouts++;
7667 freeIOJob(j);
f11b8647 7668 /* Put a few more swap requests in queue if we are still
7669 * out of memory */
b0d8747d 7670 if (trytoswap && vmCanSwapOut() &&
7671 zmalloc_used_memory() > server.vm_max_memory)
7672 {
f11b8647 7673 int more = 1;
7674 while(more) {
7675 lockThreadedIO();
7676 more = listLength(server.io_newjobs) <
7677 (unsigned) server.vm_max_threads;
7678 unlockThreadedIO();
7679 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 7680 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
7681 trytoswap = 0;
7682 break;
7683 }
f11b8647 7684 }
7685 }
b9bc0eef 7686 }
c953f24b 7687 processed++;
f6c0bba8 7688 if (processed == toprocess) return;
996cb5f7 7689 }
7690 if (retval < 0 && errno != EAGAIN) {
7691 redisLog(REDIS_WARNING,
7692 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
7693 strerror(errno));
7694 }
7695}
7696
7697static void lockThreadedIO(void) {
7698 pthread_mutex_lock(&server.io_mutex);
7699}
7700
7701static void unlockThreadedIO(void) {
7702 pthread_mutex_unlock(&server.io_mutex);
7703}
7704
7705/* Remove the specified object from the threaded I/O queue if still not
7706 * processed, otherwise make sure to flag it as canceled. */
7707static void vmCancelThreadedIOJob(robj *o) {
7708 list *lists[3] = {
6c96ba7d 7709 server.io_newjobs, /* 0 */
7710 server.io_processing, /* 1 */
7711 server.io_processed /* 2 */
996cb5f7 7712 };
7713 int i;
7714
7715 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 7716again:
996cb5f7 7717 lockThreadedIO();
7718 /* Search for a matching key in one of the queues */
7719 for (i = 0; i < 3; i++) {
7720 listNode *ln;
c7df85a4 7721 listIter li;
996cb5f7 7722
c7df85a4 7723 listRewind(lists[i],&li);
7724 while ((ln = listNext(&li)) != NULL) {
996cb5f7 7725 iojob *job = ln->value;
7726
6c96ba7d 7727 if (job->canceled) continue; /* Skip this, already canceled. */
996cb5f7 7728 if (compareStringObjects(job->key,o) == 0) {
970e10bb 7729 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
7730 (void*)job, (char*)o->ptr, job->type, i);
427a2153 7731 /* Mark the pages as free since the swap didn't happened
7732 * or happened but is now discarded. */
970e10bb 7733 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 7734 vmMarkPagesFree(job->page,job->pages);
7735 /* Cancel the job. It depends on the list the job is
7736 * living in. */
996cb5f7 7737 switch(i) {
7738 case 0: /* io_newjobs */
6c96ba7d 7739 /* If the job was yet not processed the best thing to do
996cb5f7 7740 * is to remove it from the queue at all */
6c96ba7d 7741 freeIOJob(job);
996cb5f7 7742 listDelNode(lists[i],ln);
7743 break;
7744 case 1: /* io_processing */
d5d55fc3 7745 /* Oh Shi- the thread is messing with the Job:
7746 *
7747 * Probably it's accessing the object if this is a
7748 * PREPARE_SWAP or DO_SWAP job.
7749 * If it's a LOAD job it may be reading from disk and
7750 * if we don't wait for the job to terminate before to
7751 * cancel it, maybe in a few microseconds data can be
7752 * corrupted in this pages. So the short story is:
7753 *
7754 * Better to wait for the job to move into the
7755 * next queue (processed)... */
7756
7757 /* We try again and again until the job is completed. */
7758 unlockThreadedIO();
7759 /* But let's wait some time for the I/O thread
7760 * to finish with this job. After all this condition
7761 * should be very rare. */
7762 usleep(1);
7763 goto again;
996cb5f7 7764 case 2: /* io_processed */
2e111efe 7765 /* The job was already processed, that's easy...
7766 * just mark it as canceled so that we'll ignore it
7767 * when processing completed jobs. */
996cb5f7 7768 job->canceled = 1;
7769 break;
7770 }
c7df85a4 7771 /* Finally we have to adjust the storage type of the object
7772 * in order to "UNDO" the operaiton. */
996cb5f7 7773 if (o->storage == REDIS_VM_LOADING)
7774 o->storage = REDIS_VM_SWAPPED;
7775 else if (o->storage == REDIS_VM_SWAPPING)
7776 o->storage = REDIS_VM_MEMORY;
7777 unlockThreadedIO();
7778 return;
7779 }
7780 }
7781 }
7782 unlockThreadedIO();
7783 assert(1 != 1); /* We should never reach this */
7784}
7785
b9bc0eef 7786static void *IOThreadEntryPoint(void *arg) {
7787 iojob *j;
7788 listNode *ln;
7789 REDIS_NOTUSED(arg);
7790
7791 pthread_detach(pthread_self());
7792 while(1) {
7793 /* Get a new job to process */
7794 lockThreadedIO();
7795 if (listLength(server.io_newjobs) == 0) {
7796 /* No new jobs in queue, exit. */
b74880b4 7797 redisLog(REDIS_DEBUG,"Thread %lld exiting, nothing to do",
b9bc0eef 7798 (long long) pthread_self());
7799 server.io_active_threads--;
7800 unlockThreadedIO();
7801 return NULL;
7802 }
7803 ln = listFirst(server.io_newjobs);
7804 j = ln->value;
7805 listDelNode(server.io_newjobs,ln);
7806 /* Add the job in the processing queue */
7807 j->thread = pthread_self();
7808 listAddNodeTail(server.io_processing,j);
7809 ln = listLast(server.io_processing); /* We use ln later to remove it */
7810 unlockThreadedIO();
b74880b4 7811 redisLog(REDIS_DEBUG,"Thread %lld got a new job (type %d): %p about key '%s'",
6c96ba7d 7812 (long long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 7813
7814 /* Process the Job */
7815 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 7816 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
b9bc0eef 7817 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
7818 FILE *fp = fopen("/dev/null","w+");
7819 j->pages = rdbSavedObjectPages(j->val,fp);
7820 fclose(fp);
7821 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 7822 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
7823 j->canceled = 1;
b9bc0eef 7824 }
7825
7826 /* Done: insert the job into the processed queue */
b74880b4 7827 redisLog(REDIS_DEBUG,"Thread %lld completed the job: %p (key %s)",
6c96ba7d 7828 (long long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 7829 lockThreadedIO();
7830 listDelNode(server.io_processing,ln);
7831 listAddNodeTail(server.io_processed,j);
7832 unlockThreadedIO();
7833
7834 /* Signal the main thread there is new stuff to process */
7835 assert(write(server.io_ready_pipe_write,"x",1) == 1);
7836 }
7837 return NULL; /* never reached */
7838}
7839
7840static void spawnIOThread(void) {
7841 pthread_t thread;
478c2c6f 7842 sigset_t mask, omask;
b9bc0eef 7843
478c2c6f 7844 sigemptyset(&mask);
7845 sigaddset(&mask,SIGCHLD);
7846 sigaddset(&mask,SIGHUP);
7847 sigaddset(&mask,SIGPIPE);
7848 pthread_sigmask(SIG_SETMASK, &mask, &omask);
bcaa7a4f 7849 pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL);
478c2c6f 7850 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 7851 server.io_active_threads++;
7852}
7853
4ee9488d 7854/* We need to wait for the last thread to exit before we are able to
7855 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 7856static void waitEmptyIOJobsQueue(void) {
4ee9488d 7857 while(1) {
76b7233a 7858 int io_processed_len;
7859
4ee9488d 7860 lockThreadedIO();
054e426d 7861 if (listLength(server.io_newjobs) == 0 &&
7862 listLength(server.io_processing) == 0 &&
7863 server.io_active_threads == 0)
7864 {
4ee9488d 7865 unlockThreadedIO();
7866 return;
7867 }
76b7233a 7868 /* While waiting for empty jobs queue condition we post-process some
7869 * finshed job, as I/O threads may be hanging trying to write against
7870 * the io_ready_pipe_write FD but there are so much pending jobs that
7871 * it's blocking. */
7872 io_processed_len = listLength(server.io_processed);
4ee9488d 7873 unlockThreadedIO();
76b7233a 7874 if (io_processed_len) {
7875 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
7876 usleep(1000); /* 1 millisecond */
7877 } else {
7878 usleep(10000); /* 10 milliseconds */
7879 }
4ee9488d 7880 }
7881}
7882
054e426d 7883static void vmReopenSwapFile(void) {
478c2c6f 7884 /* Note: we don't close the old one as we are in the child process
7885 * and don't want to mess at all with the original file object. */
054e426d 7886 server.vm_fp = fopen(server.vm_swap_file,"r+b");
7887 if (server.vm_fp == NULL) {
7888 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
7889 server.vm_swap_file);
478c2c6f 7890 _exit(1);
054e426d 7891 }
7892 server.vm_fd = fileno(server.vm_fp);
7893}
7894
b9bc0eef 7895/* This function must be called while with threaded IO locked */
7896static void queueIOJob(iojob *j) {
6c96ba7d 7897 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
7898 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 7899 listAddNodeTail(server.io_newjobs,j);
7900 if (server.io_active_threads < server.vm_max_threads)
7901 spawnIOThread();
7902}
7903
7904static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
7905 iojob *j;
7906
7907 assert(key->storage == REDIS_VM_MEMORY);
7908 assert(key->refcount == 1);
7909
7910 j = zmalloc(sizeof(*j));
7911 j->type = REDIS_IOJOB_PREPARE_SWAP;
7912 j->db = db;
7913 j->key = dupStringObject(key);
7914 j->val = val;
7915 incrRefCount(val);
7916 j->canceled = 0;
7917 j->thread = (pthread_t) -1;
f11b8647 7918 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 7919
7920 lockThreadedIO();
7921 queueIOJob(j);
7922 unlockThreadedIO();
7923 return REDIS_OK;
7924}
7925
b0d8747d 7926/* ============ Virtual Memory - Blocking clients on missing keys =========== */
7927
d5d55fc3 7928/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
7929 * If there is not already a job loading the key, it is craeted.
7930 * The key is added to the io_keys list in the client structure, and also
7931 * in the hash table mapping swapped keys to waiting clients, that is,
7932 * server.io_waited_keys. */
7933static int waitForSwappedKey(redisClient *c, robj *key) {
7934 struct dictEntry *de;
7935 robj *o;
7936 list *l;
7937
7938 /* If the key does not exist or is already in RAM we don't need to
7939 * block the client at all. */
7940 de = dictFind(c->db->dict,key);
7941 if (de == NULL) return 0;
7942 o = dictGetEntryKey(de);
7943 if (o->storage == REDIS_VM_MEMORY) {
7944 return 0;
7945 } else if (o->storage == REDIS_VM_SWAPPING) {
7946 /* We were swapping the key, undo it! */
7947 vmCancelThreadedIOJob(o);
7948 return 0;
7949 }
7950
7951 /* OK: the key is either swapped, or being loaded just now. */
7952
7953 /* Add the key to the list of keys this client is waiting for.
7954 * This maps clients to keys they are waiting for. */
7955 listAddNodeTail(c->io_keys,key);
7956 incrRefCount(key);
7957
7958 /* Add the client to the swapped keys => clients waiting map. */
7959 de = dictFind(c->db->io_keys,key);
7960 if (de == NULL) {
7961 int retval;
7962
7963 /* For every key we take a list of clients blocked for it */
7964 l = listCreate();
7965 retval = dictAdd(c->db->io_keys,key,l);
7966 incrRefCount(key);
7967 assert(retval == DICT_OK);
7968 } else {
7969 l = dictGetEntryVal(de);
7970 }
7971 listAddNodeTail(l,c);
7972
7973 /* Are we already loading the key from disk? If not create a job */
7974 if (o->storage == REDIS_VM_SWAPPED) {
7975 iojob *j;
7976
7977 o->storage = REDIS_VM_LOADING;
7978 j = zmalloc(sizeof(*j));
7979 j->type = REDIS_IOJOB_LOAD;
7980 j->db = c->db;
7981 j->key = dupStringObject(key);
7982 j->key->vtype = o->vtype;
7983 j->page = o->vm.page;
7984 j->val = NULL;
7985 j->canceled = 0;
7986 j->thread = (pthread_t) -1;
7987 lockThreadedIO();
7988 queueIOJob(j);
7989 unlockThreadedIO();
7990 }
7991 return 1;
7992}
7993
b0d8747d 7994/* Is this client attempting to run a command against swapped keys?
d5d55fc3 7995 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 7996 *
d5d55fc3 7997 * The important idea about this function is that it can fail! If keys will
7998 * still be swapped when the client is resumed, this key lookups will
7999 * just block loading keys from disk. In practical terms this should only
8000 * happen with SORT BY command or if there is a bug in this function.
8001 *
8002 * Return 1 if the client is marked as blocked, 0 if the client can
8003 * continue as the keys it is going to access appear to be in memory. */
8004static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
8005 if (cmd->proc == getCommand) {
8006 waitForSwappedKey(c,c->argv[1]);
8007 }
8008 /* If the client was blocked for at least one key, mark it as blocked. */
8009 if (listLength(c->io_keys)) {
8010 c->flags |= REDIS_IO_WAIT;
8011 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
8012 server.vm_blocked_clients++;
8013 return 1;
8014 } else {
8015 return 0;
8016 }
8017}
8018
8019/* Remove the 'key' from the list of blocked keys for a given client.
8020 *
8021 * The function returns 1 when there are no longer blocking keys after
8022 * the current one was removed (and the client can be unblocked). */
8023static int dontWaitForSwappedKey(redisClient *c, robj *key) {
8024 list *l;
8025 listNode *ln;
8026 listIter li;
8027 struct dictEntry *de;
8028
8029 /* Remove the key from the list of keys this client is waiting for. */
8030 listRewind(c->io_keys,&li);
8031 while ((ln = listNext(&li)) != NULL) {
8032 if (compareStringObjects(ln->value,key) == 0) {
8033 listDelNode(c->io_keys,ln);
8034 break;
8035 }
8036 }
8037 assert(ln != NULL);
8038
8039 /* Remove the client form the key => waiting clients map. */
8040 de = dictFind(c->db->io_keys,key);
8041 assert(de != NULL);
8042 l = dictGetEntryVal(de);
8043 ln = listSearchKey(l,c);
8044 assert(ln != NULL);
8045 listDelNode(l,ln);
8046 if (listLength(l) == 0)
8047 dictDelete(c->db->io_keys,key);
8048
8049 return listLength(c->io_keys) == 0;
8050}
8051
8052static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
8053 struct dictEntry *de;
8054 list *l;
8055 listNode *ln;
8056 int len;
8057
8058 de = dictFind(db->io_keys,key);
8059 if (!de) return;
8060
8061 l = dictGetEntryVal(de);
8062 len = listLength(l);
8063 /* Note: we can't use something like while(listLength(l)) as the list
8064 * can be freed by the calling function when we remove the last element. */
8065 while (len--) {
8066 ln = listFirst(l);
8067 redisClient *c = ln->value;
8068
8069 if (dontWaitForSwappedKey(c,key)) {
8070 /* Put the client in the list of clients ready to go as we
8071 * loaded all the keys about it. */
8072 listAddNodeTail(server.io_ready_clients,c);
8073 }
8074 }
b0d8747d 8075}
b0d8747d 8076
7f957c92 8077/* ================================= Debugging ============================== */
8078
8079static void debugCommand(redisClient *c) {
8080 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
8081 *((char*)-1) = 'x';
210e29f7 8082 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
8083 if (rdbSave(server.dbfilename) != REDIS_OK) {
8084 addReply(c,shared.err);
8085 return;
8086 }
8087 emptyDb();
8088 if (rdbLoad(server.dbfilename) != REDIS_OK) {
8089 addReply(c,shared.err);
8090 return;
8091 }
8092 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
8093 addReply(c,shared.ok);
71c2b467 8094 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
8095 emptyDb();
8096 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
8097 addReply(c,shared.err);
8098 return;
8099 }
8100 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
8101 addReply(c,shared.ok);
333298da 8102 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
8103 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
8104 robj *key, *val;
8105
8106 if (!de) {
8107 addReply(c,shared.nokeyerr);
8108 return;
8109 }
8110 key = dictGetEntryKey(de);
8111 val = dictGetEntryVal(de);
b9bc0eef 8112 if (server.vm_enabled && (key->storage == REDIS_VM_MEMORY ||
8113 key->storage == REDIS_VM_SWAPPING)) {
ace06542 8114 addReplySds(c,sdscatprintf(sdsempty(),
8115 "+Key at:%p refcount:%d, value at:%p refcount:%d "
8116 "encoding:%d serializedlength:%lld\r\n",
682ac724 8117 (void*)key, key->refcount, (void*)val, val->refcount,
459f52a8 8118 val->encoding, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 8119 } else {
8120 addReplySds(c,sdscatprintf(sdsempty(),
8121 "+Key at:%p refcount:%d, value swapped at: page %llu "
8122 "using %llu pages\r\n",
8123 (void*)key, key->refcount, (unsigned long long) key->vm.page,
8124 (unsigned long long) key->vm.usedpages));
8125 }
7d30035d 8126 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
8127 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
8128 robj *key, *val;
8129
8130 if (!server.vm_enabled) {
8131 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
8132 return;
8133 }
8134 if (!de) {
8135 addReply(c,shared.nokeyerr);
8136 return;
8137 }
8138 key = dictGetEntryKey(de);
8139 val = dictGetEntryVal(de);
4ef8de8a 8140 /* If the key is shared we want to create a copy */
8141 if (key->refcount > 1) {
8142 robj *newkey = dupStringObject(key);
8143 decrRefCount(key);
8144 key = dictGetEntryKey(de) = newkey;
8145 }
8146 /* Swap it */
7d30035d 8147 if (key->storage != REDIS_VM_MEMORY) {
8148 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 8149 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 8150 dictGetEntryVal(de) = NULL;
8151 addReply(c,shared.ok);
8152 } else {
8153 addReply(c,shared.err);
8154 }
7f957c92 8155 } else {
333298da 8156 addReplySds(c,sdsnew(
7d30035d 8157 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 8158 }
8159}
56906eef 8160
6c96ba7d 8161static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 8162 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
6c96ba7d 8163 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
dfc5e96c 8164#ifdef HAVE_BACKTRACE
8165 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
8166 *((char*)-1) = 'x';
8167#endif
8168}
8169
bcfc686d 8170/* =================================== Main! ================================ */
56906eef 8171
bcfc686d 8172#ifdef __linux__
8173int linuxOvercommitMemoryValue(void) {
8174 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
8175 char buf[64];
56906eef 8176
bcfc686d 8177 if (!fp) return -1;
8178 if (fgets(buf,64,fp) == NULL) {
8179 fclose(fp);
8180 return -1;
8181 }
8182 fclose(fp);
56906eef 8183
bcfc686d 8184 return atoi(buf);
8185}
8186
8187void linuxOvercommitMemoryWarning(void) {
8188 if (linuxOvercommitMemoryValue() == 0) {
8189 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
8190 }
8191}
8192#endif /* __linux__ */
8193
8194static void daemonize(void) {
8195 int fd;
8196 FILE *fp;
8197
8198 if (fork() != 0) exit(0); /* parent exits */
8199 setsid(); /* create a new session */
8200
8201 /* Every output goes to /dev/null. If Redis is daemonized but
8202 * the 'logfile' is set to 'stdout' in the configuration file
8203 * it will not log at all. */
8204 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
8205 dup2(fd, STDIN_FILENO);
8206 dup2(fd, STDOUT_FILENO);
8207 dup2(fd, STDERR_FILENO);
8208 if (fd > STDERR_FILENO) close(fd);
8209 }
8210 /* Try to write the pid file */
8211 fp = fopen(server.pidfile,"w");
8212 if (fp) {
8213 fprintf(fp,"%d\n",getpid());
8214 fclose(fp);
56906eef 8215 }
56906eef 8216}
8217
bcfc686d 8218int main(int argc, char **argv) {
9651a787 8219 time_t start;
8220
bcfc686d 8221 initServerConfig();
8222 if (argc == 2) {
8223 resetServerSaveParams();
8224 loadServerConfig(argv[1]);
8225 } else if (argc > 2) {
8226 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
8227 exit(1);
8228 } else {
8229 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
8230 }
bcfc686d 8231 if (server.daemonize) daemonize();
71c54b21 8232 initServer();
bcfc686d 8233 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
8234#ifdef __linux__
8235 linuxOvercommitMemoryWarning();
8236#endif
9651a787 8237 start = time(NULL);
bcfc686d 8238 if (server.appendonly) {
8239 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 8240 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 8241 } else {
8242 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 8243 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 8244 }
bcfc686d 8245 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 8246 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 8247 aeMain(server.el);
8248 aeDeleteEventLoop(server.el);
8249 return 0;
8250}
8251
8252/* ============================= Backtrace support ========================= */
8253
8254#ifdef HAVE_BACKTRACE
8255static char *findFuncName(void *pointer, unsigned long *offset);
8256
56906eef 8257static void *getMcontextEip(ucontext_t *uc) {
8258#if defined(__FreeBSD__)
8259 return (void*) uc->uc_mcontext.mc_eip;
8260#elif defined(__dietlibc__)
8261 return (void*) uc->uc_mcontext.eip;
06db1f50 8262#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 8263 #if __x86_64__
8264 return (void*) uc->uc_mcontext->__ss.__rip;
8265 #else
56906eef 8266 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 8267 #endif
06db1f50 8268#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 8269 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 8270 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 8271 #else
8272 return (void*) uc->uc_mcontext->__ss.__eip;
8273 #endif
c04c9ac9 8274#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
8275 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 8276#elif defined(__ia64__) /* Linux IA64 */
8277 return (void*) uc->uc_mcontext.sc_ip;
8278#else
8279 return NULL;
56906eef 8280#endif
8281}
8282
8283static void segvHandler(int sig, siginfo_t *info, void *secret) {
8284 void *trace[100];
8285 char **messages = NULL;
8286 int i, trace_size = 0;
8287 unsigned long offset=0;
56906eef 8288 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 8289 sds infostring;
56906eef 8290 REDIS_NOTUSED(info);
8291
8292 redisLog(REDIS_WARNING,
8293 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 8294 infostring = genRedisInfoString();
8295 redisLog(REDIS_WARNING, "%s",infostring);
8296 /* It's not safe to sdsfree() the returned string under memory
8297 * corruption conditions. Let it leak as we are going to abort */
56906eef 8298
8299 trace_size = backtrace(trace, 100);
de96dbfe 8300 /* overwrite sigaction with caller's address */
b91cf5ef 8301 if (getMcontextEip(uc) != NULL) {
8302 trace[1] = getMcontextEip(uc);
8303 }
56906eef 8304 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 8305
d76412d1 8306 for (i=1; i<trace_size; ++i) {
56906eef 8307 char *fn = findFuncName(trace[i], &offset), *p;
8308
8309 p = strchr(messages[i],'+');
8310 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
8311 redisLog(REDIS_WARNING,"%s", messages[i]);
8312 } else {
8313 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
8314 }
8315 }
b177fd30 8316 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 8317 _exit(0);
fe3bbfbe 8318}
56906eef 8319
8320static void setupSigSegvAction(void) {
8321 struct sigaction act;
8322
8323 sigemptyset (&act.sa_mask);
8324 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
8325 * is used. Otherwise, sa_handler is used */
8326 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
8327 act.sa_sigaction = segvHandler;
8328 sigaction (SIGSEGV, &act, NULL);
8329 sigaction (SIGBUS, &act, NULL);
12fea928 8330 sigaction (SIGFPE, &act, NULL);
8331 sigaction (SIGILL, &act, NULL);
8332 sigaction (SIGBUS, &act, NULL);
e65fdc78 8333 return;
56906eef 8334}
e65fdc78 8335
bcfc686d 8336#include "staticsymbols.h"
8337/* This function try to convert a pointer into a function name. It's used in
8338 * oreder to provide a backtrace under segmentation fault that's able to
8339 * display functions declared as static (otherwise the backtrace is useless). */
8340static char *findFuncName(void *pointer, unsigned long *offset){
8341 int i, ret = -1;
8342 unsigned long off, minoff = 0;
ed9b544e 8343
bcfc686d 8344 /* Try to match against the Symbol with the smallest offset */
8345 for (i=0; symsTable[i].pointer; i++) {
8346 unsigned long lp = (unsigned long) pointer;
0bc03378 8347
bcfc686d 8348 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
8349 off=lp-symsTable[i].pointer;
8350 if (ret < 0 || off < minoff) {
8351 minoff=off;
8352 ret=i;
8353 }
8354 }
0bc03378 8355 }
bcfc686d 8356 if (ret == -1) return NULL;
8357 *offset = minoff;
8358 return symsTable[ret].name;
0bc03378 8359}
bcfc686d 8360#else /* HAVE_BACKTRACE */
8361static void setupSigSegvAction(void) {
0bc03378 8362}
bcfc686d 8363#endif /* HAVE_BACKTRACE */
0bc03378 8364
ed9b544e 8365
ed9b544e 8366
bcfc686d 8367/* The End */
8368
8369
ed9b544e 8370