]> git.saurik.com Git - redis.git/blame - redis.c
fix for the just added new test
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
2 * Copyright (c) 2006-2009, Salvatore Sanfilippo <antirez at gmail dot com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
5dc70bff 30#define REDIS_VERSION "1.3.2"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
c9468bcf 40#define __USE_POSIX199309
ed9b544e 41#include <signal.h>
fbf9bcdb 42
43#ifdef HAVE_BACKTRACE
c9468bcf 44#include <execinfo.h>
45#include <ucontext.h>
fbf9bcdb 46#endif /* HAVE_BACKTRACE */
47
ed9b544e 48#include <sys/wait.h>
49#include <errno.h>
50#include <assert.h>
51#include <ctype.h>
52#include <stdarg.h>
53#include <inttypes.h>
54#include <arpa/inet.h>
55#include <sys/stat.h>
56#include <fcntl.h>
57#include <sys/time.h>
58#include <sys/resource.h>
2895e862 59#include <sys/uio.h>
f78fd11b 60#include <limits.h>
a7866db6 61#include <math.h>
92f8e882 62#include <pthread.h>
0bc1b2f6 63
64#if defined(__sun)
5043dff3 65#include "solarisfixes.h"
66#endif
ed9b544e 67
c9468bcf 68#include "redis.h"
ed9b544e 69#include "ae.h" /* Event driven programming library */
70#include "sds.h" /* Dynamic safe strings */
71#include "anet.h" /* Networking the easy way */
72#include "dict.h" /* Hash tables */
73#include "adlist.h" /* Linked lists */
74#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 75#include "lzf.h" /* LZF compression library */
76#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
ed9b544e 77
b04a5df9 78/* #define REDIS_HELGRIND_FRIENDLY */
79#if defined(__GNUC__) && defined(REDIS_HELGRIND_FRIENDLY)
80#warning "Remember to undef REDIS_HELGRIND_FRIENDLY before to commit"
81#endif
82
ed9b544e 83/* Error codes */
84#define REDIS_OK 0
85#define REDIS_ERR -1
86
87/* Static server configuration */
88#define REDIS_SERVERPORT 6379 /* TCP port */
89#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 90#define REDIS_IOBUF_LEN 1024
ed9b544e 91#define REDIS_LOADBUF_LEN 1024
93ea3759 92#define REDIS_STATIC_ARGS 4
ed9b544e 93#define REDIS_DEFAULT_DBNUM 16
94#define REDIS_CONFIGLINE_MAX 1024
95#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
96#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
94754ccc 97#define REDIS_EXPIRELOOKUPS_PER_CRON 100 /* try to expire 100 keys/second */
6f376729 98#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 99#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
100
101/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
102#define REDIS_WRITEV_THRESHOLD 3
103/* Max number of iovecs used for each writev call */
104#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 105
106/* Hash table parameters */
107#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 108
109/* Command flags */
3fd78bcd 110#define REDIS_CMD_BULK 1 /* Bulk write command */
111#define REDIS_CMD_INLINE 2 /* Inline command */
112/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
113 this flags will return an error when the 'maxmemory' option is set in the
114 config file and the server is using more than maxmemory bytes of memory.
115 In short this commands are denied on low memory conditions. */
116#define REDIS_CMD_DENYOOM 4
ed9b544e 117
118/* Object types */
119#define REDIS_STRING 0
120#define REDIS_LIST 1
121#define REDIS_SET 2
1812e024 122#define REDIS_ZSET 3
123#define REDIS_HASH 4
f78fd11b 124
942a3961 125/* Objects encoding */
126#define REDIS_ENCODING_RAW 0 /* Raw representation */
127#define REDIS_ENCODING_INT 1 /* Encoded as integer */
128
f78fd11b 129/* Object types only used for dumping to disk */
bb32ede5 130#define REDIS_EXPIRETIME 253
ed9b544e 131#define REDIS_SELECTDB 254
132#define REDIS_EOF 255
133
f78fd11b 134/* Defines related to the dump file format. To store 32 bits lengths for short
135 * keys requires a lot of space, so we check the most significant 2 bits of
136 * the first byte to interpreter the length:
137 *
138 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
139 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
140 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 141 * 11|000000 this means: specially encoded object will follow. The six bits
142 * number specify the kind of object that follows.
143 * See the REDIS_RDB_ENC_* defines.
f78fd11b 144 *
10c43610 145 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
146 * values, will fit inside. */
f78fd11b 147#define REDIS_RDB_6BITLEN 0
148#define REDIS_RDB_14BITLEN 1
149#define REDIS_RDB_32BITLEN 2
17be1a4a 150#define REDIS_RDB_ENCVAL 3
f78fd11b 151#define REDIS_RDB_LENERR UINT_MAX
152
a4d1ba9a 153/* When a length of a string object stored on disk has the first two bits
154 * set, the remaining two bits specify a special encoding for the object
155 * accordingly to the following defines: */
156#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
157#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
158#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 159#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 160
75680a3c 161/* Virtual memory object->where field. */
162#define REDIS_VM_MEMORY 0 /* The object is on memory */
163#define REDIS_VM_SWAPPED 1 /* The object is on disk */
164#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
165#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
166
06224fec 167/* Virtual memory static configuration stuff.
168 * Check vmFindContiguousPages() to know more about this magic numbers. */
169#define REDIS_VM_MAX_NEAR_PAGES 65536
170#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 171#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 172#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
c953f24b 173/* The following is the number of completed I/O jobs to process when the
174 * handelr is called. 1 is the minimum, and also the default, as it allows
175 * to block as little as possible other accessing clients. While Virtual
176 * Memory I/O operations are performed by threads, this operations must
177 * be processed by the main thread when completed to take effect. */
178#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 179
ed9b544e 180/* Client flags */
181#define REDIS_CLOSE 1 /* This client connection should be closed ASAP */
182#define REDIS_SLAVE 2 /* This client is a slave server */
183#define REDIS_MASTER 4 /* This client is a master server */
87eca727 184#define REDIS_MONITOR 8 /* This client is a slave monitor, see MONITOR */
6e469882 185#define REDIS_MULTI 16 /* This client is in a MULTI context */
4409877e 186#define REDIS_BLOCKED 32 /* The client is waiting in a blocking operation */
996cb5f7 187#define REDIS_IO_WAIT 64 /* The client is waiting for Virtual Memory I/O */
ed9b544e 188
40d224a9 189/* Slave replication state - slave side */
ed9b544e 190#define REDIS_REPL_NONE 0 /* No active replication */
191#define REDIS_REPL_CONNECT 1 /* Must connect to master */
192#define REDIS_REPL_CONNECTED 2 /* Connected to master */
193
40d224a9 194/* Slave replication state - from the point of view of master
195 * Note that in SEND_BULK and ONLINE state the slave receives new updates
196 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
197 * to start the next background saving in order to send updates to it. */
198#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
199#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
200#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
201#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
202
ed9b544e 203/* List related stuff */
204#define REDIS_HEAD 0
205#define REDIS_TAIL 1
206
207/* Sort operations */
208#define REDIS_SORT_GET 0
443c6409 209#define REDIS_SORT_ASC 1
210#define REDIS_SORT_DESC 2
ed9b544e 211#define REDIS_SORTKEY_MAX 1024
212
213/* Log levels */
214#define REDIS_DEBUG 0
f870935d 215#define REDIS_VERBOSE 1
216#define REDIS_NOTICE 2
217#define REDIS_WARNING 3
ed9b544e 218
219/* Anti-warning macro... */
220#define REDIS_NOTUSED(V) ((void) V)
221
6b47e12e 222#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
223#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 224
48f0308a 225/* Append only defines */
226#define APPENDFSYNC_NO 0
227#define APPENDFSYNC_ALWAYS 1
228#define APPENDFSYNC_EVERYSEC 2
229
dfc5e96c 230/* We can print the stacktrace, so our assert is defined this way: */
6c96ba7d 231#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),exit(1)))
232static void _redisAssert(char *estr, char *file, int line);
dfc5e96c 233
ed9b544e 234/*================================= Data types ============================== */
235
236/* A redis object, that is a type able to hold a string / list / set */
75680a3c 237
238/* The VM object structure */
239struct redisObjectVM {
3a66edc7 240 off_t page; /* the page at witch the object is stored on disk */
241 off_t usedpages; /* number of pages used on disk */
242 time_t atime; /* Last access time */
75680a3c 243} vm;
244
245/* The actual Redis Object */
ed9b544e 246typedef struct redisObject {
ed9b544e 247 void *ptr;
942a3961 248 unsigned char type;
249 unsigned char encoding;
d894161b 250 unsigned char storage; /* If this object is a key, where is the value?
251 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
252 unsigned char vtype; /* If this object is a key, and value is swapped out,
253 * this is the type of the swapped out object. */
ed9b544e 254 int refcount;
75680a3c 255 /* VM fields, this are only allocated if VM is active, otherwise the
256 * object allocation function will just allocate
257 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
258 * Redis without VM active will not have any overhead. */
259 struct redisObjectVM vm;
ed9b544e 260} robj;
261
dfc5e96c 262/* Macro used to initalize a Redis object allocated on the stack.
263 * Note that this macro is taken near the structure definition to make sure
264 * we'll update it when the structure is changed, to avoid bugs like
265 * bug #85 introduced exactly in this way. */
266#define initStaticStringObject(_var,_ptr) do { \
267 _var.refcount = 1; \
268 _var.type = REDIS_STRING; \
269 _var.encoding = REDIS_ENCODING_RAW; \
270 _var.ptr = _ptr; \
3a66edc7 271 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 272} while(0);
273
3305306f 274typedef struct redisDb {
4409877e 275 dict *dict; /* The keyspace for this DB */
276 dict *expires; /* Timeout of keys with a timeout set */
277 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
3305306f 278 int id;
279} redisDb;
280
6e469882 281/* Client MULTI/EXEC state */
282typedef struct multiCmd {
283 robj **argv;
284 int argc;
285 struct redisCommand *cmd;
286} multiCmd;
287
288typedef struct multiState {
289 multiCmd *commands; /* Array of MULTI commands */
290 int count; /* Total number of MULTI commands */
291} multiState;
292
ed9b544e 293/* With multiplexing we need to take per-clinet state.
294 * Clients are taken in a liked list. */
295typedef struct redisClient {
296 int fd;
3305306f 297 redisDb *db;
ed9b544e 298 int dictid;
299 sds querybuf;
e8a74421 300 robj **argv, **mbargv;
301 int argc, mbargc;
40d224a9 302 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 303 int multibulk; /* multi bulk command format active */
ed9b544e 304 list *reply;
305 int sentlen;
306 time_t lastinteraction; /* time of the last interaction, used for timeout */
40d224a9 307 int flags; /* REDIS_CLOSE | REDIS_SLAVE | REDIS_MONITOR */
6e469882 308 /* REDIS_MULTI */
40d224a9 309 int slaveseldb; /* slave selected db, if this client is a slave */
310 int authenticated; /* when requirepass is non-NULL */
311 int replstate; /* replication state if this is a slave */
312 int repldbfd; /* replication DB file descriptor */
6e469882 313 long repldboff; /* replication DB file offset */
40d224a9 314 off_t repldbsize; /* replication DB file size */
6e469882 315 multiState mstate; /* MULTI/EXEC state */
b177fd30 316 robj **blockingkeys; /* The key we waiting to terminate a blocking
4409877e 317 * operation such as BLPOP. Otherwise NULL. */
b177fd30 318 int blockingkeysnum; /* Number of blocking keys */
4409877e 319 time_t blockingto; /* Blocking operation timeout. If UNIX current time
320 * is >= blockingto then the operation timed out. */
92f8e882 321 list *io_keys; /* Keys this client is waiting to be loaded from the
322 * swap file in order to continue. */
ed9b544e 323} redisClient;
324
325struct saveparam {
326 time_t seconds;
327 int changes;
328};
329
330/* Global server state structure */
331struct redisServer {
332 int port;
333 int fd;
3305306f 334 redisDb *db;
4409877e 335 dict *sharingpool; /* Poll used for object sharing */
10c43610 336 unsigned int sharingpoolsize;
ed9b544e 337 long long dirty; /* changes to DB from the last save */
338 list *clients;
87eca727 339 list *slaves, *monitors;
ed9b544e 340 char neterr[ANET_ERR_LEN];
341 aeEventLoop *el;
342 int cronloops; /* number of times the cron function run */
343 list *objfreelist; /* A list of freed objects to avoid malloc() */
344 time_t lastsave; /* Unix time of last save succeeede */
5fba9f71 345 size_t usedmemory; /* Used memory in megabytes */
ed9b544e 346 /* Fields used only for stats */
347 time_t stat_starttime; /* server start time */
348 long long stat_numcommands; /* number of processed commands */
349 long long stat_numconnections; /* number of connections received */
350 /* Configuration */
351 int verbosity;
352 int glueoutputbuf;
353 int maxidletime;
354 int dbnum;
355 int daemonize;
44b38ef4 356 int appendonly;
48f0308a 357 int appendfsync;
358 time_t lastfsync;
44b38ef4 359 int appendfd;
360 int appendseldb;
ed329fcf 361 char *pidfile;
9f3c422c 362 pid_t bgsavechildpid;
9d65a1bb 363 pid_t bgrewritechildpid;
364 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
ed9b544e 365 struct saveparam *saveparams;
366 int saveparamslen;
367 char *logfile;
368 char *bindaddr;
369 char *dbfilename;
44b38ef4 370 char *appendfilename;
abcb223e 371 char *requirepass;
10c43610 372 int shareobjects;
121f70cf 373 int rdbcompression;
ed9b544e 374 /* Replication related */
375 int isslave;
d0ccebcf 376 char *masterauth;
ed9b544e 377 char *masterhost;
378 int masterport;
40d224a9 379 redisClient *master; /* client that is master for this slave */
ed9b544e 380 int replstate;
285add55 381 unsigned int maxclients;
4ef8de8a 382 unsigned long long maxmemory;
f86a74e9 383 unsigned int blockedclients;
ed9b544e 384 /* Sort parameters - qsort_r() is only available under BSD so we
385 * have to take this state global, in order to pass it to sortCompare() */
386 int sort_desc;
387 int sort_alpha;
388 int sort_bypattern;
75680a3c 389 /* Virtual memory configuration */
390 int vm_enabled;
391 off_t vm_page_size;
392 off_t vm_pages;
4ef8de8a 393 unsigned long long vm_max_memory;
75680a3c 394 /* Virtual memory state */
395 FILE *vm_fp;
396 int vm_fd;
397 off_t vm_next_page; /* Next probably empty page */
398 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 399 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 400 time_t unixtime; /* Unix time sampled every second. */
92f8e882 401 /* Virtual memory I/O threads stuff */
92f8e882 402 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 403 * put the result of the operation in the io_done list. While the
404 * job is being processed, it's put on io_processing queue. */
405 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
406 list *io_processing; /* List of VM I/O jobs being processed */
407 list *io_processed; /* List of VM I/O jobs already processed */
92f8e882 408 list *io_clients; /* All the clients waiting for SWAP I/O operations */
996cb5f7 409 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 410 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
411 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 412 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 413 int io_active_threads; /* Number of running I/O threads */
414 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 415 /* Our main thread is blocked on the event loop, locking for sockets ready
416 * to be read or written, so when a threaded I/O operation is ready to be
417 * processed by the main thread, the I/O thread will use a unix pipe to
418 * awake the main thread. The followings are the two pipe FDs. */
419 int io_ready_pipe_read;
420 int io_ready_pipe_write;
7d98e08c 421 /* Virtual memory stats */
422 unsigned long long vm_stats_used_pages;
423 unsigned long long vm_stats_swapped_objects;
424 unsigned long long vm_stats_swapouts;
425 unsigned long long vm_stats_swapins;
b9bc0eef 426 FILE *devnull;
ed9b544e 427};
428
429typedef void redisCommandProc(redisClient *c);
430struct redisCommand {
431 char *name;
432 redisCommandProc *proc;
433 int arity;
434 int flags;
435};
436
de96dbfe 437struct redisFunctionSym {
438 char *name;
56906eef 439 unsigned long pointer;
de96dbfe 440};
441
ed9b544e 442typedef struct _redisSortObject {
443 robj *obj;
444 union {
445 double score;
446 robj *cmpobj;
447 } u;
448} redisSortObject;
449
450typedef struct _redisSortOperation {
451 int type;
452 robj *pattern;
453} redisSortOperation;
454
6b47e12e 455/* ZSETs use a specialized version of Skiplists */
456
457typedef struct zskiplistNode {
458 struct zskiplistNode **forward;
e3870fab 459 struct zskiplistNode *backward;
6b47e12e 460 double score;
461 robj *obj;
462} zskiplistNode;
463
464typedef struct zskiplist {
e3870fab 465 struct zskiplistNode *header, *tail;
d13f767c 466 unsigned long length;
6b47e12e 467 int level;
468} zskiplist;
469
1812e024 470typedef struct zset {
471 dict *dict;
6b47e12e 472 zskiplist *zsl;
1812e024 473} zset;
474
6b47e12e 475/* Our shared "common" objects */
476
ed9b544e 477struct sharedObjectsStruct {
c937aa89 478 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 479 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 480 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
481 *outofrangeerr, *plus,
ed9b544e 482 *select0, *select1, *select2, *select3, *select4,
483 *select5, *select6, *select7, *select8, *select9;
484} shared;
485
a7866db6 486/* Global vars that are actally used as constants. The following double
487 * values are used for double on-disk serialization, and are initialized
488 * at runtime to avoid strange compiler optimizations. */
489
490static double R_Zero, R_PosInf, R_NegInf, R_Nan;
491
92f8e882 492/* VM threaded I/O request message */
b9bc0eef 493#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
494#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
495#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
996cb5f7 496typedef struct iojon {
497 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 498 redisDb *db;/* Redis database */
92f8e882 499 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 500 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 501 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
502 off_t page; /* Swap page where to read/write the object */
b9bc0eef 503 off_t pages; /* Swap pages needed to safe object. PREPARE_SWAP return val */
996cb5f7 504 int canceled; /* True if this command was canceled by blocking side of VM */
505 pthread_t thread; /* ID of the thread processing this entry */
506} iojob;
92f8e882 507
ed9b544e 508/*================================ Prototypes =============================== */
509
510static void freeStringObject(robj *o);
511static void freeListObject(robj *o);
512static void freeSetObject(robj *o);
513static void decrRefCount(void *o);
514static robj *createObject(int type, void *ptr);
515static void freeClient(redisClient *c);
f78fd11b 516static int rdbLoad(char *filename);
ed9b544e 517static void addReply(redisClient *c, robj *obj);
518static void addReplySds(redisClient *c, sds s);
519static void incrRefCount(robj *o);
f78fd11b 520static int rdbSaveBackground(char *filename);
ed9b544e 521static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 522static robj *dupStringObject(robj *o);
87eca727 523static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc);
44b38ef4 524static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 525static int syncWithMaster(void);
10c43610 526static robj *tryObjectSharing(robj *o);
942a3961 527static int tryObjectEncoding(robj *o);
9d65a1bb 528static robj *getDecodedObject(robj *o);
3305306f 529static int removeExpire(redisDb *db, robj *key);
530static int expireIfNeeded(redisDb *db, robj *key);
531static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 532static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 533static int deleteKey(redisDb *db, robj *key);
bb32ede5 534static time_t getExpire(redisDb *db, robj *key);
535static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 536static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 537static void freeMemoryIfNeeded(void);
de96dbfe 538static int processCommand(redisClient *c);
56906eef 539static void setupSigSegvAction(void);
a3b21203 540static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 541static void aofRemoveTempFile(pid_t childpid);
0ea663ea 542static size_t stringObjectLen(robj *o);
638e42ac 543static void processInputBuffer(redisClient *c);
6b47e12e 544static zskiplist *zslCreate(void);
fd8ccf44 545static void zslFree(zskiplist *zsl);
2b59cfdf 546static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 547static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 548static void initClientMultiState(redisClient *c);
549static void freeClientMultiState(redisClient *c);
550static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
4409877e 551static void unblockClient(redisClient *c);
552static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 553static void vmInit(void);
a35ddf12 554static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 555static robj *vmLoadObject(robj *key);
7e69548d 556static robj *vmPreviewObject(robj *key);
a69a0c9c 557static int vmSwapOneObjectBlocking(void);
558static int vmSwapOneObjectThreaded(void);
7e69548d 559static int vmCanSwapOut(void);
a5819310 560static int tryFreeOneObjectFromFreelist(void);
996cb5f7 561static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
562static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
563static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 564static void lockThreadedIO(void);
565static void unlockThreadedIO(void);
566static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
567static void freeIOJob(iojob *j);
568static void queueIOJob(iojob *j);
a5819310 569static int vmWriteObjectOnSwap(robj *o, off_t page);
570static robj *vmReadObjectFromSwap(off_t page, int type);
4ee9488d 571static void waitZeroActiveThreads(void);
ed9b544e 572
abcb223e 573static void authCommand(redisClient *c);
ed9b544e 574static void pingCommand(redisClient *c);
575static void echoCommand(redisClient *c);
576static void setCommand(redisClient *c);
577static void setnxCommand(redisClient *c);
578static void getCommand(redisClient *c);
579static void delCommand(redisClient *c);
580static void existsCommand(redisClient *c);
581static void incrCommand(redisClient *c);
582static void decrCommand(redisClient *c);
583static void incrbyCommand(redisClient *c);
584static void decrbyCommand(redisClient *c);
585static void selectCommand(redisClient *c);
586static void randomkeyCommand(redisClient *c);
587static void keysCommand(redisClient *c);
588static void dbsizeCommand(redisClient *c);
589static void lastsaveCommand(redisClient *c);
590static void saveCommand(redisClient *c);
591static void bgsaveCommand(redisClient *c);
9d65a1bb 592static void bgrewriteaofCommand(redisClient *c);
ed9b544e 593static void shutdownCommand(redisClient *c);
594static void moveCommand(redisClient *c);
595static void renameCommand(redisClient *c);
596static void renamenxCommand(redisClient *c);
597static void lpushCommand(redisClient *c);
598static void rpushCommand(redisClient *c);
599static void lpopCommand(redisClient *c);
600static void rpopCommand(redisClient *c);
601static void llenCommand(redisClient *c);
602static void lindexCommand(redisClient *c);
603static void lrangeCommand(redisClient *c);
604static void ltrimCommand(redisClient *c);
605static void typeCommand(redisClient *c);
606static void lsetCommand(redisClient *c);
607static void saddCommand(redisClient *c);
608static void sremCommand(redisClient *c);
a4460ef4 609static void smoveCommand(redisClient *c);
ed9b544e 610static void sismemberCommand(redisClient *c);
611static void scardCommand(redisClient *c);
12fea928 612static void spopCommand(redisClient *c);
2abb95a9 613static void srandmemberCommand(redisClient *c);
ed9b544e 614static void sinterCommand(redisClient *c);
615static void sinterstoreCommand(redisClient *c);
40d224a9 616static void sunionCommand(redisClient *c);
617static void sunionstoreCommand(redisClient *c);
f4f56e1d 618static void sdiffCommand(redisClient *c);
619static void sdiffstoreCommand(redisClient *c);
ed9b544e 620static void syncCommand(redisClient *c);
621static void flushdbCommand(redisClient *c);
622static void flushallCommand(redisClient *c);
623static void sortCommand(redisClient *c);
624static void lremCommand(redisClient *c);
0f5f7e9a 625static void rpoplpushcommand(redisClient *c);
ed9b544e 626static void infoCommand(redisClient *c);
70003d28 627static void mgetCommand(redisClient *c);
87eca727 628static void monitorCommand(redisClient *c);
3305306f 629static void expireCommand(redisClient *c);
802e8373 630static void expireatCommand(redisClient *c);
f6b141c5 631static void getsetCommand(redisClient *c);
fd88489a 632static void ttlCommand(redisClient *c);
321b0e13 633static void slaveofCommand(redisClient *c);
7f957c92 634static void debugCommand(redisClient *c);
f6b141c5 635static void msetCommand(redisClient *c);
636static void msetnxCommand(redisClient *c);
fd8ccf44 637static void zaddCommand(redisClient *c);
7db723ad 638static void zincrbyCommand(redisClient *c);
cc812361 639static void zrangeCommand(redisClient *c);
50c55df5 640static void zrangebyscoreCommand(redisClient *c);
e3870fab 641static void zrevrangeCommand(redisClient *c);
3c41331e 642static void zcardCommand(redisClient *c);
1b7106e7 643static void zremCommand(redisClient *c);
6e333bbe 644static void zscoreCommand(redisClient *c);
1807985b 645static void zremrangebyscoreCommand(redisClient *c);
6e469882 646static void multiCommand(redisClient *c);
647static void execCommand(redisClient *c);
4409877e 648static void blpopCommand(redisClient *c);
649static void brpopCommand(redisClient *c);
f6b141c5 650
ed9b544e 651/*================================= Globals ================================= */
652
653/* Global vars */
654static struct redisServer server; /* server global state */
655static struct redisCommand cmdTable[] = {
656 {"get",getCommand,2,REDIS_CMD_INLINE},
3fd78bcd 657 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
658 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
5109cdff 659 {"del",delCommand,-2,REDIS_CMD_INLINE},
ed9b544e 660 {"exists",existsCommand,2,REDIS_CMD_INLINE},
3fd78bcd 661 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
662 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
70003d28 663 {"mget",mgetCommand,-2,REDIS_CMD_INLINE},
3fd78bcd 664 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
665 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
ed9b544e 666 {"rpop",rpopCommand,2,REDIS_CMD_INLINE},
667 {"lpop",lpopCommand,2,REDIS_CMD_INLINE},
b177fd30 668 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE},
669 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE},
ed9b544e 670 {"llen",llenCommand,2,REDIS_CMD_INLINE},
671 {"lindex",lindexCommand,3,REDIS_CMD_INLINE},
3fd78bcd 672 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
ed9b544e 673 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE},
674 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE},
675 {"lrem",lremCommand,4,REDIS_CMD_BULK},
0b13687c 676 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
3fd78bcd 677 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
ed9b544e 678 {"srem",sremCommand,3,REDIS_CMD_BULK},
a4460ef4 679 {"smove",smoveCommand,4,REDIS_CMD_BULK},
ed9b544e 680 {"sismember",sismemberCommand,3,REDIS_CMD_BULK},
681 {"scard",scardCommand,2,REDIS_CMD_INLINE},
12fea928 682 {"spop",spopCommand,2,REDIS_CMD_INLINE},
2abb95a9 683 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE},
3fd78bcd 684 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
685 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
686 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
687 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
688 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
689 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
ed9b544e 690 {"smembers",sinterCommand,2,REDIS_CMD_INLINE},
fd8ccf44 691 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
7db723ad 692 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
1b7106e7 693 {"zrem",zremCommand,3,REDIS_CMD_BULK},
1807985b 694 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE},
752da584 695 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE},
80181f78 696 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE},
752da584 697 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE},
3c41331e 698 {"zcard",zcardCommand,2,REDIS_CMD_INLINE},
6e333bbe 699 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
3fd78bcd 700 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
701 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
f6b141c5 702 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
703 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
704 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM},
ed9b544e 705 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE},
706 {"select",selectCommand,2,REDIS_CMD_INLINE},
707 {"move",moveCommand,3,REDIS_CMD_INLINE},
708 {"rename",renameCommand,3,REDIS_CMD_INLINE},
709 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE},
321b0e13 710 {"expire",expireCommand,3,REDIS_CMD_INLINE},
802e8373 711 {"expireat",expireatCommand,3,REDIS_CMD_INLINE},
ed9b544e 712 {"keys",keysCommand,2,REDIS_CMD_INLINE},
713 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE},
abcb223e 714 {"auth",authCommand,2,REDIS_CMD_INLINE},
ed9b544e 715 {"ping",pingCommand,1,REDIS_CMD_INLINE},
716 {"echo",echoCommand,2,REDIS_CMD_BULK},
717 {"save",saveCommand,1,REDIS_CMD_INLINE},
718 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE},
9d65a1bb 719 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE},
ed9b544e 720 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE},
721 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE},
722 {"type",typeCommand,2,REDIS_CMD_INLINE},
6e469882 723 {"multi",multiCommand,1,REDIS_CMD_INLINE},
724 {"exec",execCommand,1,REDIS_CMD_INLINE},
ed9b544e 725 {"sync",syncCommand,1,REDIS_CMD_INLINE},
726 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE},
727 {"flushall",flushallCommand,1,REDIS_CMD_INLINE},
3fd78bcd 728 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM},
ed9b544e 729 {"info",infoCommand,1,REDIS_CMD_INLINE},
87eca727 730 {"monitor",monitorCommand,1,REDIS_CMD_INLINE},
fd88489a 731 {"ttl",ttlCommand,2,REDIS_CMD_INLINE},
321b0e13 732 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE},
7f957c92 733 {"debug",debugCommand,-2,REDIS_CMD_INLINE},
ed9b544e 734 {NULL,NULL,0,0}
735};
bcfc686d 736
ed9b544e 737/*============================ Utility functions ============================ */
738
739/* Glob-style pattern matching. */
740int stringmatchlen(const char *pattern, int patternLen,
741 const char *string, int stringLen, int nocase)
742{
743 while(patternLen) {
744 switch(pattern[0]) {
745 case '*':
746 while (pattern[1] == '*') {
747 pattern++;
748 patternLen--;
749 }
750 if (patternLen == 1)
751 return 1; /* match */
752 while(stringLen) {
753 if (stringmatchlen(pattern+1, patternLen-1,
754 string, stringLen, nocase))
755 return 1; /* match */
756 string++;
757 stringLen--;
758 }
759 return 0; /* no match */
760 break;
761 case '?':
762 if (stringLen == 0)
763 return 0; /* no match */
764 string++;
765 stringLen--;
766 break;
767 case '[':
768 {
769 int not, match;
770
771 pattern++;
772 patternLen--;
773 not = pattern[0] == '^';
774 if (not) {
775 pattern++;
776 patternLen--;
777 }
778 match = 0;
779 while(1) {
780 if (pattern[0] == '\\') {
781 pattern++;
782 patternLen--;
783 if (pattern[0] == string[0])
784 match = 1;
785 } else if (pattern[0] == ']') {
786 break;
787 } else if (patternLen == 0) {
788 pattern--;
789 patternLen++;
790 break;
791 } else if (pattern[1] == '-' && patternLen >= 3) {
792 int start = pattern[0];
793 int end = pattern[2];
794 int c = string[0];
795 if (start > end) {
796 int t = start;
797 start = end;
798 end = t;
799 }
800 if (nocase) {
801 start = tolower(start);
802 end = tolower(end);
803 c = tolower(c);
804 }
805 pattern += 2;
806 patternLen -= 2;
807 if (c >= start && c <= end)
808 match = 1;
809 } else {
810 if (!nocase) {
811 if (pattern[0] == string[0])
812 match = 1;
813 } else {
814 if (tolower((int)pattern[0]) == tolower((int)string[0]))
815 match = 1;
816 }
817 }
818 pattern++;
819 patternLen--;
820 }
821 if (not)
822 match = !match;
823 if (!match)
824 return 0; /* no match */
825 string++;
826 stringLen--;
827 break;
828 }
829 case '\\':
830 if (patternLen >= 2) {
831 pattern++;
832 patternLen--;
833 }
834 /* fall through */
835 default:
836 if (!nocase) {
837 if (pattern[0] != string[0])
838 return 0; /* no match */
839 } else {
840 if (tolower((int)pattern[0]) != tolower((int)string[0]))
841 return 0; /* no match */
842 }
843 string++;
844 stringLen--;
845 break;
846 }
847 pattern++;
848 patternLen--;
849 if (stringLen == 0) {
850 while(*pattern == '*') {
851 pattern++;
852 patternLen--;
853 }
854 break;
855 }
856 }
857 if (patternLen == 0 && stringLen == 0)
858 return 1;
859 return 0;
860}
861
56906eef 862static void redisLog(int level, const char *fmt, ...) {
ed9b544e 863 va_list ap;
864 FILE *fp;
865
866 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
867 if (!fp) return;
868
869 va_start(ap, fmt);
870 if (level >= server.verbosity) {
871 char *c = ".-*";
1904ecc1 872 char buf[64];
873 time_t now;
874
875 now = time(NULL);
6c9385e0 876 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
1904ecc1 877 fprintf(fp,"%s %c ",buf,c[level]);
ed9b544e 878 vfprintf(fp, fmt, ap);
879 fprintf(fp,"\n");
880 fflush(fp);
881 }
882 va_end(ap);
883
884 if (server.logfile) fclose(fp);
885}
886
887/*====================== Hash table type implementation ==================== */
888
889/* This is an hash table type that uses the SDS dynamic strings libary as
890 * keys and radis objects as values (objects can hold SDS strings,
891 * lists, sets). */
892
1812e024 893static void dictVanillaFree(void *privdata, void *val)
894{
895 DICT_NOTUSED(privdata);
896 zfree(val);
897}
898
4409877e 899static void dictListDestructor(void *privdata, void *val)
900{
901 DICT_NOTUSED(privdata);
902 listRelease((list*)val);
903}
904
ed9b544e 905static int sdsDictKeyCompare(void *privdata, const void *key1,
906 const void *key2)
907{
908 int l1,l2;
909 DICT_NOTUSED(privdata);
910
911 l1 = sdslen((sds)key1);
912 l2 = sdslen((sds)key2);
913 if (l1 != l2) return 0;
914 return memcmp(key1, key2, l1) == 0;
915}
916
917static void dictRedisObjectDestructor(void *privdata, void *val)
918{
919 DICT_NOTUSED(privdata);
920
a35ddf12 921 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 922 decrRefCount(val);
923}
924
942a3961 925static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 926 const void *key2)
927{
928 const robj *o1 = key1, *o2 = key2;
929 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
930}
931
942a3961 932static unsigned int dictObjHash(const void *key) {
ed9b544e 933 const robj *o = key;
934 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
935}
936
942a3961 937static int dictEncObjKeyCompare(void *privdata, const void *key1,
938 const void *key2)
939{
9d65a1bb 940 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
941 int cmp;
942a3961 942
9d65a1bb 943 o1 = getDecodedObject(o1);
944 o2 = getDecodedObject(o2);
945 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
946 decrRefCount(o1);
947 decrRefCount(o2);
948 return cmp;
942a3961 949}
950
951static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 952 robj *o = (robj*) key;
942a3961 953
9d65a1bb 954 o = getDecodedObject(o);
955 unsigned int hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
956 decrRefCount(o);
957 return hash;
942a3961 958}
959
f2d9f50f 960/* Sets type and expires */
ed9b544e 961static dictType setDictType = {
942a3961 962 dictEncObjHash, /* hash function */
ed9b544e 963 NULL, /* key dup */
964 NULL, /* val dup */
942a3961 965 dictEncObjKeyCompare, /* key compare */
ed9b544e 966 dictRedisObjectDestructor, /* key destructor */
967 NULL /* val destructor */
968};
969
f2d9f50f 970/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 971static dictType zsetDictType = {
972 dictEncObjHash, /* hash function */
973 NULL, /* key dup */
974 NULL, /* val dup */
975 dictEncObjKeyCompare, /* key compare */
976 dictRedisObjectDestructor, /* key destructor */
da0a1620 977 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 978};
979
f2d9f50f 980/* Db->dict */
ed9b544e 981static dictType hashDictType = {
942a3961 982 dictObjHash, /* hash function */
ed9b544e 983 NULL, /* key dup */
984 NULL, /* val dup */
942a3961 985 dictObjKeyCompare, /* key compare */
ed9b544e 986 dictRedisObjectDestructor, /* key destructor */
987 dictRedisObjectDestructor /* val destructor */
988};
989
f2d9f50f 990/* Db->expires */
991static dictType keyptrDictType = {
992 dictObjHash, /* hash function */
993 NULL, /* key dup */
994 NULL, /* val dup */
995 dictObjKeyCompare, /* key compare */
996 dictRedisObjectDestructor, /* key destructor */
997 NULL /* val destructor */
998};
999
4409877e 1000/* Keylist hash table type has unencoded redis objects as keys and
1001 * lists as values. It's used for blocking operations (BLPOP) */
1002static dictType keylistDictType = {
1003 dictObjHash, /* hash function */
1004 NULL, /* key dup */
1005 NULL, /* val dup */
1006 dictObjKeyCompare, /* key compare */
1007 dictRedisObjectDestructor, /* key destructor */
1008 dictListDestructor /* val destructor */
1009};
1010
ed9b544e 1011/* ========================= Random utility functions ======================= */
1012
1013/* Redis generally does not try to recover from out of memory conditions
1014 * when allocating objects or strings, it is not clear if it will be possible
1015 * to report this condition to the client since the networking layer itself
1016 * is based on heap allocation for send buffers, so we simply abort.
1017 * At least the code will be simpler to read... */
1018static void oom(const char *msg) {
71c54b21 1019 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1020 sleep(1);
1021 abort();
1022}
1023
1024/* ====================== Redis server networking stuff ===================== */
56906eef 1025static void closeTimedoutClients(void) {
ed9b544e 1026 redisClient *c;
ed9b544e 1027 listNode *ln;
1028 time_t now = time(NULL);
c7df85a4 1029 listIter li;
ed9b544e 1030
c7df85a4 1031 listRewind(server.clients,&li);
1032 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1033 c = listNodeValue(ln);
f86a74e9 1034 if (server.maxidletime &&
1035 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1036 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
f86a74e9 1037 (now - c->lastinteraction > server.maxidletime))
1038 {
f870935d 1039 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1040 freeClient(c);
f86a74e9 1041 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1042 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1043 addReply(c,shared.nullmultibulk);
f86a74e9 1044 unblockClient(c);
1045 }
ed9b544e 1046 }
1047 }
ed9b544e 1048}
1049
12fea928 1050static int htNeedsResize(dict *dict) {
1051 long long size, used;
1052
1053 size = dictSlots(dict);
1054 used = dictSize(dict);
1055 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1056 (used*100/size < REDIS_HT_MINFILL));
1057}
1058
0bc03378 1059/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1060 * we resize the hash table to save memory */
56906eef 1061static void tryResizeHashTables(void) {
0bc03378 1062 int j;
1063
1064 for (j = 0; j < server.dbnum; j++) {
12fea928 1065 if (htNeedsResize(server.db[j].dict)) {
f870935d 1066 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
0bc03378 1067 dictResize(server.db[j].dict);
f870935d 1068 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
0bc03378 1069 }
12fea928 1070 if (htNeedsResize(server.db[j].expires))
1071 dictResize(server.db[j].expires);
0bc03378 1072 }
1073}
1074
9d65a1bb 1075/* A background saving child (BGSAVE) terminated its work. Handle this. */
1076void backgroundSaveDoneHandler(int statloc) {
1077 int exitcode = WEXITSTATUS(statloc);
1078 int bysignal = WIFSIGNALED(statloc);
1079
1080 if (!bysignal && exitcode == 0) {
1081 redisLog(REDIS_NOTICE,
1082 "Background saving terminated with success");
1083 server.dirty = 0;
1084 server.lastsave = time(NULL);
1085 } else if (!bysignal && exitcode != 0) {
1086 redisLog(REDIS_WARNING, "Background saving error");
1087 } else {
1088 redisLog(REDIS_WARNING,
1089 "Background saving terminated by signal");
1090 rdbRemoveTempFile(server.bgsavechildpid);
1091 }
1092 server.bgsavechildpid = -1;
1093 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1094 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1095 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1096}
1097
1098/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1099 * Handle this. */
1100void backgroundRewriteDoneHandler(int statloc) {
1101 int exitcode = WEXITSTATUS(statloc);
1102 int bysignal = WIFSIGNALED(statloc);
1103
1104 if (!bysignal && exitcode == 0) {
1105 int fd;
1106 char tmpfile[256];
1107
1108 redisLog(REDIS_NOTICE,
1109 "Background append only file rewriting terminated with success");
1110 /* Now it's time to flush the differences accumulated by the parent */
1111 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1112 fd = open(tmpfile,O_WRONLY|O_APPEND);
1113 if (fd == -1) {
1114 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1115 goto cleanup;
1116 }
1117 /* Flush our data... */
1118 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1119 (signed) sdslen(server.bgrewritebuf)) {
1120 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1121 close(fd);
1122 goto cleanup;
1123 }
b32627cd 1124 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1125 /* Now our work is to rename the temp file into the stable file. And
1126 * switch the file descriptor used by the server for append only. */
1127 if (rename(tmpfile,server.appendfilename) == -1) {
1128 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1129 close(fd);
1130 goto cleanup;
1131 }
1132 /* Mission completed... almost */
1133 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1134 if (server.appendfd != -1) {
1135 /* If append only is actually enabled... */
1136 close(server.appendfd);
1137 server.appendfd = fd;
1138 fsync(fd);
85a83172 1139 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1140 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1141 } else {
1142 /* If append only is disabled we just generate a dump in this
1143 * format. Why not? */
1144 close(fd);
1145 }
1146 } else if (!bysignal && exitcode != 0) {
1147 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1148 } else {
1149 redisLog(REDIS_WARNING,
1150 "Background append only file rewriting terminated by signal");
1151 }
1152cleanup:
1153 sdsfree(server.bgrewritebuf);
1154 server.bgrewritebuf = sdsempty();
1155 aofRemoveTempFile(server.bgrewritechildpid);
1156 server.bgrewritechildpid = -1;
1157}
1158
56906eef 1159static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1160 int j, loops = server.cronloops++;
ed9b544e 1161 REDIS_NOTUSED(eventLoop);
1162 REDIS_NOTUSED(id);
1163 REDIS_NOTUSED(clientData);
1164
3a66edc7 1165 /* We take a cached value of the unix time in the global state because
1166 * with virtual memory and aging there is to store the current time
1167 * in objects at every object access, and accuracy is not needed.
1168 * To access a global var is faster than calling time(NULL) */
1169 server.unixtime = time(NULL);
1170
ed9b544e 1171 /* Update the global state with the amount of used memory */
1172 server.usedmemory = zmalloc_used_memory();
1173
0bc03378 1174 /* Show some info about non-empty databases */
ed9b544e 1175 for (j = 0; j < server.dbnum; j++) {
dec423d9 1176 long long size, used, vkeys;
94754ccc 1177
3305306f 1178 size = dictSlots(server.db[j].dict);
1179 used = dictSize(server.db[j].dict);
94754ccc 1180 vkeys = dictSize(server.db[j].expires);
c3cb078d 1181 if (!(loops % 5) && (used || vkeys)) {
f870935d 1182 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1183 /* dictPrintStats(server.dict); */
ed9b544e 1184 }
ed9b544e 1185 }
1186
0bc03378 1187 /* We don't want to resize the hash tables while a bacground saving
1188 * is in progress: the saving child is created using fork() that is
1189 * implemented with a copy-on-write semantic in most modern systems, so
1190 * if we resize the HT while there is the saving child at work actually
1191 * a lot of memory movements in the parent will cause a lot of pages
1192 * copied. */
9d65a1bb 1193 if (server.bgsavechildpid == -1) tryResizeHashTables();
0bc03378 1194
ed9b544e 1195 /* Show information about connected clients */
1196 if (!(loops % 5)) {
f870935d 1197 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use, %d shared objects",
ed9b544e 1198 listLength(server.clients)-listLength(server.slaves),
1199 listLength(server.slaves),
10c43610 1200 server.usedmemory,
3305306f 1201 dictSize(server.sharingpool));
ed9b544e 1202 }
1203
1204 /* Close connections of timedout clients */
f86a74e9 1205 if ((server.maxidletime && !(loops % 10)) || server.blockedclients)
ed9b544e 1206 closeTimedoutClients();
1207
9d65a1bb 1208 /* Check if a background saving or AOF rewrite in progress terminated */
1209 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1210 int statloc;
9d65a1bb 1211 pid_t pid;
1212
1213 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1214 if (pid == server.bgsavechildpid) {
1215 backgroundSaveDoneHandler(statloc);
ed9b544e 1216 } else {
9d65a1bb 1217 backgroundRewriteDoneHandler(statloc);
ed9b544e 1218 }
ed9b544e 1219 }
1220 } else {
1221 /* If there is not a background saving in progress check if
1222 * we have to save now */
1223 time_t now = time(NULL);
1224 for (j = 0; j < server.saveparamslen; j++) {
1225 struct saveparam *sp = server.saveparams+j;
1226
1227 if (server.dirty >= sp->changes &&
1228 now-server.lastsave > sp->seconds) {
1229 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1230 sp->changes, sp->seconds);
f78fd11b 1231 rdbSaveBackground(server.dbfilename);
ed9b544e 1232 break;
1233 }
1234 }
1235 }
94754ccc 1236
f2324293 1237 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1238 * will use few CPU cycles if there are few expiring keys, otherwise
1239 * it will get more aggressive to avoid that too much memory is used by
1240 * keys that can be removed from the keyspace. */
94754ccc 1241 for (j = 0; j < server.dbnum; j++) {
f2324293 1242 int expired;
94754ccc 1243 redisDb *db = server.db+j;
94754ccc 1244
f2324293 1245 /* Continue to expire if at the end of the cycle more than 25%
1246 * of the keys were expired. */
1247 do {
4ef8de8a 1248 long num = dictSize(db->expires);
94754ccc 1249 time_t now = time(NULL);
1250
f2324293 1251 expired = 0;
94754ccc 1252 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1253 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1254 while (num--) {
1255 dictEntry *de;
1256 time_t t;
1257
1258 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1259 t = (time_t) dictGetEntryVal(de);
1260 if (now > t) {
1261 deleteKey(db,dictGetEntryKey(de));
f2324293 1262 expired++;
94754ccc 1263 }
1264 }
f2324293 1265 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1266 }
1267
4ef8de8a 1268 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1269 * is enbled. Try to free objects from the free list first. */
7e69548d 1270 if (vmCanSwapOut()) {
1271 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1272 server.vm_max_memory)
1273 {
72e9fd40 1274 int retval;
1275
a5819310 1276 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1277 retval = (server.vm_max_threads == 0) ?
1278 vmSwapOneObjectBlocking() :
1279 vmSwapOneObjectThreaded();
1280 if (retval == REDIS_ERR && (loops % 30) == 0 &&
1281 zmalloc_used_memory() >
1282 (server.vm_max_memory+server.vm_max_memory/10))
1283 {
1284 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1285 }
72e9fd40 1286 /* Note that when using threade I/O we free just one object,
1287 * because anyway when the I/O thread in charge to swap this
1288 * object out will finish, the handler of completed jobs
1289 * will try to swap more objects if we are still out of memory. */
1290 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1291 }
1292 }
1293
ed9b544e 1294 /* Check if we should connect to a MASTER */
1295 if (server.replstate == REDIS_REPL_CONNECT) {
1296 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1297 if (syncWithMaster() == REDIS_OK) {
1298 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1299 }
1300 }
1301 return 1000;
1302}
1303
1304static void createSharedObjects(void) {
1305 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1306 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1307 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1308 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1309 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1310 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1311 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1312 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1313 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1314 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1315 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1316 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1317 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1318 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1319 "-ERR no such key\r\n"));
ed9b544e 1320 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1321 "-ERR syntax error\r\n"));
c937aa89 1322 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1323 "-ERR source and destination objects are the same\r\n"));
1324 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1325 "-ERR index out of range\r\n"));
ed9b544e 1326 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1327 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1328 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1329 shared.select0 = createStringObject("select 0\r\n",10);
1330 shared.select1 = createStringObject("select 1\r\n",10);
1331 shared.select2 = createStringObject("select 2\r\n",10);
1332 shared.select3 = createStringObject("select 3\r\n",10);
1333 shared.select4 = createStringObject("select 4\r\n",10);
1334 shared.select5 = createStringObject("select 5\r\n",10);
1335 shared.select6 = createStringObject("select 6\r\n",10);
1336 shared.select7 = createStringObject("select 7\r\n",10);
1337 shared.select8 = createStringObject("select 8\r\n",10);
1338 shared.select9 = createStringObject("select 9\r\n",10);
1339}
1340
1341static void appendServerSaveParams(time_t seconds, int changes) {
1342 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1343 server.saveparams[server.saveparamslen].seconds = seconds;
1344 server.saveparams[server.saveparamslen].changes = changes;
1345 server.saveparamslen++;
1346}
1347
bcfc686d 1348static void resetServerSaveParams() {
ed9b544e 1349 zfree(server.saveparams);
1350 server.saveparams = NULL;
1351 server.saveparamslen = 0;
1352}
1353
1354static void initServerConfig() {
1355 server.dbnum = REDIS_DEFAULT_DBNUM;
1356 server.port = REDIS_SERVERPORT;
f870935d 1357 server.verbosity = REDIS_VERBOSE;
ed9b544e 1358 server.maxidletime = REDIS_MAXIDLETIME;
1359 server.saveparams = NULL;
1360 server.logfile = NULL; /* NULL = log on standard output */
1361 server.bindaddr = NULL;
1362 server.glueoutputbuf = 1;
1363 server.daemonize = 0;
44b38ef4 1364 server.appendonly = 0;
4e141d5a 1365 server.appendfsync = APPENDFSYNC_ALWAYS;
48f0308a 1366 server.lastfsync = time(NULL);
44b38ef4 1367 server.appendfd = -1;
1368 server.appendseldb = -1; /* Make sure the first time will not match */
ed329fcf 1369 server.pidfile = "/var/run/redis.pid";
ed9b544e 1370 server.dbfilename = "dump.rdb";
9d65a1bb 1371 server.appendfilename = "appendonly.aof";
abcb223e 1372 server.requirepass = NULL;
10c43610 1373 server.shareobjects = 0;
b0553789 1374 server.rdbcompression = 1;
21aecf4b 1375 server.sharingpoolsize = 1024;
285add55 1376 server.maxclients = 0;
f86a74e9 1377 server.blockedclients = 0;
3fd78bcd 1378 server.maxmemory = 0;
75680a3c 1379 server.vm_enabled = 0;
1380 server.vm_page_size = 256; /* 256 bytes per page */
1381 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1382 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1383 server.vm_max_threads = 4;
75680a3c 1384
bcfc686d 1385 resetServerSaveParams();
ed9b544e 1386
1387 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1388 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1389 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1390 /* Replication related */
1391 server.isslave = 0;
d0ccebcf 1392 server.masterauth = NULL;
ed9b544e 1393 server.masterhost = NULL;
1394 server.masterport = 6379;
1395 server.master = NULL;
1396 server.replstate = REDIS_REPL_NONE;
a7866db6 1397
1398 /* Double constants initialization */
1399 R_Zero = 0.0;
1400 R_PosInf = 1.0/R_Zero;
1401 R_NegInf = -1.0/R_Zero;
1402 R_Nan = R_Zero/R_Zero;
ed9b544e 1403}
1404
1405static void initServer() {
1406 int j;
1407
1408 signal(SIGHUP, SIG_IGN);
1409 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1410 setupSigSegvAction();
ed9b544e 1411
b9bc0eef 1412 server.devnull = fopen("/dev/null","w");
1413 if (server.devnull == NULL) {
1414 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1415 exit(1);
1416 }
ed9b544e 1417 server.clients = listCreate();
1418 server.slaves = listCreate();
87eca727 1419 server.monitors = listCreate();
ed9b544e 1420 server.objfreelist = listCreate();
1421 createSharedObjects();
1422 server.el = aeCreateEventLoop();
3305306f 1423 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
10c43610 1424 server.sharingpool = dictCreate(&setDictType,NULL);
ed9b544e 1425 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1426 if (server.fd == -1) {
1427 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1428 exit(1);
1429 }
3305306f 1430 for (j = 0; j < server.dbnum; j++) {
1431 server.db[j].dict = dictCreate(&hashDictType,NULL);
f2d9f50f 1432 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
4409877e 1433 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
3305306f 1434 server.db[j].id = j;
1435 }
ed9b544e 1436 server.cronloops = 0;
9f3c422c 1437 server.bgsavechildpid = -1;
9d65a1bb 1438 server.bgrewritechildpid = -1;
1439 server.bgrewritebuf = sdsempty();
ed9b544e 1440 server.lastsave = time(NULL);
1441 server.dirty = 0;
1442 server.usedmemory = 0;
1443 server.stat_numcommands = 0;
1444 server.stat_numconnections = 0;
1445 server.stat_starttime = time(NULL);
3a66edc7 1446 server.unixtime = time(NULL);
d8f8b666 1447 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1448 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1449 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1450
1451 if (server.appendonly) {
71eba477 1452 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1453 if (server.appendfd == -1) {
1454 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1455 strerror(errno));
1456 exit(1);
1457 }
1458 }
75680a3c 1459
1460 if (server.vm_enabled) vmInit();
ed9b544e 1461}
1462
1463/* Empty the whole database */
ca37e9cd 1464static long long emptyDb() {
ed9b544e 1465 int j;
ca37e9cd 1466 long long removed = 0;
ed9b544e 1467
3305306f 1468 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1469 removed += dictSize(server.db[j].dict);
3305306f 1470 dictEmpty(server.db[j].dict);
1471 dictEmpty(server.db[j].expires);
1472 }
ca37e9cd 1473 return removed;
ed9b544e 1474}
1475
85dd2f3a 1476static int yesnotoi(char *s) {
1477 if (!strcasecmp(s,"yes")) return 1;
1478 else if (!strcasecmp(s,"no")) return 0;
1479 else return -1;
1480}
1481
ed9b544e 1482/* I agree, this is a very rudimental way to load a configuration...
1483 will improve later if the config gets more complex */
1484static void loadServerConfig(char *filename) {
c9a111ac 1485 FILE *fp;
ed9b544e 1486 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1487 int linenum = 0;
1488 sds line = NULL;
c9a111ac 1489
1490 if (filename[0] == '-' && filename[1] == '\0')
1491 fp = stdin;
1492 else {
1493 if ((fp = fopen(filename,"r")) == NULL) {
1494 redisLog(REDIS_WARNING,"Fatal error, can't open config file");
1495 exit(1);
1496 }
ed9b544e 1497 }
c9a111ac 1498
ed9b544e 1499 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1500 sds *argv;
1501 int argc, j;
1502
1503 linenum++;
1504 line = sdsnew(buf);
1505 line = sdstrim(line," \t\r\n");
1506
1507 /* Skip comments and blank lines*/
1508 if (line[0] == '#' || line[0] == '\0') {
1509 sdsfree(line);
1510 continue;
1511 }
1512
1513 /* Split into arguments */
1514 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1515 sdstolower(argv[0]);
1516
1517 /* Execute config directives */
bb0b03a3 1518 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1519 server.maxidletime = atoi(argv[1]);
0150db36 1520 if (server.maxidletime < 0) {
ed9b544e 1521 err = "Invalid timeout value"; goto loaderr;
1522 }
bb0b03a3 1523 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1524 server.port = atoi(argv[1]);
1525 if (server.port < 1 || server.port > 65535) {
1526 err = "Invalid port"; goto loaderr;
1527 }
bb0b03a3 1528 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1529 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1530 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1531 int seconds = atoi(argv[1]);
1532 int changes = atoi(argv[2]);
1533 if (seconds < 1 || changes < 0) {
1534 err = "Invalid save parameters"; goto loaderr;
1535 }
1536 appendServerSaveParams(seconds,changes);
bb0b03a3 1537 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1538 if (chdir(argv[1]) == -1) {
1539 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1540 argv[1], strerror(errno));
1541 exit(1);
1542 }
bb0b03a3 1543 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1544 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1545 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1546 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1547 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1548 else {
1549 err = "Invalid log level. Must be one of debug, notice, warning";
1550 goto loaderr;
1551 }
bb0b03a3 1552 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1553 FILE *logfp;
ed9b544e 1554
1555 server.logfile = zstrdup(argv[1]);
bb0b03a3 1556 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1557 zfree(server.logfile);
1558 server.logfile = NULL;
1559 }
1560 if (server.logfile) {
1561 /* Test if we are able to open the file. The server will not
1562 * be able to abort just for this problem later... */
c9a111ac 1563 logfp = fopen(server.logfile,"a");
1564 if (logfp == NULL) {
ed9b544e 1565 err = sdscatprintf(sdsempty(),
1566 "Can't open the log file: %s", strerror(errno));
1567 goto loaderr;
1568 }
c9a111ac 1569 fclose(logfp);
ed9b544e 1570 }
bb0b03a3 1571 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1572 server.dbnum = atoi(argv[1]);
1573 if (server.dbnum < 1) {
1574 err = "Invalid number of databases"; goto loaderr;
1575 }
285add55 1576 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1577 server.maxclients = atoi(argv[1]);
3fd78bcd 1578 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
d4465900 1579 server.maxmemory = strtoll(argv[1], NULL, 10);
bb0b03a3 1580 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1581 server.masterhost = sdsnew(argv[1]);
1582 server.masterport = atoi(argv[2]);
1583 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1584 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1585 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1586 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1587 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1588 err = "argument must be 'yes' or 'no'"; goto loaderr;
1589 }
bb0b03a3 1590 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
85dd2f3a 1591 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
10c43610 1592 err = "argument must be 'yes' or 'no'"; goto loaderr;
1593 }
121f70cf 1594 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1595 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1596 err = "argument must be 'yes' or 'no'"; goto loaderr;
1597 }
e52c65b9 1598 } else if (!strcasecmp(argv[0],"shareobjectspoolsize") && argc == 2) {
1599 server.sharingpoolsize = atoi(argv[1]);
1600 if (server.sharingpoolsize < 1) {
1601 err = "invalid object sharing pool size"; goto loaderr;
1602 }
bb0b03a3 1603 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1604 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1605 err = "argument must be 'yes' or 'no'"; goto loaderr;
1606 }
44b38ef4 1607 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1608 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1609 err = "argument must be 'yes' or 'no'"; goto loaderr;
1610 }
48f0308a 1611 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1612 if (!strcasecmp(argv[1],"no")) {
48f0308a 1613 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1614 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1615 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1616 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1617 server.appendfsync = APPENDFSYNC_EVERYSEC;
1618 } else {
1619 err = "argument must be 'no', 'always' or 'everysec'";
1620 goto loaderr;
1621 }
bb0b03a3 1622 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
abcb223e 1623 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1624 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
ed329fcf 1625 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1626 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
b8b553c8 1627 server.dbfilename = zstrdup(argv[1]);
75680a3c 1628 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1629 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1630 err = "argument must be 'yes' or 'no'"; goto loaderr;
1631 }
4ef8de8a 1632 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1633 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1634 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1635 server.vm_page_size = strtoll(argv[1], NULL, 10);
1636 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1637 server.vm_pages = strtoll(argv[1], NULL, 10);
92f8e882 1638 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1639 server.vm_max_threads = strtoll(argv[1], NULL, 10);
ed9b544e 1640 } else {
1641 err = "Bad directive or wrong number of arguments"; goto loaderr;
1642 }
1643 for (j = 0; j < argc; j++)
1644 sdsfree(argv[j]);
1645 zfree(argv);
1646 sdsfree(line);
1647 }
c9a111ac 1648 if (fp != stdin) fclose(fp);
ed9b544e 1649 return;
1650
1651loaderr:
1652 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1653 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1654 fprintf(stderr, ">>> '%s'\n", line);
1655 fprintf(stderr, "%s\n", err);
1656 exit(1);
1657}
1658
1659static void freeClientArgv(redisClient *c) {
1660 int j;
1661
1662 for (j = 0; j < c->argc; j++)
1663 decrRefCount(c->argv[j]);
e8a74421 1664 for (j = 0; j < c->mbargc; j++)
1665 decrRefCount(c->mbargv[j]);
ed9b544e 1666 c->argc = 0;
e8a74421 1667 c->mbargc = 0;
ed9b544e 1668}
1669
1670static void freeClient(redisClient *c) {
1671 listNode *ln;
1672
4409877e 1673 /* Note that if the client we are freeing is blocked into a blocking
1674 * call, we have to set querybuf to NULL *before* to call unblockClient()
1675 * to avoid processInputBuffer() will get called. Also it is important
1676 * to remove the file events after this, because this call adds
1677 * the READABLE event. */
1678 sdsfree(c->querybuf);
1679 c->querybuf = NULL;
1680 if (c->flags & REDIS_BLOCKED)
1681 unblockClient(c);
1682
ed9b544e 1683 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1684 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 1685 listRelease(c->reply);
1686 freeClientArgv(c);
1687 close(c->fd);
92f8e882 1688 /* Remove from the list of clients */
ed9b544e 1689 ln = listSearchKey(server.clients,c);
dfc5e96c 1690 redisAssert(ln != NULL);
ed9b544e 1691 listDelNode(server.clients,ln);
92f8e882 1692 /* Remove from the list of clients waiting for VM operations */
1693 if (server.vm_enabled && listLength(c->io_keys)) {
1694 ln = listSearchKey(server.io_clients,c);
1695 if (ln) listDelNode(server.io_clients,ln);
1696 listRelease(c->io_keys);
1697 }
b3e3d0d7 1698 listRelease(c->io_keys);
92f8e882 1699 /* Other cleanup */
ed9b544e 1700 if (c->flags & REDIS_SLAVE) {
6208b3a7 1701 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1702 close(c->repldbfd);
87eca727 1703 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1704 ln = listSearchKey(l,c);
dfc5e96c 1705 redisAssert(ln != NULL);
87eca727 1706 listDelNode(l,ln);
ed9b544e 1707 }
1708 if (c->flags & REDIS_MASTER) {
1709 server.master = NULL;
1710 server.replstate = REDIS_REPL_CONNECT;
1711 }
93ea3759 1712 zfree(c->argv);
e8a74421 1713 zfree(c->mbargv);
6e469882 1714 freeClientMultiState(c);
ed9b544e 1715 zfree(c);
1716}
1717
cc30e368 1718#define GLUEREPLY_UP_TO (1024)
ed9b544e 1719static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 1720 int copylen = 0;
1721 char buf[GLUEREPLY_UP_TO];
6208b3a7 1722 listNode *ln;
c7df85a4 1723 listIter li;
ed9b544e 1724 robj *o;
1725
c7df85a4 1726 listRewind(c->reply,&li);
1727 while((ln = listNext(&li))) {
c28b42ac 1728 int objlen;
1729
ed9b544e 1730 o = ln->value;
c28b42ac 1731 objlen = sdslen(o->ptr);
1732 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1733 memcpy(buf+copylen,o->ptr,objlen);
1734 copylen += objlen;
ed9b544e 1735 listDelNode(c->reply,ln);
c28b42ac 1736 } else {
1737 if (copylen == 0) return;
1738 break;
ed9b544e 1739 }
ed9b544e 1740 }
c28b42ac 1741 /* Now the output buffer is empty, add the new single element */
1742 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1743 listAddNodeHead(c->reply,o);
ed9b544e 1744}
1745
1746static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1747 redisClient *c = privdata;
1748 int nwritten = 0, totwritten = 0, objlen;
1749 robj *o;
1750 REDIS_NOTUSED(el);
1751 REDIS_NOTUSED(mask);
1752
2895e862 1753 /* Use writev() if we have enough buffers to send */
7ea870c0 1754 if (!server.glueoutputbuf &&
1755 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1756 !(c->flags & REDIS_MASTER))
2895e862 1757 {
1758 sendReplyToClientWritev(el, fd, privdata, mask);
1759 return;
1760 }
2895e862 1761
ed9b544e 1762 while(listLength(c->reply)) {
c28b42ac 1763 if (server.glueoutputbuf && listLength(c->reply) > 1)
1764 glueReplyBuffersIfNeeded(c);
1765
ed9b544e 1766 o = listNodeValue(listFirst(c->reply));
1767 objlen = sdslen(o->ptr);
1768
1769 if (objlen == 0) {
1770 listDelNode(c->reply,listFirst(c->reply));
1771 continue;
1772 }
1773
1774 if (c->flags & REDIS_MASTER) {
6f376729 1775 /* Don't reply to a master */
ed9b544e 1776 nwritten = objlen - c->sentlen;
1777 } else {
a4d1ba9a 1778 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 1779 if (nwritten <= 0) break;
1780 }
1781 c->sentlen += nwritten;
1782 totwritten += nwritten;
1783 /* If we fully sent the object on head go to the next one */
1784 if (c->sentlen == objlen) {
1785 listDelNode(c->reply,listFirst(c->reply));
1786 c->sentlen = 0;
1787 }
6f376729 1788 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 1789 * bytes, in a single threaded server it's a good idea to serve
6f376729 1790 * other clients as well, even if a very large request comes from
1791 * super fast link that is always able to accept data (in real world
12f9d551 1792 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 1793 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 1794 }
1795 if (nwritten == -1) {
1796 if (errno == EAGAIN) {
1797 nwritten = 0;
1798 } else {
f870935d 1799 redisLog(REDIS_VERBOSE,
ed9b544e 1800 "Error writing to client: %s", strerror(errno));
1801 freeClient(c);
1802 return;
1803 }
1804 }
1805 if (totwritten > 0) c->lastinteraction = time(NULL);
1806 if (listLength(c->reply) == 0) {
1807 c->sentlen = 0;
1808 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1809 }
1810}
1811
2895e862 1812static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
1813{
1814 redisClient *c = privdata;
1815 int nwritten = 0, totwritten = 0, objlen, willwrite;
1816 robj *o;
1817 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
1818 int offset, ion = 0;
1819 REDIS_NOTUSED(el);
1820 REDIS_NOTUSED(mask);
1821
1822 listNode *node;
1823 while (listLength(c->reply)) {
1824 offset = c->sentlen;
1825 ion = 0;
1826 willwrite = 0;
1827
1828 /* fill-in the iov[] array */
1829 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
1830 o = listNodeValue(node);
1831 objlen = sdslen(o->ptr);
1832
1833 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
1834 break;
1835
1836 if(ion == REDIS_WRITEV_IOVEC_COUNT)
1837 break; /* no more iovecs */
1838
1839 iov[ion].iov_base = ((char*)o->ptr) + offset;
1840 iov[ion].iov_len = objlen - offset;
1841 willwrite += objlen - offset;
1842 offset = 0; /* just for the first item */
1843 ion++;
1844 }
1845
1846 if(willwrite == 0)
1847 break;
1848
1849 /* write all collected blocks at once */
1850 if((nwritten = writev(fd, iov, ion)) < 0) {
1851 if (errno != EAGAIN) {
f870935d 1852 redisLog(REDIS_VERBOSE,
2895e862 1853 "Error writing to client: %s", strerror(errno));
1854 freeClient(c);
1855 return;
1856 }
1857 break;
1858 }
1859
1860 totwritten += nwritten;
1861 offset = c->sentlen;
1862
1863 /* remove written robjs from c->reply */
1864 while (nwritten && listLength(c->reply)) {
1865 o = listNodeValue(listFirst(c->reply));
1866 objlen = sdslen(o->ptr);
1867
1868 if(nwritten >= objlen - offset) {
1869 listDelNode(c->reply, listFirst(c->reply));
1870 nwritten -= objlen - offset;
1871 c->sentlen = 0;
1872 } else {
1873 /* partial write */
1874 c->sentlen += nwritten;
1875 break;
1876 }
1877 offset = 0;
1878 }
1879 }
1880
1881 if (totwritten > 0)
1882 c->lastinteraction = time(NULL);
1883
1884 if (listLength(c->reply) == 0) {
1885 c->sentlen = 0;
1886 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
1887 }
1888}
1889
ed9b544e 1890static struct redisCommand *lookupCommand(char *name) {
1891 int j = 0;
1892 while(cmdTable[j].name != NULL) {
bb0b03a3 1893 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
ed9b544e 1894 j++;
1895 }
1896 return NULL;
1897}
1898
1899/* resetClient prepare the client to process the next command */
1900static void resetClient(redisClient *c) {
1901 freeClientArgv(c);
1902 c->bulklen = -1;
e8a74421 1903 c->multibulk = 0;
ed9b544e 1904}
1905
6e469882 1906/* Call() is the core of Redis execution of a command */
1907static void call(redisClient *c, struct redisCommand *cmd) {
1908 long long dirty;
1909
1910 dirty = server.dirty;
1911 cmd->proc(c);
1912 if (server.appendonly && server.dirty-dirty)
1913 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
1914 if (server.dirty-dirty && listLength(server.slaves))
1915 replicationFeedSlaves(server.slaves,cmd,c->db->id,c->argv,c->argc);
1916 if (listLength(server.monitors))
1917 replicationFeedSlaves(server.monitors,cmd,c->db->id,c->argv,c->argc);
1918 server.stat_numcommands++;
1919}
1920
ed9b544e 1921/* If this function gets called we already read a whole
1922 * command, argments are in the client argv/argc fields.
1923 * processCommand() execute the command or prepare the
1924 * server for a bulk read from the client.
1925 *
1926 * If 1 is returned the client is still alive and valid and
1927 * and other operations can be performed by the caller. Otherwise
1928 * if 0 is returned the client was destroied (i.e. after QUIT). */
1929static int processCommand(redisClient *c) {
1930 struct redisCommand *cmd;
ed9b544e 1931
3fd78bcd 1932 /* Free some memory if needed (maxmemory setting) */
1933 if (server.maxmemory) freeMemoryIfNeeded();
1934
e8a74421 1935 /* Handle the multi bulk command type. This is an alternative protocol
1936 * supported by Redis in order to receive commands that are composed of
1937 * multiple binary-safe "bulk" arguments. The latency of processing is
1938 * a bit higher but this allows things like multi-sets, so if this
1939 * protocol is used only for MSET and similar commands this is a big win. */
1940 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
1941 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
1942 if (c->multibulk <= 0) {
1943 resetClient(c);
1944 return 1;
1945 } else {
1946 decrRefCount(c->argv[c->argc-1]);
1947 c->argc--;
1948 return 1;
1949 }
1950 } else if (c->multibulk) {
1951 if (c->bulklen == -1) {
1952 if (((char*)c->argv[0]->ptr)[0] != '$') {
1953 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
1954 resetClient(c);
1955 return 1;
1956 } else {
1957 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
1958 decrRefCount(c->argv[0]);
1959 if (bulklen < 0 || bulklen > 1024*1024*1024) {
1960 c->argc--;
1961 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
1962 resetClient(c);
1963 return 1;
1964 }
1965 c->argc--;
1966 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
1967 return 1;
1968 }
1969 } else {
1970 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
1971 c->mbargv[c->mbargc] = c->argv[0];
1972 c->mbargc++;
1973 c->argc--;
1974 c->multibulk--;
1975 if (c->multibulk == 0) {
1976 robj **auxargv;
1977 int auxargc;
1978
1979 /* Here we need to swap the multi-bulk argc/argv with the
1980 * normal argc/argv of the client structure. */
1981 auxargv = c->argv;
1982 c->argv = c->mbargv;
1983 c->mbargv = auxargv;
1984
1985 auxargc = c->argc;
1986 c->argc = c->mbargc;
1987 c->mbargc = auxargc;
1988
1989 /* We need to set bulklen to something different than -1
1990 * in order for the code below to process the command without
1991 * to try to read the last argument of a bulk command as
1992 * a special argument. */
1993 c->bulklen = 0;
1994 /* continue below and process the command */
1995 } else {
1996 c->bulklen = -1;
1997 return 1;
1998 }
1999 }
2000 }
2001 /* -- end of multi bulk commands processing -- */
2002
ed9b544e 2003 /* The QUIT command is handled as a special case. Normal command
2004 * procs are unable to close the client connection safely */
bb0b03a3 2005 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2006 freeClient(c);
2007 return 0;
2008 }
2009 cmd = lookupCommand(c->argv[0]->ptr);
2010 if (!cmd) {
2c14807b 2011 addReplySds(c,
2012 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2013 (char*)c->argv[0]->ptr));
ed9b544e 2014 resetClient(c);
2015 return 1;
2016 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2017 (c->argc < -cmd->arity)) {
454d4e43 2018 addReplySds(c,
2019 sdscatprintf(sdsempty(),
2020 "-ERR wrong number of arguments for '%s' command\r\n",
2021 cmd->name));
ed9b544e 2022 resetClient(c);
2023 return 1;
3fd78bcd 2024 } else if (server.maxmemory && cmd->flags & REDIS_CMD_DENYOOM && zmalloc_used_memory() > server.maxmemory) {
2025 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2026 resetClient(c);
2027 return 1;
ed9b544e 2028 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
2029 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2030
2031 decrRefCount(c->argv[c->argc-1]);
2032 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2033 c->argc--;
2034 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2035 resetClient(c);
2036 return 1;
2037 }
2038 c->argc--;
2039 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2040 /* It is possible that the bulk read is already in the
8d0490e7 2041 * buffer. Check this condition and handle it accordingly.
2042 * This is just a fast path, alternative to call processInputBuffer().
2043 * It's a good idea since the code is small and this condition
2044 * happens most of the times. */
ed9b544e 2045 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2046 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2047 c->argc++;
2048 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2049 } else {
2050 return 1;
2051 }
2052 }
10c43610 2053 /* Let's try to share objects on the command arguments vector */
2054 if (server.shareobjects) {
2055 int j;
2056 for(j = 1; j < c->argc; j++)
2057 c->argv[j] = tryObjectSharing(c->argv[j]);
2058 }
942a3961 2059 /* Let's try to encode the bulk object to save space. */
2060 if (cmd->flags & REDIS_CMD_BULK)
2061 tryObjectEncoding(c->argv[c->argc-1]);
2062
e63943a4 2063 /* Check if the user is authenticated */
2064 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2065 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2066 resetClient(c);
2067 return 1;
2068 }
2069
ed9b544e 2070 /* Exec the command */
6e469882 2071 if (c->flags & REDIS_MULTI && cmd->proc != execCommand) {
2072 queueMultiCommand(c,cmd);
2073 addReply(c,shared.queued);
2074 } else {
2075 call(c,cmd);
2076 }
ed9b544e 2077
2078 /* Prepare the client for the next command */
2079 if (c->flags & REDIS_CLOSE) {
2080 freeClient(c);
2081 return 0;
2082 }
2083 resetClient(c);
2084 return 1;
2085}
2086
87eca727 2087static void replicationFeedSlaves(list *slaves, struct redisCommand *cmd, int dictid, robj **argv, int argc) {
6208b3a7 2088 listNode *ln;
c7df85a4 2089 listIter li;
ed9b544e 2090 int outc = 0, j;
93ea3759 2091 robj **outv;
2092 /* (args*2)+1 is enough room for args, spaces, newlines */
2093 robj *static_outv[REDIS_STATIC_ARGS*2+1];
2094
2095 if (argc <= REDIS_STATIC_ARGS) {
2096 outv = static_outv;
2097 } else {
2098 outv = zmalloc(sizeof(robj*)*(argc*2+1));
93ea3759 2099 }
ed9b544e 2100
2101 for (j = 0; j < argc; j++) {
2102 if (j != 0) outv[outc++] = shared.space;
2103 if ((cmd->flags & REDIS_CMD_BULK) && j == argc-1) {
2104 robj *lenobj;
2105
2106 lenobj = createObject(REDIS_STRING,
682ac724 2107 sdscatprintf(sdsempty(),"%lu\r\n",
83c6a618 2108 (unsigned long) stringObjectLen(argv[j])));
ed9b544e 2109 lenobj->refcount = 0;
2110 outv[outc++] = lenobj;
2111 }
2112 outv[outc++] = argv[j];
2113 }
2114 outv[outc++] = shared.crlf;
2115
40d224a9 2116 /* Increment all the refcounts at start and decrement at end in order to
2117 * be sure to free objects if there is no slave in a replication state
2118 * able to be feed with commands */
2119 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2120 listRewind(slaves,&li);
2121 while((ln = listNext(&li))) {
ed9b544e 2122 redisClient *slave = ln->value;
40d224a9 2123
2124 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2125 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2126
2127 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2128 if (slave->slaveseldb != dictid) {
2129 robj *selectcmd;
2130
2131 switch(dictid) {
2132 case 0: selectcmd = shared.select0; break;
2133 case 1: selectcmd = shared.select1; break;
2134 case 2: selectcmd = shared.select2; break;
2135 case 3: selectcmd = shared.select3; break;
2136 case 4: selectcmd = shared.select4; break;
2137 case 5: selectcmd = shared.select5; break;
2138 case 6: selectcmd = shared.select6; break;
2139 case 7: selectcmd = shared.select7; break;
2140 case 8: selectcmd = shared.select8; break;
2141 case 9: selectcmd = shared.select9; break;
2142 default:
2143 selectcmd = createObject(REDIS_STRING,
2144 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2145 selectcmd->refcount = 0;
2146 break;
2147 }
2148 addReply(slave,selectcmd);
2149 slave->slaveseldb = dictid;
2150 }
2151 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2152 }
40d224a9 2153 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2154 if (outv != static_outv) zfree(outv);
ed9b544e 2155}
2156
638e42ac 2157static void processInputBuffer(redisClient *c) {
ed9b544e 2158again:
4409877e 2159 /* Before to process the input buffer, make sure the client is not
2160 * waitig for a blocking operation such as BLPOP. Note that the first
2161 * iteration the client is never blocked, otherwise the processInputBuffer
2162 * would not be called at all, but after the execution of the first commands
2163 * in the input buffer the client may be blocked, and the "goto again"
2164 * will try to reiterate. The following line will make it return asap. */
92f8e882 2165 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2166 if (c->bulklen == -1) {
2167 /* Read the first line of the query */
2168 char *p = strchr(c->querybuf,'\n');
2169 size_t querylen;
644fafa3 2170
ed9b544e 2171 if (p) {
2172 sds query, *argv;
2173 int argc, j;
2174
2175 query = c->querybuf;
2176 c->querybuf = sdsempty();
2177 querylen = 1+(p-(query));
2178 if (sdslen(query) > querylen) {
2179 /* leave data after the first line of the query in the buffer */
2180 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2181 }
2182 *p = '\0'; /* remove "\n" */
2183 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2184 sdsupdatelen(query);
2185
2186 /* Now we can split the query in arguments */
ed9b544e 2187 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2188 sdsfree(query);
2189
2190 if (c->argv) zfree(c->argv);
2191 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2192
2193 for (j = 0; j < argc; j++) {
ed9b544e 2194 if (sdslen(argv[j])) {
2195 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2196 c->argc++;
2197 } else {
2198 sdsfree(argv[j]);
2199 }
2200 }
2201 zfree(argv);
7c49733c 2202 if (c->argc) {
2203 /* Execute the command. If the client is still valid
2204 * after processCommand() return and there is something
2205 * on the query buffer try to process the next command. */
2206 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2207 } else {
2208 /* Nothing to process, argc == 0. Just process the query
2209 * buffer if it's not empty or return to the caller */
2210 if (sdslen(c->querybuf)) goto again;
2211 }
ed9b544e 2212 return;
644fafa3 2213 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2214 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2215 freeClient(c);
2216 return;
2217 }
2218 } else {
2219 /* Bulk read handling. Note that if we are at this point
2220 the client already sent a command terminated with a newline,
2221 we are reading the bulk data that is actually the last
2222 argument of the command. */
2223 int qbl = sdslen(c->querybuf);
2224
2225 if (c->bulklen <= qbl) {
2226 /* Copy everything but the final CRLF as final argument */
2227 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2228 c->argc++;
2229 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2230 /* Process the command. If the client is still valid after
2231 * the processing and there is more data in the buffer
2232 * try to parse it. */
2233 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2234 return;
2235 }
2236 }
2237}
2238
638e42ac 2239static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2240 redisClient *c = (redisClient*) privdata;
2241 char buf[REDIS_IOBUF_LEN];
2242 int nread;
2243 REDIS_NOTUSED(el);
2244 REDIS_NOTUSED(mask);
2245
2246 nread = read(fd, buf, REDIS_IOBUF_LEN);
2247 if (nread == -1) {
2248 if (errno == EAGAIN) {
2249 nread = 0;
2250 } else {
f870935d 2251 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2252 freeClient(c);
2253 return;
2254 }
2255 } else if (nread == 0) {
f870935d 2256 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2257 freeClient(c);
2258 return;
2259 }
2260 if (nread) {
2261 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2262 c->lastinteraction = time(NULL);
2263 } else {
2264 return;
2265 }
2266 processInputBuffer(c);
2267}
2268
ed9b544e 2269static int selectDb(redisClient *c, int id) {
2270 if (id < 0 || id >= server.dbnum)
2271 return REDIS_ERR;
3305306f 2272 c->db = &server.db[id];
ed9b544e 2273 return REDIS_OK;
2274}
2275
40d224a9 2276static void *dupClientReplyValue(void *o) {
2277 incrRefCount((robj*)o);
2278 return 0;
2279}
2280
ed9b544e 2281static redisClient *createClient(int fd) {
2282 redisClient *c = zmalloc(sizeof(*c));
2283
2284 anetNonBlock(NULL,fd);
2285 anetTcpNoDelay(NULL,fd);
2286 if (!c) return NULL;
2287 selectDb(c,0);
2288 c->fd = fd;
2289 c->querybuf = sdsempty();
2290 c->argc = 0;
93ea3759 2291 c->argv = NULL;
ed9b544e 2292 c->bulklen = -1;
e8a74421 2293 c->multibulk = 0;
2294 c->mbargc = 0;
2295 c->mbargv = NULL;
ed9b544e 2296 c->sentlen = 0;
2297 c->flags = 0;
2298 c->lastinteraction = time(NULL);
abcb223e 2299 c->authenticated = 0;
40d224a9 2300 c->replstate = REDIS_REPL_NONE;
6b47e12e 2301 c->reply = listCreate();
ed9b544e 2302 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2303 listSetDupMethod(c->reply,dupClientReplyValue);
92f8e882 2304 c->blockingkeys = NULL;
2305 c->blockingkeysnum = 0;
2306 c->io_keys = listCreate();
2307 listSetFreeMethod(c->io_keys,decrRefCount);
ed9b544e 2308 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2309 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2310 freeClient(c);
2311 return NULL;
2312 }
6b47e12e 2313 listAddNodeTail(server.clients,c);
6e469882 2314 initClientMultiState(c);
ed9b544e 2315 return c;
2316}
2317
2318static void addReply(redisClient *c, robj *obj) {
2319 if (listLength(c->reply) == 0 &&
6208b3a7 2320 (c->replstate == REDIS_REPL_NONE ||
2321 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2322 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2323 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2324
2325 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2326 obj = dupStringObject(obj);
2327 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2328 }
9d65a1bb 2329 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2330}
2331
2332static void addReplySds(redisClient *c, sds s) {
2333 robj *o = createObject(REDIS_STRING,s);
2334 addReply(c,o);
2335 decrRefCount(o);
2336}
2337
e2665397 2338static void addReplyDouble(redisClient *c, double d) {
2339 char buf[128];
2340
2341 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2342 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2343 (unsigned long) strlen(buf),buf));
e2665397 2344}
2345
942a3961 2346static void addReplyBulkLen(redisClient *c, robj *obj) {
2347 size_t len;
2348
2349 if (obj->encoding == REDIS_ENCODING_RAW) {
2350 len = sdslen(obj->ptr);
2351 } else {
2352 long n = (long)obj->ptr;
2353
e054afda 2354 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2355 len = 1;
2356 if (n < 0) {
2357 len++;
2358 n = -n;
2359 }
2360 while((n = n/10) != 0) {
2361 len++;
2362 }
2363 }
83c6a618 2364 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
942a3961 2365}
2366
ed9b544e 2367static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2368 int cport, cfd;
2369 char cip[128];
285add55 2370 redisClient *c;
ed9b544e 2371 REDIS_NOTUSED(el);
2372 REDIS_NOTUSED(mask);
2373 REDIS_NOTUSED(privdata);
2374
2375 cfd = anetAccept(server.neterr, fd, cip, &cport);
2376 if (cfd == AE_ERR) {
f870935d 2377 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2378 return;
2379 }
f870935d 2380 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2381 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2382 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2383 close(cfd); /* May be already closed, just ingore errors */
2384 return;
2385 }
285add55 2386 /* If maxclient directive is set and this is one client more... close the
2387 * connection. Note that we create the client instead to check before
2388 * for this condition, since now the socket is already set in nonblocking
2389 * mode and we can send an error for free using the Kernel I/O */
2390 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2391 char *err = "-ERR max number of clients reached\r\n";
2392
2393 /* That's a best effort error message, don't check write errors */
fee803ba 2394 if (write(c->fd,err,strlen(err)) == -1) {
2395 /* Nothing to do, Just to avoid the warning... */
2396 }
285add55 2397 freeClient(c);
2398 return;
2399 }
ed9b544e 2400 server.stat_numconnections++;
2401}
2402
2403/* ======================= Redis objects implementation ===================== */
2404
2405static robj *createObject(int type, void *ptr) {
2406 robj *o;
2407
a5819310 2408 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2409 if (listLength(server.objfreelist)) {
2410 listNode *head = listFirst(server.objfreelist);
2411 o = listNodeValue(head);
2412 listDelNode(server.objfreelist,head);
a5819310 2413 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2414 } else {
75680a3c 2415 if (server.vm_enabled) {
a5819310 2416 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2417 o = zmalloc(sizeof(*o));
2418 } else {
2419 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2420 }
ed9b544e 2421 }
ed9b544e 2422 o->type = type;
942a3961 2423 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2424 o->ptr = ptr;
2425 o->refcount = 1;
3a66edc7 2426 if (server.vm_enabled) {
1064ef87 2427 /* Note that this code may run in the context of an I/O thread
2428 * and accessing to server.unixtime in theory is an error
2429 * (no locks). But in practice this is safe, and even if we read
2430 * garbage Redis will not fail, as it's just a statistical info */
3a66edc7 2431 o->vm.atime = server.unixtime;
2432 o->storage = REDIS_VM_MEMORY;
2433 }
ed9b544e 2434 return o;
2435}
2436
2437static robj *createStringObject(char *ptr, size_t len) {
2438 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2439}
2440
4ef8de8a 2441static robj *dupStringObject(robj *o) {
b9bc0eef 2442 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2443 return createStringObject(o->ptr,sdslen(o->ptr));
2444}
2445
ed9b544e 2446static robj *createListObject(void) {
2447 list *l = listCreate();
2448
ed9b544e 2449 listSetFreeMethod(l,decrRefCount);
2450 return createObject(REDIS_LIST,l);
2451}
2452
2453static robj *createSetObject(void) {
2454 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2455 return createObject(REDIS_SET,d);
2456}
2457
1812e024 2458static robj *createZsetObject(void) {
6b47e12e 2459 zset *zs = zmalloc(sizeof(*zs));
2460
2461 zs->dict = dictCreate(&zsetDictType,NULL);
2462 zs->zsl = zslCreate();
2463 return createObject(REDIS_ZSET,zs);
1812e024 2464}
2465
ed9b544e 2466static void freeStringObject(robj *o) {
942a3961 2467 if (o->encoding == REDIS_ENCODING_RAW) {
2468 sdsfree(o->ptr);
2469 }
ed9b544e 2470}
2471
2472static void freeListObject(robj *o) {
2473 listRelease((list*) o->ptr);
2474}
2475
2476static void freeSetObject(robj *o) {
2477 dictRelease((dict*) o->ptr);
2478}
2479
fd8ccf44 2480static void freeZsetObject(robj *o) {
2481 zset *zs = o->ptr;
2482
2483 dictRelease(zs->dict);
2484 zslFree(zs->zsl);
2485 zfree(zs);
2486}
2487
ed9b544e 2488static void freeHashObject(robj *o) {
2489 dictRelease((dict*) o->ptr);
2490}
2491
2492static void incrRefCount(robj *o) {
f2b8ab34 2493 redisAssert(!server.vm_enabled || o->storage == REDIS_VM_MEMORY);
ed9b544e 2494 o->refcount++;
2495}
2496
2497static void decrRefCount(void *obj) {
2498 robj *o = obj;
94754ccc 2499
996cb5f7 2500 /* Object is swapped out, or in the process of being loaded. */
2501 if (server.vm_enabled &&
2502 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2503 {
2504 if (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING) {
2505 redisAssert(o->refcount == 1);
2506 }
2507 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 2508 redisAssert(o->type == REDIS_STRING);
a35ddf12 2509 freeStringObject(o);
2510 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 2511 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 2512 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2513 !listAddNodeHead(server.objfreelist,o))
2514 zfree(o);
a5819310 2515 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 2516 server.vm_stats_swapped_objects--;
a35ddf12 2517 return;
2518 }
996cb5f7 2519 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 2520 if (--(o->refcount) == 0) {
996cb5f7 2521 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2522 vmCancelThreadedIOJob(obj);
ed9b544e 2523 switch(o->type) {
2524 case REDIS_STRING: freeStringObject(o); break;
2525 case REDIS_LIST: freeListObject(o); break;
2526 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 2527 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 2528 case REDIS_HASH: freeHashObject(o); break;
dfc5e96c 2529 default: redisAssert(0 != 0); break;
ed9b544e 2530 }
a5819310 2531 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2532 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2533 !listAddNodeHead(server.objfreelist,o))
2534 zfree(o);
a5819310 2535 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2536 }
2537}
2538
942a3961 2539static robj *lookupKey(redisDb *db, robj *key) {
2540 dictEntry *de = dictFind(db->dict,key);
3a66edc7 2541 if (de) {
55cf8433 2542 robj *key = dictGetEntryKey(de);
2543 robj *val = dictGetEntryVal(de);
3a66edc7 2544
55cf8433 2545 if (server.vm_enabled) {
996cb5f7 2546 if (key->storage == REDIS_VM_MEMORY ||
2547 key->storage == REDIS_VM_SWAPPING)
2548 {
2549 /* If we were swapping the object out, stop it, this key
2550 * was requested. */
2551 if (key->storage == REDIS_VM_SWAPPING)
2552 vmCancelThreadedIOJob(key);
55cf8433 2553 /* Update the access time of the key for the aging algorithm. */
2554 key->vm.atime = server.unixtime;
2555 } else {
2556 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 2557 redisAssert(val == NULL);
55cf8433 2558 val = vmLoadObject(key);
2559 dictGetEntryVal(de) = val;
2560 }
2561 }
2562 return val;
3a66edc7 2563 } else {
2564 return NULL;
2565 }
942a3961 2566}
2567
2568static robj *lookupKeyRead(redisDb *db, robj *key) {
2569 expireIfNeeded(db,key);
2570 return lookupKey(db,key);
2571}
2572
2573static robj *lookupKeyWrite(redisDb *db, robj *key) {
2574 deleteIfVolatile(db,key);
2575 return lookupKey(db,key);
2576}
2577
2578static int deleteKey(redisDb *db, robj *key) {
2579 int retval;
2580
2581 /* We need to protect key from destruction: after the first dictDelete()
2582 * it may happen that 'key' is no longer valid if we don't increment
2583 * it's count. This may happen when we get the object reference directly
2584 * from the hash table with dictRandomKey() or dict iterators */
2585 incrRefCount(key);
2586 if (dictSize(db->expires)) dictDelete(db->expires,key);
2587 retval = dictDelete(db->dict,key);
2588 decrRefCount(key);
2589
2590 return retval == DICT_OK;
2591}
2592
10c43610 2593/* Try to share an object against the shared objects pool */
2594static robj *tryObjectSharing(robj *o) {
2595 struct dictEntry *de;
2596 unsigned long c;
2597
3305306f 2598 if (o == NULL || server.shareobjects == 0) return o;
10c43610 2599
dfc5e96c 2600 redisAssert(o->type == REDIS_STRING);
10c43610 2601 de = dictFind(server.sharingpool,o);
2602 if (de) {
2603 robj *shared = dictGetEntryKey(de);
2604
2605 c = ((unsigned long) dictGetEntryVal(de))+1;
2606 dictGetEntryVal(de) = (void*) c;
2607 incrRefCount(shared);
2608 decrRefCount(o);
2609 return shared;
2610 } else {
2611 /* Here we are using a stream algorihtm: Every time an object is
2612 * shared we increment its count, everytime there is a miss we
2613 * recrement the counter of a random object. If this object reaches
2614 * zero we remove the object and put the current object instead. */
3305306f 2615 if (dictSize(server.sharingpool) >=
10c43610 2616 server.sharingpoolsize) {
2617 de = dictGetRandomKey(server.sharingpool);
dfc5e96c 2618 redisAssert(de != NULL);
10c43610 2619 c = ((unsigned long) dictGetEntryVal(de))-1;
2620 dictGetEntryVal(de) = (void*) c;
2621 if (c == 0) {
2622 dictDelete(server.sharingpool,de->key);
2623 }
2624 } else {
2625 c = 0; /* If the pool is empty we want to add this object */
2626 }
2627 if (c == 0) {
2628 int retval;
2629
2630 retval = dictAdd(server.sharingpool,o,(void*)1);
dfc5e96c 2631 redisAssert(retval == DICT_OK);
10c43610 2632 incrRefCount(o);
2633 }
2634 return o;
2635 }
2636}
2637
724a51b1 2638/* Check if the nul-terminated string 's' can be represented by a long
2639 * (that is, is a number that fits into long without any other space or
2640 * character before or after the digits).
2641 *
2642 * If so, the function returns REDIS_OK and *longval is set to the value
2643 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 2644static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 2645 char buf[32], *endptr;
2646 long value;
2647 int slen;
2648
2649 value = strtol(s, &endptr, 10);
2650 if (endptr[0] != '\0') return REDIS_ERR;
2651 slen = snprintf(buf,32,"%ld",value);
2652
2653 /* If the number converted back into a string is not identical
2654 * then it's not possible to encode the string as integer */
f69f2cba 2655 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 2656 if (longval) *longval = value;
2657 return REDIS_OK;
2658}
2659
942a3961 2660/* Try to encode a string object in order to save space */
2661static int tryObjectEncoding(robj *o) {
2662 long value;
942a3961 2663 sds s = o->ptr;
3305306f 2664
942a3961 2665 if (o->encoding != REDIS_ENCODING_RAW)
2666 return REDIS_ERR; /* Already encoded */
3305306f 2667
942a3961 2668 /* It's not save to encode shared objects: shared objects can be shared
2669 * everywhere in the "object space" of Redis. Encoded objects can only
2670 * appear as "values" (and not, for instance, as keys) */
2671 if (o->refcount > 1) return REDIS_ERR;
3305306f 2672
942a3961 2673 /* Currently we try to encode only strings */
dfc5e96c 2674 redisAssert(o->type == REDIS_STRING);
94754ccc 2675
724a51b1 2676 /* Check if we can represent this string as a long integer */
2677 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
942a3961 2678
2679 /* Ok, this object can be encoded */
2680 o->encoding = REDIS_ENCODING_INT;
2681 sdsfree(o->ptr);
2682 o->ptr = (void*) value;
2683 return REDIS_OK;
2684}
2685
9d65a1bb 2686/* Get a decoded version of an encoded object (returned as a new object).
2687 * If the object is already raw-encoded just increment the ref count. */
2688static robj *getDecodedObject(robj *o) {
942a3961 2689 robj *dec;
2690
9d65a1bb 2691 if (o->encoding == REDIS_ENCODING_RAW) {
2692 incrRefCount(o);
2693 return o;
2694 }
942a3961 2695 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
2696 char buf[32];
2697
2698 snprintf(buf,32,"%ld",(long)o->ptr);
2699 dec = createStringObject(buf,strlen(buf));
2700 return dec;
2701 } else {
dfc5e96c 2702 redisAssert(1 != 1);
942a3961 2703 }
3305306f 2704}
2705
d7f43c08 2706/* Compare two string objects via strcmp() or alike.
2707 * Note that the objects may be integer-encoded. In such a case we
2708 * use snprintf() to get a string representation of the numbers on the stack
1fd9bc8a 2709 * and compare the strings, it's much faster than calling getDecodedObject().
2710 *
2711 * Important note: if objects are not integer encoded, but binary-safe strings,
2712 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
2713 * binary safe. */
724a51b1 2714static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 2715 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 2716 char bufa[128], bufb[128], *astr, *bstr;
2717 int bothsds = 1;
724a51b1 2718
e197b441 2719 if (a == b) return 0;
d7f43c08 2720 if (a->encoding != REDIS_ENCODING_RAW) {
2721 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
2722 astr = bufa;
2723 bothsds = 0;
724a51b1 2724 } else {
d7f43c08 2725 astr = a->ptr;
724a51b1 2726 }
d7f43c08 2727 if (b->encoding != REDIS_ENCODING_RAW) {
2728 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
2729 bstr = bufb;
2730 bothsds = 0;
2731 } else {
2732 bstr = b->ptr;
2733 }
2734 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 2735}
2736
0ea663ea 2737static size_t stringObjectLen(robj *o) {
dfc5e96c 2738 redisAssert(o->type == REDIS_STRING);
0ea663ea 2739 if (o->encoding == REDIS_ENCODING_RAW) {
2740 return sdslen(o->ptr);
2741 } else {
2742 char buf[32];
2743
2744 return snprintf(buf,32,"%ld",(long)o->ptr);
2745 }
2746}
2747
06233c45 2748/*============================ RDB saving/loading =========================== */
ed9b544e 2749
f78fd11b 2750static int rdbSaveType(FILE *fp, unsigned char type) {
2751 if (fwrite(&type,1,1,fp) == 0) return -1;
2752 return 0;
2753}
2754
bb32ede5 2755static int rdbSaveTime(FILE *fp, time_t t) {
2756 int32_t t32 = (int32_t) t;
2757 if (fwrite(&t32,4,1,fp) == 0) return -1;
2758 return 0;
2759}
2760
e3566d4b 2761/* check rdbLoadLen() comments for more info */
f78fd11b 2762static int rdbSaveLen(FILE *fp, uint32_t len) {
2763 unsigned char buf[2];
2764
2765 if (len < (1<<6)) {
2766 /* Save a 6 bit len */
10c43610 2767 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 2768 if (fwrite(buf,1,1,fp) == 0) return -1;
2769 } else if (len < (1<<14)) {
2770 /* Save a 14 bit len */
10c43610 2771 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 2772 buf[1] = len&0xFF;
17be1a4a 2773 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 2774 } else {
2775 /* Save a 32 bit len */
10c43610 2776 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 2777 if (fwrite(buf,1,1,fp) == 0) return -1;
2778 len = htonl(len);
2779 if (fwrite(&len,4,1,fp) == 0) return -1;
2780 }
2781 return 0;
2782}
2783
e3566d4b 2784/* String objects in the form "2391" "-100" without any space and with a
2785 * range of values that can fit in an 8, 16 or 32 bit signed value can be
2786 * encoded as integers to save space */
56906eef 2787static int rdbTryIntegerEncoding(sds s, unsigned char *enc) {
e3566d4b 2788 long long value;
2789 char *endptr, buf[32];
2790
2791 /* Check if it's possible to encode this value as a number */
2792 value = strtoll(s, &endptr, 10);
2793 if (endptr[0] != '\0') return 0;
2794 snprintf(buf,32,"%lld",value);
2795
2796 /* If the number converted back into a string is not identical
2797 * then it's not possible to encode the string as integer */
2798 if (strlen(buf) != sdslen(s) || memcmp(buf,s,sdslen(s))) return 0;
2799
2800 /* Finally check if it fits in our ranges */
2801 if (value >= -(1<<7) && value <= (1<<7)-1) {
2802 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
2803 enc[1] = value&0xFF;
2804 return 2;
2805 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
2806 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
2807 enc[1] = value&0xFF;
2808 enc[2] = (value>>8)&0xFF;
2809 return 3;
2810 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
2811 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
2812 enc[1] = value&0xFF;
2813 enc[2] = (value>>8)&0xFF;
2814 enc[3] = (value>>16)&0xFF;
2815 enc[4] = (value>>24)&0xFF;
2816 return 5;
2817 } else {
2818 return 0;
2819 }
2820}
2821
774e3047 2822static int rdbSaveLzfStringObject(FILE *fp, robj *obj) {
2823 unsigned int comprlen, outlen;
2824 unsigned char byte;
2825 void *out;
2826
2827 /* We require at least four bytes compression for this to be worth it */
2828 outlen = sdslen(obj->ptr)-4;
2829 if (outlen <= 0) return 0;
3a2694c4 2830 if ((out = zmalloc(outlen+1)) == NULL) return 0;
774e3047 2831 comprlen = lzf_compress(obj->ptr, sdslen(obj->ptr), out, outlen);
2832 if (comprlen == 0) {
88e85998 2833 zfree(out);
774e3047 2834 return 0;
2835 }
2836 /* Data compressed! Let's save it on disk */
2837 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
2838 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
2839 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
2840 if (rdbSaveLen(fp,sdslen(obj->ptr)) == -1) goto writeerr;
2841 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 2842 zfree(out);
774e3047 2843 return comprlen;
2844
2845writeerr:
88e85998 2846 zfree(out);
774e3047 2847 return -1;
2848}
2849
e3566d4b 2850/* Save a string objet as [len][data] on disk. If the object is a string
2851 * representation of an integer value we try to safe it in a special form */
942a3961 2852static int rdbSaveStringObjectRaw(FILE *fp, robj *obj) {
2853 size_t len;
e3566d4b 2854 int enclen;
10c43610 2855
942a3961 2856 len = sdslen(obj->ptr);
2857
774e3047 2858 /* Try integer encoding */
e3566d4b 2859 if (len <= 11) {
2860 unsigned char buf[5];
2861 if ((enclen = rdbTryIntegerEncoding(obj->ptr,buf)) > 0) {
2862 if (fwrite(buf,enclen,1,fp) == 0) return -1;
2863 return 0;
2864 }
2865 }
774e3047 2866
2867 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 2868 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 2869 if (server.rdbcompression && len > 20) {
774e3047 2870 int retval;
2871
2872 retval = rdbSaveLzfStringObject(fp,obj);
2873 if (retval == -1) return -1;
2874 if (retval > 0) return 0;
2875 /* retval == 0 means data can't be compressed, save the old way */
2876 }
2877
2878 /* Store verbatim */
10c43610 2879 if (rdbSaveLen(fp,len) == -1) return -1;
2880 if (len && fwrite(obj->ptr,len,1,fp) == 0) return -1;
2881 return 0;
2882}
2883
942a3961 2884/* Like rdbSaveStringObjectRaw() but handle encoded objects */
2885static int rdbSaveStringObject(FILE *fp, robj *obj) {
2886 int retval;
942a3961 2887
f2d9f50f 2888 /* Avoid incr/decr ref count business when possible.
2889 * This plays well with copy-on-write given that we are probably
2890 * in a child process (BGSAVE). Also this makes sure key objects
2891 * of swapped objects are not incRefCount-ed (an assert does not allow
2892 * this in order to avoid bugs) */
2893 if (obj->encoding != REDIS_ENCODING_RAW) {
996cb5f7 2894 obj = getDecodedObject(obj);
2895 retval = rdbSaveStringObjectRaw(fp,obj);
2896 decrRefCount(obj);
2897 } else {
996cb5f7 2898 retval = rdbSaveStringObjectRaw(fp,obj);
2899 }
9d65a1bb 2900 return retval;
942a3961 2901}
2902
a7866db6 2903/* Save a double value. Doubles are saved as strings prefixed by an unsigned
2904 * 8 bit integer specifing the length of the representation.
2905 * This 8 bit integer has special values in order to specify the following
2906 * conditions:
2907 * 253: not a number
2908 * 254: + inf
2909 * 255: - inf
2910 */
2911static int rdbSaveDoubleValue(FILE *fp, double val) {
2912 unsigned char buf[128];
2913 int len;
2914
2915 if (isnan(val)) {
2916 buf[0] = 253;
2917 len = 1;
2918 } else if (!isfinite(val)) {
2919 len = 1;
2920 buf[0] = (val < 0) ? 255 : 254;
2921 } else {
eaa256ad 2922 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 2923 buf[0] = strlen((char*)buf+1);
a7866db6 2924 len = buf[0]+1;
2925 }
2926 if (fwrite(buf,len,1,fp) == 0) return -1;
2927 return 0;
2928}
2929
06233c45 2930/* Save a Redis object. */
2931static int rdbSaveObject(FILE *fp, robj *o) {
2932 if (o->type == REDIS_STRING) {
2933 /* Save a string value */
2934 if (rdbSaveStringObject(fp,o) == -1) return -1;
2935 } else if (o->type == REDIS_LIST) {
2936 /* Save a list value */
2937 list *list = o->ptr;
c7df85a4 2938 listIter li;
06233c45 2939 listNode *ln;
2940
06233c45 2941 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
c7df85a4 2942 listRewind(list,&li);
2943 while((ln = listNext(&li))) {
06233c45 2944 robj *eleobj = listNodeValue(ln);
2945
2946 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
2947 }
2948 } else if (o->type == REDIS_SET) {
2949 /* Save a set value */
2950 dict *set = o->ptr;
2951 dictIterator *di = dictGetIterator(set);
2952 dictEntry *de;
2953
2954 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
2955 while((de = dictNext(di)) != NULL) {
2956 robj *eleobj = dictGetEntryKey(de);
2957
2958 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
2959 }
2960 dictReleaseIterator(di);
2961 } else if (o->type == REDIS_ZSET) {
2962 /* Save a set value */
2963 zset *zs = o->ptr;
2964 dictIterator *di = dictGetIterator(zs->dict);
2965 dictEntry *de;
2966
2967 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
2968 while((de = dictNext(di)) != NULL) {
2969 robj *eleobj = dictGetEntryKey(de);
2970 double *score = dictGetEntryVal(de);
2971
2972 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
2973 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
2974 }
2975 dictReleaseIterator(di);
2976 } else {
2977 redisAssert(0 != 0);
2978 }
2979 return 0;
2980}
2981
2982/* Return the length the object will have on disk if saved with
2983 * the rdbSaveObject() function. Currently we use a trick to get
2984 * this length with very little changes to the code. In the future
2985 * we could switch to a faster solution. */
b9bc0eef 2986static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
2987 if (fp == NULL) fp = server.devnull;
06233c45 2988 rewind(fp);
2989 assert(rdbSaveObject(fp,o) != 1);
2990 return ftello(fp);
2991}
2992
06224fec 2993/* Return the number of pages required to save this object in the swap file */
b9bc0eef 2994static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
2995 off_t bytes = rdbSavedObjectLen(o,fp);
06224fec 2996
2997 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
2998}
2999
ed9b544e 3000/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3001static int rdbSave(char *filename) {
ed9b544e 3002 dictIterator *di = NULL;
3003 dictEntry *de;
ed9b544e 3004 FILE *fp;
3005 char tmpfile[256];
3006 int j;
bb32ede5 3007 time_t now = time(NULL);
ed9b544e 3008
a3b21203 3009 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3010 fp = fopen(tmpfile,"w");
3011 if (!fp) {
3012 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3013 return REDIS_ERR;
3014 }
f78fd11b 3015 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3016 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3017 redisDb *db = server.db+j;
3018 dict *d = db->dict;
3305306f 3019 if (dictSize(d) == 0) continue;
ed9b544e 3020 di = dictGetIterator(d);
3021 if (!di) {
3022 fclose(fp);
3023 return REDIS_ERR;
3024 }
3025
3026 /* Write the SELECT DB opcode */
f78fd11b 3027 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3028 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3029
3030 /* Iterate this DB writing every entry */
3031 while((de = dictNext(di)) != NULL) {
3032 robj *key = dictGetEntryKey(de);
3033 robj *o = dictGetEntryVal(de);
bb32ede5 3034 time_t expiretime = getExpire(db,key);
3035
3036 /* Save the expire time */
3037 if (expiretime != -1) {
3038 /* If this key is already expired skip it */
3039 if (expiretime < now) continue;
3040 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3041 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3042 }
7e69548d 3043 /* Save the key and associated value. This requires special
3044 * handling if the value is swapped out. */
996cb5f7 3045 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3046 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3047 /* Save type, key, value */
3048 if (rdbSaveType(fp,o->type) == -1) goto werr;
3049 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3050 if (rdbSaveObject(fp,o) == -1) goto werr;
3051 } else {
996cb5f7 3052 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3053 robj *po;
7e69548d 3054 /* Get a preview of the object in memory */
3055 po = vmPreviewObject(key);
7e69548d 3056 /* Save type, key, value */
3057 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3058 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3059 if (rdbSaveObject(fp,po) == -1) goto werr;
3060 /* Remove the loaded object from memory */
3061 decrRefCount(po);
7e69548d 3062 }
ed9b544e 3063 }
3064 dictReleaseIterator(di);
3065 }
3066 /* EOF opcode */
f78fd11b 3067 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3068
3069 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3070 fflush(fp);
3071 fsync(fileno(fp));
3072 fclose(fp);
3073
3074 /* Use RENAME to make sure the DB file is changed atomically only
3075 * if the generate DB file is ok. */
3076 if (rename(tmpfile,filename) == -1) {
325d1eb4 3077 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3078 unlink(tmpfile);
3079 return REDIS_ERR;
3080 }
3081 redisLog(REDIS_NOTICE,"DB saved on disk");
3082 server.dirty = 0;
3083 server.lastsave = time(NULL);
3084 return REDIS_OK;
3085
3086werr:
3087 fclose(fp);
3088 unlink(tmpfile);
3089 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3090 if (di) dictReleaseIterator(di);
3091 return REDIS_ERR;
3092}
3093
f78fd11b 3094static int rdbSaveBackground(char *filename) {
ed9b544e 3095 pid_t childpid;
3096
9d65a1bb 3097 if (server.bgsavechildpid != -1) return REDIS_ERR;
4ee9488d 3098 if (server.vm_enabled) waitZeroActiveThreads();
ed9b544e 3099 if ((childpid = fork()) == 0) {
3100 /* Child */
3101 close(server.fd);
f78fd11b 3102 if (rdbSave(filename) == REDIS_OK) {
ed9b544e 3103 exit(0);
3104 } else {
3105 exit(1);
3106 }
3107 } else {
3108 /* Parent */
5a7c647e 3109 if (childpid == -1) {
3110 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3111 strerror(errno));
3112 return REDIS_ERR;
3113 }
ed9b544e 3114 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3115 server.bgsavechildpid = childpid;
ed9b544e 3116 return REDIS_OK;
3117 }
3118 return REDIS_OK; /* unreached */
3119}
3120
a3b21203 3121static void rdbRemoveTempFile(pid_t childpid) {
3122 char tmpfile[256];
3123
3124 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3125 unlink(tmpfile);
3126}
3127
f78fd11b 3128static int rdbLoadType(FILE *fp) {
3129 unsigned char type;
7b45bfb2 3130 if (fread(&type,1,1,fp) == 0) return -1;
3131 return type;
3132}
3133
bb32ede5 3134static time_t rdbLoadTime(FILE *fp) {
3135 int32_t t32;
3136 if (fread(&t32,4,1,fp) == 0) return -1;
3137 return (time_t) t32;
3138}
3139
e3566d4b 3140/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3141 * of this file for a description of how this are stored on disk.
3142 *
3143 * isencoded is set to 1 if the readed length is not actually a length but
3144 * an "encoding type", check the above comments for more info */
c78a8ccc 3145static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3146 unsigned char buf[2];
3147 uint32_t len;
c78a8ccc 3148 int type;
f78fd11b 3149
e3566d4b 3150 if (isencoded) *isencoded = 0;
c78a8ccc 3151 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3152 type = (buf[0]&0xC0)>>6;
3153 if (type == REDIS_RDB_6BITLEN) {
3154 /* Read a 6 bit len */
3155 return buf[0]&0x3F;
3156 } else if (type == REDIS_RDB_ENCVAL) {
3157 /* Read a 6 bit len encoding type */
3158 if (isencoded) *isencoded = 1;
3159 return buf[0]&0x3F;
3160 } else if (type == REDIS_RDB_14BITLEN) {
3161 /* Read a 14 bit len */
3162 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3163 return ((buf[0]&0x3F)<<8)|buf[1];
3164 } else {
3165 /* Read a 32 bit len */
f78fd11b 3166 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3167 return ntohl(len);
f78fd11b 3168 }
f78fd11b 3169}
3170
e3566d4b 3171static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3172 unsigned char enc[4];
3173 long long val;
3174
3175 if (enctype == REDIS_RDB_ENC_INT8) {
3176 if (fread(enc,1,1,fp) == 0) return NULL;
3177 val = (signed char)enc[0];
3178 } else if (enctype == REDIS_RDB_ENC_INT16) {
3179 uint16_t v;
3180 if (fread(enc,2,1,fp) == 0) return NULL;
3181 v = enc[0]|(enc[1]<<8);
3182 val = (int16_t)v;
3183 } else if (enctype == REDIS_RDB_ENC_INT32) {
3184 uint32_t v;
3185 if (fread(enc,4,1,fp) == 0) return NULL;
3186 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3187 val = (int32_t)v;
3188 } else {
3189 val = 0; /* anti-warning */
dfc5e96c 3190 redisAssert(0!=0);
e3566d4b 3191 }
3192 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3193}
3194
c78a8ccc 3195static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3196 unsigned int len, clen;
3197 unsigned char *c = NULL;
3198 sds val = NULL;
3199
c78a8ccc 3200 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3201 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3202 if ((c = zmalloc(clen)) == NULL) goto err;
3203 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3204 if (fread(c,clen,1,fp) == 0) goto err;
3205 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3206 zfree(c);
88e85998 3207 return createObject(REDIS_STRING,val);
3208err:
3209 zfree(c);
3210 sdsfree(val);
3211 return NULL;
3212}
3213
c78a8ccc 3214static robj *rdbLoadStringObject(FILE*fp) {
e3566d4b 3215 int isencoded;
3216 uint32_t len;
f78fd11b 3217 sds val;
3218
c78a8ccc 3219 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3220 if (isencoded) {
3221 switch(len) {
3222 case REDIS_RDB_ENC_INT8:
3223 case REDIS_RDB_ENC_INT16:
3224 case REDIS_RDB_ENC_INT32:
3305306f 3225 return tryObjectSharing(rdbLoadIntegerObject(fp,len));
88e85998 3226 case REDIS_RDB_ENC_LZF:
c78a8ccc 3227 return tryObjectSharing(rdbLoadLzfStringObject(fp));
e3566d4b 3228 default:
dfc5e96c 3229 redisAssert(0!=0);
e3566d4b 3230 }
3231 }
3232
f78fd11b 3233 if (len == REDIS_RDB_LENERR) return NULL;
3234 val = sdsnewlen(NULL,len);
3235 if (len && fread(val,len,1,fp) == 0) {
3236 sdsfree(val);
3237 return NULL;
3238 }
10c43610 3239 return tryObjectSharing(createObject(REDIS_STRING,val));
f78fd11b 3240}
3241
a7866db6 3242/* For information about double serialization check rdbSaveDoubleValue() */
3243static int rdbLoadDoubleValue(FILE *fp, double *val) {
3244 char buf[128];
3245 unsigned char len;
3246
3247 if (fread(&len,1,1,fp) == 0) return -1;
3248 switch(len) {
3249 case 255: *val = R_NegInf; return 0;
3250 case 254: *val = R_PosInf; return 0;
3251 case 253: *val = R_Nan; return 0;
3252 default:
3253 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3254 buf[len] = '\0';
a7866db6 3255 sscanf(buf, "%lg", val);
3256 return 0;
3257 }
3258}
3259
c78a8ccc 3260/* Load a Redis object of the specified type from the specified file.
3261 * On success a newly allocated object is returned, otherwise NULL. */
3262static robj *rdbLoadObject(int type, FILE *fp) {
3263 robj *o;
3264
3265 if (type == REDIS_STRING) {
3266 /* Read string value */
3267 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3268 tryObjectEncoding(o);
3269 } else if (type == REDIS_LIST || type == REDIS_SET) {
3270 /* Read list/set value */
3271 uint32_t listlen;
3272
3273 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3274 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3275 /* Load every single element of the list/set */
3276 while(listlen--) {
3277 robj *ele;
3278
3279 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3280 tryObjectEncoding(ele);
3281 if (type == REDIS_LIST) {
3282 listAddNodeTail((list*)o->ptr,ele);
3283 } else {
3284 dictAdd((dict*)o->ptr,ele,NULL);
3285 }
3286 }
3287 } else if (type == REDIS_ZSET) {
3288 /* Read list/set value */
3289 uint32_t zsetlen;
3290 zset *zs;
3291
3292 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3293 o = createZsetObject();
3294 zs = o->ptr;
3295 /* Load every single element of the list/set */
3296 while(zsetlen--) {
3297 robj *ele;
3298 double *score = zmalloc(sizeof(double));
3299
3300 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3301 tryObjectEncoding(ele);
3302 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3303 dictAdd(zs->dict,ele,score);
3304 zslInsert(zs->zsl,*score,ele);
3305 incrRefCount(ele); /* added to skiplist */
3306 }
3307 } else {
3308 redisAssert(0 != 0);
3309 }
3310 return o;
3311}
3312
f78fd11b 3313static int rdbLoad(char *filename) {
ed9b544e 3314 FILE *fp;
f78fd11b 3315 robj *keyobj = NULL;
3316 uint32_t dbid;
bb32ede5 3317 int type, retval, rdbver;
3305306f 3318 dict *d = server.db[0].dict;
bb32ede5 3319 redisDb *db = server.db+0;
f78fd11b 3320 char buf[1024];
bb32ede5 3321 time_t expiretime = -1, now = time(NULL);
b492cf00 3322 long long loadedkeys = 0;
bb32ede5 3323
ed9b544e 3324 fp = fopen(filename,"r");
3325 if (!fp) return REDIS_ERR;
3326 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 3327 buf[9] = '\0';
3328 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 3329 fclose(fp);
3330 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3331 return REDIS_ERR;
3332 }
f78fd11b 3333 rdbver = atoi(buf+5);
c78a8ccc 3334 if (rdbver != 1) {
f78fd11b 3335 fclose(fp);
3336 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3337 return REDIS_ERR;
3338 }
ed9b544e 3339 while(1) {
3340 robj *o;
3341
3342 /* Read type. */
f78fd11b 3343 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 3344 if (type == REDIS_EXPIRETIME) {
3345 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3346 /* We read the time so we need to read the object type again */
3347 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3348 }
ed9b544e 3349 if (type == REDIS_EOF) break;
3350 /* Handle SELECT DB opcode as a special case */
3351 if (type == REDIS_SELECTDB) {
c78a8ccc 3352 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 3353 goto eoferr;
ed9b544e 3354 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 3355 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 3356 exit(1);
3357 }
bb32ede5 3358 db = server.db+dbid;
3359 d = db->dict;
ed9b544e 3360 continue;
3361 }
3362 /* Read key */
c78a8ccc 3363 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3364 /* Read value */
3365 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
ed9b544e 3366 /* Add the new object in the hash table */
f78fd11b 3367 retval = dictAdd(d,keyobj,o);
ed9b544e 3368 if (retval == DICT_ERR) {
f78fd11b 3369 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
ed9b544e 3370 exit(1);
3371 }
bb32ede5 3372 /* Set the expire time if needed */
3373 if (expiretime != -1) {
3374 setExpire(db,keyobj,expiretime);
3375 /* Delete this key if already expired */
3376 if (expiretime < now) deleteKey(db,keyobj);
3377 expiretime = -1;
3378 }
f78fd11b 3379 keyobj = o = NULL;
b492cf00 3380 /* Handle swapping while loading big datasets when VM is on */
3381 loadedkeys++;
3382 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3383 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 3384 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 3385 }
3386 }
ed9b544e 3387 }
3388 fclose(fp);
3389 return REDIS_OK;
3390
3391eoferr: /* unexpected end of file is handled here with a fatal exit */
e3566d4b 3392 if (keyobj) decrRefCount(keyobj);
f80dff62 3393 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 3394 exit(1);
3395 return REDIS_ERR; /* Just to avoid warning */
3396}
3397
3398/*================================== Commands =============================== */
3399
abcb223e 3400static void authCommand(redisClient *c) {
2e77c2ee 3401 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
3402 c->authenticated = 1;
3403 addReply(c,shared.ok);
3404 } else {
3405 c->authenticated = 0;
fa4c0aba 3406 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
3407 }
3408}
3409
ed9b544e 3410static void pingCommand(redisClient *c) {
3411 addReply(c,shared.pong);
3412}
3413
3414static void echoCommand(redisClient *c) {
942a3961 3415 addReplyBulkLen(c,c->argv[1]);
ed9b544e 3416 addReply(c,c->argv[1]);
3417 addReply(c,shared.crlf);
3418}
3419
3420/*=================================== Strings =============================== */
3421
3422static void setGenericCommand(redisClient *c, int nx) {
3423 int retval;
3424
333fd216 3425 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3305306f 3426 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3427 if (retval == DICT_ERR) {
3428 if (!nx) {
1b03836c 3429 /* If the key is about a swapped value, we want a new key object
3430 * to overwrite the old. So we delete the old key in the database.
3431 * This will also make sure that swap pages about the old object
3432 * will be marked as free. */
3433 if (deleteIfSwapped(c->db,c->argv[1]))
3434 incrRefCount(c->argv[1]);
3305306f 3435 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3436 incrRefCount(c->argv[2]);
3437 } else {
c937aa89 3438 addReply(c,shared.czero);
ed9b544e 3439 return;
3440 }
3441 } else {
3442 incrRefCount(c->argv[1]);
3443 incrRefCount(c->argv[2]);
3444 }
3445 server.dirty++;
3305306f 3446 removeExpire(c->db,c->argv[1]);
c937aa89 3447 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 3448}
3449
3450static void setCommand(redisClient *c) {
a4d1ba9a 3451 setGenericCommand(c,0);
ed9b544e 3452}
3453
3454static void setnxCommand(redisClient *c) {
a4d1ba9a 3455 setGenericCommand(c,1);
ed9b544e 3456}
3457
322fc7d8 3458static int getGenericCommand(redisClient *c) {
3305306f 3459 robj *o = lookupKeyRead(c->db,c->argv[1]);
3460
3461 if (o == NULL) {
c937aa89 3462 addReply(c,shared.nullbulk);
322fc7d8 3463 return REDIS_OK;
ed9b544e 3464 } else {
ed9b544e 3465 if (o->type != REDIS_STRING) {
c937aa89 3466 addReply(c,shared.wrongtypeerr);
322fc7d8 3467 return REDIS_ERR;
ed9b544e 3468 } else {
942a3961 3469 addReplyBulkLen(c,o);
ed9b544e 3470 addReply(c,o);
3471 addReply(c,shared.crlf);
322fc7d8 3472 return REDIS_OK;
ed9b544e 3473 }
3474 }
3475}
3476
322fc7d8 3477static void getCommand(redisClient *c) {
3478 getGenericCommand(c);
3479}
3480
f6b141c5 3481static void getsetCommand(redisClient *c) {
322fc7d8 3482 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 3483 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3484 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3485 } else {
3486 incrRefCount(c->argv[1]);
3487 }
3488 incrRefCount(c->argv[2]);
3489 server.dirty++;
3490 removeExpire(c->db,c->argv[1]);
3491}
3492
70003d28 3493static void mgetCommand(redisClient *c) {
70003d28 3494 int j;
3495
c937aa89 3496 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 3497 for (j = 1; j < c->argc; j++) {
3305306f 3498 robj *o = lookupKeyRead(c->db,c->argv[j]);
3499 if (o == NULL) {
c937aa89 3500 addReply(c,shared.nullbulk);
70003d28 3501 } else {
70003d28 3502 if (o->type != REDIS_STRING) {
c937aa89 3503 addReply(c,shared.nullbulk);
70003d28 3504 } else {
942a3961 3505 addReplyBulkLen(c,o);
70003d28 3506 addReply(c,o);
3507 addReply(c,shared.crlf);
3508 }
3509 }
3510 }
3511}
3512
6c446631 3513static void msetGenericCommand(redisClient *c, int nx) {
906573e7 3514 int j, busykeys = 0;
6c446631 3515
3516 if ((c->argc % 2) == 0) {
454d4e43 3517 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 3518 return;
3519 }
3520 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3521 * set nothing at all if at least one already key exists. */
3522 if (nx) {
3523 for (j = 1; j < c->argc; j += 2) {
906573e7 3524 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3525 busykeys++;
6c446631 3526 }
3527 }
3528 }
906573e7 3529 if (busykeys) {
3530 addReply(c, shared.czero);
3531 return;
3532 }
6c446631 3533
3534 for (j = 1; j < c->argc; j += 2) {
3535 int retval;
3536
17511391 3537 tryObjectEncoding(c->argv[j+1]);
6c446631 3538 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3539 if (retval == DICT_ERR) {
3540 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3541 incrRefCount(c->argv[j+1]);
3542 } else {
3543 incrRefCount(c->argv[j]);
3544 incrRefCount(c->argv[j+1]);
3545 }
3546 removeExpire(c->db,c->argv[j]);
3547 }
3548 server.dirty += (c->argc-1)/2;
3549 addReply(c, nx ? shared.cone : shared.ok);
3550}
3551
3552static void msetCommand(redisClient *c) {
3553 msetGenericCommand(c,0);
3554}
3555
3556static void msetnxCommand(redisClient *c) {
3557 msetGenericCommand(c,1);
3558}
3559
d68ed120 3560static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 3561 long long value;
3562 int retval;
3563 robj *o;
3564
3305306f 3565 o = lookupKeyWrite(c->db,c->argv[1]);
3566 if (o == NULL) {
ed9b544e 3567 value = 0;
3568 } else {
ed9b544e 3569 if (o->type != REDIS_STRING) {
3570 value = 0;
3571 } else {
3572 char *eptr;
3573
942a3961 3574 if (o->encoding == REDIS_ENCODING_RAW)
3575 value = strtoll(o->ptr, &eptr, 10);
3576 else if (o->encoding == REDIS_ENCODING_INT)
3577 value = (long)o->ptr;
3578 else
dfc5e96c 3579 redisAssert(1 != 1);
ed9b544e 3580 }
3581 }
3582
3583 value += incr;
3584 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
942a3961 3585 tryObjectEncoding(o);
3305306f 3586 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 3587 if (retval == DICT_ERR) {
3305306f 3588 dictReplace(c->db->dict,c->argv[1],o);
3589 removeExpire(c->db,c->argv[1]);
ed9b544e 3590 } else {
3591 incrRefCount(c->argv[1]);
3592 }
3593 server.dirty++;
c937aa89 3594 addReply(c,shared.colon);
ed9b544e 3595 addReply(c,o);
3596 addReply(c,shared.crlf);
3597}
3598
3599static void incrCommand(redisClient *c) {
a4d1ba9a 3600 incrDecrCommand(c,1);
ed9b544e 3601}
3602
3603static void decrCommand(redisClient *c) {
a4d1ba9a 3604 incrDecrCommand(c,-1);
ed9b544e 3605}
3606
3607static void incrbyCommand(redisClient *c) {
d68ed120 3608 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
a4d1ba9a 3609 incrDecrCommand(c,incr);
ed9b544e 3610}
3611
3612static void decrbyCommand(redisClient *c) {
d68ed120 3613 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
a4d1ba9a 3614 incrDecrCommand(c,-incr);
ed9b544e 3615}
3616
3617/* ========================= Type agnostic commands ========================= */
3618
3619static void delCommand(redisClient *c) {
5109cdff 3620 int deleted = 0, j;
3621
3622 for (j = 1; j < c->argc; j++) {
3623 if (deleteKey(c->db,c->argv[j])) {
3624 server.dirty++;
3625 deleted++;
3626 }
3627 }
3628 switch(deleted) {
3629 case 0:
c937aa89 3630 addReply(c,shared.czero);
5109cdff 3631 break;
3632 case 1:
3633 addReply(c,shared.cone);
3634 break;
3635 default:
3636 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",deleted));
3637 break;
ed9b544e 3638 }
3639}
3640
3641static void existsCommand(redisClient *c) {
3305306f 3642 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
ed9b544e 3643}
3644
3645static void selectCommand(redisClient *c) {
3646 int id = atoi(c->argv[1]->ptr);
3647
3648 if (selectDb(c,id) == REDIS_ERR) {
774e3047 3649 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 3650 } else {
3651 addReply(c,shared.ok);
3652 }
3653}
3654
3655static void randomkeyCommand(redisClient *c) {
3656 dictEntry *de;
3305306f 3657
3658 while(1) {
3659 de = dictGetRandomKey(c->db->dict);
ce7bef07 3660 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 3661 }
ed9b544e 3662 if (de == NULL) {
ce7bef07 3663 addReply(c,shared.plus);
ed9b544e 3664 addReply(c,shared.crlf);
3665 } else {
c937aa89 3666 addReply(c,shared.plus);
ed9b544e 3667 addReply(c,dictGetEntryKey(de));
3668 addReply(c,shared.crlf);
3669 }
3670}
3671
3672static void keysCommand(redisClient *c) {
3673 dictIterator *di;
3674 dictEntry *de;
3675 sds pattern = c->argv[1]->ptr;
3676 int plen = sdslen(pattern);
682ac724 3677 unsigned long numkeys = 0, keyslen = 0;
ed9b544e 3678 robj *lenobj = createObject(REDIS_STRING,NULL);
3679
3305306f 3680 di = dictGetIterator(c->db->dict);
ed9b544e 3681 addReply(c,lenobj);
3682 decrRefCount(lenobj);
3683 while((de = dictNext(di)) != NULL) {
3684 robj *keyobj = dictGetEntryKey(de);
3305306f 3685
ed9b544e 3686 sds key = keyobj->ptr;
3687 if ((pattern[0] == '*' && pattern[1] == '\0') ||
3688 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 3689 if (expireIfNeeded(c->db,keyobj) == 0) {
3690 if (numkeys != 0)
3691 addReply(c,shared.space);
3692 addReply(c,keyobj);
3693 numkeys++;
3694 keyslen += sdslen(key);
3695 }
ed9b544e 3696 }
3697 }
3698 dictReleaseIterator(di);
c937aa89 3699 lenobj->ptr = sdscatprintf(sdsempty(),"$%lu\r\n",keyslen+(numkeys ? (numkeys-1) : 0));
ed9b544e 3700 addReply(c,shared.crlf);
3701}
3702
3703static void dbsizeCommand(redisClient *c) {
3704 addReplySds(c,
3305306f 3705 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 3706}
3707
3708static void lastsaveCommand(redisClient *c) {
3709 addReplySds(c,
c937aa89 3710 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 3711}
3712
3713static void typeCommand(redisClient *c) {
3305306f 3714 robj *o;
ed9b544e 3715 char *type;
3305306f 3716
3717 o = lookupKeyRead(c->db,c->argv[1]);
3718 if (o == NULL) {
c937aa89 3719 type = "+none";
ed9b544e 3720 } else {
ed9b544e 3721 switch(o->type) {
c937aa89 3722 case REDIS_STRING: type = "+string"; break;
3723 case REDIS_LIST: type = "+list"; break;
3724 case REDIS_SET: type = "+set"; break;
412a8bce 3725 case REDIS_ZSET: type = "+zset"; break;
ed9b544e 3726 default: type = "unknown"; break;
3727 }
3728 }
3729 addReplySds(c,sdsnew(type));
3730 addReply(c,shared.crlf);
3731}
3732
3733static void saveCommand(redisClient *c) {
9d65a1bb 3734 if (server.bgsavechildpid != -1) {
05557f6d 3735 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
3736 return;
3737 }
f78fd11b 3738 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 3739 addReply(c,shared.ok);
3740 } else {
3741 addReply(c,shared.err);
3742 }
3743}
3744
3745static void bgsaveCommand(redisClient *c) {
9d65a1bb 3746 if (server.bgsavechildpid != -1) {
ed9b544e 3747 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
3748 return;
3749 }
f78fd11b 3750 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 3751 char *status = "+Background saving started\r\n";
3752 addReplySds(c,sdsnew(status));
ed9b544e 3753 } else {
3754 addReply(c,shared.err);
3755 }
3756}
3757
3758static void shutdownCommand(redisClient *c) {
3759 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
a3b21203 3760 /* Kill the saving child if there is a background saving in progress.
3761 We want to avoid race conditions, for instance our saving child may
3762 overwrite the synchronous saving did by SHUTDOWN. */
9d65a1bb 3763 if (server.bgsavechildpid != -1) {
9f3c422c 3764 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
3765 kill(server.bgsavechildpid,SIGKILL);
a3b21203 3766 rdbRemoveTempFile(server.bgsavechildpid);
9f3c422c 3767 }
ac945e2d 3768 if (server.appendonly) {
3769 /* Append only file: fsync() the AOF and exit */
3770 fsync(server.appendfd);
3771 exit(0);
ed9b544e 3772 } else {
ac945e2d 3773 /* Snapshotting. Perform a SYNC SAVE and exit */
3774 if (rdbSave(server.dbfilename) == REDIS_OK) {
3775 if (server.daemonize)
3776 unlink(server.pidfile);
3777 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
3778 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
3779 exit(0);
3780 } else {
3781 /* Ooops.. error saving! The best we can do is to continue operating.
3782 * Note that if there was a background saving process, in the next
3783 * cron() Redis will be notified that the background saving aborted,
3784 * handling special stuff like slaves pending for synchronization... */
3785 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
3786 addReplySds(c,sdsnew("-ERR can't quit, problems saving the DB\r\n"));
3787 }
ed9b544e 3788 }
3789}
3790
3791static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 3792 robj *o;
3793
3794 /* To use the same key as src and dst is probably an error */
3795 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 3796 addReply(c,shared.sameobjecterr);
ed9b544e 3797 return;
3798 }
3799
3305306f 3800 o = lookupKeyWrite(c->db,c->argv[1]);
3801 if (o == NULL) {
c937aa89 3802 addReply(c,shared.nokeyerr);
ed9b544e 3803 return;
3804 }
ed9b544e 3805 incrRefCount(o);
3305306f 3806 deleteIfVolatile(c->db,c->argv[2]);
3807 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 3808 if (nx) {
3809 decrRefCount(o);
c937aa89 3810 addReply(c,shared.czero);
ed9b544e 3811 return;
3812 }
3305306f 3813 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 3814 } else {
3815 incrRefCount(c->argv[2]);
3816 }
3305306f 3817 deleteKey(c->db,c->argv[1]);
ed9b544e 3818 server.dirty++;
c937aa89 3819 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 3820}
3821
3822static void renameCommand(redisClient *c) {
3823 renameGenericCommand(c,0);
3824}
3825
3826static void renamenxCommand(redisClient *c) {
3827 renameGenericCommand(c,1);
3828}
3829
3830static void moveCommand(redisClient *c) {
3305306f 3831 robj *o;
3832 redisDb *src, *dst;
ed9b544e 3833 int srcid;
3834
3835 /* Obtain source and target DB pointers */
3305306f 3836 src = c->db;
3837 srcid = c->db->id;
ed9b544e 3838 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 3839 addReply(c,shared.outofrangeerr);
ed9b544e 3840 return;
3841 }
3305306f 3842 dst = c->db;
3843 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 3844
3845 /* If the user is moving using as target the same
3846 * DB as the source DB it is probably an error. */
3847 if (src == dst) {
c937aa89 3848 addReply(c,shared.sameobjecterr);
ed9b544e 3849 return;
3850 }
3851
3852 /* Check if the element exists and get a reference */
3305306f 3853 o = lookupKeyWrite(c->db,c->argv[1]);
3854 if (!o) {
c937aa89 3855 addReply(c,shared.czero);
ed9b544e 3856 return;
3857 }
3858
3859 /* Try to add the element to the target DB */
3305306f 3860 deleteIfVolatile(dst,c->argv[1]);
3861 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 3862 addReply(c,shared.czero);
ed9b544e 3863 return;
3864 }
3305306f 3865 incrRefCount(c->argv[1]);
ed9b544e 3866 incrRefCount(o);
3867
3868 /* OK! key moved, free the entry in the source DB */
3305306f 3869 deleteKey(src,c->argv[1]);
ed9b544e 3870 server.dirty++;
c937aa89 3871 addReply(c,shared.cone);
ed9b544e 3872}
3873
3874/* =================================== Lists ================================ */
3875static void pushGenericCommand(redisClient *c, int where) {
3876 robj *lobj;
ed9b544e 3877 list *list;
3305306f 3878
3879 lobj = lookupKeyWrite(c->db,c->argv[1]);
3880 if (lobj == NULL) {
95242ab5 3881 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
3882 addReply(c,shared.ok);
3883 return;
3884 }
ed9b544e 3885 lobj = createListObject();
3886 list = lobj->ptr;
3887 if (where == REDIS_HEAD) {
6b47e12e 3888 listAddNodeHead(list,c->argv[2]);
ed9b544e 3889 } else {
6b47e12e 3890 listAddNodeTail(list,c->argv[2]);
ed9b544e 3891 }
3305306f 3892 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 3893 incrRefCount(c->argv[1]);
3894 incrRefCount(c->argv[2]);
3895 } else {
ed9b544e 3896 if (lobj->type != REDIS_LIST) {
3897 addReply(c,shared.wrongtypeerr);
3898 return;
3899 }
95242ab5 3900 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
3901 addReply(c,shared.ok);
3902 return;
3903 }
ed9b544e 3904 list = lobj->ptr;
3905 if (where == REDIS_HEAD) {
6b47e12e 3906 listAddNodeHead(list,c->argv[2]);
ed9b544e 3907 } else {
6b47e12e 3908 listAddNodeTail(list,c->argv[2]);
ed9b544e 3909 }
3910 incrRefCount(c->argv[2]);
3911 }
3912 server.dirty++;
3913 addReply(c,shared.ok);
3914}
3915
3916static void lpushCommand(redisClient *c) {
3917 pushGenericCommand(c,REDIS_HEAD);
3918}
3919
3920static void rpushCommand(redisClient *c) {
3921 pushGenericCommand(c,REDIS_TAIL);
3922}
3923
3924static void llenCommand(redisClient *c) {
3305306f 3925 robj *o;
ed9b544e 3926 list *l;
3927
3305306f 3928 o = lookupKeyRead(c->db,c->argv[1]);
3929 if (o == NULL) {
c937aa89 3930 addReply(c,shared.czero);
ed9b544e 3931 return;
3932 } else {
ed9b544e 3933 if (o->type != REDIS_LIST) {
c937aa89 3934 addReply(c,shared.wrongtypeerr);
ed9b544e 3935 } else {
3936 l = o->ptr;
c937aa89 3937 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(l)));
ed9b544e 3938 }
3939 }
3940}
3941
3942static void lindexCommand(redisClient *c) {
3305306f 3943 robj *o;
ed9b544e 3944 int index = atoi(c->argv[2]->ptr);
3945
3305306f 3946 o = lookupKeyRead(c->db,c->argv[1]);
3947 if (o == NULL) {
c937aa89 3948 addReply(c,shared.nullbulk);
ed9b544e 3949 } else {
ed9b544e 3950 if (o->type != REDIS_LIST) {
c937aa89 3951 addReply(c,shared.wrongtypeerr);
ed9b544e 3952 } else {
3953 list *list = o->ptr;
3954 listNode *ln;
3955
3956 ln = listIndex(list, index);
3957 if (ln == NULL) {
c937aa89 3958 addReply(c,shared.nullbulk);
ed9b544e 3959 } else {
3960 robj *ele = listNodeValue(ln);
942a3961 3961 addReplyBulkLen(c,ele);
ed9b544e 3962 addReply(c,ele);
3963 addReply(c,shared.crlf);
3964 }
3965 }
3966 }
3967}
3968
3969static void lsetCommand(redisClient *c) {
3305306f 3970 robj *o;
ed9b544e 3971 int index = atoi(c->argv[2]->ptr);
3972
3305306f 3973 o = lookupKeyWrite(c->db,c->argv[1]);
3974 if (o == NULL) {
ed9b544e 3975 addReply(c,shared.nokeyerr);
3976 } else {
ed9b544e 3977 if (o->type != REDIS_LIST) {
3978 addReply(c,shared.wrongtypeerr);
3979 } else {
3980 list *list = o->ptr;
3981 listNode *ln;
3982
3983 ln = listIndex(list, index);
3984 if (ln == NULL) {
c937aa89 3985 addReply(c,shared.outofrangeerr);
ed9b544e 3986 } else {
3987 robj *ele = listNodeValue(ln);
3988
3989 decrRefCount(ele);
3990 listNodeValue(ln) = c->argv[3];
3991 incrRefCount(c->argv[3]);
3992 addReply(c,shared.ok);
3993 server.dirty++;
3994 }
3995 }
3996 }
3997}
3998
3999static void popGenericCommand(redisClient *c, int where) {
3305306f 4000 robj *o;
4001
4002 o = lookupKeyWrite(c->db,c->argv[1]);
4003 if (o == NULL) {
c937aa89 4004 addReply(c,shared.nullbulk);
ed9b544e 4005 } else {
ed9b544e 4006 if (o->type != REDIS_LIST) {
c937aa89 4007 addReply(c,shared.wrongtypeerr);
ed9b544e 4008 } else {
4009 list *list = o->ptr;
4010 listNode *ln;
4011
4012 if (where == REDIS_HEAD)
4013 ln = listFirst(list);
4014 else
4015 ln = listLast(list);
4016
4017 if (ln == NULL) {
c937aa89 4018 addReply(c,shared.nullbulk);
ed9b544e 4019 } else {
4020 robj *ele = listNodeValue(ln);
942a3961 4021 addReplyBulkLen(c,ele);
ed9b544e 4022 addReply(c,ele);
4023 addReply(c,shared.crlf);
4024 listDelNode(list,ln);
4025 server.dirty++;
4026 }
4027 }
4028 }
4029}
4030
4031static void lpopCommand(redisClient *c) {
4032 popGenericCommand(c,REDIS_HEAD);
4033}
4034
4035static void rpopCommand(redisClient *c) {
4036 popGenericCommand(c,REDIS_TAIL);
4037}
4038
4039static void lrangeCommand(redisClient *c) {
3305306f 4040 robj *o;
ed9b544e 4041 int start = atoi(c->argv[2]->ptr);
4042 int end = atoi(c->argv[3]->ptr);
3305306f 4043
4044 o = lookupKeyRead(c->db,c->argv[1]);
4045 if (o == NULL) {
c937aa89 4046 addReply(c,shared.nullmultibulk);
ed9b544e 4047 } else {
ed9b544e 4048 if (o->type != REDIS_LIST) {
c937aa89 4049 addReply(c,shared.wrongtypeerr);
ed9b544e 4050 } else {
4051 list *list = o->ptr;
4052 listNode *ln;
4053 int llen = listLength(list);
4054 int rangelen, j;
4055 robj *ele;
4056
4057 /* convert negative indexes */
4058 if (start < 0) start = llen+start;
4059 if (end < 0) end = llen+end;
4060 if (start < 0) start = 0;
4061 if (end < 0) end = 0;
4062
4063 /* indexes sanity checks */
4064 if (start > end || start >= llen) {
4065 /* Out of range start or start > end result in empty list */
c937aa89 4066 addReply(c,shared.emptymultibulk);
ed9b544e 4067 return;
4068 }
4069 if (end >= llen) end = llen-1;
4070 rangelen = (end-start)+1;
4071
4072 /* Return the result in form of a multi-bulk reply */
4073 ln = listIndex(list, start);
c937aa89 4074 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
ed9b544e 4075 for (j = 0; j < rangelen; j++) {
4076 ele = listNodeValue(ln);
942a3961 4077 addReplyBulkLen(c,ele);
ed9b544e 4078 addReply(c,ele);
4079 addReply(c,shared.crlf);
4080 ln = ln->next;
4081 }
4082 }
4083 }
4084}
4085
4086static void ltrimCommand(redisClient *c) {
3305306f 4087 robj *o;
ed9b544e 4088 int start = atoi(c->argv[2]->ptr);
4089 int end = atoi(c->argv[3]->ptr);
4090
3305306f 4091 o = lookupKeyWrite(c->db,c->argv[1]);
4092 if (o == NULL) {
ab9d4cb1 4093 addReply(c,shared.ok);
ed9b544e 4094 } else {
ed9b544e 4095 if (o->type != REDIS_LIST) {
4096 addReply(c,shared.wrongtypeerr);
4097 } else {
4098 list *list = o->ptr;
4099 listNode *ln;
4100 int llen = listLength(list);
4101 int j, ltrim, rtrim;
4102
4103 /* convert negative indexes */
4104 if (start < 0) start = llen+start;
4105 if (end < 0) end = llen+end;
4106 if (start < 0) start = 0;
4107 if (end < 0) end = 0;
4108
4109 /* indexes sanity checks */
4110 if (start > end || start >= llen) {
4111 /* Out of range start or start > end result in empty list */
4112 ltrim = llen;
4113 rtrim = 0;
4114 } else {
4115 if (end >= llen) end = llen-1;
4116 ltrim = start;
4117 rtrim = llen-end-1;
4118 }
4119
4120 /* Remove list elements to perform the trim */
4121 for (j = 0; j < ltrim; j++) {
4122 ln = listFirst(list);
4123 listDelNode(list,ln);
4124 }
4125 for (j = 0; j < rtrim; j++) {
4126 ln = listLast(list);
4127 listDelNode(list,ln);
4128 }
ed9b544e 4129 server.dirty++;
e59229a2 4130 addReply(c,shared.ok);
ed9b544e 4131 }
4132 }
4133}
4134
4135static void lremCommand(redisClient *c) {
3305306f 4136 robj *o;
ed9b544e 4137
3305306f 4138 o = lookupKeyWrite(c->db,c->argv[1]);
4139 if (o == NULL) {
33c08b39 4140 addReply(c,shared.czero);
ed9b544e 4141 } else {
ed9b544e 4142 if (o->type != REDIS_LIST) {
c937aa89 4143 addReply(c,shared.wrongtypeerr);
ed9b544e 4144 } else {
4145 list *list = o->ptr;
4146 listNode *ln, *next;
4147 int toremove = atoi(c->argv[2]->ptr);
4148 int removed = 0;
4149 int fromtail = 0;
4150
4151 if (toremove < 0) {
4152 toremove = -toremove;
4153 fromtail = 1;
4154 }
4155 ln = fromtail ? list->tail : list->head;
4156 while (ln) {
ed9b544e 4157 robj *ele = listNodeValue(ln);
a4d1ba9a 4158
4159 next = fromtail ? ln->prev : ln->next;
724a51b1 4160 if (compareStringObjects(ele,c->argv[3]) == 0) {
ed9b544e 4161 listDelNode(list,ln);
4162 server.dirty++;
4163 removed++;
4164 if (toremove && removed == toremove) break;
4165 }
4166 ln = next;
4167 }
c937aa89 4168 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 4169 }
4170 }
4171}
4172
12f9d551 4173/* This is the semantic of this command:
0f5f7e9a 4174 * RPOPLPUSH srclist dstlist:
12f9d551 4175 * IF LLEN(srclist) > 0
4176 * element = RPOP srclist
4177 * LPUSH dstlist element
4178 * RETURN element
4179 * ELSE
4180 * RETURN nil
4181 * END
4182 * END
4183 *
4184 * The idea is to be able to get an element from a list in a reliable way
4185 * since the element is not just returned but pushed against another list
4186 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4187 */
0f5f7e9a 4188static void rpoplpushcommand(redisClient *c) {
12f9d551 4189 robj *sobj;
4190
4191 sobj = lookupKeyWrite(c->db,c->argv[1]);
4192 if (sobj == NULL) {
4193 addReply(c,shared.nullbulk);
4194 } else {
4195 if (sobj->type != REDIS_LIST) {
4196 addReply(c,shared.wrongtypeerr);
4197 } else {
4198 list *srclist = sobj->ptr;
4199 listNode *ln = listLast(srclist);
4200
4201 if (ln == NULL) {
4202 addReply(c,shared.nullbulk);
4203 } else {
4204 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4205 robj *ele = listNodeValue(ln);
4206 list *dstlist;
4207
e20fb74f 4208 if (dobj && dobj->type != REDIS_LIST) {
12f9d551 4209 addReply(c,shared.wrongtypeerr);
4210 return;
4211 }
e20fb74f 4212
4213 /* Add the element to the target list (unless it's directly
4214 * passed to some BLPOP-ing client */
4215 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4216 if (dobj == NULL) {
4217 /* Create the list if the key does not exist */
4218 dobj = createListObject();
4219 dictAdd(c->db->dict,c->argv[2],dobj);
4220 incrRefCount(c->argv[2]);
4221 }
4222 dstlist = dobj->ptr;
4223 listAddNodeHead(dstlist,ele);
4224 incrRefCount(ele);
4225 }
12f9d551 4226
4227 /* Send the element to the client as reply as well */
4228 addReplyBulkLen(c,ele);
4229 addReply(c,ele);
4230 addReply(c,shared.crlf);
4231
4232 /* Finally remove the element from the source list */
4233 listDelNode(srclist,ln);
4234 server.dirty++;
4235 }
4236 }
4237 }
4238}
4239
4240
ed9b544e 4241/* ==================================== Sets ================================ */
4242
4243static void saddCommand(redisClient *c) {
ed9b544e 4244 robj *set;
4245
3305306f 4246 set = lookupKeyWrite(c->db,c->argv[1]);
4247 if (set == NULL) {
ed9b544e 4248 set = createSetObject();
3305306f 4249 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 4250 incrRefCount(c->argv[1]);
4251 } else {
ed9b544e 4252 if (set->type != REDIS_SET) {
c937aa89 4253 addReply(c,shared.wrongtypeerr);
ed9b544e 4254 return;
4255 }
4256 }
4257 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4258 incrRefCount(c->argv[2]);
4259 server.dirty++;
c937aa89 4260 addReply(c,shared.cone);
ed9b544e 4261 } else {
c937aa89 4262 addReply(c,shared.czero);
ed9b544e 4263 }
4264}
4265
4266static void sremCommand(redisClient *c) {
3305306f 4267 robj *set;
ed9b544e 4268
3305306f 4269 set = lookupKeyWrite(c->db,c->argv[1]);
4270 if (set == NULL) {
c937aa89 4271 addReply(c,shared.czero);
ed9b544e 4272 } else {
ed9b544e 4273 if (set->type != REDIS_SET) {
c937aa89 4274 addReply(c,shared.wrongtypeerr);
ed9b544e 4275 return;
4276 }
4277 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4278 server.dirty++;
12fea928 4279 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
c937aa89 4280 addReply(c,shared.cone);
ed9b544e 4281 } else {
c937aa89 4282 addReply(c,shared.czero);
ed9b544e 4283 }
4284 }
4285}
4286
a4460ef4 4287static void smoveCommand(redisClient *c) {
4288 robj *srcset, *dstset;
4289
4290 srcset = lookupKeyWrite(c->db,c->argv[1]);
4291 dstset = lookupKeyWrite(c->db,c->argv[2]);
4292
4293 /* If the source key does not exist return 0, if it's of the wrong type
4294 * raise an error */
4295 if (srcset == NULL || srcset->type != REDIS_SET) {
4296 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4297 return;
4298 }
4299 /* Error if the destination key is not a set as well */
4300 if (dstset && dstset->type != REDIS_SET) {
4301 addReply(c,shared.wrongtypeerr);
4302 return;
4303 }
4304 /* Remove the element from the source set */
4305 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4306 /* Key not found in the src set! return zero */
4307 addReply(c,shared.czero);
4308 return;
4309 }
4310 server.dirty++;
4311 /* Add the element to the destination set */
4312 if (!dstset) {
4313 dstset = createSetObject();
4314 dictAdd(c->db->dict,c->argv[2],dstset);
4315 incrRefCount(c->argv[2]);
4316 }
4317 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4318 incrRefCount(c->argv[3]);
4319 addReply(c,shared.cone);
4320}
4321
ed9b544e 4322static void sismemberCommand(redisClient *c) {
3305306f 4323 robj *set;
ed9b544e 4324
3305306f 4325 set = lookupKeyRead(c->db,c->argv[1]);
4326 if (set == NULL) {
c937aa89 4327 addReply(c,shared.czero);
ed9b544e 4328 } else {
ed9b544e 4329 if (set->type != REDIS_SET) {
c937aa89 4330 addReply(c,shared.wrongtypeerr);
ed9b544e 4331 return;
4332 }
4333 if (dictFind(set->ptr,c->argv[2]))
c937aa89 4334 addReply(c,shared.cone);
ed9b544e 4335 else
c937aa89 4336 addReply(c,shared.czero);
ed9b544e 4337 }
4338}
4339
4340static void scardCommand(redisClient *c) {
3305306f 4341 robj *o;
ed9b544e 4342 dict *s;
4343
3305306f 4344 o = lookupKeyRead(c->db,c->argv[1]);
4345 if (o == NULL) {
c937aa89 4346 addReply(c,shared.czero);
ed9b544e 4347 return;
4348 } else {
ed9b544e 4349 if (o->type != REDIS_SET) {
c937aa89 4350 addReply(c,shared.wrongtypeerr);
ed9b544e 4351 } else {
4352 s = o->ptr;
682ac724 4353 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
3305306f 4354 dictSize(s)));
ed9b544e 4355 }
4356 }
4357}
4358
12fea928 4359static void spopCommand(redisClient *c) {
4360 robj *set;
4361 dictEntry *de;
4362
4363 set = lookupKeyWrite(c->db,c->argv[1]);
4364 if (set == NULL) {
4365 addReply(c,shared.nullbulk);
4366 } else {
4367 if (set->type != REDIS_SET) {
4368 addReply(c,shared.wrongtypeerr);
4369 return;
4370 }
4371 de = dictGetRandomKey(set->ptr);
4372 if (de == NULL) {
4373 addReply(c,shared.nullbulk);
4374 } else {
4375 robj *ele = dictGetEntryKey(de);
4376
942a3961 4377 addReplyBulkLen(c,ele);
12fea928 4378 addReply(c,ele);
4379 addReply(c,shared.crlf);
4380 dictDelete(set->ptr,ele);
4381 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
4382 server.dirty++;
4383 }
4384 }
4385}
4386
2abb95a9 4387static void srandmemberCommand(redisClient *c) {
4388 robj *set;
4389 dictEntry *de;
4390
4391 set = lookupKeyRead(c->db,c->argv[1]);
4392 if (set == NULL) {
4393 addReply(c,shared.nullbulk);
4394 } else {
4395 if (set->type != REDIS_SET) {
4396 addReply(c,shared.wrongtypeerr);
4397 return;
4398 }
4399 de = dictGetRandomKey(set->ptr);
4400 if (de == NULL) {
4401 addReply(c,shared.nullbulk);
4402 } else {
4403 robj *ele = dictGetEntryKey(de);
4404
4405 addReplyBulkLen(c,ele);
4406 addReply(c,ele);
4407 addReply(c,shared.crlf);
4408 }
4409 }
4410}
4411
ed9b544e 4412static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4413 dict **d1 = (void*) s1, **d2 = (void*) s2;
4414
3305306f 4415 return dictSize(*d1)-dictSize(*d2);
ed9b544e 4416}
4417
682ac724 4418static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 4419 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4420 dictIterator *di;
4421 dictEntry *de;
4422 robj *lenobj = NULL, *dstset = NULL;
682ac724 4423 unsigned long j, cardinality = 0;
ed9b544e 4424
ed9b544e 4425 for (j = 0; j < setsnum; j++) {
4426 robj *setobj;
3305306f 4427
4428 setobj = dstkey ?
4429 lookupKeyWrite(c->db,setskeys[j]) :
4430 lookupKeyRead(c->db,setskeys[j]);
4431 if (!setobj) {
ed9b544e 4432 zfree(dv);
5faa6025 4433 if (dstkey) {
fdcaae84 4434 if (deleteKey(c->db,dstkey))
4435 server.dirty++;
0d36ded0 4436 addReply(c,shared.czero);
5faa6025 4437 } else {
4438 addReply(c,shared.nullmultibulk);
4439 }
ed9b544e 4440 return;
4441 }
ed9b544e 4442 if (setobj->type != REDIS_SET) {
4443 zfree(dv);
c937aa89 4444 addReply(c,shared.wrongtypeerr);
ed9b544e 4445 return;
4446 }
4447 dv[j] = setobj->ptr;
4448 }
4449 /* Sort sets from the smallest to largest, this will improve our
4450 * algorithm's performace */
4451 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4452
4453 /* The first thing we should output is the total number of elements...
4454 * since this is a multi-bulk write, but at this stage we don't know
4455 * the intersection set size, so we use a trick, append an empty object
4456 * to the output list and save the pointer to later modify it with the
4457 * right length */
4458 if (!dstkey) {
4459 lenobj = createObject(REDIS_STRING,NULL);
4460 addReply(c,lenobj);
4461 decrRefCount(lenobj);
4462 } else {
4463 /* If we have a target key where to store the resulting set
4464 * create this key with an empty set inside */
4465 dstset = createSetObject();
ed9b544e 4466 }
4467
4468 /* Iterate all the elements of the first (smallest) set, and test
4469 * the element against all the other sets, if at least one set does
4470 * not include the element it is discarded */
4471 di = dictGetIterator(dv[0]);
ed9b544e 4472
4473 while((de = dictNext(di)) != NULL) {
4474 robj *ele;
4475
4476 for (j = 1; j < setsnum; j++)
4477 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4478 if (j != setsnum)
4479 continue; /* at least one set does not contain the member */
4480 ele = dictGetEntryKey(de);
4481 if (!dstkey) {
942a3961 4482 addReplyBulkLen(c,ele);
ed9b544e 4483 addReply(c,ele);
4484 addReply(c,shared.crlf);
4485 cardinality++;
4486 } else {
4487 dictAdd(dstset->ptr,ele,NULL);
4488 incrRefCount(ele);
4489 }
4490 }
4491 dictReleaseIterator(di);
4492
83cdfe18
AG
4493 if (dstkey) {
4494 /* Store the resulting set into the target */
4495 deleteKey(c->db,dstkey);
4496 dictAdd(c->db->dict,dstkey,dstset);
4497 incrRefCount(dstkey);
4498 }
4499
40d224a9 4500 if (!dstkey) {
682ac724 4501 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 4502 } else {
682ac724 4503 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
03fd01c7 4504 dictSize((dict*)dstset->ptr)));
40d224a9 4505 server.dirty++;
4506 }
ed9b544e 4507 zfree(dv);
4508}
4509
4510static void sinterCommand(redisClient *c) {
4511 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4512}
4513
4514static void sinterstoreCommand(redisClient *c) {
4515 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4516}
4517
f4f56e1d 4518#define REDIS_OP_UNION 0
4519#define REDIS_OP_DIFF 1
4520
4521static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 4522 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4523 dictIterator *di;
4524 dictEntry *de;
f4f56e1d 4525 robj *dstset = NULL;
40d224a9 4526 int j, cardinality = 0;
4527
40d224a9 4528 for (j = 0; j < setsnum; j++) {
4529 robj *setobj;
4530
4531 setobj = dstkey ?
4532 lookupKeyWrite(c->db,setskeys[j]) :
4533 lookupKeyRead(c->db,setskeys[j]);
4534 if (!setobj) {
4535 dv[j] = NULL;
4536 continue;
4537 }
4538 if (setobj->type != REDIS_SET) {
4539 zfree(dv);
4540 addReply(c,shared.wrongtypeerr);
4541 return;
4542 }
4543 dv[j] = setobj->ptr;
4544 }
4545
4546 /* We need a temp set object to store our union. If the dstkey
4547 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4548 * this set object will be the resulting object to set into the target key*/
4549 dstset = createSetObject();
4550
40d224a9 4551 /* Iterate all the elements of all the sets, add every element a single
4552 * time to the result set */
4553 for (j = 0; j < setsnum; j++) {
51829ed3 4554 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 4555 if (!dv[j]) continue; /* non existing keys are like empty sets */
4556
4557 di = dictGetIterator(dv[j]);
40d224a9 4558
4559 while((de = dictNext(di)) != NULL) {
4560 robj *ele;
4561
4562 /* dictAdd will not add the same element multiple times */
4563 ele = dictGetEntryKey(de);
f4f56e1d 4564 if (op == REDIS_OP_UNION || j == 0) {
4565 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4566 incrRefCount(ele);
40d224a9 4567 cardinality++;
4568 }
f4f56e1d 4569 } else if (op == REDIS_OP_DIFF) {
4570 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4571 cardinality--;
4572 }
40d224a9 4573 }
4574 }
4575 dictReleaseIterator(di);
51829ed3
AG
4576
4577 if (op == REDIS_OP_DIFF && cardinality == 0) break; /* result set is empty */
40d224a9 4578 }
4579
f4f56e1d 4580 /* Output the content of the resulting set, if not in STORE mode */
4581 if (!dstkey) {
4582 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4583 di = dictGetIterator(dstset->ptr);
f4f56e1d 4584 while((de = dictNext(di)) != NULL) {
4585 robj *ele;
4586
4587 ele = dictGetEntryKey(de);
942a3961 4588 addReplyBulkLen(c,ele);
f4f56e1d 4589 addReply(c,ele);
4590 addReply(c,shared.crlf);
4591 }
4592 dictReleaseIterator(di);
83cdfe18
AG
4593 } else {
4594 /* If we have a target key where to store the resulting set
4595 * create this key with the result set inside */
4596 deleteKey(c->db,dstkey);
4597 dictAdd(c->db->dict,dstkey,dstset);
4598 incrRefCount(dstkey);
f4f56e1d 4599 }
4600
4601 /* Cleanup */
40d224a9 4602 if (!dstkey) {
40d224a9 4603 decrRefCount(dstset);
4604 } else {
682ac724 4605 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",
03fd01c7 4606 dictSize((dict*)dstset->ptr)));
40d224a9 4607 server.dirty++;
4608 }
4609 zfree(dv);
4610}
4611
4612static void sunionCommand(redisClient *c) {
f4f56e1d 4613 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 4614}
4615
4616static void sunionstoreCommand(redisClient *c) {
f4f56e1d 4617 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4618}
4619
4620static void sdiffCommand(redisClient *c) {
4621 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4622}
4623
4624static void sdiffstoreCommand(redisClient *c) {
4625 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 4626}
4627
6b47e12e 4628/* ==================================== ZSets =============================== */
4629
4630/* ZSETs are ordered sets using two data structures to hold the same elements
4631 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4632 * data structure.
4633 *
4634 * The elements are added to an hash table mapping Redis objects to scores.
4635 * At the same time the elements are added to a skip list mapping scores
4636 * to Redis objects (so objects are sorted by scores in this "view"). */
4637
4638/* This skiplist implementation is almost a C translation of the original
4639 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
4640 * Alternative to Balanced Trees", modified in three ways:
4641 * a) this implementation allows for repeated values.
4642 * b) the comparison is not just by key (our 'score') but by satellite data.
4643 * c) there is a back pointer, so it's a doubly linked list with the back
4644 * pointers being only at "level 1". This allows to traverse the list
4645 * from tail to head, useful for ZREVRANGE. */
4646
4647static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
4648 zskiplistNode *zn = zmalloc(sizeof(*zn));
4649
4650 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
4651 zn->score = score;
4652 zn->obj = obj;
4653 return zn;
4654}
4655
4656static zskiplist *zslCreate(void) {
4657 int j;
4658 zskiplist *zsl;
4659
4660 zsl = zmalloc(sizeof(*zsl));
4661 zsl->level = 1;
cc812361 4662 zsl->length = 0;
6b47e12e 4663 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
4664 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++)
4665 zsl->header->forward[j] = NULL;
e3870fab 4666 zsl->header->backward = NULL;
4667 zsl->tail = NULL;
6b47e12e 4668 return zsl;
4669}
4670
fd8ccf44 4671static void zslFreeNode(zskiplistNode *node) {
4672 decrRefCount(node->obj);
ad807e6f 4673 zfree(node->forward);
fd8ccf44 4674 zfree(node);
4675}
4676
4677static void zslFree(zskiplist *zsl) {
ad807e6f 4678 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 4679
ad807e6f 4680 zfree(zsl->header->forward);
4681 zfree(zsl->header);
fd8ccf44 4682 while(node) {
599379dd 4683 next = node->forward[0];
fd8ccf44 4684 zslFreeNode(node);
4685 node = next;
4686 }
ad807e6f 4687 zfree(zsl);
fd8ccf44 4688}
4689
6b47e12e 4690static int zslRandomLevel(void) {
4691 int level = 1;
4692 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
4693 level += 1;
4694 return level;
4695}
4696
4697static void zslInsert(zskiplist *zsl, double score, robj *obj) {
4698 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4699 int i, level;
4700
4701 x = zsl->header;
4702 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 4703 while (x->forward[i] &&
4704 (x->forward[i]->score < score ||
4705 (x->forward[i]->score == score &&
4706 compareStringObjects(x->forward[i]->obj,obj) < 0)))
6b47e12e 4707 x = x->forward[i];
4708 update[i] = x;
4709 }
6b47e12e 4710 /* we assume the key is not already inside, since we allow duplicated
4711 * scores, and the re-insertion of score and redis object should never
4712 * happpen since the caller of zslInsert() should test in the hash table
4713 * if the element is already inside or not. */
4714 level = zslRandomLevel();
4715 if (level > zsl->level) {
4716 for (i = zsl->level; i < level; i++)
4717 update[i] = zsl->header;
4718 zsl->level = level;
4719 }
4720 x = zslCreateNode(level,score,obj);
4721 for (i = 0; i < level; i++) {
4722 x->forward[i] = update[i]->forward[i];
4723 update[i]->forward[i] = x;
4724 }
bb975144 4725 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 4726 if (x->forward[0])
4727 x->forward[0]->backward = x;
4728 else
4729 zsl->tail = x;
cc812361 4730 zsl->length++;
6b47e12e 4731}
4732
50c55df5 4733/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 4734static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 4735 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4736 int i;
4737
4738 x = zsl->header;
4739 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 4740 while (x->forward[i] &&
4741 (x->forward[i]->score < score ||
4742 (x->forward[i]->score == score &&
4743 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 4744 x = x->forward[i];
4745 update[i] = x;
4746 }
4747 /* We may have multiple elements with the same score, what we need
4748 * is to find the element with both the right score and object. */
4749 x = x->forward[0];
50c55df5 4750 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
9d60e6e4 4751 for (i = 0; i < zsl->level; i++) {
4752 if (update[i]->forward[i] != x) break;
4753 update[i]->forward[i] = x->forward[i];
4754 }
4755 if (x->forward[0]) {
4756 x->forward[0]->backward = (x->backward == zsl->header) ?
4757 NULL : x->backward;
e197b441 4758 } else {
9d60e6e4 4759 zsl->tail = x->backward;
e197b441 4760 }
9d60e6e4 4761 zslFreeNode(x);
4762 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
4763 zsl->level--;
4764 zsl->length--;
4765 return 1;
4766 } else {
4767 return 0; /* not found */
e197b441 4768 }
4769 return 0; /* not found */
fd8ccf44 4770}
4771
1807985b 4772/* Delete all the elements with score between min and max from the skiplist.
4773 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
4774 * Note that this function takes the reference to the hash table view of the
4775 * sorted set, in order to remove the elements from the hash table too. */
4776static unsigned long zslDeleteRange(zskiplist *zsl, double min, double max, dict *dict) {
4777 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
4778 unsigned long removed = 0;
4779 int i;
4780
4781 x = zsl->header;
4782 for (i = zsl->level-1; i >= 0; i--) {
4783 while (x->forward[i] && x->forward[i]->score < min)
4784 x = x->forward[i];
4785 update[i] = x;
4786 }
4787 /* We may have multiple elements with the same score, what we need
4788 * is to find the element with both the right score and object. */
4789 x = x->forward[0];
4790 while (x && x->score <= max) {
4791 zskiplistNode *next;
4792
4793 for (i = 0; i < zsl->level; i++) {
4794 if (update[i]->forward[i] != x) break;
4795 update[i]->forward[i] = x->forward[i];
4796 }
4797 if (x->forward[0]) {
4798 x->forward[0]->backward = (x->backward == zsl->header) ?
4799 NULL : x->backward;
4800 } else {
4801 zsl->tail = x->backward;
4802 }
4803 next = x->forward[0];
4804 dictDelete(dict,x->obj);
4805 zslFreeNode(x);
4806 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
4807 zsl->level--;
4808 zsl->length--;
4809 removed++;
4810 x = next;
4811 }
4812 return removed; /* not found */
4813}
4814
50c55df5 4815/* Find the first node having a score equal or greater than the specified one.
4816 * Returns NULL if there is no match. */
4817static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
4818 zskiplistNode *x;
4819 int i;
4820
4821 x = zsl->header;
4822 for (i = zsl->level-1; i >= 0; i--) {
4823 while (x->forward[i] && x->forward[i]->score < score)
4824 x = x->forward[i];
4825 }
4826 /* We may have multiple elements with the same score, what we need
4827 * is to find the element with both the right score and object. */
4828 return x->forward[0];
4829}
4830
fd8ccf44 4831/* The actual Z-commands implementations */
4832
7db723ad 4833/* This generic command implements both ZADD and ZINCRBY.
e2665397 4834 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 4835 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 4836static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 4837 robj *zsetobj;
4838 zset *zs;
4839 double *score;
4840
e2665397 4841 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 4842 if (zsetobj == NULL) {
4843 zsetobj = createZsetObject();
e2665397 4844 dictAdd(c->db->dict,key,zsetobj);
4845 incrRefCount(key);
fd8ccf44 4846 } else {
4847 if (zsetobj->type != REDIS_ZSET) {
4848 addReply(c,shared.wrongtypeerr);
4849 return;
4850 }
4851 }
fd8ccf44 4852 zs = zsetobj->ptr;
e2665397 4853
7db723ad 4854 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 4855 * needs to handle the two different conditions. It's all about setting
4856 * '*score', that is, the new score to set, to the right value. */
4857 score = zmalloc(sizeof(double));
4858 if (doincrement) {
4859 dictEntry *de;
4860
4861 /* Read the old score. If the element was not present starts from 0 */
4862 de = dictFind(zs->dict,ele);
4863 if (de) {
4864 double *oldscore = dictGetEntryVal(de);
4865 *score = *oldscore + scoreval;
4866 } else {
4867 *score = scoreval;
4868 }
4869 } else {
4870 *score = scoreval;
4871 }
4872
4873 /* What follows is a simple remove and re-insert operation that is common
7db723ad 4874 * to both ZADD and ZINCRBY... */
e2665397 4875 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 4876 /* case 1: New element */
e2665397 4877 incrRefCount(ele); /* added to hash */
4878 zslInsert(zs->zsl,*score,ele);
4879 incrRefCount(ele); /* added to skiplist */
fd8ccf44 4880 server.dirty++;
e2665397 4881 if (doincrement)
e2665397 4882 addReplyDouble(c,*score);
91d71bfc 4883 else
4884 addReply(c,shared.cone);
fd8ccf44 4885 } else {
4886 dictEntry *de;
4887 double *oldscore;
4888
4889 /* case 2: Score update operation */
e2665397 4890 de = dictFind(zs->dict,ele);
dfc5e96c 4891 redisAssert(de != NULL);
fd8ccf44 4892 oldscore = dictGetEntryVal(de);
4893 if (*score != *oldscore) {
4894 int deleted;
4895
e2665397 4896 /* Remove and insert the element in the skip list with new score */
4897 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 4898 redisAssert(deleted != 0);
e2665397 4899 zslInsert(zs->zsl,*score,ele);
4900 incrRefCount(ele);
4901 /* Update the score in the hash table */
4902 dictReplace(zs->dict,ele,score);
fd8ccf44 4903 server.dirty++;
2161a965 4904 } else {
4905 zfree(score);
fd8ccf44 4906 }
e2665397 4907 if (doincrement)
4908 addReplyDouble(c,*score);
4909 else
4910 addReply(c,shared.czero);
fd8ccf44 4911 }
4912}
4913
e2665397 4914static void zaddCommand(redisClient *c) {
4915 double scoreval;
4916
4917 scoreval = strtod(c->argv[2]->ptr,NULL);
4918 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
4919}
4920
7db723ad 4921static void zincrbyCommand(redisClient *c) {
e2665397 4922 double scoreval;
4923
4924 scoreval = strtod(c->argv[2]->ptr,NULL);
4925 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
4926}
4927
1b7106e7 4928static void zremCommand(redisClient *c) {
4929 robj *zsetobj;
4930 zset *zs;
4931
4932 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
4933 if (zsetobj == NULL) {
4934 addReply(c,shared.czero);
4935 } else {
4936 dictEntry *de;
4937 double *oldscore;
4938 int deleted;
4939
4940 if (zsetobj->type != REDIS_ZSET) {
4941 addReply(c,shared.wrongtypeerr);
4942 return;
4943 }
4944 zs = zsetobj->ptr;
4945 de = dictFind(zs->dict,c->argv[2]);
4946 if (de == NULL) {
4947 addReply(c,shared.czero);
4948 return;
4949 }
4950 /* Delete from the skiplist */
4951 oldscore = dictGetEntryVal(de);
4952 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
dfc5e96c 4953 redisAssert(deleted != 0);
1b7106e7 4954
4955 /* Delete from the hash table */
4956 dictDelete(zs->dict,c->argv[2]);
4957 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
4958 server.dirty++;
4959 addReply(c,shared.cone);
4960 }
4961}
4962
1807985b 4963static void zremrangebyscoreCommand(redisClient *c) {
4964 double min = strtod(c->argv[2]->ptr,NULL);
4965 double max = strtod(c->argv[3]->ptr,NULL);
4966 robj *zsetobj;
4967 zset *zs;
4968
4969 zsetobj = lookupKeyWrite(c->db,c->argv[1]);
4970 if (zsetobj == NULL) {
4971 addReply(c,shared.czero);
4972 } else {
4973 long deleted;
4974
4975 if (zsetobj->type != REDIS_ZSET) {
4976 addReply(c,shared.wrongtypeerr);
4977 return;
4978 }
4979 zs = zsetobj->ptr;
4980 deleted = zslDeleteRange(zs->zsl,min,max,zs->dict);
4981 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
4982 server.dirty += deleted;
4983 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",deleted));
4984 }
4985}
4986
e3870fab 4987static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 4988 robj *o;
4989 int start = atoi(c->argv[2]->ptr);
4990 int end = atoi(c->argv[3]->ptr);
752da584 4991 int withscores = 0;
4992
4993 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
4994 withscores = 1;
4995 } else if (c->argc >= 5) {
4996 addReply(c,shared.syntaxerr);
4997 return;
4998 }
cc812361 4999
5000 o = lookupKeyRead(c->db,c->argv[1]);
5001 if (o == NULL) {
5002 addReply(c,shared.nullmultibulk);
5003 } else {
5004 if (o->type != REDIS_ZSET) {
5005 addReply(c,shared.wrongtypeerr);
5006 } else {
5007 zset *zsetobj = o->ptr;
5008 zskiplist *zsl = zsetobj->zsl;
5009 zskiplistNode *ln;
5010
5011 int llen = zsl->length;
5012 int rangelen, j;
5013 robj *ele;
5014
5015 /* convert negative indexes */
5016 if (start < 0) start = llen+start;
5017 if (end < 0) end = llen+end;
5018 if (start < 0) start = 0;
5019 if (end < 0) end = 0;
5020
5021 /* indexes sanity checks */
5022 if (start > end || start >= llen) {
5023 /* Out of range start or start > end result in empty list */
5024 addReply(c,shared.emptymultibulk);
5025 return;
5026 }
5027 if (end >= llen) end = llen-1;
5028 rangelen = (end-start)+1;
5029
5030 /* Return the result in form of a multi-bulk reply */
e3870fab 5031 if (reverse) {
5032 ln = zsl->tail;
5033 while (start--)
5034 ln = ln->backward;
5035 } else {
5036 ln = zsl->header->forward[0];
5037 while (start--)
5038 ln = ln->forward[0];
5039 }
cc812361 5040
752da584 5041 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5042 withscores ? (rangelen*2) : rangelen));
cc812361 5043 for (j = 0; j < rangelen; j++) {
0aad7a19 5044 ele = ln->obj;
cc812361 5045 addReplyBulkLen(c,ele);
5046 addReply(c,ele);
5047 addReply(c,shared.crlf);
752da584 5048 if (withscores)
5049 addReplyDouble(c,ln->score);
e3870fab 5050 ln = reverse ? ln->backward : ln->forward[0];
cc812361 5051 }
5052 }
5053 }
5054}
5055
e3870fab 5056static void zrangeCommand(redisClient *c) {
5057 zrangeGenericCommand(c,0);
5058}
5059
5060static void zrevrangeCommand(redisClient *c) {
5061 zrangeGenericCommand(c,1);
5062}
5063
50c55df5 5064static void zrangebyscoreCommand(redisClient *c) {
5065 robj *o;
5066 double min = strtod(c->argv[2]->ptr,NULL);
5067 double max = strtod(c->argv[3]->ptr,NULL);
80181f78 5068 int offset = 0, limit = -1;
5069
5070 if (c->argc != 4 && c->argc != 7) {
454d4e43 5071 addReplySds(c,
5072 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 5073 return;
5074 } else if (c->argc == 7 && strcasecmp(c->argv[4]->ptr,"limit")) {
5075 addReply(c,shared.syntaxerr);
5076 return;
5077 } else if (c->argc == 7) {
5078 offset = atoi(c->argv[5]->ptr);
5079 limit = atoi(c->argv[6]->ptr);
0b13687c 5080 if (offset < 0) offset = 0;
80181f78 5081 }
50c55df5 5082
5083 o = lookupKeyRead(c->db,c->argv[1]);
5084 if (o == NULL) {
5085 addReply(c,shared.nullmultibulk);
5086 } else {
5087 if (o->type != REDIS_ZSET) {
5088 addReply(c,shared.wrongtypeerr);
5089 } else {
5090 zset *zsetobj = o->ptr;
5091 zskiplist *zsl = zsetobj->zsl;
5092 zskiplistNode *ln;
5093 robj *ele, *lenobj;
5094 unsigned int rangelen = 0;
5095
5096 /* Get the first node with the score >= min */
5097 ln = zslFirstWithScore(zsl,min);
5098 if (ln == NULL) {
5099 /* No element matching the speciifed interval */
5100 addReply(c,shared.emptymultibulk);
5101 return;
5102 }
5103
5104 /* We don't know in advance how many matching elements there
5105 * are in the list, so we push this object that will represent
5106 * the multi-bulk length in the output buffer, and will "fix"
5107 * it later */
5108 lenobj = createObject(REDIS_STRING,NULL);
5109 addReply(c,lenobj);
c74e7c77 5110 decrRefCount(lenobj);
50c55df5 5111
dbbc7285 5112 while(ln && ln->score <= max) {
80181f78 5113 if (offset) {
5114 offset--;
5115 ln = ln->forward[0];
5116 continue;
5117 }
5118 if (limit == 0) break;
50c55df5 5119 ele = ln->obj;
5120 addReplyBulkLen(c,ele);
5121 addReply(c,ele);
5122 addReply(c,shared.crlf);
5123 ln = ln->forward[0];
5124 rangelen++;
80181f78 5125 if (limit > 0) limit--;
50c55df5 5126 }
5127 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",rangelen);
5128 }
5129 }
5130}
5131
3c41331e 5132static void zcardCommand(redisClient *c) {
e197b441 5133 robj *o;
5134 zset *zs;
5135
5136 o = lookupKeyRead(c->db,c->argv[1]);
5137 if (o == NULL) {
5138 addReply(c,shared.czero);
5139 return;
5140 } else {
5141 if (o->type != REDIS_ZSET) {
5142 addReply(c,shared.wrongtypeerr);
5143 } else {
5144 zs = o->ptr;
682ac724 5145 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",zs->zsl->length));
e197b441 5146 }
5147 }
5148}
5149
6e333bbe 5150static void zscoreCommand(redisClient *c) {
5151 robj *o;
5152 zset *zs;
5153
5154 o = lookupKeyRead(c->db,c->argv[1]);
5155 if (o == NULL) {
96d8b4ee 5156 addReply(c,shared.nullbulk);
6e333bbe 5157 return;
5158 } else {
5159 if (o->type != REDIS_ZSET) {
5160 addReply(c,shared.wrongtypeerr);
5161 } else {
5162 dictEntry *de;
5163
5164 zs = o->ptr;
5165 de = dictFind(zs->dict,c->argv[2]);
5166 if (!de) {
5167 addReply(c,shared.nullbulk);
5168 } else {
6e333bbe 5169 double *score = dictGetEntryVal(de);
5170
e2665397 5171 addReplyDouble(c,*score);
6e333bbe 5172 }
5173 }
5174 }
5175}
5176
6b47e12e 5177/* ========================= Non type-specific commands ==================== */
5178
ed9b544e 5179static void flushdbCommand(redisClient *c) {
ca37e9cd 5180 server.dirty += dictSize(c->db->dict);
3305306f 5181 dictEmpty(c->db->dict);
5182 dictEmpty(c->db->expires);
ed9b544e 5183 addReply(c,shared.ok);
ed9b544e 5184}
5185
5186static void flushallCommand(redisClient *c) {
ca37e9cd 5187 server.dirty += emptyDb();
ed9b544e 5188 addReply(c,shared.ok);
f78fd11b 5189 rdbSave(server.dbfilename);
ca37e9cd 5190 server.dirty++;
ed9b544e 5191}
5192
56906eef 5193static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 5194 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 5195 so->type = type;
5196 so->pattern = pattern;
5197 return so;
5198}
5199
5200/* Return the value associated to the key with a name obtained
5201 * substituting the first occurence of '*' in 'pattern' with 'subst' */
56906eef 5202static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
ed9b544e 5203 char *p;
5204 sds spat, ssub;
5205 robj keyobj;
5206 int prefixlen, sublen, postfixlen;
ed9b544e 5207 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
5208 struct {
f1017b3f 5209 long len;
5210 long free;
ed9b544e 5211 char buf[REDIS_SORTKEY_MAX+1];
5212 } keyname;
5213
28173a49 5214 /* If the pattern is "#" return the substitution object itself in order
5215 * to implement the "SORT ... GET #" feature. */
5216 spat = pattern->ptr;
5217 if (spat[0] == '#' && spat[1] == '\0') {
5218 return subst;
5219 }
5220
5221 /* The substitution object may be specially encoded. If so we create
9d65a1bb 5222 * a decoded object on the fly. Otherwise getDecodedObject will just
5223 * increment the ref count, that we'll decrement later. */
5224 subst = getDecodedObject(subst);
942a3961 5225
ed9b544e 5226 ssub = subst->ptr;
5227 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
5228 p = strchr(spat,'*');
ed5a857a 5229 if (!p) {
5230 decrRefCount(subst);
5231 return NULL;
5232 }
ed9b544e 5233
5234 prefixlen = p-spat;
5235 sublen = sdslen(ssub);
5236 postfixlen = sdslen(spat)-(prefixlen+1);
5237 memcpy(keyname.buf,spat,prefixlen);
5238 memcpy(keyname.buf+prefixlen,ssub,sublen);
5239 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
5240 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
5241 keyname.len = prefixlen+sublen+postfixlen;
5242
dfc5e96c 5243 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
942a3961 5244 decrRefCount(subst);
5245
a4d1ba9a 5246 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
3305306f 5247 return lookupKeyRead(db,&keyobj);
ed9b544e 5248}
5249
5250/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
5251 * the additional parameter is not standard but a BSD-specific we have to
5252 * pass sorting parameters via the global 'server' structure */
5253static int sortCompare(const void *s1, const void *s2) {
5254 const redisSortObject *so1 = s1, *so2 = s2;
5255 int cmp;
5256
5257 if (!server.sort_alpha) {
5258 /* Numeric sorting. Here it's trivial as we precomputed scores */
5259 if (so1->u.score > so2->u.score) {
5260 cmp = 1;
5261 } else if (so1->u.score < so2->u.score) {
5262 cmp = -1;
5263 } else {
5264 cmp = 0;
5265 }
5266 } else {
5267 /* Alphanumeric sorting */
5268 if (server.sort_bypattern) {
5269 if (!so1->u.cmpobj || !so2->u.cmpobj) {
5270 /* At least one compare object is NULL */
5271 if (so1->u.cmpobj == so2->u.cmpobj)
5272 cmp = 0;
5273 else if (so1->u.cmpobj == NULL)
5274 cmp = -1;
5275 else
5276 cmp = 1;
5277 } else {
5278 /* We have both the objects, use strcoll */
5279 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
5280 }
5281 } else {
5282 /* Compare elements directly */
9d65a1bb 5283 robj *dec1, *dec2;
5284
5285 dec1 = getDecodedObject(so1->obj);
5286 dec2 = getDecodedObject(so2->obj);
5287 cmp = strcoll(dec1->ptr,dec2->ptr);
5288 decrRefCount(dec1);
5289 decrRefCount(dec2);
ed9b544e 5290 }
5291 }
5292 return server.sort_desc ? -cmp : cmp;
5293}
5294
5295/* The SORT command is the most complex command in Redis. Warning: this code
5296 * is optimized for speed and a bit less for readability */
5297static void sortCommand(redisClient *c) {
ed9b544e 5298 list *operations;
5299 int outputlen = 0;
5300 int desc = 0, alpha = 0;
5301 int limit_start = 0, limit_count = -1, start, end;
5302 int j, dontsort = 0, vectorlen;
5303 int getop = 0; /* GET operation counter */
443c6409 5304 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 5305 redisSortObject *vector; /* Resulting vector to sort */
5306
5307 /* Lookup the key to sort. It must be of the right types */
3305306f 5308 sortval = lookupKeyRead(c->db,c->argv[1]);
5309 if (sortval == NULL) {
d922ae65 5310 addReply(c,shared.nullmultibulk);
ed9b544e 5311 return;
5312 }
a5eb649b 5313 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
5314 sortval->type != REDIS_ZSET)
5315 {
c937aa89 5316 addReply(c,shared.wrongtypeerr);
ed9b544e 5317 return;
5318 }
5319
5320 /* Create a list of operations to perform for every sorted element.
5321 * Operations can be GET/DEL/INCR/DECR */
5322 operations = listCreate();
092dac2a 5323 listSetFreeMethod(operations,zfree);
ed9b544e 5324 j = 2;
5325
5326 /* Now we need to protect sortval incrementing its count, in the future
5327 * SORT may have options able to overwrite/delete keys during the sorting
5328 * and the sorted key itself may get destroied */
5329 incrRefCount(sortval);
5330
5331 /* The SORT command has an SQL-alike syntax, parse it */
5332 while(j < c->argc) {
5333 int leftargs = c->argc-j-1;
5334 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
5335 desc = 0;
5336 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
5337 desc = 1;
5338 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
5339 alpha = 1;
5340 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
5341 limit_start = atoi(c->argv[j+1]->ptr);
5342 limit_count = atoi(c->argv[j+2]->ptr);
5343 j+=2;
443c6409 5344 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
5345 storekey = c->argv[j+1];
5346 j++;
ed9b544e 5347 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
5348 sortby = c->argv[j+1];
5349 /* If the BY pattern does not contain '*', i.e. it is constant,
5350 * we don't need to sort nor to lookup the weight keys. */
5351 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
5352 j++;
5353 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
5354 listAddNodeTail(operations,createSortOperation(
5355 REDIS_SORT_GET,c->argv[j+1]));
5356 getop++;
5357 j++;
ed9b544e 5358 } else {
5359 decrRefCount(sortval);
5360 listRelease(operations);
c937aa89 5361 addReply(c,shared.syntaxerr);
ed9b544e 5362 return;
5363 }
5364 j++;
5365 }
5366
5367 /* Load the sorting vector with all the objects to sort */
a5eb649b 5368 switch(sortval->type) {
5369 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
5370 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
5371 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
dfc5e96c 5372 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
a5eb649b 5373 }
ed9b544e 5374 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 5375 j = 0;
a5eb649b 5376
ed9b544e 5377 if (sortval->type == REDIS_LIST) {
5378 list *list = sortval->ptr;
6208b3a7 5379 listNode *ln;
c7df85a4 5380 listIter li;
6208b3a7 5381
c7df85a4 5382 listRewind(list,&li);
5383 while((ln = listNext(&li))) {
ed9b544e 5384 robj *ele = ln->value;
5385 vector[j].obj = ele;
5386 vector[j].u.score = 0;
5387 vector[j].u.cmpobj = NULL;
ed9b544e 5388 j++;
5389 }
5390 } else {
a5eb649b 5391 dict *set;
ed9b544e 5392 dictIterator *di;
5393 dictEntry *setele;
5394
a5eb649b 5395 if (sortval->type == REDIS_SET) {
5396 set = sortval->ptr;
5397 } else {
5398 zset *zs = sortval->ptr;
5399 set = zs->dict;
5400 }
5401
ed9b544e 5402 di = dictGetIterator(set);
ed9b544e 5403 while((setele = dictNext(di)) != NULL) {
5404 vector[j].obj = dictGetEntryKey(setele);
5405 vector[j].u.score = 0;
5406 vector[j].u.cmpobj = NULL;
5407 j++;
5408 }
5409 dictReleaseIterator(di);
5410 }
dfc5e96c 5411 redisAssert(j == vectorlen);
ed9b544e 5412
5413 /* Now it's time to load the right scores in the sorting vector */
5414 if (dontsort == 0) {
5415 for (j = 0; j < vectorlen; j++) {
5416 if (sortby) {
5417 robj *byval;
5418
3305306f 5419 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
ed9b544e 5420 if (!byval || byval->type != REDIS_STRING) continue;
5421 if (alpha) {
9d65a1bb 5422 vector[j].u.cmpobj = getDecodedObject(byval);
ed9b544e 5423 } else {
942a3961 5424 if (byval->encoding == REDIS_ENCODING_RAW) {
5425 vector[j].u.score = strtod(byval->ptr,NULL);
5426 } else {
9d65a1bb 5427 /* Don't need to decode the object if it's
5428 * integer-encoded (the only encoding supported) so
5429 * far. We can just cast it */
f1017b3f 5430 if (byval->encoding == REDIS_ENCODING_INT) {
942a3961 5431 vector[j].u.score = (long)byval->ptr;
f1017b3f 5432 } else
dfc5e96c 5433 redisAssert(1 != 1);
942a3961 5434 }
ed9b544e 5435 }
5436 } else {
942a3961 5437 if (!alpha) {
5438 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
5439 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
5440 else {
5441 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
5442 vector[j].u.score = (long) vector[j].obj->ptr;
5443 else
dfc5e96c 5444 redisAssert(1 != 1);
942a3961 5445 }
5446 }
ed9b544e 5447 }
5448 }
5449 }
5450
5451 /* We are ready to sort the vector... perform a bit of sanity check
5452 * on the LIMIT option too. We'll use a partial version of quicksort. */
5453 start = (limit_start < 0) ? 0 : limit_start;
5454 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
5455 if (start >= vectorlen) {
5456 start = vectorlen-1;
5457 end = vectorlen-2;
5458 }
5459 if (end >= vectorlen) end = vectorlen-1;
5460
5461 if (dontsort == 0) {
5462 server.sort_desc = desc;
5463 server.sort_alpha = alpha;
5464 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 5465 if (sortby && (start != 0 || end != vectorlen-1))
5466 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
5467 else
5468 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 5469 }
5470
5471 /* Send command output to the output buffer, performing the specified
5472 * GET/DEL/INCR/DECR operations if any. */
5473 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 5474 if (storekey == NULL) {
5475 /* STORE option not specified, sent the sorting result to client */
5476 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
5477 for (j = start; j <= end; j++) {
5478 listNode *ln;
c7df85a4 5479 listIter li;
5480
443c6409 5481 if (!getop) {
5482 addReplyBulkLen(c,vector[j].obj);
5483 addReply(c,vector[j].obj);
5484 addReply(c,shared.crlf);
5485 }
c7df85a4 5486 listRewind(operations,&li);
5487 while((ln = listNext(&li))) {
443c6409 5488 redisSortOperation *sop = ln->value;
5489 robj *val = lookupKeyByPattern(c->db,sop->pattern,
5490 vector[j].obj);
5491
5492 if (sop->type == REDIS_SORT_GET) {
5493 if (!val || val->type != REDIS_STRING) {
5494 addReply(c,shared.nullbulk);
5495 } else {
5496 addReplyBulkLen(c,val);
5497 addReply(c,val);
5498 addReply(c,shared.crlf);
5499 }
5500 } else {
dfc5e96c 5501 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 5502 }
5503 }
ed9b544e 5504 }
443c6409 5505 } else {
5506 robj *listObject = createListObject();
5507 list *listPtr = (list*) listObject->ptr;
5508
5509 /* STORE option specified, set the sorting result as a List object */
5510 for (j = start; j <= end; j++) {
5511 listNode *ln;
c7df85a4 5512 listIter li;
5513
443c6409 5514 if (!getop) {
5515 listAddNodeTail(listPtr,vector[j].obj);
5516 incrRefCount(vector[j].obj);
5517 }
c7df85a4 5518 listRewind(operations,&li);
5519 while((ln = listNext(&li))) {
443c6409 5520 redisSortOperation *sop = ln->value;
5521 robj *val = lookupKeyByPattern(c->db,sop->pattern,
5522 vector[j].obj);
5523
5524 if (sop->type == REDIS_SORT_GET) {
5525 if (!val || val->type != REDIS_STRING) {
5526 listAddNodeTail(listPtr,createStringObject("",0));
5527 } else {
5528 listAddNodeTail(listPtr,val);
5529 incrRefCount(val);
5530 }
ed9b544e 5531 } else {
dfc5e96c 5532 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 5533 }
ed9b544e 5534 }
ed9b544e 5535 }
121796f7 5536 if (dictReplace(c->db->dict,storekey,listObject)) {
5537 incrRefCount(storekey);
5538 }
443c6409 5539 /* Note: we add 1 because the DB is dirty anyway since even if the
5540 * SORT result is empty a new key is set and maybe the old content
5541 * replaced. */
5542 server.dirty += 1+outputlen;
5543 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 5544 }
5545
5546 /* Cleanup */
5547 decrRefCount(sortval);
5548 listRelease(operations);
5549 for (j = 0; j < vectorlen; j++) {
5550 if (sortby && alpha && vector[j].u.cmpobj)
5551 decrRefCount(vector[j].u.cmpobj);
5552 }
5553 zfree(vector);
5554}
5555
ec6c7a1d 5556/* Convert an amount of bytes into a human readable string in the form
5557 * of 100B, 2G, 100M, 4K, and so forth. */
5558static void bytesToHuman(char *s, unsigned long long n) {
5559 double d;
5560
5561 if (n < 1024) {
5562 /* Bytes */
5563 sprintf(s,"%lluB",n);
5564 return;
5565 } else if (n < (1024*1024)) {
5566 d = (double)n/(1024);
5567 sprintf(s,"%.2fK",d);
5568 } else if (n < (1024LL*1024*1024)) {
5569 d = (double)n/(1024*1024);
5570 sprintf(s,"%.2fM",d);
5571 } else if (n < (1024LL*1024*1024*1024)) {
5572 d = (double)n/(1024LL*1024*1024);
5573 sprintf(s,"%.2fM",d);
5574 }
5575}
5576
1c85b79f 5577/* Create the string returned by the INFO command. This is decoupled
5578 * by the INFO command itself as we need to report the same information
5579 * on memory corruption problems. */
5580static sds genRedisInfoString(void) {
ed9b544e 5581 sds info;
5582 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 5583 int j;
ec6c7a1d 5584 char hmem[64];
5585
5586 bytesToHuman(hmem,server.usedmemory);
ed9b544e 5587 info = sdscatprintf(sdsempty(),
5588 "redis_version:%s\r\n"
f1017b3f 5589 "arch_bits:%s\r\n"
7a932b74 5590 "multiplexing_api:%s\r\n"
0d7170a4 5591 "process_id:%ld\r\n"
682ac724 5592 "uptime_in_seconds:%ld\r\n"
5593 "uptime_in_days:%ld\r\n"
ed9b544e 5594 "connected_clients:%d\r\n"
5595 "connected_slaves:%d\r\n"
f86a74e9 5596 "blocked_clients:%d\r\n"
5fba9f71 5597 "used_memory:%zu\r\n"
ec6c7a1d 5598 "used_memory_human:%s\r\n"
ed9b544e 5599 "changes_since_last_save:%lld\r\n"
be2bb6b0 5600 "bgsave_in_progress:%d\r\n"
682ac724 5601 "last_save_time:%ld\r\n"
b3fad521 5602 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 5603 "total_connections_received:%lld\r\n"
5604 "total_commands_processed:%lld\r\n"
7d98e08c 5605 "vm_enabled:%d\r\n"
a0f643ea 5606 "role:%s\r\n"
ed9b544e 5607 ,REDIS_VERSION,
f1017b3f 5608 (sizeof(long) == 8) ? "64" : "32",
7a932b74 5609 aeGetApiName(),
0d7170a4 5610 (long) getpid(),
a0f643ea 5611 uptime,
5612 uptime/(3600*24),
ed9b544e 5613 listLength(server.clients)-listLength(server.slaves),
5614 listLength(server.slaves),
f86a74e9 5615 server.blockedclients,
ed9b544e 5616 server.usedmemory,
ec6c7a1d 5617 hmem,
ed9b544e 5618 server.dirty,
9d65a1bb 5619 server.bgsavechildpid != -1,
ed9b544e 5620 server.lastsave,
b3fad521 5621 server.bgrewritechildpid != -1,
ed9b544e 5622 server.stat_numconnections,
5623 server.stat_numcommands,
7d98e08c 5624 server.vm_enabled != 0,
a0f643ea 5625 server.masterhost == NULL ? "master" : "slave"
ed9b544e 5626 );
a0f643ea 5627 if (server.masterhost) {
5628 info = sdscatprintf(info,
5629 "master_host:%s\r\n"
5630 "master_port:%d\r\n"
5631 "master_link_status:%s\r\n"
5632 "master_last_io_seconds_ago:%d\r\n"
5633 ,server.masterhost,
5634 server.masterport,
5635 (server.replstate == REDIS_REPL_CONNECTED) ?
5636 "up" : "down",
f72b934d 5637 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 5638 );
5639 }
7d98e08c 5640 if (server.vm_enabled) {
1064ef87 5641 lockThreadedIO();
7d98e08c 5642 info = sdscatprintf(info,
5643 "vm_conf_max_memory:%llu\r\n"
5644 "vm_conf_page_size:%llu\r\n"
5645 "vm_conf_pages:%llu\r\n"
5646 "vm_stats_used_pages:%llu\r\n"
5647 "vm_stats_swapped_objects:%llu\r\n"
5648 "vm_stats_swappin_count:%llu\r\n"
5649 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 5650 "vm_stats_io_newjobs_len:%lu\r\n"
5651 "vm_stats_io_processing_len:%lu\r\n"
5652 "vm_stats_io_processed_len:%lu\r\n"
5653 "vm_stats_io_waiting_clients:%lu\r\n"
25fd2cb2 5654 "vm_stats_io_active_threads:%lu\r\n"
7d98e08c 5655 ,(unsigned long long) server.vm_max_memory,
5656 (unsigned long long) server.vm_page_size,
5657 (unsigned long long) server.vm_pages,
5658 (unsigned long long) server.vm_stats_used_pages,
5659 (unsigned long long) server.vm_stats_swapped_objects,
5660 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 5661 (unsigned long long) server.vm_stats_swapouts,
5662 (unsigned long) listLength(server.io_newjobs),
5663 (unsigned long) listLength(server.io_processing),
5664 (unsigned long) listLength(server.io_processed),
25fd2cb2 5665 (unsigned long) listLength(server.io_clients),
5666 (unsigned long) server.io_active_threads
7d98e08c 5667 );
1064ef87 5668 unlockThreadedIO();
7d98e08c 5669 }
c3cb078d 5670 for (j = 0; j < server.dbnum; j++) {
5671 long long keys, vkeys;
5672
5673 keys = dictSize(server.db[j].dict);
5674 vkeys = dictSize(server.db[j].expires);
5675 if (keys || vkeys) {
9d65a1bb 5676 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 5677 j, keys, vkeys);
5678 }
5679 }
1c85b79f 5680 return info;
5681}
5682
5683static void infoCommand(redisClient *c) {
5684 sds info = genRedisInfoString();
83c6a618 5685 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
5686 (unsigned long)sdslen(info)));
ed9b544e 5687 addReplySds(c,info);
70003d28 5688 addReply(c,shared.crlf);
ed9b544e 5689}
5690
3305306f 5691static void monitorCommand(redisClient *c) {
5692 /* ignore MONITOR if aleady slave or in monitor mode */
5693 if (c->flags & REDIS_SLAVE) return;
5694
5695 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
5696 c->slaveseldb = 0;
6b47e12e 5697 listAddNodeTail(server.monitors,c);
3305306f 5698 addReply(c,shared.ok);
5699}
5700
5701/* ================================= Expire ================================= */
5702static int removeExpire(redisDb *db, robj *key) {
5703 if (dictDelete(db->expires,key) == DICT_OK) {
5704 return 1;
5705 } else {
5706 return 0;
5707 }
5708}
5709
5710static int setExpire(redisDb *db, robj *key, time_t when) {
5711 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
5712 return 0;
5713 } else {
5714 incrRefCount(key);
5715 return 1;
5716 }
5717}
5718
bb32ede5 5719/* Return the expire time of the specified key, or -1 if no expire
5720 * is associated with this key (i.e. the key is non volatile) */
5721static time_t getExpire(redisDb *db, robj *key) {
5722 dictEntry *de;
5723
5724 /* No expire? return ASAP */
5725 if (dictSize(db->expires) == 0 ||
5726 (de = dictFind(db->expires,key)) == NULL) return -1;
5727
5728 return (time_t) dictGetEntryVal(de);
5729}
5730
3305306f 5731static int expireIfNeeded(redisDb *db, robj *key) {
5732 time_t when;
5733 dictEntry *de;
5734
5735 /* No expire? return ASAP */
5736 if (dictSize(db->expires) == 0 ||
5737 (de = dictFind(db->expires,key)) == NULL) return 0;
5738
5739 /* Lookup the expire */
5740 when = (time_t) dictGetEntryVal(de);
5741 if (time(NULL) <= when) return 0;
5742
5743 /* Delete the key */
5744 dictDelete(db->expires,key);
5745 return dictDelete(db->dict,key) == DICT_OK;
5746}
5747
5748static int deleteIfVolatile(redisDb *db, robj *key) {
5749 dictEntry *de;
5750
5751 /* No expire? return ASAP */
5752 if (dictSize(db->expires) == 0 ||
5753 (de = dictFind(db->expires,key)) == NULL) return 0;
5754
5755 /* Delete the key */
0c66a471 5756 server.dirty++;
3305306f 5757 dictDelete(db->expires,key);
5758 return dictDelete(db->dict,key) == DICT_OK;
5759}
5760
802e8373 5761static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
3305306f 5762 dictEntry *de;
3305306f 5763
802e8373 5764 de = dictFind(c->db->dict,key);
3305306f 5765 if (de == NULL) {
5766 addReply(c,shared.czero);
5767 return;
5768 }
43e5ccdf 5769 if (seconds < 0) {
5770 if (deleteKey(c->db,key)) server.dirty++;
5771 addReply(c, shared.cone);
3305306f 5772 return;
5773 } else {
5774 time_t when = time(NULL)+seconds;
802e8373 5775 if (setExpire(c->db,key,when)) {
3305306f 5776 addReply(c,shared.cone);
77423026 5777 server.dirty++;
5778 } else {
3305306f 5779 addReply(c,shared.czero);
77423026 5780 }
3305306f 5781 return;
5782 }
5783}
5784
802e8373 5785static void expireCommand(redisClient *c) {
5786 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
5787}
5788
5789static void expireatCommand(redisClient *c) {
5790 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
5791}
5792
fd88489a 5793static void ttlCommand(redisClient *c) {
5794 time_t expire;
5795 int ttl = -1;
5796
5797 expire = getExpire(c->db,c->argv[1]);
5798 if (expire != -1) {
5799 ttl = (int) (expire-time(NULL));
5800 if (ttl < 0) ttl = -1;
5801 }
5802 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
5803}
5804
6e469882 5805/* ================================ MULTI/EXEC ============================== */
5806
5807/* Client state initialization for MULTI/EXEC */
5808static void initClientMultiState(redisClient *c) {
5809 c->mstate.commands = NULL;
5810 c->mstate.count = 0;
5811}
5812
5813/* Release all the resources associated with MULTI/EXEC state */
5814static void freeClientMultiState(redisClient *c) {
5815 int j;
5816
5817 for (j = 0; j < c->mstate.count; j++) {
5818 int i;
5819 multiCmd *mc = c->mstate.commands+j;
5820
5821 for (i = 0; i < mc->argc; i++)
5822 decrRefCount(mc->argv[i]);
5823 zfree(mc->argv);
5824 }
5825 zfree(c->mstate.commands);
5826}
5827
5828/* Add a new command into the MULTI commands queue */
5829static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
5830 multiCmd *mc;
5831 int j;
5832
5833 c->mstate.commands = zrealloc(c->mstate.commands,
5834 sizeof(multiCmd)*(c->mstate.count+1));
5835 mc = c->mstate.commands+c->mstate.count;
5836 mc->cmd = cmd;
5837 mc->argc = c->argc;
5838 mc->argv = zmalloc(sizeof(robj*)*c->argc);
5839 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
5840 for (j = 0; j < c->argc; j++)
5841 incrRefCount(mc->argv[j]);
5842 c->mstate.count++;
5843}
5844
5845static void multiCommand(redisClient *c) {
5846 c->flags |= REDIS_MULTI;
36c548f0 5847 addReply(c,shared.ok);
6e469882 5848}
5849
5850static void execCommand(redisClient *c) {
5851 int j;
5852 robj **orig_argv;
5853 int orig_argc;
5854
5855 if (!(c->flags & REDIS_MULTI)) {
5856 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
5857 return;
5858 }
5859
5860 orig_argv = c->argv;
5861 orig_argc = c->argc;
5862 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
5863 for (j = 0; j < c->mstate.count; j++) {
5864 c->argc = c->mstate.commands[j].argc;
5865 c->argv = c->mstate.commands[j].argv;
5866 call(c,c->mstate.commands[j].cmd);
5867 }
5868 c->argv = orig_argv;
5869 c->argc = orig_argc;
5870 freeClientMultiState(c);
5871 initClientMultiState(c);
5872 c->flags &= (~REDIS_MULTI);
5873}
5874
4409877e 5875/* =========================== Blocking Operations ========================= */
5876
5877/* Currently Redis blocking operations support is limited to list POP ops,
5878 * so the current implementation is not fully generic, but it is also not
5879 * completely specific so it will not require a rewrite to support new
5880 * kind of blocking operations in the future.
5881 *
5882 * Still it's important to note that list blocking operations can be already
5883 * used as a notification mechanism in order to implement other blocking
5884 * operations at application level, so there must be a very strong evidence
5885 * of usefulness and generality before new blocking operations are implemented.
5886 *
5887 * This is how the current blocking POP works, we use BLPOP as example:
5888 * - If the user calls BLPOP and the key exists and contains a non empty list
5889 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
5890 * if there is not to block.
5891 * - If instead BLPOP is called and the key does not exists or the list is
5892 * empty we need to block. In order to do so we remove the notification for
5893 * new data to read in the client socket (so that we'll not serve new
5894 * requests if the blocking request is not served). Also we put the client
95242ab5 5895 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
4409877e 5896 * blocking for this keys.
5897 * - If a PUSH operation against a key with blocked clients waiting is
5898 * performed, we serve the first in the list: basically instead to push
5899 * the new element inside the list we return it to the (first / oldest)
5900 * blocking client, unblock the client, and remove it form the list.
5901 *
5902 * The above comment and the source code should be enough in order to understand
5903 * the implementation and modify / fix it later.
5904 */
5905
5906/* Set a client in blocking mode for the specified key, with the specified
5907 * timeout */
b177fd30 5908static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 5909 dictEntry *de;
5910 list *l;
b177fd30 5911 int j;
4409877e 5912
b177fd30 5913 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
5914 c->blockingkeysnum = numkeys;
4409877e 5915 c->blockingto = timeout;
b177fd30 5916 for (j = 0; j < numkeys; j++) {
5917 /* Add the key in the client structure, to map clients -> keys */
5918 c->blockingkeys[j] = keys[j];
5919 incrRefCount(keys[j]);
4409877e 5920
b177fd30 5921 /* And in the other "side", to map keys -> clients */
5922 de = dictFind(c->db->blockingkeys,keys[j]);
5923 if (de == NULL) {
5924 int retval;
5925
5926 /* For every key we take a list of clients blocked for it */
5927 l = listCreate();
5928 retval = dictAdd(c->db->blockingkeys,keys[j],l);
5929 incrRefCount(keys[j]);
5930 assert(retval == DICT_OK);
5931 } else {
5932 l = dictGetEntryVal(de);
5933 }
5934 listAddNodeTail(l,c);
4409877e 5935 }
b177fd30 5936 /* Mark the client as a blocked client */
4409877e 5937 c->flags |= REDIS_BLOCKED;
5938 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
f86a74e9 5939 server.blockedclients++;
4409877e 5940}
5941
5942/* Unblock a client that's waiting in a blocking operation such as BLPOP */
5943static void unblockClient(redisClient *c) {
5944 dictEntry *de;
5945 list *l;
b177fd30 5946 int j;
4409877e 5947
b177fd30 5948 assert(c->blockingkeys != NULL);
5949 /* The client may wait for multiple keys, so unblock it for every key. */
5950 for (j = 0; j < c->blockingkeysnum; j++) {
5951 /* Remove this client from the list of clients waiting for this key. */
5952 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
5953 assert(de != NULL);
5954 l = dictGetEntryVal(de);
5955 listDelNode(l,listSearchKey(l,c));
5956 /* If the list is empty we need to remove it to avoid wasting memory */
5957 if (listLength(l) == 0)
5958 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
5959 decrRefCount(c->blockingkeys[j]);
5960 }
5961 /* Cleanup the client structure */
5962 zfree(c->blockingkeys);
5963 c->blockingkeys = NULL;
4409877e 5964 c->flags &= (~REDIS_BLOCKED);
f86a74e9 5965 server.blockedclients--;
4409877e 5966 /* Ok now we are ready to get read events from socket, note that we
5967 * can't trap errors here as it's possible that unblockClients() is
5968 * called from freeClient() itself, and the only thing we can do
5969 * if we failed to register the READABLE event is to kill the client.
5970 * Still the following function should never fail in the real world as
5971 * we are sure the file descriptor is sane, and we exit on out of mem. */
5972 aeCreateFileEvent(server.el, c->fd, AE_READABLE, readQueryFromClient, c);
5973 /* As a final step we want to process data if there is some command waiting
5974 * in the input buffer. Note that this is safe even if unblockClient()
5975 * gets called from freeClient() because freeClient() will be smart
5976 * enough to call this function *after* c->querybuf was set to NULL. */
5977 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
5978}
5979
5980/* This should be called from any function PUSHing into lists.
5981 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
5982 * 'ele' is the element pushed.
5983 *
5984 * If the function returns 0 there was no client waiting for a list push
5985 * against this key.
5986 *
5987 * If the function returns 1 there was a client waiting for a list push
5988 * against this key, the element was passed to this client thus it's not
5989 * needed to actually add it to the list and the caller should return asap. */
5990static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
5991 struct dictEntry *de;
5992 redisClient *receiver;
5993 list *l;
5994 listNode *ln;
5995
5996 de = dictFind(c->db->blockingkeys,key);
5997 if (de == NULL) return 0;
5998 l = dictGetEntryVal(de);
5999 ln = listFirst(l);
6000 assert(ln != NULL);
6001 receiver = ln->value;
4409877e 6002
b177fd30 6003 addReplySds(receiver,sdsnew("*2\r\n"));
6004 addReplyBulkLen(receiver,key);
6005 addReply(receiver,key);
6006 addReply(receiver,shared.crlf);
4409877e 6007 addReplyBulkLen(receiver,ele);
6008 addReply(receiver,ele);
6009 addReply(receiver,shared.crlf);
6010 unblockClient(receiver);
6011 return 1;
6012}
6013
6014/* Blocking RPOP/LPOP */
6015static void blockingPopGenericCommand(redisClient *c, int where) {
6016 robj *o;
6017 time_t timeout;
b177fd30 6018 int j;
4409877e 6019
b177fd30 6020 for (j = 1; j < c->argc-1; j++) {
6021 o = lookupKeyWrite(c->db,c->argv[j]);
6022 if (o != NULL) {
6023 if (o->type != REDIS_LIST) {
6024 addReply(c,shared.wrongtypeerr);
4409877e 6025 return;
b177fd30 6026 } else {
6027 list *list = o->ptr;
6028 if (listLength(list) != 0) {
6029 /* If the list contains elements fall back to the usual
6030 * non-blocking POP operation */
6031 robj *argv[2], **orig_argv;
6032 int orig_argc;
6033
6034 /* We need to alter the command arguments before to call
6035 * popGenericCommand() as the command takes a single key. */
6036 orig_argv = c->argv;
6037 orig_argc = c->argc;
6038 argv[1] = c->argv[j];
6039 c->argv = argv;
6040 c->argc = 2;
6041
6042 /* Also the return value is different, we need to output
6043 * the multi bulk reply header and the key name. The
6044 * "real" command will add the last element (the value)
6045 * for us. If this souds like an hack to you it's just
6046 * because it is... */
6047 addReplySds(c,sdsnew("*2\r\n"));
6048 addReplyBulkLen(c,argv[1]);
6049 addReply(c,argv[1]);
6050 addReply(c,shared.crlf);
6051 popGenericCommand(c,where);
6052
6053 /* Fix the client structure with the original stuff */
6054 c->argv = orig_argv;
6055 c->argc = orig_argc;
6056 return;
6057 }
4409877e 6058 }
6059 }
6060 }
6061 /* If the list is empty or the key does not exists we must block */
b177fd30 6062 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 6063 if (timeout > 0) timeout += time(NULL);
b177fd30 6064 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 6065}
6066
6067static void blpopCommand(redisClient *c) {
6068 blockingPopGenericCommand(c,REDIS_HEAD);
6069}
6070
6071static void brpopCommand(redisClient *c) {
6072 blockingPopGenericCommand(c,REDIS_TAIL);
6073}
6074
ed9b544e 6075/* =============================== Replication ============================= */
6076
a4d1ba9a 6077static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 6078 ssize_t nwritten, ret = size;
6079 time_t start = time(NULL);
6080
6081 timeout++;
6082 while(size) {
6083 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
6084 nwritten = write(fd,ptr,size);
6085 if (nwritten == -1) return -1;
6086 ptr += nwritten;
6087 size -= nwritten;
6088 }
6089 if ((time(NULL)-start) > timeout) {
6090 errno = ETIMEDOUT;
6091 return -1;
6092 }
6093 }
6094 return ret;
6095}
6096
a4d1ba9a 6097static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 6098 ssize_t nread, totread = 0;
6099 time_t start = time(NULL);
6100
6101 timeout++;
6102 while(size) {
6103 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
6104 nread = read(fd,ptr,size);
6105 if (nread == -1) return -1;
6106 ptr += nread;
6107 size -= nread;
6108 totread += nread;
6109 }
6110 if ((time(NULL)-start) > timeout) {
6111 errno = ETIMEDOUT;
6112 return -1;
6113 }
6114 }
6115 return totread;
6116}
6117
6118static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
6119 ssize_t nread = 0;
6120
6121 size--;
6122 while(size) {
6123 char c;
6124
6125 if (syncRead(fd,&c,1,timeout) == -1) return -1;
6126 if (c == '\n') {
6127 *ptr = '\0';
6128 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
6129 return nread;
6130 } else {
6131 *ptr++ = c;
6132 *ptr = '\0';
6133 nread++;
6134 }
6135 }
6136 return nread;
6137}
6138
6139static void syncCommand(redisClient *c) {
40d224a9 6140 /* ignore SYNC if aleady slave or in monitor mode */
6141 if (c->flags & REDIS_SLAVE) return;
6142
6143 /* SYNC can't be issued when the server has pending data to send to
6144 * the client about already issued commands. We need a fresh reply
6145 * buffer registering the differences between the BGSAVE and the current
6146 * dataset, so that we can copy to other slaves if needed. */
6147 if (listLength(c->reply) != 0) {
6148 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
6149 return;
6150 }
6151
6152 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
6153 /* Here we need to check if there is a background saving operation
6154 * in progress, or if it is required to start one */
9d65a1bb 6155 if (server.bgsavechildpid != -1) {
40d224a9 6156 /* Ok a background save is in progress. Let's check if it is a good
6157 * one for replication, i.e. if there is another slave that is
6158 * registering differences since the server forked to save */
6159 redisClient *slave;
6160 listNode *ln;
c7df85a4 6161 listIter li;
40d224a9 6162
c7df85a4 6163 listRewind(server.slaves,&li);
6164 while((ln = listNext(&li))) {
40d224a9 6165 slave = ln->value;
6166 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 6167 }
6168 if (ln) {
6169 /* Perfect, the server is already registering differences for
6170 * another slave. Set the right state, and copy the buffer. */
6171 listRelease(c->reply);
6172 c->reply = listDup(slave->reply);
40d224a9 6173 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6174 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
6175 } else {
6176 /* No way, we need to wait for the next BGSAVE in order to
6177 * register differences */
6178 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
6179 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
6180 }
6181 } else {
6182 /* Ok we don't have a BGSAVE in progress, let's start one */
6183 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
6184 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
6185 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
6186 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
6187 return;
6188 }
6189 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6190 }
6208b3a7 6191 c->repldbfd = -1;
40d224a9 6192 c->flags |= REDIS_SLAVE;
6193 c->slaveseldb = 0;
6b47e12e 6194 listAddNodeTail(server.slaves,c);
40d224a9 6195 return;
6196}
6197
6208b3a7 6198static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
6199 redisClient *slave = privdata;
6200 REDIS_NOTUSED(el);
6201 REDIS_NOTUSED(mask);
6202 char buf[REDIS_IOBUF_LEN];
6203 ssize_t nwritten, buflen;
6204
6205 if (slave->repldboff == 0) {
6206 /* Write the bulk write count before to transfer the DB. In theory here
6207 * we don't know how much room there is in the output buffer of the
6208 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
6209 * operations) will never be smaller than the few bytes we need. */
6210 sds bulkcount;
6211
6212 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
6213 slave->repldbsize);
6214 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
6215 {
6216 sdsfree(bulkcount);
6217 freeClient(slave);
6218 return;
6219 }
6220 sdsfree(bulkcount);
6221 }
6222 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
6223 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
6224 if (buflen <= 0) {
6225 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
6226 (buflen == 0) ? "premature EOF" : strerror(errno));
6227 freeClient(slave);
6228 return;
6229 }
6230 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 6231 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 6232 strerror(errno));
6233 freeClient(slave);
6234 return;
6235 }
6236 slave->repldboff += nwritten;
6237 if (slave->repldboff == slave->repldbsize) {
6238 close(slave->repldbfd);
6239 slave->repldbfd = -1;
6240 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
6241 slave->replstate = REDIS_REPL_ONLINE;
6242 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 6243 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 6244 freeClient(slave);
6245 return;
6246 }
6247 addReplySds(slave,sdsempty());
6248 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
6249 }
6250}
ed9b544e 6251
a3b21203 6252/* This function is called at the end of every backgrond saving.
6253 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
6254 * otherwise REDIS_ERR is passed to the function.
6255 *
6256 * The goal of this function is to handle slaves waiting for a successful
6257 * background saving in order to perform non-blocking synchronization. */
6258static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 6259 listNode *ln;
6260 int startbgsave = 0;
c7df85a4 6261 listIter li;
ed9b544e 6262
c7df85a4 6263 listRewind(server.slaves,&li);
6264 while((ln = listNext(&li))) {
6208b3a7 6265 redisClient *slave = ln->value;
ed9b544e 6266
6208b3a7 6267 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
6268 startbgsave = 1;
6269 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
6270 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 6271 struct redis_stat buf;
6208b3a7 6272
6273 if (bgsaveerr != REDIS_OK) {
6274 freeClient(slave);
6275 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
6276 continue;
6277 }
6278 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 6279 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 6280 freeClient(slave);
6281 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
6282 continue;
6283 }
6284 slave->repldboff = 0;
6285 slave->repldbsize = buf.st_size;
6286 slave->replstate = REDIS_REPL_SEND_BULK;
6287 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 6288 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 6289 freeClient(slave);
6290 continue;
6291 }
6292 }
ed9b544e 6293 }
6208b3a7 6294 if (startbgsave) {
6295 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 6296 listIter li;
6297
6298 listRewind(server.slaves,&li);
6208b3a7 6299 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 6300 while((ln = listNext(&li))) {
6208b3a7 6301 redisClient *slave = ln->value;
ed9b544e 6302
6208b3a7 6303 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
6304 freeClient(slave);
6305 }
6306 }
6307 }
ed9b544e 6308}
6309
6310static int syncWithMaster(void) {
d0ccebcf 6311 char buf[1024], tmpfile[256], authcmd[1024];
ed9b544e 6312 int dumpsize;
6313 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
6314 int dfd;
6315
6316 if (fd == -1) {
6317 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
6318 strerror(errno));
6319 return REDIS_ERR;
6320 }
d0ccebcf 6321
6322 /* AUTH with the master if required. */
6323 if(server.masterauth) {
6324 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
6325 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
6326 close(fd);
6327 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
6328 strerror(errno));
6329 return REDIS_ERR;
6330 }
6331 /* Read the AUTH result. */
6332 if (syncReadLine(fd,buf,1024,3600) == -1) {
6333 close(fd);
6334 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
6335 strerror(errno));
6336 return REDIS_ERR;
6337 }
6338 if (buf[0] != '+') {
6339 close(fd);
6340 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
6341 return REDIS_ERR;
6342 }
6343 }
6344
ed9b544e 6345 /* Issue the SYNC command */
6346 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
6347 close(fd);
6348 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
6349 strerror(errno));
6350 return REDIS_ERR;
6351 }
6352 /* Read the bulk write count */
8c4d91fc 6353 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 6354 close(fd);
6355 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
6356 strerror(errno));
6357 return REDIS_ERR;
6358 }
4aa701c1 6359 if (buf[0] != '$') {
6360 close(fd);
6361 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
6362 return REDIS_ERR;
6363 }
c937aa89 6364 dumpsize = atoi(buf+1);
ed9b544e 6365 redisLog(REDIS_NOTICE,"Receiving %d bytes data dump from MASTER",dumpsize);
6366 /* Read the bulk write data on a temp file */
6367 snprintf(tmpfile,256,"temp-%d.%ld.rdb",(int)time(NULL),(long int)random());
6368 dfd = open(tmpfile,O_CREAT|O_WRONLY,0644);
6369 if (dfd == -1) {
6370 close(fd);
6371 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
6372 return REDIS_ERR;
6373 }
6374 while(dumpsize) {
6375 int nread, nwritten;
6376
6377 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
6378 if (nread == -1) {
6379 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
6380 strerror(errno));
6381 close(fd);
6382 close(dfd);
6383 return REDIS_ERR;
6384 }
6385 nwritten = write(dfd,buf,nread);
6386 if (nwritten == -1) {
6387 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
6388 close(fd);
6389 close(dfd);
6390 return REDIS_ERR;
6391 }
6392 dumpsize -= nread;
6393 }
6394 close(dfd);
6395 if (rename(tmpfile,server.dbfilename) == -1) {
6396 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
6397 unlink(tmpfile);
6398 close(fd);
6399 return REDIS_ERR;
6400 }
6401 emptyDb();
f78fd11b 6402 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 6403 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
6404 close(fd);
6405 return REDIS_ERR;
6406 }
6407 server.master = createClient(fd);
6408 server.master->flags |= REDIS_MASTER;
179b3952 6409 server.master->authenticated = 1;
ed9b544e 6410 server.replstate = REDIS_REPL_CONNECTED;
6411 return REDIS_OK;
6412}
6413
321b0e13 6414static void slaveofCommand(redisClient *c) {
6415 if (!strcasecmp(c->argv[1]->ptr,"no") &&
6416 !strcasecmp(c->argv[2]->ptr,"one")) {
6417 if (server.masterhost) {
6418 sdsfree(server.masterhost);
6419 server.masterhost = NULL;
6420 if (server.master) freeClient(server.master);
6421 server.replstate = REDIS_REPL_NONE;
6422 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
6423 }
6424 } else {
6425 sdsfree(server.masterhost);
6426 server.masterhost = sdsdup(c->argv[1]->ptr);
6427 server.masterport = atoi(c->argv[2]->ptr);
6428 if (server.master) freeClient(server.master);
6429 server.replstate = REDIS_REPL_CONNECT;
6430 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
6431 server.masterhost, server.masterport);
6432 }
6433 addReply(c,shared.ok);
6434}
6435
3fd78bcd 6436/* ============================ Maxmemory directive ======================== */
6437
a5819310 6438/* Try to free one object form the pre-allocated objects free list.
6439 * This is useful under low mem conditions as by default we take 1 million
6440 * free objects allocated. On success REDIS_OK is returned, otherwise
6441 * REDIS_ERR. */
6442static int tryFreeOneObjectFromFreelist(void) {
f870935d 6443 robj *o;
6444
a5819310 6445 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
6446 if (listLength(server.objfreelist)) {
6447 listNode *head = listFirst(server.objfreelist);
6448 o = listNodeValue(head);
6449 listDelNode(server.objfreelist,head);
6450 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
6451 zfree(o);
6452 return REDIS_OK;
6453 } else {
6454 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
6455 return REDIS_ERR;
6456 }
f870935d 6457}
6458
3fd78bcd 6459/* This function gets called when 'maxmemory' is set on the config file to limit
6460 * the max memory used by the server, and we are out of memory.
6461 * This function will try to, in order:
6462 *
6463 * - Free objects from the free list
6464 * - Try to remove keys with an EXPIRE set
6465 *
6466 * It is not possible to free enough memory to reach used-memory < maxmemory
6467 * the server will start refusing commands that will enlarge even more the
6468 * memory usage.
6469 */
6470static void freeMemoryIfNeeded(void) {
6471 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 6472 int j, k, freed = 0;
6473
6474 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
6475 for (j = 0; j < server.dbnum; j++) {
6476 int minttl = -1;
6477 robj *minkey = NULL;
6478 struct dictEntry *de;
6479
6480 if (dictSize(server.db[j].expires)) {
6481 freed = 1;
6482 /* From a sample of three keys drop the one nearest to
6483 * the natural expire */
6484 for (k = 0; k < 3; k++) {
6485 time_t t;
6486
6487 de = dictGetRandomKey(server.db[j].expires);
6488 t = (time_t) dictGetEntryVal(de);
6489 if (minttl == -1 || t < minttl) {
6490 minkey = dictGetEntryKey(de);
6491 minttl = t;
3fd78bcd 6492 }
3fd78bcd 6493 }
a5819310 6494 deleteKey(server.db+j,minkey);
3fd78bcd 6495 }
3fd78bcd 6496 }
a5819310 6497 if (!freed) return; /* nothing to free... */
3fd78bcd 6498 }
6499}
6500
f80dff62 6501/* ============================== Append Only file ========================== */
6502
6503static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
6504 sds buf = sdsempty();
6505 int j;
6506 ssize_t nwritten;
6507 time_t now;
6508 robj *tmpargv[3];
6509
6510 /* The DB this command was targetting is not the same as the last command
6511 * we appendend. To issue a SELECT command is needed. */
6512 if (dictid != server.appendseldb) {
6513 char seldb[64];
6514
6515 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 6516 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 6517 (unsigned long)strlen(seldb),seldb);
f80dff62 6518 server.appendseldb = dictid;
6519 }
6520
6521 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
6522 * EXPIREs into EXPIREATs calls */
6523 if (cmd->proc == expireCommand) {
6524 long when;
6525
6526 tmpargv[0] = createStringObject("EXPIREAT",8);
6527 tmpargv[1] = argv[1];
6528 incrRefCount(argv[1]);
6529 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
6530 tmpargv[2] = createObject(REDIS_STRING,
6531 sdscatprintf(sdsempty(),"%ld",when));
6532 argv = tmpargv;
6533 }
6534
6535 /* Append the actual command */
6536 buf = sdscatprintf(buf,"*%d\r\n",argc);
6537 for (j = 0; j < argc; j++) {
6538 robj *o = argv[j];
6539
9d65a1bb 6540 o = getDecodedObject(o);
83c6a618 6541 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
f80dff62 6542 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
6543 buf = sdscatlen(buf,"\r\n",2);
9d65a1bb 6544 decrRefCount(o);
f80dff62 6545 }
6546
6547 /* Free the objects from the modified argv for EXPIREAT */
6548 if (cmd->proc == expireCommand) {
6549 for (j = 0; j < 3; j++)
6550 decrRefCount(argv[j]);
6551 }
6552
6553 /* We want to perform a single write. This should be guaranteed atomic
6554 * at least if the filesystem we are writing is a real physical one.
6555 * While this will save us against the server being killed I don't think
6556 * there is much to do about the whole server stopping for power problems
6557 * or alike */
6558 nwritten = write(server.appendfd,buf,sdslen(buf));
6559 if (nwritten != (signed)sdslen(buf)) {
6560 /* Ooops, we are in troubles. The best thing to do for now is
6561 * to simply exit instead to give the illusion that everything is
6562 * working as expected. */
6563 if (nwritten == -1) {
6564 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
6565 } else {
6566 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
6567 }
6568 exit(1);
6569 }
85a83172 6570 /* If a background append only file rewriting is in progress we want to
6571 * accumulate the differences between the child DB and the current one
6572 * in a buffer, so that when the child process will do its work we
6573 * can append the differences to the new append only file. */
6574 if (server.bgrewritechildpid != -1)
6575 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
6576
6577 sdsfree(buf);
f80dff62 6578 now = time(NULL);
6579 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
6580 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
6581 now-server.lastfsync > 1))
6582 {
6583 fsync(server.appendfd); /* Let's try to get this data on the disk */
6584 server.lastfsync = now;
6585 }
6586}
6587
6588/* In Redis commands are always executed in the context of a client, so in
6589 * order to load the append only file we need to create a fake client. */
6590static struct redisClient *createFakeClient(void) {
6591 struct redisClient *c = zmalloc(sizeof(*c));
6592
6593 selectDb(c,0);
6594 c->fd = -1;
6595 c->querybuf = sdsempty();
6596 c->argc = 0;
6597 c->argv = NULL;
6598 c->flags = 0;
9387d17d 6599 /* We set the fake client as a slave waiting for the synchronization
6600 * so that Redis will not try to send replies to this client. */
6601 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 6602 c->reply = listCreate();
6603 listSetFreeMethod(c->reply,decrRefCount);
6604 listSetDupMethod(c->reply,dupClientReplyValue);
6605 return c;
6606}
6607
6608static void freeFakeClient(struct redisClient *c) {
6609 sdsfree(c->querybuf);
6610 listRelease(c->reply);
6611 zfree(c);
6612}
6613
6614/* Replay the append log file. On error REDIS_OK is returned. On non fatal
6615 * error (the append only file is zero-length) REDIS_ERR is returned. On
6616 * fatal error an error message is logged and the program exists. */
6617int loadAppendOnlyFile(char *filename) {
6618 struct redisClient *fakeClient;
6619 FILE *fp = fopen(filename,"r");
6620 struct redis_stat sb;
b492cf00 6621 unsigned long long loadedkeys = 0;
f80dff62 6622
6623 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
6624 return REDIS_ERR;
6625
6626 if (fp == NULL) {
6627 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
6628 exit(1);
6629 }
6630
6631 fakeClient = createFakeClient();
6632 while(1) {
6633 int argc, j;
6634 unsigned long len;
6635 robj **argv;
6636 char buf[128];
6637 sds argsds;
6638 struct redisCommand *cmd;
6639
6640 if (fgets(buf,sizeof(buf),fp) == NULL) {
6641 if (feof(fp))
6642 break;
6643 else
6644 goto readerr;
6645 }
6646 if (buf[0] != '*') goto fmterr;
6647 argc = atoi(buf+1);
6648 argv = zmalloc(sizeof(robj*)*argc);
6649 for (j = 0; j < argc; j++) {
6650 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
6651 if (buf[0] != '$') goto fmterr;
6652 len = strtol(buf+1,NULL,10);
6653 argsds = sdsnewlen(NULL,len);
0f151ef1 6654 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 6655 argv[j] = createObject(REDIS_STRING,argsds);
6656 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
6657 }
6658
6659 /* Command lookup */
6660 cmd = lookupCommand(argv[0]->ptr);
6661 if (!cmd) {
6662 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
6663 exit(1);
6664 }
6665 /* Try object sharing and encoding */
6666 if (server.shareobjects) {
6667 int j;
6668 for(j = 1; j < argc; j++)
6669 argv[j] = tryObjectSharing(argv[j]);
6670 }
6671 if (cmd->flags & REDIS_CMD_BULK)
6672 tryObjectEncoding(argv[argc-1]);
6673 /* Run the command in the context of a fake client */
6674 fakeClient->argc = argc;
6675 fakeClient->argv = argv;
6676 cmd->proc(fakeClient);
6677 /* Discard the reply objects list from the fake client */
6678 while(listLength(fakeClient->reply))
6679 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
6680 /* Clean up, ready for the next command */
6681 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
6682 zfree(argv);
b492cf00 6683 /* Handle swapping while loading big datasets when VM is on */
6684 loadedkeys++;
6685 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
6686 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 6687 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 6688 }
6689 }
f80dff62 6690 }
6691 fclose(fp);
6692 freeFakeClient(fakeClient);
6693 return REDIS_OK;
6694
6695readerr:
6696 if (feof(fp)) {
6697 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
6698 } else {
6699 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
6700 }
6701 exit(1);
6702fmterr:
6703 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
6704 exit(1);
6705}
6706
9d65a1bb 6707/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
6708static int fwriteBulk(FILE *fp, robj *obj) {
6709 char buf[128];
b9bc0eef 6710 int decrrc = 0;
6711
f2d9f50f 6712 /* Avoid the incr/decr ref count business if possible to help
6713 * copy-on-write (we are often in a child process when this function
6714 * is called).
6715 * Also makes sure that key objects don't get incrRefCount-ed when VM
6716 * is enabled */
6717 if (obj->encoding != REDIS_ENCODING_RAW) {
b9bc0eef 6718 obj = getDecodedObject(obj);
6719 decrrc = 1;
6720 }
9d65a1bb 6721 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
6722 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 6723 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
6724 goto err;
9d65a1bb 6725 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 6726 if (decrrc) decrRefCount(obj);
9d65a1bb 6727 return 1;
6728err:
b9bc0eef 6729 if (decrrc) decrRefCount(obj);
9d65a1bb 6730 return 0;
6731}
6732
6733/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
6734static int fwriteBulkDouble(FILE *fp, double d) {
6735 char buf[128], dbuf[128];
6736
6737 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
6738 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
6739 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
6740 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
6741 return 1;
6742}
6743
6744/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
6745static int fwriteBulkLong(FILE *fp, long l) {
6746 char buf[128], lbuf[128];
6747
6748 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
6749 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
6750 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
6751 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
6752 return 1;
6753}
6754
6755/* Write a sequence of commands able to fully rebuild the dataset into
6756 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
6757static int rewriteAppendOnlyFile(char *filename) {
6758 dictIterator *di = NULL;
6759 dictEntry *de;
6760 FILE *fp;
6761 char tmpfile[256];
6762 int j;
6763 time_t now = time(NULL);
6764
6765 /* Note that we have to use a different temp name here compared to the
6766 * one used by rewriteAppendOnlyFileBackground() function. */
6767 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
6768 fp = fopen(tmpfile,"w");
6769 if (!fp) {
6770 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
6771 return REDIS_ERR;
6772 }
6773 for (j = 0; j < server.dbnum; j++) {
6774 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
6775 redisDb *db = server.db+j;
6776 dict *d = db->dict;
6777 if (dictSize(d) == 0) continue;
6778 di = dictGetIterator(d);
6779 if (!di) {
6780 fclose(fp);
6781 return REDIS_ERR;
6782 }
6783
6784 /* SELECT the new DB */
6785 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 6786 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 6787
6788 /* Iterate this DB writing every entry */
6789 while((de = dictNext(di)) != NULL) {
e7546c63 6790 robj *key, *o;
6791 time_t expiretime;
6792 int swapped;
6793
6794 key = dictGetEntryKey(de);
b9bc0eef 6795 /* If the value for this key is swapped, load a preview in memory.
6796 * We use a "swapped" flag to remember if we need to free the
6797 * value object instead to just increment the ref count anyway
6798 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 6799 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
6800 key->storage == REDIS_VM_SWAPPING) {
e7546c63 6801 o = dictGetEntryVal(de);
6802 swapped = 0;
6803 } else {
6804 o = vmPreviewObject(key);
e7546c63 6805 swapped = 1;
6806 }
6807 expiretime = getExpire(db,key);
9d65a1bb 6808
6809 /* Save the key and associated value */
9d65a1bb 6810 if (o->type == REDIS_STRING) {
6811 /* Emit a SET command */
6812 char cmd[]="*3\r\n$3\r\nSET\r\n";
6813 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
6814 /* Key and value */
6815 if (fwriteBulk(fp,key) == 0) goto werr;
6816 if (fwriteBulk(fp,o) == 0) goto werr;
6817 } else if (o->type == REDIS_LIST) {
6818 /* Emit the RPUSHes needed to rebuild the list */
6819 list *list = o->ptr;
6820 listNode *ln;
c7df85a4 6821 listIter li;
9d65a1bb 6822
c7df85a4 6823 listRewind(list,&li);
6824 while((ln = listNext(&li))) {
9d65a1bb 6825 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
6826 robj *eleobj = listNodeValue(ln);
6827
6828 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
6829 if (fwriteBulk(fp,key) == 0) goto werr;
6830 if (fwriteBulk(fp,eleobj) == 0) goto werr;
6831 }
6832 } else if (o->type == REDIS_SET) {
6833 /* Emit the SADDs needed to rebuild the set */
6834 dict *set = o->ptr;
6835 dictIterator *di = dictGetIterator(set);
6836 dictEntry *de;
6837
6838 while((de = dictNext(di)) != NULL) {
6839 char cmd[]="*3\r\n$4\r\nSADD\r\n";
6840 robj *eleobj = dictGetEntryKey(de);
6841
6842 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
6843 if (fwriteBulk(fp,key) == 0) goto werr;
6844 if (fwriteBulk(fp,eleobj) == 0) goto werr;
6845 }
6846 dictReleaseIterator(di);
6847 } else if (o->type == REDIS_ZSET) {
6848 /* Emit the ZADDs needed to rebuild the sorted set */
6849 zset *zs = o->ptr;
6850 dictIterator *di = dictGetIterator(zs->dict);
6851 dictEntry *de;
6852
6853 while((de = dictNext(di)) != NULL) {
6854 char cmd[]="*4\r\n$4\r\nZADD\r\n";
6855 robj *eleobj = dictGetEntryKey(de);
6856 double *score = dictGetEntryVal(de);
6857
6858 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
6859 if (fwriteBulk(fp,key) == 0) goto werr;
6860 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
6861 if (fwriteBulk(fp,eleobj) == 0) goto werr;
6862 }
6863 dictReleaseIterator(di);
6864 } else {
dfc5e96c 6865 redisAssert(0 != 0);
9d65a1bb 6866 }
6867 /* Save the expire time */
6868 if (expiretime != -1) {
e96e4fbf 6869 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 6870 /* If this key is already expired skip it */
6871 if (expiretime < now) continue;
6872 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
6873 if (fwriteBulk(fp,key) == 0) goto werr;
6874 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
6875 }
b9bc0eef 6876 if (swapped) decrRefCount(o);
9d65a1bb 6877 }
6878 dictReleaseIterator(di);
6879 }
6880
6881 /* Make sure data will not remain on the OS's output buffers */
6882 fflush(fp);
6883 fsync(fileno(fp));
6884 fclose(fp);
6885
6886 /* Use RENAME to make sure the DB file is changed atomically only
6887 * if the generate DB file is ok. */
6888 if (rename(tmpfile,filename) == -1) {
6889 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
6890 unlink(tmpfile);
6891 return REDIS_ERR;
6892 }
6893 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
6894 return REDIS_OK;
6895
6896werr:
6897 fclose(fp);
6898 unlink(tmpfile);
e96e4fbf 6899 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 6900 if (di) dictReleaseIterator(di);
6901 return REDIS_ERR;
6902}
6903
6904/* This is how rewriting of the append only file in background works:
6905 *
6906 * 1) The user calls BGREWRITEAOF
6907 * 2) Redis calls this function, that forks():
6908 * 2a) the child rewrite the append only file in a temp file.
6909 * 2b) the parent accumulates differences in server.bgrewritebuf.
6910 * 3) When the child finished '2a' exists.
6911 * 4) The parent will trap the exit code, if it's OK, will append the
6912 * data accumulated into server.bgrewritebuf into the temp file, and
6913 * finally will rename(2) the temp file in the actual file name.
6914 * The the new file is reopened as the new append only file. Profit!
6915 */
6916static int rewriteAppendOnlyFileBackground(void) {
6917 pid_t childpid;
6918
6919 if (server.bgrewritechildpid != -1) return REDIS_ERR;
4ee9488d 6920 if (server.vm_enabled) waitZeroActiveThreads();
9d65a1bb 6921 if ((childpid = fork()) == 0) {
6922 /* Child */
6923 char tmpfile[256];
6924 close(server.fd);
6925
6926 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
6927 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
6928 exit(0);
6929 } else {
6930 exit(1);
6931 }
6932 } else {
6933 /* Parent */
6934 if (childpid == -1) {
6935 redisLog(REDIS_WARNING,
6936 "Can't rewrite append only file in background: fork: %s",
6937 strerror(errno));
6938 return REDIS_ERR;
6939 }
6940 redisLog(REDIS_NOTICE,
6941 "Background append only file rewriting started by pid %d",childpid);
6942 server.bgrewritechildpid = childpid;
85a83172 6943 /* We set appendseldb to -1 in order to force the next call to the
6944 * feedAppendOnlyFile() to issue a SELECT command, so the differences
6945 * accumulated by the parent into server.bgrewritebuf will start
6946 * with a SELECT statement and it will be safe to merge. */
6947 server.appendseldb = -1;
9d65a1bb 6948 return REDIS_OK;
6949 }
6950 return REDIS_OK; /* unreached */
6951}
6952
6953static void bgrewriteaofCommand(redisClient *c) {
6954 if (server.bgrewritechildpid != -1) {
6955 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
6956 return;
6957 }
6958 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 6959 char *status = "+Background append only file rewriting started\r\n";
6960 addReplySds(c,sdsnew(status));
9d65a1bb 6961 } else {
6962 addReply(c,shared.err);
6963 }
6964}
6965
6966static void aofRemoveTempFile(pid_t childpid) {
6967 char tmpfile[256];
6968
6969 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
6970 unlink(tmpfile);
6971}
6972
996cb5f7 6973/* Virtual Memory is composed mainly of two subsystems:
6974 * - Blocking Virutal Memory
6975 * - Threaded Virtual Memory I/O
6976 * The two parts are not fully decoupled, but functions are split among two
6977 * different sections of the source code (delimited by comments) in order to
6978 * make more clear what functionality is about the blocking VM and what about
6979 * the threaded (not blocking) VM.
6980 *
6981 * Redis VM design:
6982 *
6983 * Redis VM is a blocking VM (one that blocks reading swapped values from
6984 * disk into memory when a value swapped out is needed in memory) that is made
6985 * unblocking by trying to examine the command argument vector in order to
6986 * load in background values that will likely be needed in order to exec
6987 * the command. The command is executed only once all the relevant keys
6988 * are loaded into memory.
6989 *
6990 * This basically is almost as simple of a blocking VM, but almost as parallel
6991 * as a fully non-blocking VM.
6992 */
6993
6994/* =================== Virtual Memory - Blocking Side ====================== */
75680a3c 6995static void vmInit(void) {
6996 off_t totsize;
996cb5f7 6997 int pipefds[2];
bcaa7a4f 6998 size_t stacksize;
75680a3c 6999
4ad37480 7000 if (server.vm_max_threads != 0)
7001 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
7002
75680a3c 7003 server.vm_fp = fopen("/tmp/redisvm","w+b");
7004 if (server.vm_fp == NULL) {
7005 redisLog(REDIS_WARNING,"Impossible to open the swap file. Exiting.");
7006 exit(1);
7007 }
7008 server.vm_fd = fileno(server.vm_fp);
7009 server.vm_next_page = 0;
7010 server.vm_near_pages = 0;
7d98e08c 7011 server.vm_stats_used_pages = 0;
7012 server.vm_stats_swapped_objects = 0;
7013 server.vm_stats_swapouts = 0;
7014 server.vm_stats_swapins = 0;
75680a3c 7015 totsize = server.vm_pages*server.vm_page_size;
7016 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
7017 if (ftruncate(server.vm_fd,totsize) == -1) {
7018 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
7019 strerror(errno));
7020 exit(1);
7021 } else {
7022 redisLog(REDIS_NOTICE,"Swap file allocated with success");
7023 }
7d30035d 7024 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 7025 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 7026 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 7027 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
75680a3c 7028 /* Try to remove the swap file, so the OS will really delete it from the
7029 * file system when Redis exists. */
7030 unlink("/tmp/redisvm");
92f8e882 7031
996cb5f7 7032 /* Initialize threaded I/O (used by Virtual Memory) */
7033 server.io_newjobs = listCreate();
7034 server.io_processing = listCreate();
7035 server.io_processed = listCreate();
92f8e882 7036 server.io_clients = listCreate();
7037 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 7038 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
7039 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 7040 server.io_active_threads = 0;
996cb5f7 7041 if (pipe(pipefds) == -1) {
7042 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
7043 ,strerror(errno));
7044 exit(1);
7045 }
7046 server.io_ready_pipe_read = pipefds[0];
7047 server.io_ready_pipe_write = pipefds[1];
7048 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 7049 /* LZF requires a lot of stack */
7050 pthread_attr_init(&server.io_threads_attr);
7051 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
7052 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
7053 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 7054 /* Listen for events in the threaded I/O pipe */
7055 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
7056 vmThreadedIOCompletedJob, NULL) == AE_ERR)
7057 oom("creating file event");
75680a3c 7058}
7059
06224fec 7060/* Mark the page as used */
7061static void vmMarkPageUsed(off_t page) {
7062 off_t byte = page/8;
7063 int bit = page&7;
7064 server.vm_bitmap[byte] |= 1<<bit;
f870935d 7065 redisLog(REDIS_DEBUG,"Mark used: %lld (byte:%lld bit:%d)\n",
7066 (long long)page, (long long)byte, bit);
06224fec 7067}
7068
7069/* Mark N contiguous pages as used, with 'page' being the first. */
7070static void vmMarkPagesUsed(off_t page, off_t count) {
7071 off_t j;
7072
7073 for (j = 0; j < count; j++)
7d30035d 7074 vmMarkPageUsed(page+j);
7d98e08c 7075 server.vm_stats_used_pages += count;
06224fec 7076}
7077
7078/* Mark the page as free */
7079static void vmMarkPageFree(off_t page) {
7080 off_t byte = page/8;
7081 int bit = page&7;
7082 server.vm_bitmap[byte] &= ~(1<<bit);
7083}
7084
7085/* Mark N contiguous pages as free, with 'page' being the first. */
7086static void vmMarkPagesFree(off_t page, off_t count) {
7087 off_t j;
7088
7089 for (j = 0; j < count; j++)
7d30035d 7090 vmMarkPageFree(page+j);
7d98e08c 7091 server.vm_stats_used_pages -= count;
06224fec 7092}
7093
7094/* Test if the page is free */
7095static int vmFreePage(off_t page) {
7096 off_t byte = page/8;
7097 int bit = page&7;
7d30035d 7098 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 7099}
7100
7101/* Find N contiguous free pages storing the first page of the cluster in *first.
3a66edc7 7102 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
7103 * REDIS_ERR is returned.
06224fec 7104 *
7105 * This function uses a simple algorithm: we try to allocate
7106 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
7107 * again from the start of the swap file searching for free spaces.
7108 *
7109 * If it looks pretty clear that there are no free pages near our offset
7110 * we try to find less populated places doing a forward jump of
7111 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
7112 * without hurry, and then we jump again and so forth...
7113 *
7114 * This function can be improved using a free list to avoid to guess
7115 * too much, since we could collect data about freed pages.
7116 *
7117 * note: I implemented this function just after watching an episode of
7118 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
7119 */
c7df85a4 7120static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 7121 off_t base, offset = 0, since_jump = 0, numfree = 0;
7122
7123 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
7124 server.vm_near_pages = 0;
7125 server.vm_next_page = 0;
7126 }
7127 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
7128 base = server.vm_next_page;
7129
7130 while(offset < server.vm_pages) {
7131 off_t this = base+offset;
7132
f870935d 7133 redisLog(REDIS_DEBUG, "THIS: %lld (%c)\n", (long long) this, vmFreePage(this) ? 'F' : 'X');
06224fec 7134 /* If we overflow, restart from page zero */
7135 if (this >= server.vm_pages) {
7136 this -= server.vm_pages;
7137 if (this == 0) {
7138 /* Just overflowed, what we found on tail is no longer
7139 * interesting, as it's no longer contiguous. */
7140 numfree = 0;
7141 }
7142 }
7143 if (vmFreePage(this)) {
7144 /* This is a free page */
7145 numfree++;
7146 /* Already got N free pages? Return to the caller, with success */
7147 if (numfree == n) {
7d30035d 7148 *first = this-(n-1);
7149 server.vm_next_page = this+1;
3a66edc7 7150 return REDIS_OK;
06224fec 7151 }
7152 } else {
7153 /* The current one is not a free page */
7154 numfree = 0;
7155 }
7156
7157 /* Fast-forward if the current page is not free and we already
7158 * searched enough near this place. */
7159 since_jump++;
7160 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
7161 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
7162 since_jump = 0;
7163 /* Note that even if we rewind after the jump, we are don't need
7164 * to make sure numfree is set to zero as we only jump *if* it
7165 * is set to zero. */
7166 } else {
7167 /* Otherwise just check the next page */
7168 offset++;
7169 }
7170 }
3a66edc7 7171 return REDIS_ERR;
7172}
7173
a5819310 7174/* Write the specified object at the specified page of the swap file */
7175static int vmWriteObjectOnSwap(robj *o, off_t page) {
7176 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
7177 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
7178 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7179 redisLog(REDIS_WARNING,
7180 "Critical VM problem in vmSwapObjectBlocking(): can't seek: %s",
7181 strerror(errno));
7182 return REDIS_ERR;
7183 }
7184 rdbSaveObject(server.vm_fp,o);
7185 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7186 return REDIS_OK;
7187}
7188
3a66edc7 7189/* Swap the 'val' object relative to 'key' into disk. Store all the information
7190 * needed to later retrieve the object into the key object.
7191 * If we can't find enough contiguous empty pages to swap the object on disk
7192 * REDIS_ERR is returned. */
a69a0c9c 7193static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 7194 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 7195 off_t page;
7196
7197 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 7198 assert(key->refcount == 1);
3a66edc7 7199 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 7200 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 7201 key->vm.page = page;
7202 key->vm.usedpages = pages;
7203 key->storage = REDIS_VM_SWAPPED;
d894161b 7204 key->vtype = val->type;
3a66edc7 7205 decrRefCount(val); /* Deallocate the object from memory. */
7206 vmMarkPagesUsed(page,pages);
7d30035d 7207 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
7208 (unsigned char*) key->ptr,
7209 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 7210 server.vm_stats_swapped_objects++;
7211 server.vm_stats_swapouts++;
0841cc92 7212 fflush(server.vm_fp);
3a66edc7 7213 return REDIS_OK;
7214}
7215
a5819310 7216static robj *vmReadObjectFromSwap(off_t page, int type) {
7217 robj *o;
3a66edc7 7218
a5819310 7219 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
7220 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 7221 redisLog(REDIS_WARNING,
7222 "Unrecoverable VM problem in vmLoadObject(): can't seek: %s",
7223 strerror(errno));
7224 exit(1);
7225 }
a5819310 7226 o = rdbLoadObject(type,server.vm_fp);
7227 if (o == NULL) {
3a66edc7 7228 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmLoadObject(): can't load object from swap file: %s", strerror(errno));
7229 exit(1);
7230 }
a5819310 7231 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
7232 return o;
7233}
7234
7235/* Load the value object relative to the 'key' object from swap to memory.
7236 * The newly allocated object is returned.
7237 *
7238 * If preview is true the unserialized object is returned to the caller but
7239 * no changes are made to the key object, nor the pages are marked as freed */
7240static robj *vmGenericLoadObject(robj *key, int preview) {
7241 robj *val;
7242
7243 redisAssert(key->storage == REDIS_VM_SWAPPED);
7244 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 7245 if (!preview) {
7246 key->storage = REDIS_VM_MEMORY;
7247 key->vm.atime = server.unixtime;
7248 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
7249 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
7250 (unsigned char*) key->ptr);
7d98e08c 7251 server.vm_stats_swapped_objects--;
38aba9a1 7252 } else {
7253 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
7254 (unsigned char*) key->ptr);
7e69548d 7255 }
7d98e08c 7256 server.vm_stats_swapins++;
3a66edc7 7257 return val;
06224fec 7258}
7259
7e69548d 7260/* Plain object loading, from swap to memory */
7261static robj *vmLoadObject(robj *key) {
996cb5f7 7262 /* If we are loading the object in background, stop it, we
7263 * need to load this object synchronously ASAP. */
7264 if (key->storage == REDIS_VM_LOADING)
7265 vmCancelThreadedIOJob(key);
7e69548d 7266 return vmGenericLoadObject(key,0);
7267}
7268
7269/* Just load the value on disk, without to modify the key.
7270 * This is useful when we want to perform some operation on the value
7271 * without to really bring it from swap to memory, like while saving the
7272 * dataset or rewriting the append only log. */
7273static robj *vmPreviewObject(robj *key) {
7274 return vmGenericLoadObject(key,1);
7275}
7276
4ef8de8a 7277/* How a good candidate is this object for swapping?
7278 * The better candidate it is, the greater the returned value.
7279 *
7280 * Currently we try to perform a fast estimation of the object size in
7281 * memory, and combine it with aging informations.
7282 *
7283 * Basically swappability = idle-time * log(estimated size)
7284 *
7285 * Bigger objects are preferred over smaller objects, but not
7286 * proportionally, this is why we use the logarithm. This algorithm is
7287 * just a first try and will probably be tuned later. */
7288static double computeObjectSwappability(robj *o) {
7289 time_t age = server.unixtime - o->vm.atime;
7290 long asize = 0;
7291 list *l;
7292 dict *d;
7293 struct dictEntry *de;
7294 int z;
7295
7296 if (age <= 0) return 0;
7297 switch(o->type) {
7298 case REDIS_STRING:
7299 if (o->encoding != REDIS_ENCODING_RAW) {
7300 asize = sizeof(*o);
7301 } else {
7302 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
7303 }
7304 break;
7305 case REDIS_LIST:
7306 l = o->ptr;
7307 listNode *ln = listFirst(l);
7308
7309 asize = sizeof(list);
7310 if (ln) {
7311 robj *ele = ln->value;
7312 long elesize;
7313
7314 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
7315 (sizeof(*o)+sdslen(ele->ptr)) :
7316 sizeof(*o);
7317 asize += (sizeof(listNode)+elesize)*listLength(l);
7318 }
7319 break;
7320 case REDIS_SET:
7321 case REDIS_ZSET:
7322 z = (o->type == REDIS_ZSET);
7323 d = z ? ((zset*)o->ptr)->dict : o->ptr;
7324
7325 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
7326 if (z) asize += sizeof(zset)-sizeof(dict);
7327 if (dictSize(d)) {
7328 long elesize;
7329 robj *ele;
7330
7331 de = dictGetRandomKey(d);
7332 ele = dictGetEntryKey(de);
7333 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
7334 (sizeof(*o)+sdslen(ele->ptr)) :
7335 sizeof(*o);
7336 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
7337 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
7338 }
7339 break;
7340 }
7341 return (double)asize*log(1+asize);
7342}
7343
7344/* Try to swap an object that's a good candidate for swapping.
7345 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 7346 * to swap any object at all.
7347 *
7348 * If 'usethreaded' is true, Redis will try to swap the object in background
7349 * using I/O threads. */
7350static int vmSwapOneObject(int usethreads) {
4ef8de8a 7351 int j, i;
7352 struct dictEntry *best = NULL;
7353 double best_swappability = 0;
b9bc0eef 7354 redisDb *best_db = NULL;
4ef8de8a 7355 robj *key, *val;
7356
7357 for (j = 0; j < server.dbnum; j++) {
7358 redisDb *db = server.db+j;
e3cadb8a 7359 int maxtries = 1000;
4ef8de8a 7360
7361 if (dictSize(db->dict) == 0) continue;
7362 for (i = 0; i < 5; i++) {
7363 dictEntry *de;
7364 double swappability;
7365
e3cadb8a 7366 if (maxtries) maxtries--;
4ef8de8a 7367 de = dictGetRandomKey(db->dict);
7368 key = dictGetEntryKey(de);
7369 val = dictGetEntryVal(de);
1064ef87 7370 /* Only swap objects that are currently in memory.
7371 *
7372 * Also don't swap shared objects if threaded VM is on, as we
7373 * try to ensure that the main thread does not touch the
7374 * object while the I/O thread is using it, but we can't
7375 * control other keys without adding additional mutex. */
7376 if (key->storage != REDIS_VM_MEMORY ||
7377 (server.vm_max_threads != 0 && val->refcount != 1)) {
e3cadb8a 7378 if (maxtries) i--; /* don't count this try */
7379 continue;
7380 }
4ef8de8a 7381 swappability = computeObjectSwappability(val);
7382 if (!best || swappability > best_swappability) {
7383 best = de;
7384 best_swappability = swappability;
b9bc0eef 7385 best_db = db;
4ef8de8a 7386 }
7387 }
7388 }
e3cadb8a 7389 if (best == NULL) {
7390 redisLog(REDIS_DEBUG,"No swappable key found!");
7391 return REDIS_ERR;
7392 }
4ef8de8a 7393 key = dictGetEntryKey(best);
7394 val = dictGetEntryVal(best);
7395
e3cadb8a 7396 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 7397 key->ptr, best_swappability);
7398
7399 /* Unshare the key if needed */
7400 if (key->refcount > 1) {
7401 robj *newkey = dupStringObject(key);
7402 decrRefCount(key);
7403 key = dictGetEntryKey(best) = newkey;
7404 }
7405 /* Swap it */
a69a0c9c 7406 if (usethreads) {
b9bc0eef 7407 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 7408 return REDIS_OK;
7409 } else {
a69a0c9c 7410 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7411 dictGetEntryVal(best) = NULL;
7412 return REDIS_OK;
7413 } else {
7414 return REDIS_ERR;
7415 }
4ef8de8a 7416 }
7417}
7418
a69a0c9c 7419static int vmSwapOneObjectBlocking() {
7420 return vmSwapOneObject(0);
7421}
7422
7423static int vmSwapOneObjectThreaded() {
7424 return vmSwapOneObject(1);
7425}
7426
7e69548d 7427/* Return true if it's safe to swap out objects in a given moment.
7428 * Basically we don't want to swap objects out while there is a BGSAVE
7429 * or a BGAEOREWRITE running in backgroud. */
7430static int vmCanSwapOut(void) {
7431 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
7432}
7433
1b03836c 7434/* Delete a key if swapped. Returns 1 if the key was found, was swapped
7435 * and was deleted. Otherwise 0 is returned. */
7436static int deleteIfSwapped(redisDb *db, robj *key) {
7437 dictEntry *de;
7438 robj *foundkey;
7439
7440 if ((de = dictFind(db->dict,key)) == NULL) return 0;
7441 foundkey = dictGetEntryKey(de);
7442 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
7443 deleteKey(db,key);
7444 return 1;
7445}
7446
996cb5f7 7447/* =================== Virtual Memory - Threaded I/O ======================= */
7448
b9bc0eef 7449static void freeIOJob(iojob *j) {
7450 if (j->type == REDIS_IOJOB_PREPARE_SWAP ||
7451 j->type == REDIS_IOJOB_DO_SWAP)
7452 decrRefCount(j->val);
7453 decrRefCount(j->key);
7454 zfree(j);
7455}
7456
996cb5f7 7457/* Every time a thread finished a Job, it writes a byte into the write side
7458 * of an unix pipe in order to "awake" the main thread, and this function
7459 * is called. */
7460static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
7461 int mask)
7462{
7463 char buf[1];
7464 int retval;
c953f24b 7465 int processed = 0;
996cb5f7 7466 REDIS_NOTUSED(el);
7467 REDIS_NOTUSED(mask);
7468 REDIS_NOTUSED(privdata);
7469
7470 /* For every byte we read in the read side of the pipe, there is one
7471 * I/O job completed to process. */
7472 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 7473 iojob *j;
7474 listNode *ln;
7475 robj *key;
7476 struct dictEntry *de;
7477
996cb5f7 7478 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 7479
7480 /* Get the processed element (the oldest one) */
7481 lockThreadedIO();
1064ef87 7482 assert(listLength(server.io_processed) != 0);
b9bc0eef 7483 ln = listFirst(server.io_processed);
7484 j = ln->value;
7485 listDelNode(server.io_processed,ln);
7486 unlockThreadedIO();
7487 /* If this job is marked as canceled, just ignore it */
7488 if (j->canceled) {
7489 freeIOJob(j);
7490 continue;
7491 }
7492 /* Post process it in the main thread, as there are things we
7493 * can do just here to avoid race conditions and/or invasive locks */
6c96ba7d 7494 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 7495 de = dictFind(j->db->dict,j->key);
7496 assert(de != NULL);
7497 key = dictGetEntryKey(de);
7498 if (j->type == REDIS_IOJOB_LOAD) {
7499 /* Key loaded, bring it at home */
7500 key->storage = REDIS_VM_MEMORY;
7501 key->vm.atime = server.unixtime;
7502 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
7503 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
7504 (unsigned char*) key->ptr);
7505 server.vm_stats_swapped_objects--;
7506 server.vm_stats_swapins++;
7507 freeIOJob(j);
7508 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
7509 /* Now we know the amount of pages required to swap this object.
7510 * Let's find some space for it, and queue this task again
7511 * rebranded as REDIS_IOJOB_DO_SWAP. */
7512 if (vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR) {
7513 /* Ooops... no space! */
7514 freeIOJob(j);
7515 } else {
c7df85a4 7516 /* Note that we need to mark this pages as used now,
7517 * if the job will be canceled, we'll mark them as freed
7518 * again. */
7519 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 7520 j->type = REDIS_IOJOB_DO_SWAP;
7521 lockThreadedIO();
7522 queueIOJob(j);
7523 unlockThreadedIO();
7524 }
7525 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
7526 robj *val;
7527
7528 /* Key swapped. We can finally free some memory. */
6c96ba7d 7529 if (key->storage != REDIS_VM_SWAPPING) {
7530 printf("key->storage: %d\n",key->storage);
7531 printf("key->name: %s\n",(char*)key->ptr);
7532 printf("key->refcount: %d\n",key->refcount);
7533 printf("val: %p\n",(void*)j->val);
7534 printf("val->type: %d\n",j->val->type);
7535 printf("val->ptr: %s\n",(char*)j->val->ptr);
7536 }
7537 redisAssert(key->storage == REDIS_VM_SWAPPING);
b9bc0eef 7538 val = dictGetEntryVal(de);
7539 key->vm.page = j->page;
7540 key->vm.usedpages = j->pages;
7541 key->storage = REDIS_VM_SWAPPED;
7542 key->vtype = j->val->type;
7543 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 7544 dictGetEntryVal(de) = NULL;
b9bc0eef 7545 redisLog(REDIS_DEBUG,
7546 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
7547 (unsigned char*) key->ptr,
7548 (unsigned long long) j->page, (unsigned long long) j->pages);
7549 server.vm_stats_swapped_objects++;
7550 server.vm_stats_swapouts++;
7551 freeIOJob(j);
f11b8647 7552 /* Put a few more swap requests in queue if we are still
7553 * out of memory */
7554 if (zmalloc_used_memory() > server.vm_max_memory) {
7555 int more = 1;
7556 while(more) {
7557 lockThreadedIO();
7558 more = listLength(server.io_newjobs) <
7559 (unsigned) server.vm_max_threads;
7560 unlockThreadedIO();
7561 /* Don't waste CPU time if swappable objects are rare. */
7562 if (vmSwapOneObjectThreaded() == REDIS_ERR) break;
7563 }
7564 }
b9bc0eef 7565 }
c953f24b 7566 processed++;
7567 if (processed == REDIS_MAX_COMPLETED_JOBS_PROCESSED) return;
996cb5f7 7568 }
7569 if (retval < 0 && errno != EAGAIN) {
7570 redisLog(REDIS_WARNING,
7571 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
7572 strerror(errno));
7573 }
7574}
7575
7576static void lockThreadedIO(void) {
7577 pthread_mutex_lock(&server.io_mutex);
7578}
7579
7580static void unlockThreadedIO(void) {
7581 pthread_mutex_unlock(&server.io_mutex);
7582}
7583
7584/* Remove the specified object from the threaded I/O queue if still not
7585 * processed, otherwise make sure to flag it as canceled. */
7586static void vmCancelThreadedIOJob(robj *o) {
7587 list *lists[3] = {
6c96ba7d 7588 server.io_newjobs, /* 0 */
7589 server.io_processing, /* 1 */
7590 server.io_processed /* 2 */
996cb5f7 7591 };
7592 int i;
7593
7594 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 7595again:
996cb5f7 7596 lockThreadedIO();
7597 /* Search for a matching key in one of the queues */
7598 for (i = 0; i < 3; i++) {
7599 listNode *ln;
c7df85a4 7600 listIter li;
996cb5f7 7601
c7df85a4 7602 listRewind(lists[i],&li);
7603 while ((ln = listNext(&li)) != NULL) {
996cb5f7 7604 iojob *job = ln->value;
7605
6c96ba7d 7606 if (job->canceled) continue; /* Skip this, already canceled. */
996cb5f7 7607 if (compareStringObjects(job->key,o) == 0) {
2e111efe 7608 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (LIST ID %d)\n",
7609 (void*)job, (char*)o->ptr, i);
427a2153 7610 /* Mark the pages as free since the swap didn't happened
7611 * or happened but is now discarded. */
7612 if (job->type == REDIS_IOJOB_DO_SWAP)
7613 vmMarkPagesFree(job->page,job->pages);
7614 /* Cancel the job. It depends on the list the job is
7615 * living in. */
996cb5f7 7616 switch(i) {
7617 case 0: /* io_newjobs */
6c96ba7d 7618 /* If the job was yet not processed the best thing to do
996cb5f7 7619 * is to remove it from the queue at all */
6c96ba7d 7620 freeIOJob(job);
996cb5f7 7621 listDelNode(lists[i],ln);
7622 break;
7623 case 1: /* io_processing */
2e111efe 7624 /* Oh Shi- the thread is messing with the Job, and
7625 * probably with the object if this is a
7626 * PREPARE_SWAP or DO_SWAP job. Better to wait for the
7627 * job to move into the next queue... */
7628 if (job->type != REDIS_IOJOB_LOAD) {
7629 /* Yes, we try again and again until the job
7630 * is completed. */
7631 unlockThreadedIO();
7632 /* But let's wait some time for the I/O thread
7633 * to finish with this job. After all this condition
7634 * should be very rare. */
7635 usleep(1);
7636 goto again;
7637 } else {
7638 job->canceled = 1;
7639 break;
7640 }
996cb5f7 7641 case 2: /* io_processed */
2e111efe 7642 /* The job was already processed, that's easy...
7643 * just mark it as canceled so that we'll ignore it
7644 * when processing completed jobs. */
996cb5f7 7645 job->canceled = 1;
7646 break;
7647 }
c7df85a4 7648 /* Finally we have to adjust the storage type of the object
7649 * in order to "UNDO" the operaiton. */
996cb5f7 7650 if (o->storage == REDIS_VM_LOADING)
7651 o->storage = REDIS_VM_SWAPPED;
7652 else if (o->storage == REDIS_VM_SWAPPING)
7653 o->storage = REDIS_VM_MEMORY;
7654 unlockThreadedIO();
7655 return;
7656 }
7657 }
7658 }
7659 unlockThreadedIO();
7660 assert(1 != 1); /* We should never reach this */
7661}
7662
b9bc0eef 7663static void *IOThreadEntryPoint(void *arg) {
7664 iojob *j;
7665 listNode *ln;
7666 REDIS_NOTUSED(arg);
7667
7668 pthread_detach(pthread_self());
7669 while(1) {
7670 /* Get a new job to process */
7671 lockThreadedIO();
7672 if (listLength(server.io_newjobs) == 0) {
b04a5df9 7673#ifdef REDIS_HELGRIND_FRIENDLY
7674 /* No new jobs? Wait and retry, because to be Helgrind
7675 * (valgrind --tool=helgrind) what's needed is to take
7676 * the same threads running instead to create/destroy threads
7677 * as needed (otherwise valgrind will fail) */
7678 unlockThreadedIO();
7679 usleep(1); /* Give some time for the I/O thread to work. */
7680 continue;
7681#endif
b9bc0eef 7682 /* No new jobs in queue, exit. */
b74880b4 7683 redisLog(REDIS_DEBUG,"Thread %lld exiting, nothing to do",
b9bc0eef 7684 (long long) pthread_self());
7685 server.io_active_threads--;
7686 unlockThreadedIO();
7687 return NULL;
7688 }
7689 ln = listFirst(server.io_newjobs);
7690 j = ln->value;
7691 listDelNode(server.io_newjobs,ln);
7692 /* Add the job in the processing queue */
7693 j->thread = pthread_self();
7694 listAddNodeTail(server.io_processing,j);
7695 ln = listLast(server.io_processing); /* We use ln later to remove it */
7696 unlockThreadedIO();
b74880b4 7697 redisLog(REDIS_DEBUG,"Thread %lld got a new job (type %d): %p about key '%s'",
6c96ba7d 7698 (long long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 7699
7700 /* Process the Job */
7701 if (j->type == REDIS_IOJOB_LOAD) {
7702 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
7703 FILE *fp = fopen("/dev/null","w+");
7704 j->pages = rdbSavedObjectPages(j->val,fp);
7705 fclose(fp);
7706 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 7707 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
7708 j->canceled = 1;
b9bc0eef 7709 }
7710
7711 /* Done: insert the job into the processed queue */
b74880b4 7712 redisLog(REDIS_DEBUG,"Thread %lld completed the job: %p (key %s)",
6c96ba7d 7713 (long long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 7714 lockThreadedIO();
7715 listDelNode(server.io_processing,ln);
7716 listAddNodeTail(server.io_processed,j);
7717 unlockThreadedIO();
7718
7719 /* Signal the main thread there is new stuff to process */
7720 assert(write(server.io_ready_pipe_write,"x",1) == 1);
7721 }
7722 return NULL; /* never reached */
7723}
7724
7725static void spawnIOThread(void) {
7726 pthread_t thread;
7727
bcaa7a4f 7728 pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL);
b9bc0eef 7729 server.io_active_threads++;
7730}
7731
4ee9488d 7732/* We need to wait for the last thread to exit before we are able to
7733 * fork() in order to BGSAVE or BGREWRITEAOF. */
7734static void waitZeroActiveThreads(void) {
7735 while(1) {
7736 lockThreadedIO();
7737 if (server.io_active_threads == 0) {
7738 unlockThreadedIO();
7739 return;
7740 }
7741 unlockThreadedIO();
7742 usleep(10000); /* 10 milliseconds */
7743 }
7744}
7745
b9bc0eef 7746/* This function must be called while with threaded IO locked */
7747static void queueIOJob(iojob *j) {
6c96ba7d 7748 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
7749 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 7750 listAddNodeTail(server.io_newjobs,j);
7751 if (server.io_active_threads < server.vm_max_threads)
7752 spawnIOThread();
7753}
7754
7755static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
7756 iojob *j;
7757
7758 assert(key->storage == REDIS_VM_MEMORY);
7759 assert(key->refcount == 1);
7760
7761 j = zmalloc(sizeof(*j));
7762 j->type = REDIS_IOJOB_PREPARE_SWAP;
7763 j->db = db;
7764 j->key = dupStringObject(key);
7765 j->val = val;
7766 incrRefCount(val);
7767 j->canceled = 0;
7768 j->thread = (pthread_t) -1;
f11b8647 7769 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 7770
7771 lockThreadedIO();
7772 queueIOJob(j);
7773 unlockThreadedIO();
7774 return REDIS_OK;
7775}
7776
7f957c92 7777/* ================================= Debugging ============================== */
7778
7779static void debugCommand(redisClient *c) {
7780 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
7781 *((char*)-1) = 'x';
210e29f7 7782 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
7783 if (rdbSave(server.dbfilename) != REDIS_OK) {
7784 addReply(c,shared.err);
7785 return;
7786 }
7787 emptyDb();
7788 if (rdbLoad(server.dbfilename) != REDIS_OK) {
7789 addReply(c,shared.err);
7790 return;
7791 }
7792 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
7793 addReply(c,shared.ok);
71c2b467 7794 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
7795 emptyDb();
7796 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
7797 addReply(c,shared.err);
7798 return;
7799 }
7800 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
7801 addReply(c,shared.ok);
333298da 7802 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
7803 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
7804 robj *key, *val;
7805
7806 if (!de) {
7807 addReply(c,shared.nokeyerr);
7808 return;
7809 }
7810 key = dictGetEntryKey(de);
7811 val = dictGetEntryVal(de);
b9bc0eef 7812 if (server.vm_enabled && (key->storage == REDIS_VM_MEMORY ||
7813 key->storage == REDIS_VM_SWAPPING)) {
ace06542 7814 addReplySds(c,sdscatprintf(sdsempty(),
7815 "+Key at:%p refcount:%d, value at:%p refcount:%d "
7816 "encoding:%d serializedlength:%lld\r\n",
682ac724 7817 (void*)key, key->refcount, (void*)val, val->refcount,
b9bc0eef 7818 val->encoding, rdbSavedObjectLen(val,NULL)));
ace06542 7819 } else {
7820 addReplySds(c,sdscatprintf(sdsempty(),
7821 "+Key at:%p refcount:%d, value swapped at: page %llu "
7822 "using %llu pages\r\n",
7823 (void*)key, key->refcount, (unsigned long long) key->vm.page,
7824 (unsigned long long) key->vm.usedpages));
7825 }
7d30035d 7826 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
7827 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
7828 robj *key, *val;
7829
7830 if (!server.vm_enabled) {
7831 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
7832 return;
7833 }
7834 if (!de) {
7835 addReply(c,shared.nokeyerr);
7836 return;
7837 }
7838 key = dictGetEntryKey(de);
7839 val = dictGetEntryVal(de);
4ef8de8a 7840 /* If the key is shared we want to create a copy */
7841 if (key->refcount > 1) {
7842 robj *newkey = dupStringObject(key);
7843 decrRefCount(key);
7844 key = dictGetEntryKey(de) = newkey;
7845 }
7846 /* Swap it */
7d30035d 7847 if (key->storage != REDIS_VM_MEMORY) {
7848 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 7849 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 7850 dictGetEntryVal(de) = NULL;
7851 addReply(c,shared.ok);
7852 } else {
7853 addReply(c,shared.err);
7854 }
7f957c92 7855 } else {
333298da 7856 addReplySds(c,sdsnew(
7d30035d 7857 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 7858 }
7859}
56906eef 7860
6c96ba7d 7861static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 7862 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
6c96ba7d 7863 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
dfc5e96c 7864#ifdef HAVE_BACKTRACE
7865 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
7866 *((char*)-1) = 'x';
7867#endif
7868}
7869
bcfc686d 7870/* =================================== Main! ================================ */
56906eef 7871
bcfc686d 7872#ifdef __linux__
7873int linuxOvercommitMemoryValue(void) {
7874 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
7875 char buf[64];
56906eef 7876
bcfc686d 7877 if (!fp) return -1;
7878 if (fgets(buf,64,fp) == NULL) {
7879 fclose(fp);
7880 return -1;
7881 }
7882 fclose(fp);
56906eef 7883
bcfc686d 7884 return atoi(buf);
7885}
7886
7887void linuxOvercommitMemoryWarning(void) {
7888 if (linuxOvercommitMemoryValue() == 0) {
7889 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
7890 }
7891}
7892#endif /* __linux__ */
7893
7894static void daemonize(void) {
7895 int fd;
7896 FILE *fp;
7897
7898 if (fork() != 0) exit(0); /* parent exits */
7899 setsid(); /* create a new session */
7900
7901 /* Every output goes to /dev/null. If Redis is daemonized but
7902 * the 'logfile' is set to 'stdout' in the configuration file
7903 * it will not log at all. */
7904 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
7905 dup2(fd, STDIN_FILENO);
7906 dup2(fd, STDOUT_FILENO);
7907 dup2(fd, STDERR_FILENO);
7908 if (fd > STDERR_FILENO) close(fd);
7909 }
7910 /* Try to write the pid file */
7911 fp = fopen(server.pidfile,"w");
7912 if (fp) {
7913 fprintf(fp,"%d\n",getpid());
7914 fclose(fp);
56906eef 7915 }
56906eef 7916}
7917
bcfc686d 7918int main(int argc, char **argv) {
7919 initServerConfig();
7920 if (argc == 2) {
7921 resetServerSaveParams();
7922 loadServerConfig(argv[1]);
7923 } else if (argc > 2) {
7924 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
7925 exit(1);
7926 } else {
7927 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
7928 }
bcfc686d 7929 if (server.daemonize) daemonize();
71c54b21 7930 initServer();
bcfc686d 7931 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
7932#ifdef __linux__
7933 linuxOvercommitMemoryWarning();
7934#endif
7935 if (server.appendonly) {
7936 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
7937 redisLog(REDIS_NOTICE,"DB loaded from append only file");
7938 } else {
7939 if (rdbLoad(server.dbfilename) == REDIS_OK)
7940 redisLog(REDIS_NOTICE,"DB loaded from disk");
7941 }
bcfc686d 7942 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
7943 aeMain(server.el);
7944 aeDeleteEventLoop(server.el);
7945 return 0;
7946}
7947
7948/* ============================= Backtrace support ========================= */
7949
7950#ifdef HAVE_BACKTRACE
7951static char *findFuncName(void *pointer, unsigned long *offset);
7952
56906eef 7953static void *getMcontextEip(ucontext_t *uc) {
7954#if defined(__FreeBSD__)
7955 return (void*) uc->uc_mcontext.mc_eip;
7956#elif defined(__dietlibc__)
7957 return (void*) uc->uc_mcontext.eip;
06db1f50 7958#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 7959 #if __x86_64__
7960 return (void*) uc->uc_mcontext->__ss.__rip;
7961 #else
56906eef 7962 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 7963 #endif
06db1f50 7964#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 7965 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 7966 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 7967 #else
7968 return (void*) uc->uc_mcontext->__ss.__eip;
7969 #endif
c04c9ac9 7970#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
7971 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 7972#elif defined(__ia64__) /* Linux IA64 */
7973 return (void*) uc->uc_mcontext.sc_ip;
7974#else
7975 return NULL;
56906eef 7976#endif
7977}
7978
7979static void segvHandler(int sig, siginfo_t *info, void *secret) {
7980 void *trace[100];
7981 char **messages = NULL;
7982 int i, trace_size = 0;
7983 unsigned long offset=0;
56906eef 7984 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 7985 sds infostring;
56906eef 7986 REDIS_NOTUSED(info);
7987
7988 redisLog(REDIS_WARNING,
7989 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 7990 infostring = genRedisInfoString();
7991 redisLog(REDIS_WARNING, "%s",infostring);
7992 /* It's not safe to sdsfree() the returned string under memory
7993 * corruption conditions. Let it leak as we are going to abort */
56906eef 7994
7995 trace_size = backtrace(trace, 100);
de96dbfe 7996 /* overwrite sigaction with caller's address */
b91cf5ef 7997 if (getMcontextEip(uc) != NULL) {
7998 trace[1] = getMcontextEip(uc);
7999 }
56906eef 8000 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 8001
d76412d1 8002 for (i=1; i<trace_size; ++i) {
56906eef 8003 char *fn = findFuncName(trace[i], &offset), *p;
8004
8005 p = strchr(messages[i],'+');
8006 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
8007 redisLog(REDIS_WARNING,"%s", messages[i]);
8008 } else {
8009 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
8010 }
8011 }
b177fd30 8012 /* free(messages); Don't call free() with possibly corrupted memory. */
56906eef 8013 exit(0);
fe3bbfbe 8014}
56906eef 8015
8016static void setupSigSegvAction(void) {
8017 struct sigaction act;
8018
8019 sigemptyset (&act.sa_mask);
8020 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
8021 * is used. Otherwise, sa_handler is used */
8022 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
8023 act.sa_sigaction = segvHandler;
8024 sigaction (SIGSEGV, &act, NULL);
8025 sigaction (SIGBUS, &act, NULL);
12fea928 8026 sigaction (SIGFPE, &act, NULL);
8027 sigaction (SIGILL, &act, NULL);
8028 sigaction (SIGBUS, &act, NULL);
e65fdc78 8029 return;
56906eef 8030}
e65fdc78 8031
bcfc686d 8032#include "staticsymbols.h"
8033/* This function try to convert a pointer into a function name. It's used in
8034 * oreder to provide a backtrace under segmentation fault that's able to
8035 * display functions declared as static (otherwise the backtrace is useless). */
8036static char *findFuncName(void *pointer, unsigned long *offset){
8037 int i, ret = -1;
8038 unsigned long off, minoff = 0;
ed9b544e 8039
bcfc686d 8040 /* Try to match against the Symbol with the smallest offset */
8041 for (i=0; symsTable[i].pointer; i++) {
8042 unsigned long lp = (unsigned long) pointer;
0bc03378 8043
bcfc686d 8044 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
8045 off=lp-symsTable[i].pointer;
8046 if (ret < 0 || off < minoff) {
8047 minoff=off;
8048 ret=i;
8049 }
8050 }
0bc03378 8051 }
bcfc686d 8052 if (ret == -1) return NULL;
8053 *offset = minoff;
8054 return symsTable[ret].name;
0bc03378 8055}
bcfc686d 8056#else /* HAVE_BACKTRACE */
8057static void setupSigSegvAction(void) {
0bc03378 8058}
bcfc686d 8059#endif /* HAVE_BACKTRACE */
0bc03378 8060
ed9b544e 8061
ed9b544e 8062
bcfc686d 8063/* The End */
8064
8065
ed9b544e 8066