]> git.saurik.com Git - redis.git/blame - redis.c
TODO updated with syslog plans for 2.2
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
9005896c 30#define REDIS_VERSION "2.1.1"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
40#include <signal.h>
fbf9bcdb 41
42#ifdef HAVE_BACKTRACE
c9468bcf 43#include <execinfo.h>
44#include <ucontext.h>
fbf9bcdb 45#endif /* HAVE_BACKTRACE */
46
ed9b544e 47#include <sys/wait.h>
48#include <errno.h>
49#include <assert.h>
50#include <ctype.h>
51#include <stdarg.h>
52#include <inttypes.h>
53#include <arpa/inet.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#include <sys/time.h>
57#include <sys/resource.h>
2895e862 58#include <sys/uio.h>
f78fd11b 59#include <limits.h>
fb82e75c 60#include <float.h>
a7866db6 61#include <math.h>
92f8e882 62#include <pthread.h>
0bc1b2f6 63
64#if defined(__sun)
5043dff3 65#include "solarisfixes.h"
66#endif
ed9b544e 67
c9468bcf 68#include "redis.h"
ed9b544e 69#include "ae.h" /* Event driven programming library */
70#include "sds.h" /* Dynamic safe strings */
71#include "anet.h" /* Networking the easy way */
72#include "dict.h" /* Hash tables */
73#include "adlist.h" /* Linked lists */
74#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 75#include "lzf.h" /* LZF compression library */
76#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
ba798261 77#include "zipmap.h" /* Compact dictionary-alike data structure */
78#include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
5436146c 79#include "release.h" /* Release and/or git repository information */
ed9b544e 80
81/* Error codes */
82#define REDIS_OK 0
83#define REDIS_ERR -1
84
85/* Static server configuration */
86#define REDIS_SERVERPORT 6379 /* TCP port */
87#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 88#define REDIS_IOBUF_LEN 1024
ed9b544e 89#define REDIS_LOADBUF_LEN 1024
248ea310 90#define REDIS_STATIC_ARGS 8
ed9b544e 91#define REDIS_DEFAULT_DBNUM 16
92#define REDIS_CONFIGLINE_MAX 1024
93#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
94#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
8ca3e9d1 95#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
6f376729 96#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 97#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98
99/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
100#define REDIS_WRITEV_THRESHOLD 3
101/* Max number of iovecs used for each writev call */
102#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 103
104/* Hash table parameters */
105#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 106
107/* Command flags */
3fd78bcd 108#define REDIS_CMD_BULK 1 /* Bulk write command */
109#define REDIS_CMD_INLINE 2 /* Inline command */
110/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
111 this flags will return an error when the 'maxmemory' option is set in the
112 config file and the server is using more than maxmemory bytes of memory.
113 In short this commands are denied on low memory conditions. */
114#define REDIS_CMD_DENYOOM 4
4005fef1 115#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
ed9b544e 116
117/* Object types */
118#define REDIS_STRING 0
119#define REDIS_LIST 1
120#define REDIS_SET 2
1812e024 121#define REDIS_ZSET 3
122#define REDIS_HASH 4
f78fd11b 123
5234952b 124/* Objects encoding. Some kind of objects like Strings and Hashes can be
125 * internally represented in multiple ways. The 'encoding' field of the object
126 * is set to one of this fields for this object. */
942a3961 127#define REDIS_ENCODING_RAW 0 /* Raw representation */
128#define REDIS_ENCODING_INT 1 /* Encoded as integer */
5234952b 129#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
130#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
942a3961 131
07efaf74 132static char* strencoding[] = {
133 "raw", "int", "zipmap", "hashtable"
134};
135
f78fd11b 136/* Object types only used for dumping to disk */
bb32ede5 137#define REDIS_EXPIRETIME 253
ed9b544e 138#define REDIS_SELECTDB 254
139#define REDIS_EOF 255
140
f78fd11b 141/* Defines related to the dump file format. To store 32 bits lengths for short
142 * keys requires a lot of space, so we check the most significant 2 bits of
143 * the first byte to interpreter the length:
144 *
145 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
146 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
147 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 148 * 11|000000 this means: specially encoded object will follow. The six bits
149 * number specify the kind of object that follows.
150 * See the REDIS_RDB_ENC_* defines.
f78fd11b 151 *
10c43610 152 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
153 * values, will fit inside. */
f78fd11b 154#define REDIS_RDB_6BITLEN 0
155#define REDIS_RDB_14BITLEN 1
156#define REDIS_RDB_32BITLEN 2
17be1a4a 157#define REDIS_RDB_ENCVAL 3
f78fd11b 158#define REDIS_RDB_LENERR UINT_MAX
159
a4d1ba9a 160/* When a length of a string object stored on disk has the first two bits
161 * set, the remaining two bits specify a special encoding for the object
162 * accordingly to the following defines: */
163#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
164#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
165#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 166#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 167
75680a3c 168/* Virtual memory object->where field. */
169#define REDIS_VM_MEMORY 0 /* The object is on memory */
170#define REDIS_VM_SWAPPED 1 /* The object is on disk */
171#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
172#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
173
06224fec 174/* Virtual memory static configuration stuff.
175 * Check vmFindContiguousPages() to know more about this magic numbers. */
176#define REDIS_VM_MAX_NEAR_PAGES 65536
177#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 178#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 179#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 180/* The following is the *percentage* of completed I/O jobs to process when the
181 * handelr is called. While Virtual Memory I/O operations are performed by
182 * threads, this operations must be processed by the main thread when completed
183 * in order to take effect. */
c953f24b 184#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 185
ed9b544e 186/* Client flags */
d5d55fc3 187#define REDIS_SLAVE 1 /* This client is a slave server */
188#define REDIS_MASTER 2 /* This client is a master server */
189#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
190#define REDIS_MULTI 8 /* This client is in a MULTI context */
191#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
192#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
37ab76c9 193#define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
ed9b544e 194
40d224a9 195/* Slave replication state - slave side */
ed9b544e 196#define REDIS_REPL_NONE 0 /* No active replication */
197#define REDIS_REPL_CONNECT 1 /* Must connect to master */
198#define REDIS_REPL_CONNECTED 2 /* Connected to master */
199
40d224a9 200/* Slave replication state - from the point of view of master
201 * Note that in SEND_BULK and ONLINE state the slave receives new updates
202 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
203 * to start the next background saving in order to send updates to it. */
204#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
205#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
206#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
207#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
208
ed9b544e 209/* List related stuff */
210#define REDIS_HEAD 0
211#define REDIS_TAIL 1
212
213/* Sort operations */
214#define REDIS_SORT_GET 0
443c6409 215#define REDIS_SORT_ASC 1
216#define REDIS_SORT_DESC 2
ed9b544e 217#define REDIS_SORTKEY_MAX 1024
218
219/* Log levels */
220#define REDIS_DEBUG 0
f870935d 221#define REDIS_VERBOSE 1
222#define REDIS_NOTICE 2
223#define REDIS_WARNING 3
ed9b544e 224
225/* Anti-warning macro... */
226#define REDIS_NOTUSED(V) ((void) V)
227
6b47e12e 228#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
229#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 230
48f0308a 231/* Append only defines */
232#define APPENDFSYNC_NO 0
233#define APPENDFSYNC_ALWAYS 1
234#define APPENDFSYNC_EVERYSEC 2
235
cbba7dd7 236/* Hashes related defaults */
237#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
238#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
239
dfc5e96c 240/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 241#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
c651fd9e 242#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
6c96ba7d 243static void _redisAssert(char *estr, char *file, int line);
c651fd9e 244static void _redisPanic(char *msg, char *file, int line);
dfc5e96c 245
ed9b544e 246/*================================= Data types ============================== */
247
248/* A redis object, that is a type able to hold a string / list / set */
75680a3c 249
250/* The VM object structure */
251struct redisObjectVM {
3a66edc7 252 off_t page; /* the page at witch the object is stored on disk */
253 off_t usedpages; /* number of pages used on disk */
254 time_t atime; /* Last access time */
75680a3c 255} vm;
256
257/* The actual Redis Object */
ed9b544e 258typedef struct redisObject {
ed9b544e 259 void *ptr;
942a3961 260 unsigned char type;
261 unsigned char encoding;
d894161b 262 unsigned char storage; /* If this object is a key, where is the value?
263 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
264 unsigned char vtype; /* If this object is a key, and value is swapped out,
265 * this is the type of the swapped out object. */
ed9b544e 266 int refcount;
75680a3c 267 /* VM fields, this are only allocated if VM is active, otherwise the
268 * object allocation function will just allocate
269 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
270 * Redis without VM active will not have any overhead. */
271 struct redisObjectVM vm;
ed9b544e 272} robj;
273
dfc5e96c 274/* Macro used to initalize a Redis object allocated on the stack.
275 * Note that this macro is taken near the structure definition to make sure
276 * we'll update it when the structure is changed, to avoid bugs like
277 * bug #85 introduced exactly in this way. */
278#define initStaticStringObject(_var,_ptr) do { \
279 _var.refcount = 1; \
280 _var.type = REDIS_STRING; \
281 _var.encoding = REDIS_ENCODING_RAW; \
282 _var.ptr = _ptr; \
3a66edc7 283 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 284} while(0);
285
3305306f 286typedef struct redisDb {
4409877e 287 dict *dict; /* The keyspace for this DB */
288 dict *expires; /* Timeout of keys with a timeout set */
37ab76c9 289 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 290 dict *io_keys; /* Keys with clients waiting for VM I/O */
37ab76c9 291 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
3305306f 292 int id;
293} redisDb;
294
6e469882 295/* Client MULTI/EXEC state */
296typedef struct multiCmd {
297 robj **argv;
298 int argc;
299 struct redisCommand *cmd;
300} multiCmd;
301
302typedef struct multiState {
303 multiCmd *commands; /* Array of MULTI commands */
304 int count; /* Total number of MULTI commands */
305} multiState;
306
ed9b544e 307/* With multiplexing we need to take per-clinet state.
308 * Clients are taken in a liked list. */
309typedef struct redisClient {
310 int fd;
3305306f 311 redisDb *db;
ed9b544e 312 int dictid;
313 sds querybuf;
e8a74421 314 robj **argv, **mbargv;
315 int argc, mbargc;
40d224a9 316 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 317 int multibulk; /* multi bulk command format active */
ed9b544e 318 list *reply;
319 int sentlen;
320 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 321 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 322 int slaveseldb; /* slave selected db, if this client is a slave */
323 int authenticated; /* when requirepass is non-NULL */
324 int replstate; /* replication state if this is a slave */
325 int repldbfd; /* replication DB file descriptor */
6e469882 326 long repldboff; /* replication DB file offset */
40d224a9 327 off_t repldbsize; /* replication DB file size */
6e469882 328 multiState mstate; /* MULTI/EXEC state */
37ab76c9 329 robj **blocking_keys; /* The key we are waiting to terminate a blocking
4409877e 330 * operation such as BLPOP. Otherwise NULL. */
37ab76c9 331 int blocking_keys_num; /* Number of blocking keys */
4409877e 332 time_t blockingto; /* Blocking operation timeout. If UNIX current time
333 * is >= blockingto then the operation timed out. */
92f8e882 334 list *io_keys; /* Keys this client is waiting to be loaded from the
335 * swap file in order to continue. */
37ab76c9 336 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
ffc6b7f8 337 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
338 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
ed9b544e 339} redisClient;
340
341struct saveparam {
342 time_t seconds;
343 int changes;
344};
345
346/* Global server state structure */
347struct redisServer {
348 int port;
349 int fd;
3305306f 350 redisDb *db;
ed9b544e 351 long long dirty; /* changes to DB from the last save */
352 list *clients;
87eca727 353 list *slaves, *monitors;
ed9b544e 354 char neterr[ANET_ERR_LEN];
355 aeEventLoop *el;
356 int cronloops; /* number of times the cron function run */
357 list *objfreelist; /* A list of freed objects to avoid malloc() */
358 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 359 /* Fields used only for stats */
360 time_t stat_starttime; /* server start time */
361 long long stat_numcommands; /* number of processed commands */
362 long long stat_numconnections; /* number of connections received */
2a6a2ed1 363 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 364 /* Configuration */
365 int verbosity;
366 int glueoutputbuf;
367 int maxidletime;
368 int dbnum;
369 int daemonize;
44b38ef4 370 int appendonly;
48f0308a 371 int appendfsync;
38db9171 372 int no_appendfsync_on_rewrite;
fab43727 373 int shutdown_asap;
48f0308a 374 time_t lastfsync;
44b38ef4 375 int appendfd;
376 int appendseldb;
ed329fcf 377 char *pidfile;
9f3c422c 378 pid_t bgsavechildpid;
9d65a1bb 379 pid_t bgrewritechildpid;
380 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
28ed1f33 381 sds aofbuf; /* AOF buffer, written before entering the event loop */
ed9b544e 382 struct saveparam *saveparams;
383 int saveparamslen;
384 char *logfile;
385 char *bindaddr;
386 char *dbfilename;
44b38ef4 387 char *appendfilename;
abcb223e 388 char *requirepass;
121f70cf 389 int rdbcompression;
8ca3e9d1 390 int activerehashing;
ed9b544e 391 /* Replication related */
392 int isslave;
d0ccebcf 393 char *masterauth;
ed9b544e 394 char *masterhost;
395 int masterport;
40d224a9 396 redisClient *master; /* client that is master for this slave */
ed9b544e 397 int replstate;
285add55 398 unsigned int maxclients;
4ef8de8a 399 unsigned long long maxmemory;
d5d55fc3 400 unsigned int blpop_blocked_clients;
401 unsigned int vm_blocked_clients;
ed9b544e 402 /* Sort parameters - qsort_r() is only available under BSD so we
403 * have to take this state global, in order to pass it to sortCompare() */
404 int sort_desc;
405 int sort_alpha;
406 int sort_bypattern;
75680a3c 407 /* Virtual memory configuration */
408 int vm_enabled;
054e426d 409 char *vm_swap_file;
75680a3c 410 off_t vm_page_size;
411 off_t vm_pages;
4ef8de8a 412 unsigned long long vm_max_memory;
cbba7dd7 413 /* Hashes config */
414 size_t hash_max_zipmap_entries;
415 size_t hash_max_zipmap_value;
75680a3c 416 /* Virtual memory state */
417 FILE *vm_fp;
418 int vm_fd;
419 off_t vm_next_page; /* Next probably empty page */
420 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 421 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 422 time_t unixtime; /* Unix time sampled every second. */
92f8e882 423 /* Virtual memory I/O threads stuff */
92f8e882 424 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 425 * put the result of the operation in the io_done list. While the
426 * job is being processed, it's put on io_processing queue. */
427 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
428 list *io_processing; /* List of VM I/O jobs being processed */
429 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 430 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 431 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 432 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
433 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 434 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 435 int io_active_threads; /* Number of running I/O threads */
436 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 437 /* Our main thread is blocked on the event loop, locking for sockets ready
438 * to be read or written, so when a threaded I/O operation is ready to be
439 * processed by the main thread, the I/O thread will use a unix pipe to
440 * awake the main thread. The followings are the two pipe FDs. */
441 int io_ready_pipe_read;
442 int io_ready_pipe_write;
7d98e08c 443 /* Virtual memory stats */
444 unsigned long long vm_stats_used_pages;
445 unsigned long long vm_stats_swapped_objects;
446 unsigned long long vm_stats_swapouts;
447 unsigned long long vm_stats_swapins;
befec3cd 448 /* Pubsub */
ffc6b7f8 449 dict *pubsub_channels; /* Map channels to list of subscribed clients */
450 list *pubsub_patterns; /* A list of pubsub_patterns */
befec3cd 451 /* Misc */
b9bc0eef 452 FILE *devnull;
ed9b544e 453};
454
ffc6b7f8 455typedef struct pubsubPattern {
456 redisClient *client;
457 robj *pattern;
458} pubsubPattern;
459
ed9b544e 460typedef void redisCommandProc(redisClient *c);
ca1788b5 461typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
ed9b544e 462struct redisCommand {
463 char *name;
464 redisCommandProc *proc;
465 int arity;
466 int flags;
76583ea4
PN
467 /* Use a function to determine which keys need to be loaded
468 * in the background prior to executing this command. Takes precedence
469 * over vm_firstkey and others, ignored when NULL */
ca1788b5 470 redisVmPreloadProc *vm_preload_proc;
7c775e09 471 /* What keys should be loaded in background when calling this command? */
472 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
473 int vm_lastkey; /* THe last argument that's a key */
474 int vm_keystep; /* The step between first and last key */
ed9b544e 475};
476
de96dbfe 477struct redisFunctionSym {
478 char *name;
56906eef 479 unsigned long pointer;
de96dbfe 480};
481
ed9b544e 482typedef struct _redisSortObject {
483 robj *obj;
484 union {
485 double score;
486 robj *cmpobj;
487 } u;
488} redisSortObject;
489
490typedef struct _redisSortOperation {
491 int type;
492 robj *pattern;
493} redisSortOperation;
494
6b47e12e 495/* ZSETs use a specialized version of Skiplists */
496
497typedef struct zskiplistNode {
498 struct zskiplistNode **forward;
e3870fab 499 struct zskiplistNode *backward;
912b9165 500 unsigned int *span;
6b47e12e 501 double score;
502 robj *obj;
503} zskiplistNode;
504
505typedef struct zskiplist {
e3870fab 506 struct zskiplistNode *header, *tail;
d13f767c 507 unsigned long length;
6b47e12e 508 int level;
509} zskiplist;
510
1812e024 511typedef struct zset {
512 dict *dict;
6b47e12e 513 zskiplist *zsl;
1812e024 514} zset;
515
6b47e12e 516/* Our shared "common" objects */
517
05df7621 518#define REDIS_SHARED_INTEGERS 10000
ed9b544e 519struct sharedObjectsStruct {
c937aa89 520 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 521 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 522 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
523 *outofrangeerr, *plus,
ed9b544e 524 *select0, *select1, *select2, *select3, *select4,
befec3cd 525 *select5, *select6, *select7, *select8, *select9,
c8d0ea0e 526 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
527 *mbulk4, *psubscribebulk, *punsubscribebulk,
528 *integers[REDIS_SHARED_INTEGERS];
ed9b544e 529} shared;
530
a7866db6 531/* Global vars that are actally used as constants. The following double
532 * values are used for double on-disk serialization, and are initialized
533 * at runtime to avoid strange compiler optimizations. */
534
535static double R_Zero, R_PosInf, R_NegInf, R_Nan;
536
92f8e882 537/* VM threaded I/O request message */
b9bc0eef 538#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
539#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
540#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 541typedef struct iojob {
996cb5f7 542 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 543 redisDb *db;/* Redis database */
92f8e882 544 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 545 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 546 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
547 off_t page; /* Swap page where to read/write the object */
248ea310 548 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 549 int canceled; /* True if this command was canceled by blocking side of VM */
550 pthread_t thread; /* ID of the thread processing this entry */
551} iojob;
92f8e882 552
ed9b544e 553/*================================ Prototypes =============================== */
554
555static void freeStringObject(robj *o);
556static void freeListObject(robj *o);
557static void freeSetObject(robj *o);
558static void decrRefCount(void *o);
559static robj *createObject(int type, void *ptr);
560static void freeClient(redisClient *c);
f78fd11b 561static int rdbLoad(char *filename);
ed9b544e 562static void addReply(redisClient *c, robj *obj);
563static void addReplySds(redisClient *c, sds s);
564static void incrRefCount(robj *o);
f78fd11b 565static int rdbSaveBackground(char *filename);
ed9b544e 566static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 567static robj *dupStringObject(robj *o);
248ea310 568static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
dd142b9c 569static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
28ed1f33 570static void flushAppendOnlyFile(void);
44b38ef4 571static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 572static int syncWithMaster(void);
05df7621 573static robj *tryObjectEncoding(robj *o);
9d65a1bb 574static robj *getDecodedObject(robj *o);
3305306f 575static int removeExpire(redisDb *db, robj *key);
576static int expireIfNeeded(redisDb *db, robj *key);
577static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 578static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 579static int deleteKey(redisDb *db, robj *key);
bb32ede5 580static time_t getExpire(redisDb *db, robj *key);
581static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 582static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 583static void freeMemoryIfNeeded(void);
de96dbfe 584static int processCommand(redisClient *c);
56906eef 585static void setupSigSegvAction(void);
a3b21203 586static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 587static void aofRemoveTempFile(pid_t childpid);
0ea663ea 588static size_t stringObjectLen(robj *o);
638e42ac 589static void processInputBuffer(redisClient *c);
6b47e12e 590static zskiplist *zslCreate(void);
fd8ccf44 591static void zslFree(zskiplist *zsl);
2b59cfdf 592static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 593static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 594static void initClientMultiState(redisClient *c);
595static void freeClientMultiState(redisClient *c);
596static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 597static void unblockClientWaitingData(redisClient *c);
4409877e 598static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 599static void vmInit(void);
a35ddf12 600static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 601static robj *vmLoadObject(robj *key);
7e69548d 602static robj *vmPreviewObject(robj *key);
a69a0c9c 603static int vmSwapOneObjectBlocking(void);
604static int vmSwapOneObjectThreaded(void);
7e69548d 605static int vmCanSwapOut(void);
a5819310 606static int tryFreeOneObjectFromFreelist(void);
996cb5f7 607static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
608static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
609static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 610static void lockThreadedIO(void);
611static void unlockThreadedIO(void);
612static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
613static void freeIOJob(iojob *j);
614static void queueIOJob(iojob *j);
a5819310 615static int vmWriteObjectOnSwap(robj *o, off_t page);
616static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 617static void waitEmptyIOJobsQueue(void);
618static void vmReopenSwapFile(void);
970e10bb 619static int vmFreePage(off_t page);
ca1788b5 620static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
3805e04f 621static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
0a6f3f0f 622static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
d5d55fc3 623static int dontWaitForSwappedKey(redisClient *c, robj *key);
624static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
625static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
626static struct redisCommand *lookupCommand(char *name);
627static void call(redisClient *c, struct redisCommand *cmd);
628static void resetClient(redisClient *c);
ada386b2 629static void convertToRealHash(robj *o);
ffc6b7f8 630static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
631static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
632static void freePubsubPattern(void *p);
633static int listMatchPubsubPattern(void *a, void *b);
634static int compareStringObjects(robj *a, robj *b);
bf028098 635static int equalStringObjects(robj *a, robj *b);
befec3cd 636static void usage();
8f63ddca 637static int rewriteAppendOnlyFileBackground(void);
242a64f3 638static int vmSwapObjectBlocking(robj *key, robj *val);
fab43727 639static int prepareForShutdown();
37ab76c9 640static void touchWatchedKey(redisDb *db, robj *key);
9b30e1a2 641static void touchWatchedKeysOnFlush(int dbid);
37ab76c9 642static void unwatchAllKeys(redisClient *c);
ed9b544e 643
abcb223e 644static void authCommand(redisClient *c);
ed9b544e 645static void pingCommand(redisClient *c);
646static void echoCommand(redisClient *c);
647static void setCommand(redisClient *c);
648static void setnxCommand(redisClient *c);
526d00a5 649static void setexCommand(redisClient *c);
ed9b544e 650static void getCommand(redisClient *c);
651static void delCommand(redisClient *c);
652static void existsCommand(redisClient *c);
653static void incrCommand(redisClient *c);
654static void decrCommand(redisClient *c);
655static void incrbyCommand(redisClient *c);
656static void decrbyCommand(redisClient *c);
657static void selectCommand(redisClient *c);
658static void randomkeyCommand(redisClient *c);
659static void keysCommand(redisClient *c);
660static void dbsizeCommand(redisClient *c);
661static void lastsaveCommand(redisClient *c);
662static void saveCommand(redisClient *c);
663static void bgsaveCommand(redisClient *c);
9d65a1bb 664static void bgrewriteaofCommand(redisClient *c);
ed9b544e 665static void shutdownCommand(redisClient *c);
666static void moveCommand(redisClient *c);
667static void renameCommand(redisClient *c);
668static void renamenxCommand(redisClient *c);
669static void lpushCommand(redisClient *c);
670static void rpushCommand(redisClient *c);
671static void lpopCommand(redisClient *c);
672static void rpopCommand(redisClient *c);
673static void llenCommand(redisClient *c);
674static void lindexCommand(redisClient *c);
675static void lrangeCommand(redisClient *c);
676static void ltrimCommand(redisClient *c);
677static void typeCommand(redisClient *c);
678static void lsetCommand(redisClient *c);
679static void saddCommand(redisClient *c);
680static void sremCommand(redisClient *c);
a4460ef4 681static void smoveCommand(redisClient *c);
ed9b544e 682static void sismemberCommand(redisClient *c);
683static void scardCommand(redisClient *c);
12fea928 684static void spopCommand(redisClient *c);
2abb95a9 685static void srandmemberCommand(redisClient *c);
ed9b544e 686static void sinterCommand(redisClient *c);
687static void sinterstoreCommand(redisClient *c);
40d224a9 688static void sunionCommand(redisClient *c);
689static void sunionstoreCommand(redisClient *c);
f4f56e1d 690static void sdiffCommand(redisClient *c);
691static void sdiffstoreCommand(redisClient *c);
ed9b544e 692static void syncCommand(redisClient *c);
693static void flushdbCommand(redisClient *c);
694static void flushallCommand(redisClient *c);
695static void sortCommand(redisClient *c);
696static void lremCommand(redisClient *c);
0f5f7e9a 697static void rpoplpushcommand(redisClient *c);
ed9b544e 698static void infoCommand(redisClient *c);
70003d28 699static void mgetCommand(redisClient *c);
87eca727 700static void monitorCommand(redisClient *c);
3305306f 701static void expireCommand(redisClient *c);
802e8373 702static void expireatCommand(redisClient *c);
f6b141c5 703static void getsetCommand(redisClient *c);
fd88489a 704static void ttlCommand(redisClient *c);
321b0e13 705static void slaveofCommand(redisClient *c);
7f957c92 706static void debugCommand(redisClient *c);
f6b141c5 707static void msetCommand(redisClient *c);
708static void msetnxCommand(redisClient *c);
fd8ccf44 709static void zaddCommand(redisClient *c);
7db723ad 710static void zincrbyCommand(redisClient *c);
cc812361 711static void zrangeCommand(redisClient *c);
50c55df5 712static void zrangebyscoreCommand(redisClient *c);
f44dd428 713static void zcountCommand(redisClient *c);
e3870fab 714static void zrevrangeCommand(redisClient *c);
3c41331e 715static void zcardCommand(redisClient *c);
1b7106e7 716static void zremCommand(redisClient *c);
6e333bbe 717static void zscoreCommand(redisClient *c);
1807985b 718static void zremrangebyscoreCommand(redisClient *c);
6e469882 719static void multiCommand(redisClient *c);
720static void execCommand(redisClient *c);
18b6cb76 721static void discardCommand(redisClient *c);
4409877e 722static void blpopCommand(redisClient *c);
723static void brpopCommand(redisClient *c);
4b00bebd 724static void appendCommand(redisClient *c);
39191553 725static void substrCommand(redisClient *c);
69d95c3e 726static void zrankCommand(redisClient *c);
798d9e55 727static void zrevrankCommand(redisClient *c);
978c2c94 728static void hsetCommand(redisClient *c);
1f1c7695 729static void hsetnxCommand(redisClient *c);
978c2c94 730static void hgetCommand(redisClient *c);
09aeb579
PN
731static void hmsetCommand(redisClient *c);
732static void hmgetCommand(redisClient *c);
07efaf74 733static void hdelCommand(redisClient *c);
92b27fe9 734static void hlenCommand(redisClient *c);
9212eafd 735static void zremrangebyrankCommand(redisClient *c);
5d373da9 736static void zunionstoreCommand(redisClient *c);
737static void zinterstoreCommand(redisClient *c);
78409a0f 738static void hkeysCommand(redisClient *c);
739static void hvalsCommand(redisClient *c);
740static void hgetallCommand(redisClient *c);
a86f14b1 741static void hexistsCommand(redisClient *c);
500ece7c 742static void configCommand(redisClient *c);
01426b05 743static void hincrbyCommand(redisClient *c);
befec3cd 744static void subscribeCommand(redisClient *c);
745static void unsubscribeCommand(redisClient *c);
ffc6b7f8 746static void psubscribeCommand(redisClient *c);
747static void punsubscribeCommand(redisClient *c);
befec3cd 748static void publishCommand(redisClient *c);
37ab76c9 749static void watchCommand(redisClient *c);
750static void unwatchCommand(redisClient *c);
f6b141c5 751
ed9b544e 752/*================================= Globals ================================= */
753
754/* Global vars */
755static struct redisServer server; /* server global state */
1a132bbc 756static struct redisCommand *commandTable;
1a132bbc 757static struct redisCommand readonlyCommandTable[] = {
76583ea4
PN
758 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
760 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
526d00a5 761 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
762 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
763 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
764 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
765 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
766 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
767 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
768 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
769 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
770 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
771 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
778 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
780 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
781 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
782 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
783 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
784 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
785 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
786 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
790 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
791 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
792 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
793 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
794 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
795 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
796 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
798 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
799 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
800 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
5d373da9 801 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
802 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
803 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
807 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
808 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
809 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
810 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
811 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
1f1c7695 812 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 813 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
d33278d1 814 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 815 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
01426b05 816 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
817 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
818 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
819 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
820 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
821 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 822 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
823 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
824 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
825 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
826 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
827 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
828 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
831 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
832 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
833 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
840 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
842 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
843 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
844 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
845 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
846 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
3805e04f 847 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
848 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
849 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
850 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
851 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
852 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
853 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
854 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
855 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
856 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
857 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 858 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
befec3cd 859 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
860 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
ffc6b7f8 861 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
862 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
4005fef1 863 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
37ab76c9 864 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
d55d5c5d 865 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}
ed9b544e 866};
bcfc686d 867
ed9b544e 868/*============================ Utility functions ============================ */
869
870/* Glob-style pattern matching. */
500ece7c 871static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 872 const char *string, int stringLen, int nocase)
873{
874 while(patternLen) {
875 switch(pattern[0]) {
876 case '*':
877 while (pattern[1] == '*') {
878 pattern++;
879 patternLen--;
880 }
881 if (patternLen == 1)
882 return 1; /* match */
883 while(stringLen) {
884 if (stringmatchlen(pattern+1, patternLen-1,
885 string, stringLen, nocase))
886 return 1; /* match */
887 string++;
888 stringLen--;
889 }
890 return 0; /* no match */
891 break;
892 case '?':
893 if (stringLen == 0)
894 return 0; /* no match */
895 string++;
896 stringLen--;
897 break;
898 case '[':
899 {
900 int not, match;
901
902 pattern++;
903 patternLen--;
904 not = pattern[0] == '^';
905 if (not) {
906 pattern++;
907 patternLen--;
908 }
909 match = 0;
910 while(1) {
911 if (pattern[0] == '\\') {
912 pattern++;
913 patternLen--;
914 if (pattern[0] == string[0])
915 match = 1;
916 } else if (pattern[0] == ']') {
917 break;
918 } else if (patternLen == 0) {
919 pattern--;
920 patternLen++;
921 break;
922 } else if (pattern[1] == '-' && patternLen >= 3) {
923 int start = pattern[0];
924 int end = pattern[2];
925 int c = string[0];
926 if (start > end) {
927 int t = start;
928 start = end;
929 end = t;
930 }
931 if (nocase) {
932 start = tolower(start);
933 end = tolower(end);
934 c = tolower(c);
935 }
936 pattern += 2;
937 patternLen -= 2;
938 if (c >= start && c <= end)
939 match = 1;
940 } else {
941 if (!nocase) {
942 if (pattern[0] == string[0])
943 match = 1;
944 } else {
945 if (tolower((int)pattern[0]) == tolower((int)string[0]))
946 match = 1;
947 }
948 }
949 pattern++;
950 patternLen--;
951 }
952 if (not)
953 match = !match;
954 if (!match)
955 return 0; /* no match */
956 string++;
957 stringLen--;
958 break;
959 }
960 case '\\':
961 if (patternLen >= 2) {
962 pattern++;
963 patternLen--;
964 }
965 /* fall through */
966 default:
967 if (!nocase) {
968 if (pattern[0] != string[0])
969 return 0; /* no match */
970 } else {
971 if (tolower((int)pattern[0]) != tolower((int)string[0]))
972 return 0; /* no match */
973 }
974 string++;
975 stringLen--;
976 break;
977 }
978 pattern++;
979 patternLen--;
980 if (stringLen == 0) {
981 while(*pattern == '*') {
982 pattern++;
983 patternLen--;
984 }
985 break;
986 }
987 }
988 if (patternLen == 0 && stringLen == 0)
989 return 1;
990 return 0;
991}
992
500ece7c 993static int stringmatch(const char *pattern, const char *string, int nocase) {
994 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
995}
996
2b619329 997/* Convert a string representing an amount of memory into the number of
998 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
999 * (1024*1024*1024).
1000 *
1001 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1002 * set to 0 */
1003static long long memtoll(const char *p, int *err) {
1004 const char *u;
1005 char buf[128];
1006 long mul; /* unit multiplier */
1007 long long val;
1008 unsigned int digits;
1009
1010 if (err) *err = 0;
1011 /* Search the first non digit character. */
1012 u = p;
1013 if (*u == '-') u++;
1014 while(*u && isdigit(*u)) u++;
1015 if (*u == '\0' || !strcasecmp(u,"b")) {
1016 mul = 1;
72324005 1017 } else if (!strcasecmp(u,"k")) {
2b619329 1018 mul = 1000;
72324005 1019 } else if (!strcasecmp(u,"kb")) {
2b619329 1020 mul = 1024;
72324005 1021 } else if (!strcasecmp(u,"m")) {
2b619329 1022 mul = 1000*1000;
72324005 1023 } else if (!strcasecmp(u,"mb")) {
2b619329 1024 mul = 1024*1024;
72324005 1025 } else if (!strcasecmp(u,"g")) {
2b619329 1026 mul = 1000L*1000*1000;
72324005 1027 } else if (!strcasecmp(u,"gb")) {
2b619329 1028 mul = 1024L*1024*1024;
1029 } else {
1030 if (err) *err = 1;
1031 mul = 1;
1032 }
1033 digits = u-p;
1034 if (digits >= sizeof(buf)) {
1035 if (err) *err = 1;
1036 return LLONG_MAX;
1037 }
1038 memcpy(buf,p,digits);
1039 buf[digits] = '\0';
1040 val = strtoll(buf,NULL,10);
1041 return val*mul;
1042}
1043
ee14da56 1044/* Convert a long long into a string. Returns the number of
1045 * characters needed to represent the number, that can be shorter if passed
1046 * buffer length is not enough to store the whole number. */
1047static int ll2string(char *s, size_t len, long long value) {
1048 char buf[32], *p;
1049 unsigned long long v;
1050 size_t l;
1051
1052 if (len == 0) return 0;
1053 v = (value < 0) ? -value : value;
1054 p = buf+31; /* point to the last character */
1055 do {
1056 *p-- = '0'+(v%10);
1057 v /= 10;
1058 } while(v);
1059 if (value < 0) *p-- = '-';
1060 p++;
1061 l = 32-(p-buf);
1062 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1063 memcpy(s,p,l);
1064 s[l] = '\0';
1065 return l;
1066}
1067
56906eef 1068static void redisLog(int level, const char *fmt, ...) {
ed9b544e 1069 va_list ap;
1070 FILE *fp;
1071
1072 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1073 if (!fp) return;
1074
1075 va_start(ap, fmt);
1076 if (level >= server.verbosity) {
6766f45e 1077 char *c = ".-*#";
1904ecc1 1078 char buf[64];
1079 time_t now;
1080
1081 now = time(NULL);
6c9385e0 1082 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 1083 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 1084 vfprintf(fp, fmt, ap);
1085 fprintf(fp,"\n");
1086 fflush(fp);
1087 }
1088 va_end(ap);
1089
1090 if (server.logfile) fclose(fp);
1091}
1092
1093/*====================== Hash table type implementation ==================== */
1094
1095/* This is an hash table type that uses the SDS dynamic strings libary as
1096 * keys and radis objects as values (objects can hold SDS strings,
1097 * lists, sets). */
1098
1812e024 1099static void dictVanillaFree(void *privdata, void *val)
1100{
1101 DICT_NOTUSED(privdata);
1102 zfree(val);
1103}
1104
4409877e 1105static void dictListDestructor(void *privdata, void *val)
1106{
1107 DICT_NOTUSED(privdata);
1108 listRelease((list*)val);
1109}
1110
ed9b544e 1111static int sdsDictKeyCompare(void *privdata, const void *key1,
1112 const void *key2)
1113{
1114 int l1,l2;
1115 DICT_NOTUSED(privdata);
1116
1117 l1 = sdslen((sds)key1);
1118 l2 = sdslen((sds)key2);
1119 if (l1 != l2) return 0;
1120 return memcmp(key1, key2, l1) == 0;
1121}
1122
1123static void dictRedisObjectDestructor(void *privdata, void *val)
1124{
1125 DICT_NOTUSED(privdata);
1126
a35ddf12 1127 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 1128 decrRefCount(val);
1129}
1130
942a3961 1131static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1132 const void *key2)
1133{
1134 const robj *o1 = key1, *o2 = key2;
1135 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1136}
1137
942a3961 1138static unsigned int dictObjHash(const void *key) {
ed9b544e 1139 const robj *o = key;
1140 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1141}
1142
942a3961 1143static int dictEncObjKeyCompare(void *privdata, const void *key1,
1144 const void *key2)
1145{
9d65a1bb 1146 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1147 int cmp;
942a3961 1148
2a1198b4 1149 if (o1->encoding == REDIS_ENCODING_INT &&
dc05abde 1150 o2->encoding == REDIS_ENCODING_INT)
1151 return o1->ptr == o2->ptr;
2a1198b4 1152
9d65a1bb 1153 o1 = getDecodedObject(o1);
1154 o2 = getDecodedObject(o2);
1155 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1156 decrRefCount(o1);
1157 decrRefCount(o2);
1158 return cmp;
942a3961 1159}
1160
1161static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1162 robj *o = (robj*) key;
942a3961 1163
ed9e4966 1164 if (o->encoding == REDIS_ENCODING_RAW) {
1165 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1166 } else {
1167 if (o->encoding == REDIS_ENCODING_INT) {
1168 char buf[32];
1169 int len;
1170
ee14da56 1171 len = ll2string(buf,32,(long)o->ptr);
ed9e4966 1172 return dictGenHashFunction((unsigned char*)buf, len);
1173 } else {
1174 unsigned int hash;
1175
1176 o = getDecodedObject(o);
1177 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1178 decrRefCount(o);
1179 return hash;
1180 }
1181 }
942a3961 1182}
1183
f2d9f50f 1184/* Sets type and expires */
ed9b544e 1185static dictType setDictType = {
942a3961 1186 dictEncObjHash, /* hash function */
ed9b544e 1187 NULL, /* key dup */
1188 NULL, /* val dup */
942a3961 1189 dictEncObjKeyCompare, /* key compare */
ed9b544e 1190 dictRedisObjectDestructor, /* key destructor */
1191 NULL /* val destructor */
1192};
1193
f2d9f50f 1194/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1195static dictType zsetDictType = {
1196 dictEncObjHash, /* hash function */
1197 NULL, /* key dup */
1198 NULL, /* val dup */
1199 dictEncObjKeyCompare, /* key compare */
1200 dictRedisObjectDestructor, /* key destructor */
da0a1620 1201 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1202};
1203
f2d9f50f 1204/* Db->dict */
5234952b 1205static dictType dbDictType = {
942a3961 1206 dictObjHash, /* hash function */
ed9b544e 1207 NULL, /* key dup */
1208 NULL, /* val dup */
942a3961 1209 dictObjKeyCompare, /* key compare */
ed9b544e 1210 dictRedisObjectDestructor, /* key destructor */
1211 dictRedisObjectDestructor /* val destructor */
1212};
1213
f2d9f50f 1214/* Db->expires */
1215static dictType keyptrDictType = {
1216 dictObjHash, /* hash function */
1217 NULL, /* key dup */
1218 NULL, /* val dup */
1219 dictObjKeyCompare, /* key compare */
1220 dictRedisObjectDestructor, /* key destructor */
1221 NULL /* val destructor */
1222};
1223
5234952b 1224/* Hash type hash table (note that small hashes are represented with zimpaps) */
1225static dictType hashDictType = {
1226 dictEncObjHash, /* hash function */
1227 NULL, /* key dup */
1228 NULL, /* val dup */
1229 dictEncObjKeyCompare, /* key compare */
1230 dictRedisObjectDestructor, /* key destructor */
1231 dictRedisObjectDestructor /* val destructor */
1232};
1233
4409877e 1234/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1235 * lists as values. It's used for blocking operations (BLPOP) and to
1236 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1237static dictType keylistDictType = {
1238 dictObjHash, /* hash function */
1239 NULL, /* key dup */
1240 NULL, /* val dup */
1241 dictObjKeyCompare, /* key compare */
1242 dictRedisObjectDestructor, /* key destructor */
1243 dictListDestructor /* val destructor */
1244};
1245
42ab0172
AO
1246static void version();
1247
ed9b544e 1248/* ========================= Random utility functions ======================= */
1249
1250/* Redis generally does not try to recover from out of memory conditions
1251 * when allocating objects or strings, it is not clear if it will be possible
1252 * to report this condition to the client since the networking layer itself
1253 * is based on heap allocation for send buffers, so we simply abort.
1254 * At least the code will be simpler to read... */
1255static void oom(const char *msg) {
71c54b21 1256 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1257 sleep(1);
1258 abort();
1259}
1260
1261/* ====================== Redis server networking stuff ===================== */
56906eef 1262static void closeTimedoutClients(void) {
ed9b544e 1263 redisClient *c;
ed9b544e 1264 listNode *ln;
1265 time_t now = time(NULL);
c7df85a4 1266 listIter li;
ed9b544e 1267
c7df85a4 1268 listRewind(server.clients,&li);
1269 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1270 c = listNodeValue(ln);
f86a74e9 1271 if (server.maxidletime &&
1272 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1273 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
ffc6b7f8 1274 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1275 listLength(c->pubsub_patterns) == 0 &&
d6cc8867 1276 (now - c->lastinteraction > server.maxidletime))
f86a74e9 1277 {
f870935d 1278 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1279 freeClient(c);
f86a74e9 1280 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1281 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1282 addReply(c,shared.nullmultibulk);
b0d8747d 1283 unblockClientWaitingData(c);
f86a74e9 1284 }
ed9b544e 1285 }
1286 }
ed9b544e 1287}
1288
12fea928 1289static int htNeedsResize(dict *dict) {
1290 long long size, used;
1291
1292 size = dictSlots(dict);
1293 used = dictSize(dict);
1294 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1295 (used*100/size < REDIS_HT_MINFILL));
1296}
1297
0bc03378 1298/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1299 * we resize the hash table to save memory */
56906eef 1300static void tryResizeHashTables(void) {
0bc03378 1301 int j;
1302
1303 for (j = 0; j < server.dbnum; j++) {
5413c40d 1304 if (htNeedsResize(server.db[j].dict))
0bc03378 1305 dictResize(server.db[j].dict);
12fea928 1306 if (htNeedsResize(server.db[j].expires))
1307 dictResize(server.db[j].expires);
0bc03378 1308 }
1309}
1310
8ca3e9d1 1311/* Our hash table implementation performs rehashing incrementally while
1312 * we write/read from the hash table. Still if the server is idle, the hash
1313 * table will use two tables for a long time. So we try to use 1 millisecond
1314 * of CPU time at every serverCron() loop in order to rehash some key. */
1315static void incrementallyRehash(void) {
1316 int j;
1317
1318 for (j = 0; j < server.dbnum; j++) {
1319 if (dictIsRehashing(server.db[j].dict)) {
1320 dictRehashMilliseconds(server.db[j].dict,1);
1321 break; /* already used our millisecond for this loop... */
1322 }
1323 }
1324}
1325
9d65a1bb 1326/* A background saving child (BGSAVE) terminated its work. Handle this. */
1327void backgroundSaveDoneHandler(int statloc) {
1328 int exitcode = WEXITSTATUS(statloc);
1329 int bysignal = WIFSIGNALED(statloc);
1330
1331 if (!bysignal && exitcode == 0) {
1332 redisLog(REDIS_NOTICE,
1333 "Background saving terminated with success");
1334 server.dirty = 0;
1335 server.lastsave = time(NULL);
1336 } else if (!bysignal && exitcode != 0) {
1337 redisLog(REDIS_WARNING, "Background saving error");
1338 } else {
1339 redisLog(REDIS_WARNING,
454eea7c 1340 "Background saving terminated by signal %d", WTERMSIG(statloc));
9d65a1bb 1341 rdbRemoveTempFile(server.bgsavechildpid);
1342 }
1343 server.bgsavechildpid = -1;
1344 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1345 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1346 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1347}
1348
1349/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1350 * Handle this. */
1351void backgroundRewriteDoneHandler(int statloc) {
1352 int exitcode = WEXITSTATUS(statloc);
1353 int bysignal = WIFSIGNALED(statloc);
1354
1355 if (!bysignal && exitcode == 0) {
1356 int fd;
1357 char tmpfile[256];
1358
1359 redisLog(REDIS_NOTICE,
1360 "Background append only file rewriting terminated with success");
1361 /* Now it's time to flush the differences accumulated by the parent */
1362 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1363 fd = open(tmpfile,O_WRONLY|O_APPEND);
1364 if (fd == -1) {
1365 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1366 goto cleanup;
1367 }
1368 /* Flush our data... */
1369 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1370 (signed) sdslen(server.bgrewritebuf)) {
1371 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1372 close(fd);
1373 goto cleanup;
1374 }
b32627cd 1375 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1376 /* Now our work is to rename the temp file into the stable file. And
1377 * switch the file descriptor used by the server for append only. */
1378 if (rename(tmpfile,server.appendfilename) == -1) {
1379 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1380 close(fd);
1381 goto cleanup;
1382 }
1383 /* Mission completed... almost */
1384 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1385 if (server.appendfd != -1) {
1386 /* If append only is actually enabled... */
1387 close(server.appendfd);
1388 server.appendfd = fd;
d5d23dab 1389 if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
85a83172 1390 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1391 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1392 } else {
1393 /* If append only is disabled we just generate a dump in this
1394 * format. Why not? */
1395 close(fd);
1396 }
1397 } else if (!bysignal && exitcode != 0) {
1398 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1399 } else {
1400 redisLog(REDIS_WARNING,
454eea7c 1401 "Background append only file rewriting terminated by signal %d",
1402 WTERMSIG(statloc));
9d65a1bb 1403 }
1404cleanup:
1405 sdsfree(server.bgrewritebuf);
1406 server.bgrewritebuf = sdsempty();
1407 aofRemoveTempFile(server.bgrewritechildpid);
1408 server.bgrewritechildpid = -1;
1409}
1410
884d4b39 1411/* This function is called once a background process of some kind terminates,
1412 * as we want to avoid resizing the hash tables when there is a child in order
1413 * to play well with copy-on-write (otherwise when a resize happens lots of
1414 * memory pages are copied). The goal of this function is to update the ability
1415 * for dict.c to resize the hash tables accordingly to the fact we have o not
1416 * running childs. */
1417static void updateDictResizePolicy(void) {
1418 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1419 dictEnableResize();
1420 else
1421 dictDisableResize();
1422}
1423
56906eef 1424static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1425 int j, loops = server.cronloops++;
ed9b544e 1426 REDIS_NOTUSED(eventLoop);
1427 REDIS_NOTUSED(id);
1428 REDIS_NOTUSED(clientData);
1429
3a66edc7 1430 /* We take a cached value of the unix time in the global state because
1431 * with virtual memory and aging there is to store the current time
1432 * in objects at every object access, and accuracy is not needed.
1433 * To access a global var is faster than calling time(NULL) */
1434 server.unixtime = time(NULL);
1435
fab43727 1436 /* We received a SIGTERM, shutting down here in a safe way, as it is
1437 * not ok doing so inside the signal handler. */
1438 if (server.shutdown_asap) {
1439 if (prepareForShutdown() == REDIS_OK) exit(0);
1440 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1441 }
1442
0bc03378 1443 /* Show some info about non-empty databases */
ed9b544e 1444 for (j = 0; j < server.dbnum; j++) {
dec423d9 1445 long long size, used, vkeys;
94754ccc 1446
3305306f 1447 size = dictSlots(server.db[j].dict);
1448 used = dictSize(server.db[j].dict);
94754ccc 1449 vkeys = dictSize(server.db[j].expires);
1763929f 1450 if (!(loops % 50) && (used || vkeys)) {
f870935d 1451 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1452 /* dictPrintStats(server.dict); */
ed9b544e 1453 }
ed9b544e 1454 }
1455
0bc03378 1456 /* We don't want to resize the hash tables while a bacground saving
1457 * is in progress: the saving child is created using fork() that is
1458 * implemented with a copy-on-write semantic in most modern systems, so
1459 * if we resize the HT while there is the saving child at work actually
1460 * a lot of memory movements in the parent will cause a lot of pages
1461 * copied. */
8ca3e9d1 1462 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1463 if (!(loops % 10)) tryResizeHashTables();
1464 if (server.activerehashing) incrementallyRehash();
884d4b39 1465 }
0bc03378 1466
ed9b544e 1467 /* Show information about connected clients */
1763929f 1468 if (!(loops % 50)) {
bdcb92f2 1469 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
ed9b544e 1470 listLength(server.clients)-listLength(server.slaves),
1471 listLength(server.slaves),
bdcb92f2 1472 zmalloc_used_memory());
ed9b544e 1473 }
1474
1475 /* Close connections of timedout clients */
1763929f 1476 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1477 closeTimedoutClients();
1478
9d65a1bb 1479 /* Check if a background saving or AOF rewrite in progress terminated */
1480 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1481 int statloc;
9d65a1bb 1482 pid_t pid;
1483
1484 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1485 if (pid == server.bgsavechildpid) {
1486 backgroundSaveDoneHandler(statloc);
ed9b544e 1487 } else {
9d65a1bb 1488 backgroundRewriteDoneHandler(statloc);
ed9b544e 1489 }
884d4b39 1490 updateDictResizePolicy();
ed9b544e 1491 }
1492 } else {
1493 /* If there is not a background saving in progress check if
1494 * we have to save now */
1495 time_t now = time(NULL);
1496 for (j = 0; j < server.saveparamslen; j++) {
1497 struct saveparam *sp = server.saveparams+j;
1498
1499 if (server.dirty >= sp->changes &&
1500 now-server.lastsave > sp->seconds) {
1501 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1502 sp->changes, sp->seconds);
f78fd11b 1503 rdbSaveBackground(server.dbfilename);
ed9b544e 1504 break;
1505 }
1506 }
1507 }
94754ccc 1508
f2324293 1509 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1510 * will use few CPU cycles if there are few expiring keys, otherwise
1511 * it will get more aggressive to avoid that too much memory is used by
1512 * keys that can be removed from the keyspace. */
94754ccc 1513 for (j = 0; j < server.dbnum; j++) {
f2324293 1514 int expired;
94754ccc 1515 redisDb *db = server.db+j;
94754ccc 1516
f2324293 1517 /* Continue to expire if at the end of the cycle more than 25%
1518 * of the keys were expired. */
1519 do {
4ef8de8a 1520 long num = dictSize(db->expires);
94754ccc 1521 time_t now = time(NULL);
1522
f2324293 1523 expired = 0;
94754ccc 1524 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1525 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1526 while (num--) {
1527 dictEntry *de;
1528 time_t t;
1529
1530 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1531 t = (time_t) dictGetEntryVal(de);
1532 if (now > t) {
1533 deleteKey(db,dictGetEntryKey(de));
f2324293 1534 expired++;
2a6a2ed1 1535 server.stat_expiredkeys++;
94754ccc 1536 }
1537 }
f2324293 1538 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1539 }
1540
4ef8de8a 1541 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1542 * is enbled. Try to free objects from the free list first. */
7e69548d 1543 if (vmCanSwapOut()) {
1544 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1545 server.vm_max_memory)
1546 {
72e9fd40 1547 int retval;
1548
a5819310 1549 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1550 retval = (server.vm_max_threads == 0) ?
1551 vmSwapOneObjectBlocking() :
1552 vmSwapOneObjectThreaded();
1763929f 1553 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1554 zmalloc_used_memory() >
1555 (server.vm_max_memory+server.vm_max_memory/10))
1556 {
1557 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1558 }
72e9fd40 1559 /* Note that when using threade I/O we free just one object,
1560 * because anyway when the I/O thread in charge to swap this
1561 * object out will finish, the handler of completed jobs
1562 * will try to swap more objects if we are still out of memory. */
1563 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1564 }
1565 }
1566
ed9b544e 1567 /* Check if we should connect to a MASTER */
1763929f 1568 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1569 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1570 if (syncWithMaster() == REDIS_OK) {
1571 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
8f63ddca 1572 if (server.appendonly) rewriteAppendOnlyFileBackground();
ed9b544e 1573 }
1574 }
1763929f 1575 return 100;
ed9b544e 1576}
1577
d5d55fc3 1578/* This function gets called every time Redis is entering the
1579 * main loop of the event driven library, that is, before to sleep
1580 * for ready file descriptors. */
1581static void beforeSleep(struct aeEventLoop *eventLoop) {
1582 REDIS_NOTUSED(eventLoop);
1583
28ed1f33 1584 /* Awake clients that got all the swapped keys they requested */
d5d55fc3 1585 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1586 listIter li;
1587 listNode *ln;
1588
1589 listRewind(server.io_ready_clients,&li);
1590 while((ln = listNext(&li))) {
1591 redisClient *c = ln->value;
1592 struct redisCommand *cmd;
1593
1594 /* Resume the client. */
1595 listDelNode(server.io_ready_clients,ln);
1596 c->flags &= (~REDIS_IO_WAIT);
1597 server.vm_blocked_clients--;
1598 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1599 readQueryFromClient, c);
1600 cmd = lookupCommand(c->argv[0]->ptr);
1601 assert(cmd != NULL);
1602 call(c,cmd);
1603 resetClient(c);
1604 /* There may be more data to process in the input buffer. */
1605 if (c->querybuf && sdslen(c->querybuf) > 0)
1606 processInputBuffer(c);
1607 }
1608 }
28ed1f33 1609 /* Write the AOF buffer on disk */
1610 flushAppendOnlyFile();
d5d55fc3 1611}
1612
ed9b544e 1613static void createSharedObjects(void) {
05df7621 1614 int j;
1615
ed9b544e 1616 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1617 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1618 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1619 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1620 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1621 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1622 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1623 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1624 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1625 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1626 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1627 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1628 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1629 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1630 "-ERR no such key\r\n"));
ed9b544e 1631 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1632 "-ERR syntax error\r\n"));
c937aa89 1633 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1634 "-ERR source and destination objects are the same\r\n"));
1635 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1636 "-ERR index out of range\r\n"));
ed9b544e 1637 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1638 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1639 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1640 shared.select0 = createStringObject("select 0\r\n",10);
1641 shared.select1 = createStringObject("select 1\r\n",10);
1642 shared.select2 = createStringObject("select 2\r\n",10);
1643 shared.select3 = createStringObject("select 3\r\n",10);
1644 shared.select4 = createStringObject("select 4\r\n",10);
1645 shared.select5 = createStringObject("select 5\r\n",10);
1646 shared.select6 = createStringObject("select 6\r\n",10);
1647 shared.select7 = createStringObject("select 7\r\n",10);
1648 shared.select8 = createStringObject("select 8\r\n",10);
1649 shared.select9 = createStringObject("select 9\r\n",10);
befec3cd 1650 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
c8d0ea0e 1651 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
befec3cd 1652 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
fc46bb71 1653 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
ffc6b7f8 1654 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1655 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
befec3cd 1656 shared.mbulk3 = createStringObject("*3\r\n",4);
c8d0ea0e 1657 shared.mbulk4 = createStringObject("*4\r\n",4);
05df7621 1658 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1659 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1660 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1661 }
ed9b544e 1662}
1663
1664static void appendServerSaveParams(time_t seconds, int changes) {
1665 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1666 server.saveparams[server.saveparamslen].seconds = seconds;
1667 server.saveparams[server.saveparamslen].changes = changes;
1668 server.saveparamslen++;
1669}
1670
bcfc686d 1671static void resetServerSaveParams() {
ed9b544e 1672 zfree(server.saveparams);
1673 server.saveparams = NULL;
1674 server.saveparamslen = 0;
1675}
1676
1677static void initServerConfig() {
1678 server.dbnum = REDIS_DEFAULT_DBNUM;
1679 server.port = REDIS_SERVERPORT;
f870935d 1680 server.verbosity = REDIS_VERBOSE;
ed9b544e 1681 server.maxidletime = REDIS_MAXIDLETIME;
1682 server.saveparams = NULL;
1683 server.logfile = NULL; /* NULL = log on standard output */
1684 server.bindaddr = NULL;
1685 server.glueoutputbuf = 1;
1686 server.daemonize = 0;
44b38ef4 1687 server.appendonly = 0;
1b677732 1688 server.appendfsync = APPENDFSYNC_EVERYSEC;
38db9171 1689 server.no_appendfsync_on_rewrite = 0;
48f0308a 1690 server.lastfsync = time(NULL);
44b38ef4 1691 server.appendfd = -1;
1692 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1693 server.pidfile = zstrdup("/var/run/redis.pid");
1694 server.dbfilename = zstrdup("dump.rdb");
1695 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1696 server.requirepass = NULL;
b0553789 1697 server.rdbcompression = 1;
8ca3e9d1 1698 server.activerehashing = 1;
285add55 1699 server.maxclients = 0;
d5d55fc3 1700 server.blpop_blocked_clients = 0;
3fd78bcd 1701 server.maxmemory = 0;
75680a3c 1702 server.vm_enabled = 0;
054e426d 1703 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1704 server.vm_page_size = 256; /* 256 bytes per page */
1705 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1706 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1707 server.vm_max_threads = 4;
d5d55fc3 1708 server.vm_blocked_clients = 0;
cbba7dd7 1709 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1710 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
fab43727 1711 server.shutdown_asap = 0;
75680a3c 1712
bcfc686d 1713 resetServerSaveParams();
ed9b544e 1714
1715 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1716 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1717 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1718 /* Replication related */
1719 server.isslave = 0;
d0ccebcf 1720 server.masterauth = NULL;
ed9b544e 1721 server.masterhost = NULL;
1722 server.masterport = 6379;
1723 server.master = NULL;
1724 server.replstate = REDIS_REPL_NONE;
a7866db6 1725
1726 /* Double constants initialization */
1727 R_Zero = 0.0;
1728 R_PosInf = 1.0/R_Zero;
1729 R_NegInf = -1.0/R_Zero;
1730 R_Nan = R_Zero/R_Zero;
ed9b544e 1731}
1732
1733static void initServer() {
1734 int j;
1735
1736 signal(SIGHUP, SIG_IGN);
1737 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1738 setupSigSegvAction();
ed9b544e 1739
b9bc0eef 1740 server.devnull = fopen("/dev/null","w");
1741 if (server.devnull == NULL) {
1742 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1743 exit(1);
1744 }
ed9b544e 1745 server.clients = listCreate();
1746 server.slaves = listCreate();
87eca727 1747 server.monitors = listCreate();
ed9b544e 1748 server.objfreelist = listCreate();
1749 createSharedObjects();
1750 server.el = aeCreateEventLoop();
3305306f 1751 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
ed9b544e 1752 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1753 if (server.fd == -1) {
1754 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1755 exit(1);
1756 }
3305306f 1757 for (j = 0; j < server.dbnum; j++) {
5234952b 1758 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1759 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
37ab76c9 1760 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1761 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1762 if (server.vm_enabled)
1763 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1764 server.db[j].id = j;
1765 }
ffc6b7f8 1766 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1767 server.pubsub_patterns = listCreate();
1768 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1769 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
ed9b544e 1770 server.cronloops = 0;
9f3c422c 1771 server.bgsavechildpid = -1;
9d65a1bb 1772 server.bgrewritechildpid = -1;
1773 server.bgrewritebuf = sdsempty();
28ed1f33 1774 server.aofbuf = sdsempty();
ed9b544e 1775 server.lastsave = time(NULL);
1776 server.dirty = 0;
ed9b544e 1777 server.stat_numcommands = 0;
1778 server.stat_numconnections = 0;
2a6a2ed1 1779 server.stat_expiredkeys = 0;
ed9b544e 1780 server.stat_starttime = time(NULL);
3a66edc7 1781 server.unixtime = time(NULL);
d8f8b666 1782 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1783 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1784 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1785
1786 if (server.appendonly) {
3bb225d6 1787 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1788 if (server.appendfd == -1) {
1789 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1790 strerror(errno));
1791 exit(1);
1792 }
1793 }
75680a3c 1794
1795 if (server.vm_enabled) vmInit();
ed9b544e 1796}
1797
1798/* Empty the whole database */
ca37e9cd 1799static long long emptyDb() {
ed9b544e 1800 int j;
ca37e9cd 1801 long long removed = 0;
ed9b544e 1802
3305306f 1803 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1804 removed += dictSize(server.db[j].dict);
3305306f 1805 dictEmpty(server.db[j].dict);
1806 dictEmpty(server.db[j].expires);
1807 }
ca37e9cd 1808 return removed;
ed9b544e 1809}
1810
85dd2f3a 1811static int yesnotoi(char *s) {
1812 if (!strcasecmp(s,"yes")) return 1;
1813 else if (!strcasecmp(s,"no")) return 0;
1814 else return -1;
1815}
1816
ed9b544e 1817/* I agree, this is a very rudimental way to load a configuration...
1818 will improve later if the config gets more complex */
1819static void loadServerConfig(char *filename) {
c9a111ac 1820 FILE *fp;
ed9b544e 1821 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1822 int linenum = 0;
1823 sds line = NULL;
c9a111ac 1824
1825 if (filename[0] == '-' && filename[1] == '\0')
1826 fp = stdin;
1827 else {
1828 if ((fp = fopen(filename,"r")) == NULL) {
9a22de82 1829 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
c9a111ac 1830 exit(1);
1831 }
ed9b544e 1832 }
c9a111ac 1833
ed9b544e 1834 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1835 sds *argv;
1836 int argc, j;
1837
1838 linenum++;
1839 line = sdsnew(buf);
1840 line = sdstrim(line," \t\r\n");
1841
1842 /* Skip comments and blank lines*/
1843 if (line[0] == '#' || line[0] == '\0') {
1844 sdsfree(line);
1845 continue;
1846 }
1847
1848 /* Split into arguments */
1849 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1850 sdstolower(argv[0]);
1851
1852 /* Execute config directives */
bb0b03a3 1853 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1854 server.maxidletime = atoi(argv[1]);
0150db36 1855 if (server.maxidletime < 0) {
ed9b544e 1856 err = "Invalid timeout value"; goto loaderr;
1857 }
bb0b03a3 1858 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1859 server.port = atoi(argv[1]);
1860 if (server.port < 1 || server.port > 65535) {
1861 err = "Invalid port"; goto loaderr;
1862 }
bb0b03a3 1863 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1864 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1865 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1866 int seconds = atoi(argv[1]);
1867 int changes = atoi(argv[2]);
1868 if (seconds < 1 || changes < 0) {
1869 err = "Invalid save parameters"; goto loaderr;
1870 }
1871 appendServerSaveParams(seconds,changes);
bb0b03a3 1872 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1873 if (chdir(argv[1]) == -1) {
1874 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1875 argv[1], strerror(errno));
1876 exit(1);
1877 }
bb0b03a3 1878 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1879 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1880 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1881 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1882 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1883 else {
1884 err = "Invalid log level. Must be one of debug, notice, warning";
1885 goto loaderr;
1886 }
bb0b03a3 1887 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1888 FILE *logfp;
ed9b544e 1889
1890 server.logfile = zstrdup(argv[1]);
bb0b03a3 1891 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1892 zfree(server.logfile);
1893 server.logfile = NULL;
1894 }
1895 if (server.logfile) {
1896 /* Test if we are able to open the file. The server will not
1897 * be able to abort just for this problem later... */
c9a111ac 1898 logfp = fopen(server.logfile,"a");
1899 if (logfp == NULL) {
ed9b544e 1900 err = sdscatprintf(sdsempty(),
1901 "Can't open the log file: %s", strerror(errno));
1902 goto loaderr;
1903 }
c9a111ac 1904 fclose(logfp);
ed9b544e 1905 }
bb0b03a3 1906 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1907 server.dbnum = atoi(argv[1]);
1908 if (server.dbnum < 1) {
1909 err = "Invalid number of databases"; goto loaderr;
1910 }
b3f83f12
JZ
1911 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1912 loadServerConfig(argv[1]);
285add55 1913 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1914 server.maxclients = atoi(argv[1]);
3fd78bcd 1915 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
2b619329 1916 server.maxmemory = memtoll(argv[1],NULL);
bb0b03a3 1917 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1918 server.masterhost = sdsnew(argv[1]);
1919 server.masterport = atoi(argv[2]);
1920 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1921 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1922 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1923 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1924 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1925 err = "argument must be 'yes' or 'no'"; goto loaderr;
1926 }
121f70cf 1927 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1928 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
8ca3e9d1 1929 err = "argument must be 'yes' or 'no'"; goto loaderr;
1930 }
1931 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1932 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
121f70cf 1933 err = "argument must be 'yes' or 'no'"; goto loaderr;
1934 }
bb0b03a3 1935 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1936 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1937 err = "argument must be 'yes' or 'no'"; goto loaderr;
1938 }
44b38ef4 1939 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1940 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1941 err = "argument must be 'yes' or 'no'"; goto loaderr;
1942 }
f3b52411
PN
1943 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1944 zfree(server.appendfilename);
1945 server.appendfilename = zstrdup(argv[1]);
38db9171 1946 } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite")
1947 && argc == 2) {
1948 if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) {
1949 err = "argument must be 'yes' or 'no'"; goto loaderr;
1950 }
48f0308a 1951 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1952 if (!strcasecmp(argv[1],"no")) {
48f0308a 1953 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1954 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1955 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1956 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1957 server.appendfsync = APPENDFSYNC_EVERYSEC;
1958 } else {
1959 err = "argument must be 'no', 'always' or 'everysec'";
1960 goto loaderr;
1961 }
bb0b03a3 1962 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 1963 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1964 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 1965 zfree(server.pidfile);
054e426d 1966 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1967 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 1968 zfree(server.dbfilename);
054e426d 1969 server.dbfilename = zstrdup(argv[1]);
75680a3c 1970 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1971 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1972 err = "argument must be 'yes' or 'no'"; goto loaderr;
1973 }
054e426d 1974 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 1975 zfree(server.vm_swap_file);
054e426d 1976 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 1977 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2b619329 1978 server.vm_max_memory = memtoll(argv[1],NULL);
4ef8de8a 1979 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2b619329 1980 server.vm_page_size = memtoll(argv[1], NULL);
4ef8de8a 1981 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2b619329 1982 server.vm_pages = memtoll(argv[1], NULL);
92f8e882 1983 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1984 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 1985 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2b619329 1986 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
cbba7dd7 1987 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2b619329 1988 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
ed9b544e 1989 } else {
1990 err = "Bad directive or wrong number of arguments"; goto loaderr;
1991 }
1992 for (j = 0; j < argc; j++)
1993 sdsfree(argv[j]);
1994 zfree(argv);
1995 sdsfree(line);
1996 }
c9a111ac 1997 if (fp != stdin) fclose(fp);
ed9b544e 1998 return;
1999
2000loaderr:
2001 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
2002 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
2003 fprintf(stderr, ">>> '%s'\n", line);
2004 fprintf(stderr, "%s\n", err);
2005 exit(1);
2006}
2007
2008static void freeClientArgv(redisClient *c) {
2009 int j;
2010
2011 for (j = 0; j < c->argc; j++)
2012 decrRefCount(c->argv[j]);
e8a74421 2013 for (j = 0; j < c->mbargc; j++)
2014 decrRefCount(c->mbargv[j]);
ed9b544e 2015 c->argc = 0;
e8a74421 2016 c->mbargc = 0;
ed9b544e 2017}
2018
2019static void freeClient(redisClient *c) {
2020 listNode *ln;
2021
4409877e 2022 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 2023 * call, we have to set querybuf to NULL *before* to call
2024 * unblockClientWaitingData() to avoid processInputBuffer() will get
2025 * called. Also it is important to remove the file events after
2026 * this, because this call adds the READABLE event. */
4409877e 2027 sdsfree(c->querybuf);
2028 c->querybuf = NULL;
2029 if (c->flags & REDIS_BLOCKED)
b0d8747d 2030 unblockClientWaitingData(c);
4409877e 2031
37ab76c9 2032 /* UNWATCH all the keys */
2033 unwatchAllKeys(c);
2034 listRelease(c->watched_keys);
ffc6b7f8 2035 /* Unsubscribe from all the pubsub channels */
2036 pubsubUnsubscribeAllChannels(c,0);
2037 pubsubUnsubscribeAllPatterns(c,0);
2038 dictRelease(c->pubsub_channels);
2039 listRelease(c->pubsub_patterns);
befec3cd 2040 /* Obvious cleanup */
ed9b544e 2041 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2042 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 2043 listRelease(c->reply);
2044 freeClientArgv(c);
2045 close(c->fd);
92f8e882 2046 /* Remove from the list of clients */
ed9b544e 2047 ln = listSearchKey(server.clients,c);
dfc5e96c 2048 redisAssert(ln != NULL);
ed9b544e 2049 listDelNode(server.clients,ln);
37ab76c9 2050 /* Remove from the list of clients that are now ready to be restarted
2051 * after waiting for swapped keys */
d5d55fc3 2052 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2053 ln = listSearchKey(server.io_ready_clients,c);
2054 if (ln) {
2055 listDelNode(server.io_ready_clients,ln);
2056 server.vm_blocked_clients--;
2057 }
2058 }
37ab76c9 2059 /* Remove from the list of clients waiting for swapped keys */
d5d55fc3 2060 while (server.vm_enabled && listLength(c->io_keys)) {
2061 ln = listFirst(c->io_keys);
2062 dontWaitForSwappedKey(c,ln->value);
92f8e882 2063 }
b3e3d0d7 2064 listRelease(c->io_keys);
befec3cd 2065 /* Master/slave cleanup */
ed9b544e 2066 if (c->flags & REDIS_SLAVE) {
6208b3a7 2067 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2068 close(c->repldbfd);
87eca727 2069 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2070 ln = listSearchKey(l,c);
dfc5e96c 2071 redisAssert(ln != NULL);
87eca727 2072 listDelNode(l,ln);
ed9b544e 2073 }
2074 if (c->flags & REDIS_MASTER) {
2075 server.master = NULL;
2076 server.replstate = REDIS_REPL_CONNECT;
2077 }
befec3cd 2078 /* Release memory */
93ea3759 2079 zfree(c->argv);
e8a74421 2080 zfree(c->mbargv);
6e469882 2081 freeClientMultiState(c);
ed9b544e 2082 zfree(c);
2083}
2084
cc30e368 2085#define GLUEREPLY_UP_TO (1024)
ed9b544e 2086static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 2087 int copylen = 0;
2088 char buf[GLUEREPLY_UP_TO];
6208b3a7 2089 listNode *ln;
c7df85a4 2090 listIter li;
ed9b544e 2091 robj *o;
2092
c7df85a4 2093 listRewind(c->reply,&li);
2094 while((ln = listNext(&li))) {
c28b42ac 2095 int objlen;
2096
ed9b544e 2097 o = ln->value;
c28b42ac 2098 objlen = sdslen(o->ptr);
2099 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2100 memcpy(buf+copylen,o->ptr,objlen);
2101 copylen += objlen;
ed9b544e 2102 listDelNode(c->reply,ln);
c28b42ac 2103 } else {
2104 if (copylen == 0) return;
2105 break;
ed9b544e 2106 }
ed9b544e 2107 }
c28b42ac 2108 /* Now the output buffer is empty, add the new single element */
2109 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2110 listAddNodeHead(c->reply,o);
ed9b544e 2111}
2112
2113static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2114 redisClient *c = privdata;
2115 int nwritten = 0, totwritten = 0, objlen;
2116 robj *o;
2117 REDIS_NOTUSED(el);
2118 REDIS_NOTUSED(mask);
2119
2895e862 2120 /* Use writev() if we have enough buffers to send */
7ea870c0 2121 if (!server.glueoutputbuf &&
e0a62c7f 2122 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
7ea870c0 2123 !(c->flags & REDIS_MASTER))
2895e862 2124 {
2125 sendReplyToClientWritev(el, fd, privdata, mask);
2126 return;
2127 }
2895e862 2128
ed9b544e 2129 while(listLength(c->reply)) {
c28b42ac 2130 if (server.glueoutputbuf && listLength(c->reply) > 1)
2131 glueReplyBuffersIfNeeded(c);
2132
ed9b544e 2133 o = listNodeValue(listFirst(c->reply));
2134 objlen = sdslen(o->ptr);
2135
2136 if (objlen == 0) {
2137 listDelNode(c->reply,listFirst(c->reply));
2138 continue;
2139 }
2140
2141 if (c->flags & REDIS_MASTER) {
6f376729 2142 /* Don't reply to a master */
ed9b544e 2143 nwritten = objlen - c->sentlen;
2144 } else {
a4d1ba9a 2145 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 2146 if (nwritten <= 0) break;
2147 }
2148 c->sentlen += nwritten;
2149 totwritten += nwritten;
2150 /* If we fully sent the object on head go to the next one */
2151 if (c->sentlen == objlen) {
2152 listDelNode(c->reply,listFirst(c->reply));
2153 c->sentlen = 0;
2154 }
6f376729 2155 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 2156 * bytes, in a single threaded server it's a good idea to serve
6f376729 2157 * other clients as well, even if a very large request comes from
2158 * super fast link that is always able to accept data (in real world
12f9d551 2159 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 2160 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 2161 }
2162 if (nwritten == -1) {
2163 if (errno == EAGAIN) {
2164 nwritten = 0;
2165 } else {
f870935d 2166 redisLog(REDIS_VERBOSE,
ed9b544e 2167 "Error writing to client: %s", strerror(errno));
2168 freeClient(c);
2169 return;
2170 }
2171 }
2172 if (totwritten > 0) c->lastinteraction = time(NULL);
2173 if (listLength(c->reply) == 0) {
2174 c->sentlen = 0;
2175 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2176 }
2177}
2178
2895e862 2179static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2180{
2181 redisClient *c = privdata;
2182 int nwritten = 0, totwritten = 0, objlen, willwrite;
2183 robj *o;
2184 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2185 int offset, ion = 0;
2186 REDIS_NOTUSED(el);
2187 REDIS_NOTUSED(mask);
2188
2189 listNode *node;
2190 while (listLength(c->reply)) {
2191 offset = c->sentlen;
2192 ion = 0;
2193 willwrite = 0;
2194
2195 /* fill-in the iov[] array */
2196 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2197 o = listNodeValue(node);
2198 objlen = sdslen(o->ptr);
2199
e0a62c7f 2200 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2895e862 2201 break;
2202
2203 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2204 break; /* no more iovecs */
2205
2206 iov[ion].iov_base = ((char*)o->ptr) + offset;
2207 iov[ion].iov_len = objlen - offset;
2208 willwrite += objlen - offset;
2209 offset = 0; /* just for the first item */
2210 ion++;
2211 }
2212
2213 if(willwrite == 0)
2214 break;
2215
2216 /* write all collected blocks at once */
2217 if((nwritten = writev(fd, iov, ion)) < 0) {
2218 if (errno != EAGAIN) {
f870935d 2219 redisLog(REDIS_VERBOSE,
2895e862 2220 "Error writing to client: %s", strerror(errno));
2221 freeClient(c);
2222 return;
2223 }
2224 break;
2225 }
2226
2227 totwritten += nwritten;
2228 offset = c->sentlen;
2229
2230 /* remove written robjs from c->reply */
2231 while (nwritten && listLength(c->reply)) {
2232 o = listNodeValue(listFirst(c->reply));
2233 objlen = sdslen(o->ptr);
2234
2235 if(nwritten >= objlen - offset) {
2236 listDelNode(c->reply, listFirst(c->reply));
2237 nwritten -= objlen - offset;
2238 c->sentlen = 0;
2239 } else {
2240 /* partial write */
2241 c->sentlen += nwritten;
2242 break;
2243 }
2244 offset = 0;
2245 }
2246 }
2247
e0a62c7f 2248 if (totwritten > 0)
2895e862 2249 c->lastinteraction = time(NULL);
2250
2251 if (listLength(c->reply) == 0) {
2252 c->sentlen = 0;
2253 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2254 }
2255}
2256
1a132bbc
PN
2257static int qsortRedisCommands(const void *r1, const void *r2) {
2258 return strcasecmp(
2259 ((struct redisCommand*)r1)->name,
2260 ((struct redisCommand*)r2)->name);
2261}
2262
2263static void sortCommandTable() {
1a132bbc
PN
2264 /* Copy and sort the read-only version of the command table */
2265 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2266 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
d55d5c5d 2267 qsort(commandTable,
2268 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2269 sizeof(struct redisCommand),qsortRedisCommands);
1a132bbc
PN
2270}
2271
ed9b544e 2272static struct redisCommand *lookupCommand(char *name) {
1a132bbc
PN
2273 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2274 return bsearch(
2275 &tmp,
2276 commandTable,
d55d5c5d 2277 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
1a132bbc
PN
2278 sizeof(struct redisCommand),
2279 qsortRedisCommands);
ed9b544e 2280}
2281
2282/* resetClient prepare the client to process the next command */
2283static void resetClient(redisClient *c) {
2284 freeClientArgv(c);
2285 c->bulklen = -1;
e8a74421 2286 c->multibulk = 0;
ed9b544e 2287}
2288
6e469882 2289/* Call() is the core of Redis execution of a command */
2290static void call(redisClient *c, struct redisCommand *cmd) {
2291 long long dirty;
2292
2293 dirty = server.dirty;
2294 cmd->proc(c);
4005fef1 2295 dirty = server.dirty-dirty;
2296
2297 if (server.appendonly && dirty)
6e469882 2298 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
4005fef1 2299 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2300 listLength(server.slaves))
248ea310 2301 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2302 if (listLength(server.monitors))
dd142b9c 2303 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2304 server.stat_numcommands++;
2305}
2306
ed9b544e 2307/* If this function gets called we already read a whole
2308 * command, argments are in the client argv/argc fields.
2309 * processCommand() execute the command or prepare the
2310 * server for a bulk read from the client.
2311 *
2312 * If 1 is returned the client is still alive and valid and
2313 * and other operations can be performed by the caller. Otherwise
2314 * if 0 is returned the client was destroied (i.e. after QUIT). */
2315static int processCommand(redisClient *c) {
2316 struct redisCommand *cmd;
ed9b544e 2317
3fd78bcd 2318 /* Free some memory if needed (maxmemory setting) */
2319 if (server.maxmemory) freeMemoryIfNeeded();
2320
e8a74421 2321 /* Handle the multi bulk command type. This is an alternative protocol
2322 * supported by Redis in order to receive commands that are composed of
2323 * multiple binary-safe "bulk" arguments. The latency of processing is
2324 * a bit higher but this allows things like multi-sets, so if this
2325 * protocol is used only for MSET and similar commands this is a big win. */
2326 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2327 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2328 if (c->multibulk <= 0) {
2329 resetClient(c);
2330 return 1;
2331 } else {
2332 decrRefCount(c->argv[c->argc-1]);
2333 c->argc--;
2334 return 1;
2335 }
2336 } else if (c->multibulk) {
2337 if (c->bulklen == -1) {
2338 if (((char*)c->argv[0]->ptr)[0] != '$') {
2339 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2340 resetClient(c);
2341 return 1;
2342 } else {
2343 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2344 decrRefCount(c->argv[0]);
2345 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2346 c->argc--;
2347 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2348 resetClient(c);
2349 return 1;
2350 }
2351 c->argc--;
2352 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2353 return 1;
2354 }
2355 } else {
2356 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2357 c->mbargv[c->mbargc] = c->argv[0];
2358 c->mbargc++;
2359 c->argc--;
2360 c->multibulk--;
2361 if (c->multibulk == 0) {
2362 robj **auxargv;
2363 int auxargc;
2364
2365 /* Here we need to swap the multi-bulk argc/argv with the
2366 * normal argc/argv of the client structure. */
2367 auxargv = c->argv;
2368 c->argv = c->mbargv;
2369 c->mbargv = auxargv;
2370
2371 auxargc = c->argc;
2372 c->argc = c->mbargc;
2373 c->mbargc = auxargc;
2374
2375 /* We need to set bulklen to something different than -1
2376 * in order for the code below to process the command without
2377 * to try to read the last argument of a bulk command as
2378 * a special argument. */
2379 c->bulklen = 0;
2380 /* continue below and process the command */
2381 } else {
2382 c->bulklen = -1;
2383 return 1;
2384 }
2385 }
2386 }
2387 /* -- end of multi bulk commands processing -- */
2388
ed9b544e 2389 /* The QUIT command is handled as a special case. Normal command
2390 * procs are unable to close the client connection safely */
bb0b03a3 2391 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2392 freeClient(c);
2393 return 0;
2394 }
d5d55fc3 2395
2396 /* Now lookup the command and check ASAP about trivial error conditions
2397 * such wrong arity, bad command name and so forth. */
ed9b544e 2398 cmd = lookupCommand(c->argv[0]->ptr);
2399 if (!cmd) {
2c14807b 2400 addReplySds(c,
2401 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2402 (char*)c->argv[0]->ptr));
ed9b544e 2403 resetClient(c);
2404 return 1;
2405 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2406 (c->argc < -cmd->arity)) {
454d4e43 2407 addReplySds(c,
2408 sdscatprintf(sdsempty(),
2409 "-ERR wrong number of arguments for '%s' command\r\n",
2410 cmd->name));
ed9b544e 2411 resetClient(c);
2412 return 1;
2413 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2414 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2415 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2416
2417 decrRefCount(c->argv[c->argc-1]);
2418 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2419 c->argc--;
2420 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2421 resetClient(c);
2422 return 1;
2423 }
2424 c->argc--;
2425 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2426 /* It is possible that the bulk read is already in the
8d0490e7 2427 * buffer. Check this condition and handle it accordingly.
2428 * This is just a fast path, alternative to call processInputBuffer().
2429 * It's a good idea since the code is small and this condition
2430 * happens most of the times. */
ed9b544e 2431 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2432 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2433 c->argc++;
2434 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2435 } else {
d5d55fc3 2436 /* Otherwise return... there is to read the last argument
2437 * from the socket. */
ed9b544e 2438 return 1;
2439 }
2440 }
942a3961 2441 /* Let's try to encode the bulk object to save space. */
2442 if (cmd->flags & REDIS_CMD_BULK)
05df7621 2443 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
942a3961 2444
e63943a4 2445 /* Check if the user is authenticated */
2446 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2447 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2448 resetClient(c);
2449 return 1;
2450 }
2451
b61a28fe 2452 /* Handle the maxmemory directive */
2453 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2454 zmalloc_used_memory() > server.maxmemory)
2455 {
2456 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2457 resetClient(c);
2458 return 1;
2459 }
2460
d6cc8867 2461 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
e6cca5db 2462 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2463 &&
ffc6b7f8 2464 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2465 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2466 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
d6cc8867 2467 resetClient(c);
2468 return 1;
2469 }
2470
ed9b544e 2471 /* Exec the command */
6531c94d 2472 if (c->flags & REDIS_MULTI &&
2473 cmd->proc != execCommand && cmd->proc != discardCommand &&
2474 cmd->proc != multiCommand && cmd->proc != watchCommand)
2475 {
6e469882 2476 queueMultiCommand(c,cmd);
2477 addReply(c,shared.queued);
2478 } else {
d5d55fc3 2479 if (server.vm_enabled && server.vm_max_threads > 0 &&
0a6f3f0f 2480 blockClientOnSwappedKeys(c,cmd)) return 1;
6e469882 2481 call(c,cmd);
2482 }
ed9b544e 2483
2484 /* Prepare the client for the next command */
ed9b544e 2485 resetClient(c);
2486 return 1;
2487}
2488
248ea310 2489static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2490 listNode *ln;
c7df85a4 2491 listIter li;
ed9b544e 2492 int outc = 0, j;
93ea3759 2493 robj **outv;
248ea310 2494 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2495 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2496 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2497 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2498 robj *lenobj;
93ea3759 2499
2500 if (argc <= REDIS_STATIC_ARGS) {
2501 outv = static_outv;
2502 } else {
248ea310 2503 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2504 }
248ea310 2505
2506 lenobj = createObject(REDIS_STRING,
2507 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2508 lenobj->refcount = 0;
2509 outv[outc++] = lenobj;
ed9b544e 2510 for (j = 0; j < argc; j++) {
248ea310 2511 lenobj = createObject(REDIS_STRING,
2512 sdscatprintf(sdsempty(),"$%lu\r\n",
2513 (unsigned long) stringObjectLen(argv[j])));
2514 lenobj->refcount = 0;
2515 outv[outc++] = lenobj;
ed9b544e 2516 outv[outc++] = argv[j];
248ea310 2517 outv[outc++] = shared.crlf;
ed9b544e 2518 }
ed9b544e 2519
40d224a9 2520 /* Increment all the refcounts at start and decrement at end in order to
2521 * be sure to free objects if there is no slave in a replication state
2522 * able to be feed with commands */
2523 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2524 listRewind(slaves,&li);
2525 while((ln = listNext(&li))) {
ed9b544e 2526 redisClient *slave = ln->value;
40d224a9 2527
2528 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2529 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2530
2531 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2532 if (slave->slaveseldb != dictid) {
2533 robj *selectcmd;
2534
2535 switch(dictid) {
2536 case 0: selectcmd = shared.select0; break;
2537 case 1: selectcmd = shared.select1; break;
2538 case 2: selectcmd = shared.select2; break;
2539 case 3: selectcmd = shared.select3; break;
2540 case 4: selectcmd = shared.select4; break;
2541 case 5: selectcmd = shared.select5; break;
2542 case 6: selectcmd = shared.select6; break;
2543 case 7: selectcmd = shared.select7; break;
2544 case 8: selectcmd = shared.select8; break;
2545 case 9: selectcmd = shared.select9; break;
2546 default:
2547 selectcmd = createObject(REDIS_STRING,
2548 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2549 selectcmd->refcount = 0;
2550 break;
2551 }
2552 addReply(slave,selectcmd);
2553 slave->slaveseldb = dictid;
2554 }
2555 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2556 }
40d224a9 2557 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2558 if (outv != static_outv) zfree(outv);
ed9b544e 2559}
2560
dd142b9c 2561static sds sdscatrepr(sds s, char *p, size_t len) {
2562 s = sdscatlen(s,"\"",1);
2563 while(len--) {
2564 switch(*p) {
2565 case '\\':
2566 case '"':
2567 s = sdscatprintf(s,"\\%c",*p);
2568 break;
2569 case '\n': s = sdscatlen(s,"\\n",1); break;
2570 case '\r': s = sdscatlen(s,"\\r",1); break;
2571 case '\t': s = sdscatlen(s,"\\t",1); break;
2572 case '\a': s = sdscatlen(s,"\\a",1); break;
2573 case '\b': s = sdscatlen(s,"\\b",1); break;
2574 default:
2575 if (isprint(*p))
2576 s = sdscatprintf(s,"%c",*p);
2577 else
2578 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2579 break;
2580 }
2581 p++;
2582 }
2583 return sdscatlen(s,"\"",1);
2584}
2585
2586static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2587 listNode *ln;
2588 listIter li;
2589 int j;
2590 sds cmdrepr = sdsnew("+");
2591 robj *cmdobj;
2592 struct timeval tv;
2593
2594 gettimeofday(&tv,NULL);
2595 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2596 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2597
2598 for (j = 0; j < argc; j++) {
2599 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2600 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2601 } else {
2602 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2603 sdslen(argv[j]->ptr));
2604 }
2605 if (j != argc-1)
2606 cmdrepr = sdscatlen(cmdrepr," ",1);
2607 }
2608 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2609 cmdobj = createObject(REDIS_STRING,cmdrepr);
2610
2611 listRewind(monitors,&li);
2612 while((ln = listNext(&li))) {
2613 redisClient *monitor = ln->value;
2614 addReply(monitor,cmdobj);
2615 }
2616 decrRefCount(cmdobj);
2617}
2618
638e42ac 2619static void processInputBuffer(redisClient *c) {
ed9b544e 2620again:
4409877e 2621 /* Before to process the input buffer, make sure the client is not
2622 * waitig for a blocking operation such as BLPOP. Note that the first
2623 * iteration the client is never blocked, otherwise the processInputBuffer
2624 * would not be called at all, but after the execution of the first commands
2625 * in the input buffer the client may be blocked, and the "goto again"
2626 * will try to reiterate. The following line will make it return asap. */
92f8e882 2627 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2628 if (c->bulklen == -1) {
2629 /* Read the first line of the query */
2630 char *p = strchr(c->querybuf,'\n');
2631 size_t querylen;
644fafa3 2632
ed9b544e 2633 if (p) {
2634 sds query, *argv;
2635 int argc, j;
e0a62c7f 2636
ed9b544e 2637 query = c->querybuf;
2638 c->querybuf = sdsempty();
2639 querylen = 1+(p-(query));
2640 if (sdslen(query) > querylen) {
2641 /* leave data after the first line of the query in the buffer */
2642 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2643 }
2644 *p = '\0'; /* remove "\n" */
2645 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2646 sdsupdatelen(query);
2647
2648 /* Now we can split the query in arguments */
ed9b544e 2649 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2650 sdsfree(query);
2651
2652 if (c->argv) zfree(c->argv);
2653 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2654
2655 for (j = 0; j < argc; j++) {
ed9b544e 2656 if (sdslen(argv[j])) {
2657 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2658 c->argc++;
2659 } else {
2660 sdsfree(argv[j]);
2661 }
2662 }
2663 zfree(argv);
7c49733c 2664 if (c->argc) {
2665 /* Execute the command. If the client is still valid
2666 * after processCommand() return and there is something
2667 * on the query buffer try to process the next command. */
2668 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2669 } else {
2670 /* Nothing to process, argc == 0. Just process the query
2671 * buffer if it's not empty or return to the caller */
2672 if (sdslen(c->querybuf)) goto again;
2673 }
ed9b544e 2674 return;
644fafa3 2675 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2676 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2677 freeClient(c);
2678 return;
2679 }
2680 } else {
2681 /* Bulk read handling. Note that if we are at this point
2682 the client already sent a command terminated with a newline,
2683 we are reading the bulk data that is actually the last
2684 argument of the command. */
2685 int qbl = sdslen(c->querybuf);
2686
2687 if (c->bulklen <= qbl) {
2688 /* Copy everything but the final CRLF as final argument */
2689 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2690 c->argc++;
2691 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2692 /* Process the command. If the client is still valid after
2693 * the processing and there is more data in the buffer
2694 * try to parse it. */
2695 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2696 return;
2697 }
2698 }
2699}
2700
638e42ac 2701static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2702 redisClient *c = (redisClient*) privdata;
2703 char buf[REDIS_IOBUF_LEN];
2704 int nread;
2705 REDIS_NOTUSED(el);
2706 REDIS_NOTUSED(mask);
2707
2708 nread = read(fd, buf, REDIS_IOBUF_LEN);
2709 if (nread == -1) {
2710 if (errno == EAGAIN) {
2711 nread = 0;
2712 } else {
f870935d 2713 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2714 freeClient(c);
2715 return;
2716 }
2717 } else if (nread == 0) {
f870935d 2718 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2719 freeClient(c);
2720 return;
2721 }
2722 if (nread) {
2723 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2724 c->lastinteraction = time(NULL);
2725 } else {
2726 return;
2727 }
168ac5c6 2728 processInputBuffer(c);
638e42ac 2729}
2730
ed9b544e 2731static int selectDb(redisClient *c, int id) {
2732 if (id < 0 || id >= server.dbnum)
2733 return REDIS_ERR;
3305306f 2734 c->db = &server.db[id];
ed9b544e 2735 return REDIS_OK;
2736}
2737
40d224a9 2738static void *dupClientReplyValue(void *o) {
2739 incrRefCount((robj*)o);
12d090d2 2740 return o;
40d224a9 2741}
2742
ffc6b7f8 2743static int listMatchObjects(void *a, void *b) {
bf028098 2744 return equalStringObjects(a,b);
ffc6b7f8 2745}
2746
ed9b544e 2747static redisClient *createClient(int fd) {
2748 redisClient *c = zmalloc(sizeof(*c));
2749
2750 anetNonBlock(NULL,fd);
2751 anetTcpNoDelay(NULL,fd);
2752 if (!c) return NULL;
2753 selectDb(c,0);
2754 c->fd = fd;
2755 c->querybuf = sdsempty();
2756 c->argc = 0;
93ea3759 2757 c->argv = NULL;
ed9b544e 2758 c->bulklen = -1;
e8a74421 2759 c->multibulk = 0;
2760 c->mbargc = 0;
2761 c->mbargv = NULL;
ed9b544e 2762 c->sentlen = 0;
2763 c->flags = 0;
2764 c->lastinteraction = time(NULL);
abcb223e 2765 c->authenticated = 0;
40d224a9 2766 c->replstate = REDIS_REPL_NONE;
6b47e12e 2767 c->reply = listCreate();
ed9b544e 2768 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2769 listSetDupMethod(c->reply,dupClientReplyValue);
37ab76c9 2770 c->blocking_keys = NULL;
2771 c->blocking_keys_num = 0;
92f8e882 2772 c->io_keys = listCreate();
87c68815 2773 c->watched_keys = listCreate();
92f8e882 2774 listSetFreeMethod(c->io_keys,decrRefCount);
ffc6b7f8 2775 c->pubsub_channels = dictCreate(&setDictType,NULL);
2776 c->pubsub_patterns = listCreate();
2777 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2778 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
ed9b544e 2779 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2780 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2781 freeClient(c);
2782 return NULL;
2783 }
6b47e12e 2784 listAddNodeTail(server.clients,c);
6e469882 2785 initClientMultiState(c);
ed9b544e 2786 return c;
2787}
2788
2789static void addReply(redisClient *c, robj *obj) {
2790 if (listLength(c->reply) == 0 &&
6208b3a7 2791 (c->replstate == REDIS_REPL_NONE ||
2792 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2793 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2794 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2795
2796 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2797 obj = dupStringObject(obj);
2798 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2799 }
9d65a1bb 2800 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2801}
2802
2803static void addReplySds(redisClient *c, sds s) {
2804 robj *o = createObject(REDIS_STRING,s);
2805 addReply(c,o);
2806 decrRefCount(o);
2807}
2808
e2665397 2809static void addReplyDouble(redisClient *c, double d) {
2810 char buf[128];
2811
2812 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2813 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2814 (unsigned long) strlen(buf),buf));
e2665397 2815}
2816
aa7c2934
PN
2817static void addReplyLongLong(redisClient *c, long long ll) {
2818 char buf[128];
2819 size_t len;
2820
2821 if (ll == 0) {
2822 addReply(c,shared.czero);
2823 return;
2824 } else if (ll == 1) {
2825 addReply(c,shared.cone);
2826 return;
2827 }
482b672d 2828 buf[0] = ':';
2829 len = ll2string(buf+1,sizeof(buf)-1,ll);
2830 buf[len+1] = '\r';
2831 buf[len+2] = '\n';
2832 addReplySds(c,sdsnewlen(buf,len+3));
aa7c2934
PN
2833}
2834
92b27fe9 2835static void addReplyUlong(redisClient *c, unsigned long ul) {
2836 char buf[128];
2837 size_t len;
2838
dd88747b 2839 if (ul == 0) {
2840 addReply(c,shared.czero);
2841 return;
2842 } else if (ul == 1) {
2843 addReply(c,shared.cone);
2844 return;
2845 }
92b27fe9 2846 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2847 addReplySds(c,sdsnewlen(buf,len));
2848}
2849
942a3961 2850static void addReplyBulkLen(redisClient *c, robj *obj) {
482b672d 2851 size_t len, intlen;
2852 char buf[128];
942a3961 2853
2854 if (obj->encoding == REDIS_ENCODING_RAW) {
2855 len = sdslen(obj->ptr);
2856 } else {
2857 long n = (long)obj->ptr;
2858
e054afda 2859 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2860 len = 1;
2861 if (n < 0) {
2862 len++;
2863 n = -n;
2864 }
2865 while((n = n/10) != 0) {
2866 len++;
2867 }
2868 }
482b672d 2869 buf[0] = '$';
2870 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2871 buf[intlen+1] = '\r';
2872 buf[intlen+2] = '\n';
2873 addReplySds(c,sdsnewlen(buf,intlen+3));
942a3961 2874}
2875
dd88747b 2876static void addReplyBulk(redisClient *c, robj *obj) {
2877 addReplyBulkLen(c,obj);
2878 addReply(c,obj);
2879 addReply(c,shared.crlf);
2880}
2881
500ece7c 2882/* In the CONFIG command we need to add vanilla C string as bulk replies */
2883static void addReplyBulkCString(redisClient *c, char *s) {
2884 if (s == NULL) {
2885 addReply(c,shared.nullbulk);
2886 } else {
2887 robj *o = createStringObject(s,strlen(s));
2888 addReplyBulk(c,o);
2889 decrRefCount(o);
2890 }
2891}
2892
ed9b544e 2893static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2894 int cport, cfd;
2895 char cip[128];
285add55 2896 redisClient *c;
ed9b544e 2897 REDIS_NOTUSED(el);
2898 REDIS_NOTUSED(mask);
2899 REDIS_NOTUSED(privdata);
2900
2901 cfd = anetAccept(server.neterr, fd, cip, &cport);
2902 if (cfd == AE_ERR) {
f870935d 2903 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2904 return;
2905 }
f870935d 2906 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2907 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2908 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2909 close(cfd); /* May be already closed, just ingore errors */
2910 return;
2911 }
285add55 2912 /* If maxclient directive is set and this is one client more... close the
2913 * connection. Note that we create the client instead to check before
2914 * for this condition, since now the socket is already set in nonblocking
2915 * mode and we can send an error for free using the Kernel I/O */
2916 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2917 char *err = "-ERR max number of clients reached\r\n";
2918
2919 /* That's a best effort error message, don't check write errors */
fee803ba 2920 if (write(c->fd,err,strlen(err)) == -1) {
2921 /* Nothing to do, Just to avoid the warning... */
2922 }
285add55 2923 freeClient(c);
2924 return;
2925 }
ed9b544e 2926 server.stat_numconnections++;
2927}
2928
2929/* ======================= Redis objects implementation ===================== */
2930
2931static robj *createObject(int type, void *ptr) {
2932 robj *o;
2933
a5819310 2934 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2935 if (listLength(server.objfreelist)) {
2936 listNode *head = listFirst(server.objfreelist);
2937 o = listNodeValue(head);
2938 listDelNode(server.objfreelist,head);
a5819310 2939 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2940 } else {
75680a3c 2941 if (server.vm_enabled) {
a5819310 2942 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2943 o = zmalloc(sizeof(*o));
2944 } else {
2945 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2946 }
ed9b544e 2947 }
ed9b544e 2948 o->type = type;
942a3961 2949 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2950 o->ptr = ptr;
2951 o->refcount = 1;
3a66edc7 2952 if (server.vm_enabled) {
1064ef87 2953 /* Note that this code may run in the context of an I/O thread
2954 * and accessing to server.unixtime in theory is an error
2955 * (no locks). But in practice this is safe, and even if we read
2956 * garbage Redis will not fail, as it's just a statistical info */
3a66edc7 2957 o->vm.atime = server.unixtime;
2958 o->storage = REDIS_VM_MEMORY;
2959 }
ed9b544e 2960 return o;
2961}
2962
2963static robj *createStringObject(char *ptr, size_t len) {
2964 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2965}
2966
3f973463
PN
2967static robj *createStringObjectFromLongLong(long long value) {
2968 robj *o;
2969 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2970 incrRefCount(shared.integers[value]);
2971 o = shared.integers[value];
2972 } else {
3f973463 2973 if (value >= LONG_MIN && value <= LONG_MAX) {
10dea8dc 2974 o = createObject(REDIS_STRING, NULL);
3f973463
PN
2975 o->encoding = REDIS_ENCODING_INT;
2976 o->ptr = (void*)((long)value);
2977 } else {
ee14da56 2978 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3f973463
PN
2979 }
2980 }
2981 return o;
2982}
2983
4ef8de8a 2984static robj *dupStringObject(robj *o) {
b9bc0eef 2985 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2986 return createStringObject(o->ptr,sdslen(o->ptr));
2987}
2988
ed9b544e 2989static robj *createListObject(void) {
2990 list *l = listCreate();
2991
ed9b544e 2992 listSetFreeMethod(l,decrRefCount);
2993 return createObject(REDIS_LIST,l);
2994}
2995
2996static robj *createSetObject(void) {
2997 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2998 return createObject(REDIS_SET,d);
2999}
3000
5234952b 3001static robj *createHashObject(void) {
3002 /* All the Hashes start as zipmaps. Will be automatically converted
3003 * into hash tables if there are enough elements or big elements
3004 * inside. */
3005 unsigned char *zm = zipmapNew();
3006 robj *o = createObject(REDIS_HASH,zm);
3007 o->encoding = REDIS_ENCODING_ZIPMAP;
3008 return o;
3009}
3010
1812e024 3011static robj *createZsetObject(void) {
6b47e12e 3012 zset *zs = zmalloc(sizeof(*zs));
3013
3014 zs->dict = dictCreate(&zsetDictType,NULL);
3015 zs->zsl = zslCreate();
3016 return createObject(REDIS_ZSET,zs);
1812e024 3017}
3018
ed9b544e 3019static void freeStringObject(robj *o) {
942a3961 3020 if (o->encoding == REDIS_ENCODING_RAW) {
3021 sdsfree(o->ptr);
3022 }
ed9b544e 3023}
3024
3025static void freeListObject(robj *o) {
3026 listRelease((list*) o->ptr);
3027}
3028
3029static void freeSetObject(robj *o) {
3030 dictRelease((dict*) o->ptr);
3031}
3032
fd8ccf44 3033static void freeZsetObject(robj *o) {
3034 zset *zs = o->ptr;
3035
3036 dictRelease(zs->dict);
3037 zslFree(zs->zsl);
3038 zfree(zs);
3039}
3040
ed9b544e 3041static void freeHashObject(robj *o) {
cbba7dd7 3042 switch (o->encoding) {
3043 case REDIS_ENCODING_HT:
3044 dictRelease((dict*) o->ptr);
3045 break;
3046 case REDIS_ENCODING_ZIPMAP:
3047 zfree(o->ptr);
3048 break;
3049 default:
f83c6cb5 3050 redisPanic("Unknown hash encoding type");
cbba7dd7 3051 break;
3052 }
ed9b544e 3053}
3054
3055static void incrRefCount(robj *o) {
3056 o->refcount++;
3057}
3058
3059static void decrRefCount(void *obj) {
3060 robj *o = obj;
94754ccc 3061
c651fd9e 3062 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
970e10bb 3063 /* Object is a key of a swapped out value, or in the process of being
3064 * loaded. */
996cb5f7 3065 if (server.vm_enabled &&
3066 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3067 {
996cb5f7 3068 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 3069 redisAssert(o->type == REDIS_STRING);
a35ddf12 3070 freeStringObject(o);
3071 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 3072 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 3073 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3074 !listAddNodeHead(server.objfreelist,o))
3075 zfree(o);
a5819310 3076 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 3077 server.vm_stats_swapped_objects--;
a35ddf12 3078 return;
3079 }
996cb5f7 3080 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 3081 if (--(o->refcount) == 0) {
996cb5f7 3082 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3083 vmCancelThreadedIOJob(obj);
ed9b544e 3084 switch(o->type) {
3085 case REDIS_STRING: freeStringObject(o); break;
3086 case REDIS_LIST: freeListObject(o); break;
3087 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 3088 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 3089 case REDIS_HASH: freeHashObject(o); break;
f83c6cb5 3090 default: redisPanic("Unknown object type"); break;
ed9b544e 3091 }
a5819310 3092 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 3093 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3094 !listAddNodeHead(server.objfreelist,o))
3095 zfree(o);
a5819310 3096 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 3097 }
3098}
3099
942a3961 3100static robj *lookupKey(redisDb *db, robj *key) {
3101 dictEntry *de = dictFind(db->dict,key);
3a66edc7 3102 if (de) {
55cf8433 3103 robj *key = dictGetEntryKey(de);
3104 robj *val = dictGetEntryVal(de);
3a66edc7 3105
55cf8433 3106 if (server.vm_enabled) {
996cb5f7 3107 if (key->storage == REDIS_VM_MEMORY ||
3108 key->storage == REDIS_VM_SWAPPING)
3109 {
3110 /* If we were swapping the object out, stop it, this key
3111 * was requested. */
3112 if (key->storage == REDIS_VM_SWAPPING)
3113 vmCancelThreadedIOJob(key);
55cf8433 3114 /* Update the access time of the key for the aging algorithm. */
3115 key->vm.atime = server.unixtime;
3116 } else {
d5d55fc3 3117 int notify = (key->storage == REDIS_VM_LOADING);
3118
55cf8433 3119 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 3120 redisAssert(val == NULL);
55cf8433 3121 val = vmLoadObject(key);
3122 dictGetEntryVal(de) = val;
d5d55fc3 3123
3124 /* Clients blocked by the VM subsystem may be waiting for
3125 * this key... */
3126 if (notify) handleClientsBlockedOnSwappedKey(db,key);
55cf8433 3127 }
3128 }
3129 return val;
3a66edc7 3130 } else {
3131 return NULL;
3132 }
942a3961 3133}
3134
3135static robj *lookupKeyRead(redisDb *db, robj *key) {
3136 expireIfNeeded(db,key);
3137 return lookupKey(db,key);
3138}
3139
3140static robj *lookupKeyWrite(redisDb *db, robj *key) {
3141 deleteIfVolatile(db,key);
37ab76c9 3142 touchWatchedKey(db,key);
942a3961 3143 return lookupKey(db,key);
3144}
3145
92b27fe9 3146static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3147 robj *o = lookupKeyRead(c->db, key);
3148 if (!o) addReply(c,reply);
3149 return o;
3150}
3151
3152static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3153 robj *o = lookupKeyWrite(c->db, key);
3154 if (!o) addReply(c,reply);
3155 return o;
3156}
3157
3158static int checkType(redisClient *c, robj *o, int type) {
3159 if (o->type != type) {
3160 addReply(c,shared.wrongtypeerr);
3161 return 1;
3162 }
3163 return 0;
3164}
3165
942a3961 3166static int deleteKey(redisDb *db, robj *key) {
3167 int retval;
3168
3169 /* We need to protect key from destruction: after the first dictDelete()
3170 * it may happen that 'key' is no longer valid if we don't increment
3171 * it's count. This may happen when we get the object reference directly
3172 * from the hash table with dictRandomKey() or dict iterators */
3173 incrRefCount(key);
3174 if (dictSize(db->expires)) dictDelete(db->expires,key);
3175 retval = dictDelete(db->dict,key);
3176 decrRefCount(key);
3177
3178 return retval == DICT_OK;
3179}
3180
724a51b1 3181/* Check if the nul-terminated string 's' can be represented by a long
3182 * (that is, is a number that fits into long without any other space or
3183 * character before or after the digits).
3184 *
3185 * If so, the function returns REDIS_OK and *longval is set to the value
3186 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 3187static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 3188 char buf[32], *endptr;
3189 long value;
3190 int slen;
e0a62c7f 3191
724a51b1 3192 value = strtol(s, &endptr, 10);
3193 if (endptr[0] != '\0') return REDIS_ERR;
ee14da56 3194 slen = ll2string(buf,32,value);
724a51b1 3195
3196 /* If the number converted back into a string is not identical
3197 * then it's not possible to encode the string as integer */
f69f2cba 3198 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 3199 if (longval) *longval = value;
3200 return REDIS_OK;
3201}
3202
942a3961 3203/* Try to encode a string object in order to save space */
05df7621 3204static robj *tryObjectEncoding(robj *o) {
942a3961 3205 long value;
942a3961 3206 sds s = o->ptr;
3305306f 3207
942a3961 3208 if (o->encoding != REDIS_ENCODING_RAW)
05df7621 3209 return o; /* Already encoded */
3305306f 3210
05df7621 3211 /* It's not safe to encode shared objects: shared objects can be shared
942a3961 3212 * everywhere in the "object space" of Redis. Encoded objects can only
3213 * appear as "values" (and not, for instance, as keys) */
05df7621 3214 if (o->refcount > 1) return o;
3305306f 3215
942a3961 3216 /* Currently we try to encode only strings */
dfc5e96c 3217 redisAssert(o->type == REDIS_STRING);
94754ccc 3218
724a51b1 3219 /* Check if we can represent this string as a long integer */
05df7621 3220 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
942a3961 3221
3222 /* Ok, this object can be encoded */
05df7621 3223 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3224 decrRefCount(o);
3225 incrRefCount(shared.integers[value]);
3226 return shared.integers[value];
3227 } else {
3228 o->encoding = REDIS_ENCODING_INT;
3229 sdsfree(o->ptr);
3230 o->ptr = (void*) value;
3231 return o;
3232 }
942a3961 3233}
3234
9d65a1bb 3235/* Get a decoded version of an encoded object (returned as a new object).
3236 * If the object is already raw-encoded just increment the ref count. */
3237static robj *getDecodedObject(robj *o) {
942a3961 3238 robj *dec;
e0a62c7f 3239
9d65a1bb 3240 if (o->encoding == REDIS_ENCODING_RAW) {
3241 incrRefCount(o);
3242 return o;
3243 }
942a3961 3244 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3245 char buf[32];
3246
ee14da56 3247 ll2string(buf,32,(long)o->ptr);
942a3961 3248 dec = createStringObject(buf,strlen(buf));
3249 return dec;
3250 } else {
08ee9b57 3251 redisPanic("Unknown encoding type");
942a3961 3252 }
3305306f 3253}
3254
d7f43c08 3255/* Compare two string objects via strcmp() or alike.
3256 * Note that the objects may be integer-encoded. In such a case we
ee14da56 3257 * use ll2string() to get a string representation of the numbers on the stack
1fd9bc8a 3258 * and compare the strings, it's much faster than calling getDecodedObject().
3259 *
3260 * Important note: if objects are not integer encoded, but binary-safe strings,
3261 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3262 * binary safe. */
724a51b1 3263static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 3264 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 3265 char bufa[128], bufb[128], *astr, *bstr;
3266 int bothsds = 1;
724a51b1 3267
e197b441 3268 if (a == b) return 0;
d7f43c08 3269 if (a->encoding != REDIS_ENCODING_RAW) {
ee14da56 3270 ll2string(bufa,sizeof(bufa),(long) a->ptr);
d7f43c08 3271 astr = bufa;
3272 bothsds = 0;
724a51b1 3273 } else {
d7f43c08 3274 astr = a->ptr;
724a51b1 3275 }
d7f43c08 3276 if (b->encoding != REDIS_ENCODING_RAW) {
ee14da56 3277 ll2string(bufb,sizeof(bufb),(long) b->ptr);
d7f43c08 3278 bstr = bufb;
3279 bothsds = 0;
3280 } else {
3281 bstr = b->ptr;
3282 }
3283 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3284}
3285
bf028098 3286/* Equal string objects return 1 if the two objects are the same from the
3287 * point of view of a string comparison, otherwise 0 is returned. Note that
3288 * this function is faster then checking for (compareStringObject(a,b) == 0)
3289 * because it can perform some more optimization. */
3290static int equalStringObjects(robj *a, robj *b) {
3291 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3292 return a->ptr == b->ptr;
3293 } else {
3294 return compareStringObjects(a,b) == 0;
3295 }
3296}
3297
0ea663ea 3298static size_t stringObjectLen(robj *o) {
dfc5e96c 3299 redisAssert(o->type == REDIS_STRING);
0ea663ea 3300 if (o->encoding == REDIS_ENCODING_RAW) {
3301 return sdslen(o->ptr);
3302 } else {
3303 char buf[32];
3304
ee14da56 3305 return ll2string(buf,32,(long)o->ptr);
0ea663ea 3306 }
3307}
3308
bd79a6bd
PN
3309static int getDoubleFromObject(robj *o, double *target) {
3310 double value;
682c73e8 3311 char *eptr;
bbe025e0 3312
bd79a6bd
PN
3313 if (o == NULL) {
3314 value = 0;
3315 } else {
3316 redisAssert(o->type == REDIS_STRING);
3317 if (o->encoding == REDIS_ENCODING_RAW) {
3318 value = strtod(o->ptr, &eptr);
682c73e8 3319 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3320 } else if (o->encoding == REDIS_ENCODING_INT) {
3321 value = (long)o->ptr;
3322 } else {
946342c1 3323 redisPanic("Unknown string encoding");
bd79a6bd
PN
3324 }
3325 }
3326
bd79a6bd
PN
3327 *target = value;
3328 return REDIS_OK;
3329}
bbe025e0 3330
bd79a6bd
PN
3331static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3332 double value;
3333 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3334 if (msg != NULL) {
3335 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3336 } else {
3337 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3338 }
bbe025e0
AM
3339 return REDIS_ERR;
3340 }
3341
bd79a6bd 3342 *target = value;
bbe025e0
AM
3343 return REDIS_OK;
3344}
3345
bd79a6bd
PN
3346static int getLongLongFromObject(robj *o, long long *target) {
3347 long long value;
682c73e8 3348 char *eptr;
bbe025e0 3349
bd79a6bd
PN
3350 if (o == NULL) {
3351 value = 0;
3352 } else {
3353 redisAssert(o->type == REDIS_STRING);
3354 if (o->encoding == REDIS_ENCODING_RAW) {
3355 value = strtoll(o->ptr, &eptr, 10);
682c73e8 3356 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3357 } else if (o->encoding == REDIS_ENCODING_INT) {
3358 value = (long)o->ptr;
3359 } else {
946342c1 3360 redisPanic("Unknown string encoding");
bd79a6bd
PN
3361 }
3362 }
3363
bd79a6bd
PN
3364 *target = value;
3365 return REDIS_OK;
3366}
bbe025e0 3367
bd79a6bd
PN
3368static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3369 long long value;
3370 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3371 if (msg != NULL) {
3372 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3373 } else {
3374 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3375 }
bbe025e0
AM
3376 return REDIS_ERR;
3377 }
3378
bd79a6bd 3379 *target = value;
bbe025e0
AM
3380 return REDIS_OK;
3381}
3382
bd79a6bd
PN
3383static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3384 long long value;
bbe025e0 3385
bd79a6bd
PN
3386 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3387 if (value < LONG_MIN || value > LONG_MAX) {
3388 if (msg != NULL) {
3389 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3390 } else {
3391 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3392 }
bbe025e0
AM
3393 return REDIS_ERR;
3394 }
3395
bd79a6bd 3396 *target = value;
bbe025e0
AM
3397 return REDIS_OK;
3398}
3399
06233c45 3400/*============================ RDB saving/loading =========================== */
ed9b544e 3401
f78fd11b 3402static int rdbSaveType(FILE *fp, unsigned char type) {
3403 if (fwrite(&type,1,1,fp) == 0) return -1;
3404 return 0;
3405}
3406
bb32ede5 3407static int rdbSaveTime(FILE *fp, time_t t) {
3408 int32_t t32 = (int32_t) t;
3409 if (fwrite(&t32,4,1,fp) == 0) return -1;
3410 return 0;
3411}
3412
e3566d4b 3413/* check rdbLoadLen() comments for more info */
f78fd11b 3414static int rdbSaveLen(FILE *fp, uint32_t len) {
3415 unsigned char buf[2];
3416
3417 if (len < (1<<6)) {
3418 /* Save a 6 bit len */
10c43610 3419 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3420 if (fwrite(buf,1,1,fp) == 0) return -1;
3421 } else if (len < (1<<14)) {
3422 /* Save a 14 bit len */
10c43610 3423 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3424 buf[1] = len&0xFF;
17be1a4a 3425 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3426 } else {
3427 /* Save a 32 bit len */
10c43610 3428 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3429 if (fwrite(buf,1,1,fp) == 0) return -1;
3430 len = htonl(len);
3431 if (fwrite(&len,4,1,fp) == 0) return -1;
3432 }
3433 return 0;
3434}
3435
32a66513 3436/* Encode 'value' as an integer if possible (if integer will fit the
3437 * supported range). If the function sucessful encoded the integer
3438 * then the (up to 5 bytes) encoded representation is written in the
3439 * string pointed by 'enc' and the length is returned. Otherwise
3440 * 0 is returned. */
3441static int rdbEncodeInteger(long long value, unsigned char *enc) {
e3566d4b 3442 /* Finally check if it fits in our ranges */
3443 if (value >= -(1<<7) && value <= (1<<7)-1) {
3444 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3445 enc[1] = value&0xFF;
3446 return 2;
3447 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3448 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3449 enc[1] = value&0xFF;
3450 enc[2] = (value>>8)&0xFF;
3451 return 3;
3452 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3453 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3454 enc[1] = value&0xFF;
3455 enc[2] = (value>>8)&0xFF;
3456 enc[3] = (value>>16)&0xFF;
3457 enc[4] = (value>>24)&0xFF;
3458 return 5;
3459 } else {
3460 return 0;
3461 }
3462}
3463
32a66513 3464/* String objects in the form "2391" "-100" without any space and with a
3465 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3466 * encoded as integers to save space */
3467static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3468 long long value;
3469 char *endptr, buf[32];
3470
3471 /* Check if it's possible to encode this value as a number */
3472 value = strtoll(s, &endptr, 10);
3473 if (endptr[0] != '\0') return 0;
3474 ll2string(buf,32,value);
3475
3476 /* If the number converted back into a string is not identical
3477 * then it's not possible to encode the string as integer */
3478 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3479
3480 return rdbEncodeInteger(value,enc);
3481}
3482
b1befe6a 3483static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3484 size_t comprlen, outlen;
774e3047 3485 unsigned char byte;
3486 void *out;
3487
3488 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3489 if (len <= 4) return 0;
3490 outlen = len-4;
3a2694c4 3491 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3492 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3493 if (comprlen == 0) {
88e85998 3494 zfree(out);
774e3047 3495 return 0;
3496 }
3497 /* Data compressed! Let's save it on disk */
3498 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3499 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3500 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3501 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3502 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3503 zfree(out);
774e3047 3504 return comprlen;
3505
3506writeerr:
88e85998 3507 zfree(out);
774e3047 3508 return -1;
3509}
3510
e3566d4b 3511/* Save a string objet as [len][data] on disk. If the object is a string
3512 * representation of an integer value we try to safe it in a special form */
b1befe6a 3513static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3514 int enclen;
10c43610 3515
774e3047 3516 /* Try integer encoding */
e3566d4b 3517 if (len <= 11) {
3518 unsigned char buf[5];
b1befe6a 3519 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3520 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3521 return 0;
3522 }
3523 }
774e3047 3524
3525 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3526 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3527 if (server.rdbcompression && len > 20) {
774e3047 3528 int retval;
3529
b1befe6a 3530 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3531 if (retval == -1) return -1;
3532 if (retval > 0) return 0;
3533 /* retval == 0 means data can't be compressed, save the old way */
3534 }
3535
3536 /* Store verbatim */
10c43610 3537 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3538 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3539 return 0;
3540}
3541
942a3961 3542/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3543static int rdbSaveStringObject(FILE *fp, robj *obj) {
3544 int retval;
942a3961 3545
32a66513 3546 /* Avoid to decode the object, then encode it again, if the
3547 * object is alrady integer encoded. */
3548 if (obj->encoding == REDIS_ENCODING_INT) {
3549 long val = (long) obj->ptr;
3550 unsigned char buf[5];
3551 int enclen;
3552
3553 if ((enclen = rdbEncodeInteger(val,buf)) > 0) {
3554 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3555 return 0;
3556 }
3557 /* otherwise... fall throught and continue with the usual
3558 * code path. */
3559 }
3560
f2d9f50f 3561 /* Avoid incr/decr ref count business when possible.
3562 * This plays well with copy-on-write given that we are probably
3563 * in a child process (BGSAVE). Also this makes sure key objects
3564 * of swapped objects are not incRefCount-ed (an assert does not allow
3565 * this in order to avoid bugs) */
3566 if (obj->encoding != REDIS_ENCODING_RAW) {
996cb5f7 3567 obj = getDecodedObject(obj);
b1befe6a 3568 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3569 decrRefCount(obj);
3570 } else {
b1befe6a 3571 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3572 }
9d65a1bb 3573 return retval;
942a3961 3574}
3575
a7866db6 3576/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3577 * 8 bit integer specifing the length of the representation.
3578 * This 8 bit integer has special values in order to specify the following
3579 * conditions:
3580 * 253: not a number
3581 * 254: + inf
3582 * 255: - inf
3583 */
3584static int rdbSaveDoubleValue(FILE *fp, double val) {
3585 unsigned char buf[128];
3586 int len;
3587
3588 if (isnan(val)) {
3589 buf[0] = 253;
3590 len = 1;
3591 } else if (!isfinite(val)) {
3592 len = 1;
3593 buf[0] = (val < 0) ? 255 : 254;
3594 } else {
88e8d89f 3595#if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
fe244589 3596 /* Check if the float is in a safe range to be casted into a
3597 * long long. We are assuming that long long is 64 bit here.
3598 * Also we are assuming that there are no implementations around where
3599 * double has precision < 52 bit.
3600 *
3601 * Under this assumptions we test if a double is inside an interval
3602 * where casting to long long is safe. Then using two castings we
3603 * make sure the decimal part is zero. If all this is true we use
3604 * integer printing function that is much faster. */
fb82e75c 3605 double min = -4503599627370495; /* (2^52)-1 */
3606 double max = 4503599627370496; /* -(2^52) */
fe244589 3607 if (val > min && val < max && val == ((double)((long long)val)))
8c096b16 3608 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3609 else
88e8d89f 3610#endif
8c096b16 3611 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3612 buf[0] = strlen((char*)buf+1);
a7866db6 3613 len = buf[0]+1;
3614 }
3615 if (fwrite(buf,len,1,fp) == 0) return -1;
3616 return 0;
3617}
3618
06233c45 3619/* Save a Redis object. */
3620static int rdbSaveObject(FILE *fp, robj *o) {
3621 if (o->type == REDIS_STRING) {
3622 /* Save a string value */
3623 if (rdbSaveStringObject(fp,o) == -1) return -1;
3624 } else if (o->type == REDIS_LIST) {
3625 /* Save a list value */
3626 list *list = o->ptr;
c7df85a4 3627 listIter li;
06233c45 3628 listNode *ln;
3629
06233c45 3630 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
c7df85a4 3631 listRewind(list,&li);
3632 while((ln = listNext(&li))) {
06233c45 3633 robj *eleobj = listNodeValue(ln);
3634
3635 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3636 }
3637 } else if (o->type == REDIS_SET) {
3638 /* Save a set value */
3639 dict *set = o->ptr;
3640 dictIterator *di = dictGetIterator(set);
3641 dictEntry *de;
3642
3643 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3644 while((de = dictNext(di)) != NULL) {
3645 robj *eleobj = dictGetEntryKey(de);
3646
3647 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3648 }
3649 dictReleaseIterator(di);
3650 } else if (o->type == REDIS_ZSET) {
3651 /* Save a set value */
3652 zset *zs = o->ptr;
3653 dictIterator *di = dictGetIterator(zs->dict);
3654 dictEntry *de;
3655
3656 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3657 while((de = dictNext(di)) != NULL) {
3658 robj *eleobj = dictGetEntryKey(de);
3659 double *score = dictGetEntryVal(de);
3660
3661 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3662 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3663 }
3664 dictReleaseIterator(di);
b1befe6a 3665 } else if (o->type == REDIS_HASH) {
3666 /* Save a hash value */
3667 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3668 unsigned char *p = zipmapRewind(o->ptr);
3669 unsigned int count = zipmapLen(o->ptr);
3670 unsigned char *key, *val;
3671 unsigned int klen, vlen;
3672
3673 if (rdbSaveLen(fp,count) == -1) return -1;
3674 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3675 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3676 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3677 }
3678 } else {
3679 dictIterator *di = dictGetIterator(o->ptr);
3680 dictEntry *de;
3681
3682 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3683 while((de = dictNext(di)) != NULL) {
3684 robj *key = dictGetEntryKey(de);
3685 robj *val = dictGetEntryVal(de);
3686
3687 if (rdbSaveStringObject(fp,key) == -1) return -1;
3688 if (rdbSaveStringObject(fp,val) == -1) return -1;
3689 }
3690 dictReleaseIterator(di);
3691 }
06233c45 3692 } else {
f83c6cb5 3693 redisPanic("Unknown object type");
06233c45 3694 }
3695 return 0;
3696}
3697
3698/* Return the length the object will have on disk if saved with
3699 * the rdbSaveObject() function. Currently we use a trick to get
3700 * this length with very little changes to the code. In the future
3701 * we could switch to a faster solution. */
b9bc0eef 3702static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3703 if (fp == NULL) fp = server.devnull;
06233c45 3704 rewind(fp);
3705 assert(rdbSaveObject(fp,o) != 1);
3706 return ftello(fp);
3707}
3708
06224fec 3709/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3710static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3711 off_t bytes = rdbSavedObjectLen(o,fp);
e0a62c7f 3712
06224fec 3713 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3714}
3715
ed9b544e 3716/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3717static int rdbSave(char *filename) {
ed9b544e 3718 dictIterator *di = NULL;
3719 dictEntry *de;
ed9b544e 3720 FILE *fp;
3721 char tmpfile[256];
3722 int j;
bb32ede5 3723 time_t now = time(NULL);
ed9b544e 3724
2316bb3b 3725 /* Wait for I/O therads to terminate, just in case this is a
3726 * foreground-saving, to avoid seeking the swap file descriptor at the
3727 * same time. */
3728 if (server.vm_enabled)
3729 waitEmptyIOJobsQueue();
3730
a3b21203 3731 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3732 fp = fopen(tmpfile,"w");
3733 if (!fp) {
3734 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3735 return REDIS_ERR;
3736 }
f78fd11b 3737 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3738 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3739 redisDb *db = server.db+j;
3740 dict *d = db->dict;
3305306f 3741 if (dictSize(d) == 0) continue;
ed9b544e 3742 di = dictGetIterator(d);
3743 if (!di) {
3744 fclose(fp);
3745 return REDIS_ERR;
3746 }
3747
3748 /* Write the SELECT DB opcode */
f78fd11b 3749 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3750 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3751
3752 /* Iterate this DB writing every entry */
3753 while((de = dictNext(di)) != NULL) {
3754 robj *key = dictGetEntryKey(de);
3755 robj *o = dictGetEntryVal(de);
bb32ede5 3756 time_t expiretime = getExpire(db,key);
3757
3758 /* Save the expire time */
3759 if (expiretime != -1) {
3760 /* If this key is already expired skip it */
3761 if (expiretime < now) continue;
3762 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3763 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3764 }
7e69548d 3765 /* Save the key and associated value. This requires special
3766 * handling if the value is swapped out. */
996cb5f7 3767 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3768 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3769 /* Save type, key, value */
3770 if (rdbSaveType(fp,o->type) == -1) goto werr;
3771 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3772 if (rdbSaveObject(fp,o) == -1) goto werr;
3773 } else {
996cb5f7 3774 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3775 robj *po;
7e69548d 3776 /* Get a preview of the object in memory */
3777 po = vmPreviewObject(key);
7e69548d 3778 /* Save type, key, value */
3779 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3780 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3781 if (rdbSaveObject(fp,po) == -1) goto werr;
3782 /* Remove the loaded object from memory */
3783 decrRefCount(po);
7e69548d 3784 }
ed9b544e 3785 }
3786 dictReleaseIterator(di);
3787 }
3788 /* EOF opcode */
f78fd11b 3789 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3790
3791 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3792 fflush(fp);
3793 fsync(fileno(fp));
3794 fclose(fp);
e0a62c7f 3795
ed9b544e 3796 /* Use RENAME to make sure the DB file is changed atomically only
3797 * if the generate DB file is ok. */
3798 if (rename(tmpfile,filename) == -1) {
325d1eb4 3799 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3800 unlink(tmpfile);
3801 return REDIS_ERR;
3802 }
3803 redisLog(REDIS_NOTICE,"DB saved on disk");
3804 server.dirty = 0;
3805 server.lastsave = time(NULL);
3806 return REDIS_OK;
3807
3808werr:
3809 fclose(fp);
3810 unlink(tmpfile);
3811 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3812 if (di) dictReleaseIterator(di);
3813 return REDIS_ERR;
3814}
3815
f78fd11b 3816static int rdbSaveBackground(char *filename) {
ed9b544e 3817 pid_t childpid;
3818
9d65a1bb 3819 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3820 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3821 if ((childpid = fork()) == 0) {
3822 /* Child */
054e426d 3823 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3824 close(server.fd);
f78fd11b 3825 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3826 _exit(0);
ed9b544e 3827 } else {
478c2c6f 3828 _exit(1);
ed9b544e 3829 }
3830 } else {
3831 /* Parent */
5a7c647e 3832 if (childpid == -1) {
3833 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3834 strerror(errno));
3835 return REDIS_ERR;
3836 }
ed9b544e 3837 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3838 server.bgsavechildpid = childpid;
884d4b39 3839 updateDictResizePolicy();
ed9b544e 3840 return REDIS_OK;
3841 }
3842 return REDIS_OK; /* unreached */
3843}
3844
a3b21203 3845static void rdbRemoveTempFile(pid_t childpid) {
3846 char tmpfile[256];
3847
3848 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3849 unlink(tmpfile);
3850}
3851
f78fd11b 3852static int rdbLoadType(FILE *fp) {
3853 unsigned char type;
7b45bfb2 3854 if (fread(&type,1,1,fp) == 0) return -1;
3855 return type;
3856}
3857
bb32ede5 3858static time_t rdbLoadTime(FILE *fp) {
3859 int32_t t32;
3860 if (fread(&t32,4,1,fp) == 0) return -1;
3861 return (time_t) t32;
3862}
3863
e3566d4b 3864/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3865 * of this file for a description of how this are stored on disk.
3866 *
3867 * isencoded is set to 1 if the readed length is not actually a length but
3868 * an "encoding type", check the above comments for more info */
c78a8ccc 3869static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3870 unsigned char buf[2];
3871 uint32_t len;
c78a8ccc 3872 int type;
f78fd11b 3873
e3566d4b 3874 if (isencoded) *isencoded = 0;
c78a8ccc 3875 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3876 type = (buf[0]&0xC0)>>6;
3877 if (type == REDIS_RDB_6BITLEN) {
3878 /* Read a 6 bit len */
3879 return buf[0]&0x3F;
3880 } else if (type == REDIS_RDB_ENCVAL) {
3881 /* Read a 6 bit len encoding type */
3882 if (isencoded) *isencoded = 1;
3883 return buf[0]&0x3F;
3884 } else if (type == REDIS_RDB_14BITLEN) {
3885 /* Read a 14 bit len */
3886 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3887 return ((buf[0]&0x3F)<<8)|buf[1];
3888 } else {
3889 /* Read a 32 bit len */
f78fd11b 3890 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3891 return ntohl(len);
f78fd11b 3892 }
f78fd11b 3893}
3894
ad30aa60 3895/* Load an integer-encoded object from file 'fp', with the specified
3896 * encoding type 'enctype'. If encode is true the function may return
3897 * an integer-encoded object as reply, otherwise the returned object
3898 * will always be encoded as a raw string. */
3899static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
e3566d4b 3900 unsigned char enc[4];
3901 long long val;
3902
3903 if (enctype == REDIS_RDB_ENC_INT8) {
3904 if (fread(enc,1,1,fp) == 0) return NULL;
3905 val = (signed char)enc[0];
3906 } else if (enctype == REDIS_RDB_ENC_INT16) {
3907 uint16_t v;
3908 if (fread(enc,2,1,fp) == 0) return NULL;
3909 v = enc[0]|(enc[1]<<8);
3910 val = (int16_t)v;
3911 } else if (enctype == REDIS_RDB_ENC_INT32) {
3912 uint32_t v;
3913 if (fread(enc,4,1,fp) == 0) return NULL;
3914 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3915 val = (int32_t)v;
3916 } else {
3917 val = 0; /* anti-warning */
f83c6cb5 3918 redisPanic("Unknown RDB integer encoding type");
e3566d4b 3919 }
ad30aa60 3920 if (encode)
3921 return createStringObjectFromLongLong(val);
3922 else
3923 return createObject(REDIS_STRING,sdsfromlonglong(val));
e3566d4b 3924}
3925
c78a8ccc 3926static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3927 unsigned int len, clen;
3928 unsigned char *c = NULL;
3929 sds val = NULL;
3930
c78a8ccc 3931 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3932 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3933 if ((c = zmalloc(clen)) == NULL) goto err;
3934 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3935 if (fread(c,clen,1,fp) == 0) goto err;
3936 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3937 zfree(c);
88e85998 3938 return createObject(REDIS_STRING,val);
3939err:
3940 zfree(c);
3941 sdsfree(val);
3942 return NULL;
3943}
3944
ad30aa60 3945static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
e3566d4b 3946 int isencoded;
3947 uint32_t len;
f78fd11b 3948 sds val;
3949
c78a8ccc 3950 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3951 if (isencoded) {
3952 switch(len) {
3953 case REDIS_RDB_ENC_INT8:
3954 case REDIS_RDB_ENC_INT16:
3955 case REDIS_RDB_ENC_INT32:
ad30aa60 3956 return rdbLoadIntegerObject(fp,len,encode);
88e85998 3957 case REDIS_RDB_ENC_LZF:
bdcb92f2 3958 return rdbLoadLzfStringObject(fp);
e3566d4b 3959 default:
f83c6cb5 3960 redisPanic("Unknown RDB encoding type");
e3566d4b 3961 }
3962 }
3963
f78fd11b 3964 if (len == REDIS_RDB_LENERR) return NULL;
3965 val = sdsnewlen(NULL,len);
3966 if (len && fread(val,len,1,fp) == 0) {
3967 sdsfree(val);
3968 return NULL;
3969 }
bdcb92f2 3970 return createObject(REDIS_STRING,val);
f78fd11b 3971}
3972
ad30aa60 3973static robj *rdbLoadStringObject(FILE *fp) {
3974 return rdbGenericLoadStringObject(fp,0);
3975}
3976
3977static robj *rdbLoadEncodedStringObject(FILE *fp) {
3978 return rdbGenericLoadStringObject(fp,1);
3979}
3980
a7866db6 3981/* For information about double serialization check rdbSaveDoubleValue() */
3982static int rdbLoadDoubleValue(FILE *fp, double *val) {
3983 char buf[128];
3984 unsigned char len;
3985
3986 if (fread(&len,1,1,fp) == 0) return -1;
3987 switch(len) {
3988 case 255: *val = R_NegInf; return 0;
3989 case 254: *val = R_PosInf; return 0;
3990 case 253: *val = R_Nan; return 0;
3991 default:
3992 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3993 buf[len] = '\0';
a7866db6 3994 sscanf(buf, "%lg", val);
3995 return 0;
3996 }
3997}
3998
c78a8ccc 3999/* Load a Redis object of the specified type from the specified file.
4000 * On success a newly allocated object is returned, otherwise NULL. */
4001static robj *rdbLoadObject(int type, FILE *fp) {
4002 robj *o;
4003
bcd11906 4004 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 4005 if (type == REDIS_STRING) {
4006 /* Read string value */
ad30aa60 4007 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4008 o = tryObjectEncoding(o);
c78a8ccc 4009 } else if (type == REDIS_LIST || type == REDIS_SET) {
4010 /* Read list/set value */
4011 uint32_t listlen;
4012
4013 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4014 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3c68de9b 4015 /* It's faster to expand the dict to the right size asap in order
4016 * to avoid rehashing */
4017 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
4018 dictExpand(o->ptr,listlen);
c78a8ccc 4019 /* Load every single element of the list/set */
4020 while(listlen--) {
4021 robj *ele;
4022
ad30aa60 4023 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4024 ele = tryObjectEncoding(ele);
c78a8ccc 4025 if (type == REDIS_LIST) {
4026 listAddNodeTail((list*)o->ptr,ele);
4027 } else {
4028 dictAdd((dict*)o->ptr,ele,NULL);
4029 }
4030 }
4031 } else if (type == REDIS_ZSET) {
4032 /* Read list/set value */
ada386b2 4033 size_t zsetlen;
c78a8ccc 4034 zset *zs;
4035
4036 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4037 o = createZsetObject();
4038 zs = o->ptr;
4039 /* Load every single element of the list/set */
4040 while(zsetlen--) {
4041 robj *ele;
4042 double *score = zmalloc(sizeof(double));
4043
ad30aa60 4044 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4045 ele = tryObjectEncoding(ele);
c78a8ccc 4046 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4047 dictAdd(zs->dict,ele,score);
4048 zslInsert(zs->zsl,*score,ele);
4049 incrRefCount(ele); /* added to skiplist */
4050 }
ada386b2 4051 } else if (type == REDIS_HASH) {
4052 size_t hashlen;
4053
4054 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4055 o = createHashObject();
4056 /* Too many entries? Use an hash table. */
4057 if (hashlen > server.hash_max_zipmap_entries)
4058 convertToRealHash(o);
4059 /* Load every key/value, then set it into the zipmap or hash
4060 * table, as needed. */
4061 while(hashlen--) {
4062 robj *key, *val;
4063
4064 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4065 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4066 /* If we are using a zipmap and there are too big values
4067 * the object is converted to real hash table encoding. */
4068 if (o->encoding != REDIS_ENCODING_HT &&
4069 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4070 sdslen(val->ptr) > server.hash_max_zipmap_value))
4071 {
4072 convertToRealHash(o);
4073 }
4074
4075 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4076 unsigned char *zm = o->ptr;
4077
4078 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4079 val->ptr,sdslen(val->ptr),NULL);
4080 o->ptr = zm;
4081 decrRefCount(key);
4082 decrRefCount(val);
4083 } else {
05df7621 4084 key = tryObjectEncoding(key);
4085 val = tryObjectEncoding(val);
ada386b2 4086 dictAdd((dict*)o->ptr,key,val);
ada386b2 4087 }
4088 }
c78a8ccc 4089 } else {
f83c6cb5 4090 redisPanic("Unknown object type");
c78a8ccc 4091 }
4092 return o;
4093}
4094
f78fd11b 4095static int rdbLoad(char *filename) {
ed9b544e 4096 FILE *fp;
f78fd11b 4097 uint32_t dbid;
bb32ede5 4098 int type, retval, rdbver;
585af7e2 4099 int swap_all_values = 0;
3305306f 4100 dict *d = server.db[0].dict;
bb32ede5 4101 redisDb *db = server.db+0;
f78fd11b 4102 char buf[1024];
242a64f3 4103 time_t expiretime, now = time(NULL);
b492cf00 4104 long long loadedkeys = 0;
bb32ede5 4105
ed9b544e 4106 fp = fopen(filename,"r");
4107 if (!fp) return REDIS_ERR;
4108 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 4109 buf[9] = '\0';
4110 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 4111 fclose(fp);
4112 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4113 return REDIS_ERR;
4114 }
f78fd11b 4115 rdbver = atoi(buf+5);
c78a8ccc 4116 if (rdbver != 1) {
f78fd11b 4117 fclose(fp);
4118 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4119 return REDIS_ERR;
4120 }
ed9b544e 4121 while(1) {
585af7e2 4122 robj *key, *val;
ed9b544e 4123
585af7e2 4124 expiretime = -1;
ed9b544e 4125 /* Read type. */
f78fd11b 4126 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 4127 if (type == REDIS_EXPIRETIME) {
4128 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4129 /* We read the time so we need to read the object type again */
4130 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4131 }
ed9b544e 4132 if (type == REDIS_EOF) break;
4133 /* Handle SELECT DB opcode as a special case */
4134 if (type == REDIS_SELECTDB) {
c78a8ccc 4135 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 4136 goto eoferr;
ed9b544e 4137 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 4138 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 4139 exit(1);
4140 }
bb32ede5 4141 db = server.db+dbid;
4142 d = db->dict;
ed9b544e 4143 continue;
4144 }
4145 /* Read key */
585af7e2 4146 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
c78a8ccc 4147 /* Read value */
585af7e2 4148 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
89e689c5 4149 /* Check if the key already expired */
4150 if (expiretime != -1 && expiretime < now) {
4151 decrRefCount(key);
4152 decrRefCount(val);
4153 continue;
4154 }
ed9b544e 4155 /* Add the new object in the hash table */
585af7e2 4156 retval = dictAdd(d,key,val);
ed9b544e 4157 if (retval == DICT_ERR) {
585af7e2 4158 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
ed9b544e 4159 exit(1);
4160 }
242a64f3 4161 loadedkeys++;
bb32ede5 4162 /* Set the expire time if needed */
89e689c5 4163 if (expiretime != -1) setExpire(db,key,expiretime);
242a64f3 4164
b492cf00 4165 /* Handle swapping while loading big datasets when VM is on */
242a64f3 4166
4167 /* If we detecter we are hopeless about fitting something in memory
4168 * we just swap every new key on disk. Directly...
4169 * Note that's important to check for this condition before resorting
4170 * to random sampling, otherwise we may try to swap already
4171 * swapped keys. */
585af7e2 4172 if (swap_all_values) {
4173 dictEntry *de = dictFind(d,key);
242a64f3 4174
4175 /* de may be NULL since the key already expired */
4176 if (de) {
585af7e2 4177 key = dictGetEntryKey(de);
4178 val = dictGetEntryVal(de);
242a64f3 4179
585af7e2 4180 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
242a64f3 4181 dictGetEntryVal(de) = NULL;
4182 }
4183 }
4184 continue;
4185 }
4186
4187 /* If we have still some hope of having some value fitting memory
4188 * then we try random sampling. */
585af7e2 4189 if (!swap_all_values && server.vm_enabled && (loadedkeys % 5000) == 0) {
b492cf00 4190 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 4191 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 4192 }
242a64f3 4193 if (zmalloc_used_memory() > server.vm_max_memory)
585af7e2 4194 swap_all_values = 1; /* We are already using too much mem */
b492cf00 4195 }
ed9b544e 4196 }
4197 fclose(fp);
4198 return REDIS_OK;
4199
4200eoferr: /* unexpected end of file is handled here with a fatal exit */
f80dff62 4201 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 4202 exit(1);
4203 return REDIS_ERR; /* Just to avoid warning */
4204}
4205
b58ba105 4206/*================================== Shutdown =============================== */
fab43727 4207static int prepareForShutdown() {
b58ba105
AM
4208 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4209 /* Kill the saving child if there is a background saving in progress.
4210 We want to avoid race conditions, for instance our saving child may
4211 overwrite the synchronous saving did by SHUTDOWN. */
4212 if (server.bgsavechildpid != -1) {
4213 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4214 kill(server.bgsavechildpid,SIGKILL);
4215 rdbRemoveTempFile(server.bgsavechildpid);
4216 }
4217 if (server.appendonly) {
4218 /* Append only file: fsync() the AOF and exit */
b0bd87f6 4219 aof_fsync(server.appendfd);
b58ba105 4220 if (server.vm_enabled) unlink(server.vm_swap_file);
b58ba105
AM
4221 } else {
4222 /* Snapshotting. Perform a SYNC SAVE and exit */
4223 if (rdbSave(server.dbfilename) == REDIS_OK) {
4224 if (server.daemonize)
4225 unlink(server.pidfile);
4226 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
b58ba105
AM
4227 } else {
4228 /* Ooops.. error saving! The best we can do is to continue
4229 * operating. Note that if there was a background saving process,
4230 * in the next cron() Redis will be notified that the background
4231 * saving aborted, handling special stuff like slaves pending for
4232 * synchronization... */
4233 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
fab43727 4234 return REDIS_ERR;
b58ba105
AM
4235 }
4236 }
8513a757 4237 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
fab43727 4238 return REDIS_OK;
b58ba105
AM
4239}
4240
ed9b544e 4241/*================================== Commands =============================== */
4242
abcb223e 4243static void authCommand(redisClient *c) {
2e77c2ee 4244 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
4245 c->authenticated = 1;
4246 addReply(c,shared.ok);
4247 } else {
4248 c->authenticated = 0;
fa4c0aba 4249 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
4250 }
4251}
4252
ed9b544e 4253static void pingCommand(redisClient *c) {
4254 addReply(c,shared.pong);
4255}
4256
4257static void echoCommand(redisClient *c) {
dd88747b 4258 addReplyBulk(c,c->argv[1]);
ed9b544e 4259}
4260
4261/*=================================== Strings =============================== */
4262
526d00a5 4263static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
ed9b544e 4264 int retval;
10ce1276 4265 long seconds = 0; /* initialized to avoid an harmness warning */
ed9b544e 4266
526d00a5 4267 if (expire) {
4268 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4269 return;
4270 if (seconds <= 0) {
4271 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4272 return;
4273 }
4274 }
4275
37ab76c9 4276 touchWatchedKey(c->db,key);
526d00a5 4277 if (nx) deleteIfVolatile(c->db,key);
4278 retval = dictAdd(c->db->dict,key,val);
ed9b544e 4279 if (retval == DICT_ERR) {
4280 if (!nx) {
1b03836c 4281 /* If the key is about a swapped value, we want a new key object
4282 * to overwrite the old. So we delete the old key in the database.
4283 * This will also make sure that swap pages about the old object
4284 * will be marked as free. */
526d00a5 4285 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4286 incrRefCount(key);
4287 dictReplace(c->db->dict,key,val);
4288 incrRefCount(val);
ed9b544e 4289 } else {
c937aa89 4290 addReply(c,shared.czero);
ed9b544e 4291 return;
4292 }
4293 } else {
526d00a5 4294 incrRefCount(key);
4295 incrRefCount(val);
ed9b544e 4296 }
4297 server.dirty++;
526d00a5 4298 removeExpire(c->db,key);
4299 if (expire) setExpire(c->db,key,time(NULL)+seconds);
c937aa89 4300 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 4301}
4302
4303static void setCommand(redisClient *c) {
526d00a5 4304 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
ed9b544e 4305}
4306
4307static void setnxCommand(redisClient *c) {
526d00a5 4308 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4309}
4310
4311static void setexCommand(redisClient *c) {
4312 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
ed9b544e 4313}
4314
322fc7d8 4315static int getGenericCommand(redisClient *c) {
dd88747b 4316 robj *o;
e0a62c7f 4317
dd88747b 4318 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 4319 return REDIS_OK;
dd88747b 4320
4321 if (o->type != REDIS_STRING) {
4322 addReply(c,shared.wrongtypeerr);
4323 return REDIS_ERR;
ed9b544e 4324 } else {
dd88747b 4325 addReplyBulk(c,o);
4326 return REDIS_OK;
ed9b544e 4327 }
4328}
4329
322fc7d8 4330static void getCommand(redisClient *c) {
4331 getGenericCommand(c);
4332}
4333
f6b141c5 4334static void getsetCommand(redisClient *c) {
322fc7d8 4335 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 4336 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4337 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4338 } else {
4339 incrRefCount(c->argv[1]);
4340 }
4341 incrRefCount(c->argv[2]);
4342 server.dirty++;
4343 removeExpire(c->db,c->argv[1]);
4344}
4345
70003d28 4346static void mgetCommand(redisClient *c) {
70003d28 4347 int j;
e0a62c7f 4348
c937aa89 4349 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 4350 for (j = 1; j < c->argc; j++) {
3305306f 4351 robj *o = lookupKeyRead(c->db,c->argv[j]);
4352 if (o == NULL) {
c937aa89 4353 addReply(c,shared.nullbulk);
70003d28 4354 } else {
70003d28 4355 if (o->type != REDIS_STRING) {
c937aa89 4356 addReply(c,shared.nullbulk);
70003d28 4357 } else {
dd88747b 4358 addReplyBulk(c,o);
70003d28 4359 }
4360 }
4361 }
4362}
4363
6c446631 4364static void msetGenericCommand(redisClient *c, int nx) {
906573e7 4365 int j, busykeys = 0;
6c446631 4366
4367 if ((c->argc % 2) == 0) {
454d4e43 4368 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 4369 return;
4370 }
4371 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4372 * set nothing at all if at least one already key exists. */
4373 if (nx) {
4374 for (j = 1; j < c->argc; j += 2) {
906573e7 4375 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4376 busykeys++;
6c446631 4377 }
4378 }
4379 }
906573e7 4380 if (busykeys) {
4381 addReply(c, shared.czero);
4382 return;
4383 }
6c446631 4384
4385 for (j = 1; j < c->argc; j += 2) {
4386 int retval;
4387
05df7621 4388 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
6c446631 4389 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4390 if (retval == DICT_ERR) {
4391 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4392 incrRefCount(c->argv[j+1]);
4393 } else {
4394 incrRefCount(c->argv[j]);
4395 incrRefCount(c->argv[j+1]);
4396 }
4397 removeExpire(c->db,c->argv[j]);
4398 }
4399 server.dirty += (c->argc-1)/2;
4400 addReply(c, nx ? shared.cone : shared.ok);
4401}
4402
4403static void msetCommand(redisClient *c) {
4404 msetGenericCommand(c,0);
4405}
4406
4407static void msetnxCommand(redisClient *c) {
4408 msetGenericCommand(c,1);
4409}
4410
d68ed120 4411static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 4412 long long value;
4413 int retval;
4414 robj *o;
e0a62c7f 4415
3305306f 4416 o = lookupKeyWrite(c->db,c->argv[1]);
6485f293
PN
4417 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4418 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
ed9b544e 4419
4420 value += incr;
d6f4c262 4421 o = createStringObjectFromLongLong(value);
3305306f 4422 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 4423 if (retval == DICT_ERR) {
3305306f 4424 dictReplace(c->db->dict,c->argv[1],o);
4425 removeExpire(c->db,c->argv[1]);
ed9b544e 4426 } else {
4427 incrRefCount(c->argv[1]);
4428 }
4429 server.dirty++;
c937aa89 4430 addReply(c,shared.colon);
ed9b544e 4431 addReply(c,o);
4432 addReply(c,shared.crlf);
4433}
4434
4435static void incrCommand(redisClient *c) {
a4d1ba9a 4436 incrDecrCommand(c,1);
ed9b544e 4437}
4438
4439static void decrCommand(redisClient *c) {
a4d1ba9a 4440 incrDecrCommand(c,-1);
ed9b544e 4441}
4442
4443static void incrbyCommand(redisClient *c) {
bbe025e0
AM
4444 long long incr;
4445
bd79a6bd 4446 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4447 incrDecrCommand(c,incr);
ed9b544e 4448}
4449
4450static void decrbyCommand(redisClient *c) {
bbe025e0
AM
4451 long long incr;
4452
bd79a6bd 4453 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4454 incrDecrCommand(c,-incr);
ed9b544e 4455}
4456
4b00bebd 4457static void appendCommand(redisClient *c) {
4458 int retval;
4459 size_t totlen;
4460 robj *o;
4461
4462 o = lookupKeyWrite(c->db,c->argv[1]);
4463 if (o == NULL) {
4464 /* Create the key */
4465 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4466 incrRefCount(c->argv[1]);
4467 incrRefCount(c->argv[2]);
4468 totlen = stringObjectLen(c->argv[2]);
4469 } else {
4470 dictEntry *de;
e0a62c7f 4471
4b00bebd 4472 de = dictFind(c->db->dict,c->argv[1]);
4473 assert(de != NULL);
4474
4475 o = dictGetEntryVal(de);
4476 if (o->type != REDIS_STRING) {
4477 addReply(c,shared.wrongtypeerr);
4478 return;
4479 }
4480 /* If the object is specially encoded or shared we have to make
4481 * a copy */
4482 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4483 robj *decoded = getDecodedObject(o);
4484
4485 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4486 decrRefCount(decoded);
4487 dictReplace(c->db->dict,c->argv[1],o);
4488 }
4489 /* APPEND! */
4490 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4491 o->ptr = sdscatlen(o->ptr,
4492 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4493 } else {
4494 o->ptr = sdscatprintf(o->ptr, "%ld",
4495 (unsigned long) c->argv[2]->ptr);
4496 }
4497 totlen = sdslen(o->ptr);
4498 }
4499 server.dirty++;
4500 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4501}
4502
39191553 4503static void substrCommand(redisClient *c) {
4504 robj *o;
4505 long start = atoi(c->argv[2]->ptr);
4506 long end = atoi(c->argv[3]->ptr);
dd88747b 4507 size_t rangelen, strlen;
4508 sds range;
39191553 4509
dd88747b 4510 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4511 checkType(c,o,REDIS_STRING)) return;
39191553 4512
dd88747b 4513 o = getDecodedObject(o);
4514 strlen = sdslen(o->ptr);
8fe7fad7 4515
dd88747b 4516 /* convert negative indexes */
4517 if (start < 0) start = strlen+start;
4518 if (end < 0) end = strlen+end;
4519 if (start < 0) start = 0;
4520 if (end < 0) end = 0;
39191553 4521
dd88747b 4522 /* indexes sanity checks */
4523 if (start > end || (size_t)start >= strlen) {
4524 /* Out of range start or start > end result in null reply */
4525 addReply(c,shared.nullbulk);
4526 decrRefCount(o);
4527 return;
39191553 4528 }
dd88747b 4529 if ((size_t)end >= strlen) end = strlen-1;
4530 rangelen = (end-start)+1;
4531
4532 /* Return the result */
4533 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4534 range = sdsnewlen((char*)o->ptr+start,rangelen);
4535 addReplySds(c,range);
4536 addReply(c,shared.crlf);
4537 decrRefCount(o);
39191553 4538}
4539
ed9b544e 4540/* ========================= Type agnostic commands ========================= */
4541
4542static void delCommand(redisClient *c) {
5109cdff 4543 int deleted = 0, j;
4544
4545 for (j = 1; j < c->argc; j++) {
4546 if (deleteKey(c->db,c->argv[j])) {
37ab76c9 4547 touchWatchedKey(c->db,c->argv[j]);
5109cdff 4548 server.dirty++;
4549 deleted++;
4550 }
4551 }
482b672d 4552 addReplyLongLong(c,deleted);
ed9b544e 4553}
4554
4555static void existsCommand(redisClient *c) {
f4f06efc
PN
4556 expireIfNeeded(c->db,c->argv[1]);
4557 if (dictFind(c->db->dict,c->argv[1])) {
4558 addReply(c, shared.cone);
4559 } else {
4560 addReply(c, shared.czero);
4561 }
ed9b544e 4562}
4563
4564static void selectCommand(redisClient *c) {
4565 int id = atoi(c->argv[1]->ptr);
e0a62c7f 4566
ed9b544e 4567 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4568 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4569 } else {
4570 addReply(c,shared.ok);
4571 }
4572}
4573
4574static void randomkeyCommand(redisClient *c) {
4575 dictEntry *de;
dc4be23e 4576 robj *key;
e0a62c7f 4577
3305306f 4578 while(1) {
4579 de = dictGetRandomKey(c->db->dict);
ce7bef07 4580 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 4581 }
2b619329 4582
ed9b544e 4583 if (de == NULL) {
dc4be23e 4584 addReply(c,shared.nullbulk);
4585 return;
4586 }
4587
4588 key = dictGetEntryKey(de);
4589 if (server.vm_enabled) {
4590 key = dupStringObject(key);
4591 addReplyBulk(c,key);
4592 decrRefCount(key);
ed9b544e 4593 } else {
dc4be23e 4594 addReplyBulk(c,key);
ed9b544e 4595 }
4596}
4597
4598static void keysCommand(redisClient *c) {
4599 dictIterator *di;
4600 dictEntry *de;
4601 sds pattern = c->argv[1]->ptr;
4602 int plen = sdslen(pattern);
a3f9eec2 4603 unsigned long numkeys = 0;
ed9b544e 4604 robj *lenobj = createObject(REDIS_STRING,NULL);
4605
3305306f 4606 di = dictGetIterator(c->db->dict);
ed9b544e 4607 addReply(c,lenobj);
4608 decrRefCount(lenobj);
4609 while((de = dictNext(di)) != NULL) {
4610 robj *keyobj = dictGetEntryKey(de);
3305306f 4611
ed9b544e 4612 sds key = keyobj->ptr;
4613 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4614 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 4615 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4616 addReplyBulk(c,keyobj);
3305306f 4617 numkeys++;
3305306f 4618 }
ed9b544e 4619 }
4620 }
4621 dictReleaseIterator(di);
a3f9eec2 4622 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4623}
4624
4625static void dbsizeCommand(redisClient *c) {
4626 addReplySds(c,
3305306f 4627 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4628}
4629
4630static void lastsaveCommand(redisClient *c) {
4631 addReplySds(c,
c937aa89 4632 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4633}
4634
4635static void typeCommand(redisClient *c) {
3305306f 4636 robj *o;
ed9b544e 4637 char *type;
3305306f 4638
4639 o = lookupKeyRead(c->db,c->argv[1]);
4640 if (o == NULL) {
c937aa89 4641 type = "+none";
ed9b544e 4642 } else {
ed9b544e 4643 switch(o->type) {
c937aa89 4644 case REDIS_STRING: type = "+string"; break;
4645 case REDIS_LIST: type = "+list"; break;
4646 case REDIS_SET: type = "+set"; break;
412a8bce 4647 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4648 case REDIS_HASH: type = "+hash"; break;
4649 default: type = "+unknown"; break;
ed9b544e 4650 }
4651 }
4652 addReplySds(c,sdsnew(type));
4653 addReply(c,shared.crlf);
4654}
4655
4656static void saveCommand(redisClient *c) {
9d65a1bb 4657 if (server.bgsavechildpid != -1) {
05557f6d 4658 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4659 return;
4660 }
f78fd11b 4661 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4662 addReply(c,shared.ok);
4663 } else {
4664 addReply(c,shared.err);
4665 }
4666}
4667
4668static void bgsaveCommand(redisClient *c) {
9d65a1bb 4669 if (server.bgsavechildpid != -1) {
ed9b544e 4670 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4671 return;
4672 }
f78fd11b 4673 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4674 char *status = "+Background saving started\r\n";
4675 addReplySds(c,sdsnew(status));
ed9b544e 4676 } else {
4677 addReply(c,shared.err);
4678 }
4679}
4680
4681static void shutdownCommand(redisClient *c) {
fab43727 4682 if (prepareForShutdown() == REDIS_OK)
4683 exit(0);
4684 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
ed9b544e 4685}
4686
4687static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4688 robj *o;
4689
4690 /* To use the same key as src and dst is probably an error */
4691 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4692 addReply(c,shared.sameobjecterr);
ed9b544e 4693 return;
4694 }
4695
dd88747b 4696 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4697 return;
dd88747b 4698
ed9b544e 4699 incrRefCount(o);
3305306f 4700 deleteIfVolatile(c->db,c->argv[2]);
4701 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 4702 if (nx) {
4703 decrRefCount(o);
c937aa89 4704 addReply(c,shared.czero);
ed9b544e 4705 return;
4706 }
3305306f 4707 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 4708 } else {
4709 incrRefCount(c->argv[2]);
4710 }
3305306f 4711 deleteKey(c->db,c->argv[1]);
b167f877 4712 touchWatchedKey(c->db,c->argv[2]);
ed9b544e 4713 server.dirty++;
c937aa89 4714 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4715}
4716
4717static void renameCommand(redisClient *c) {
4718 renameGenericCommand(c,0);
4719}
4720
4721static void renamenxCommand(redisClient *c) {
4722 renameGenericCommand(c,1);
4723}
4724
4725static void moveCommand(redisClient *c) {
3305306f 4726 robj *o;
4727 redisDb *src, *dst;
ed9b544e 4728 int srcid;
4729
4730 /* Obtain source and target DB pointers */
3305306f 4731 src = c->db;
4732 srcid = c->db->id;
ed9b544e 4733 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4734 addReply(c,shared.outofrangeerr);
ed9b544e 4735 return;
4736 }
3305306f 4737 dst = c->db;
4738 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4739
4740 /* If the user is moving using as target the same
4741 * DB as the source DB it is probably an error. */
4742 if (src == dst) {
c937aa89 4743 addReply(c,shared.sameobjecterr);
ed9b544e 4744 return;
4745 }
4746
4747 /* Check if the element exists and get a reference */
3305306f 4748 o = lookupKeyWrite(c->db,c->argv[1]);
4749 if (!o) {
c937aa89 4750 addReply(c,shared.czero);
ed9b544e 4751 return;
4752 }
4753
4754 /* Try to add the element to the target DB */
3305306f 4755 deleteIfVolatile(dst,c->argv[1]);
4756 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 4757 addReply(c,shared.czero);
ed9b544e 4758 return;
4759 }
3305306f 4760 incrRefCount(c->argv[1]);
ed9b544e 4761 incrRefCount(o);
4762
4763 /* OK! key moved, free the entry in the source DB */
3305306f 4764 deleteKey(src,c->argv[1]);
ed9b544e 4765 server.dirty++;
c937aa89 4766 addReply(c,shared.cone);
ed9b544e 4767}
4768
4769/* =================================== Lists ================================ */
4770static void pushGenericCommand(redisClient *c, int where) {
4771 robj *lobj;
ed9b544e 4772 list *list;
3305306f 4773
4774 lobj = lookupKeyWrite(c->db,c->argv[1]);
4775 if (lobj == NULL) {
95242ab5 4776 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4777 addReply(c,shared.cone);
95242ab5 4778 return;
4779 }
ed9b544e 4780 lobj = createListObject();
4781 list = lobj->ptr;
4782 if (where == REDIS_HEAD) {
6b47e12e 4783 listAddNodeHead(list,c->argv[2]);
ed9b544e 4784 } else {
6b47e12e 4785 listAddNodeTail(list,c->argv[2]);
ed9b544e 4786 }
3305306f 4787 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 4788 incrRefCount(c->argv[1]);
4789 incrRefCount(c->argv[2]);
4790 } else {
ed9b544e 4791 if (lobj->type != REDIS_LIST) {
4792 addReply(c,shared.wrongtypeerr);
4793 return;
4794 }
95242ab5 4795 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4796 addReply(c,shared.cone);
95242ab5 4797 return;
4798 }
ed9b544e 4799 list = lobj->ptr;
4800 if (where == REDIS_HEAD) {
6b47e12e 4801 listAddNodeHead(list,c->argv[2]);
ed9b544e 4802 } else {
6b47e12e 4803 listAddNodeTail(list,c->argv[2]);
ed9b544e 4804 }
4805 incrRefCount(c->argv[2]);
4806 }
4807 server.dirty++;
482b672d 4808 addReplyLongLong(c,listLength(list));
ed9b544e 4809}
4810
4811static void lpushCommand(redisClient *c) {
4812 pushGenericCommand(c,REDIS_HEAD);
4813}
4814
4815static void rpushCommand(redisClient *c) {
4816 pushGenericCommand(c,REDIS_TAIL);
4817}
4818
4819static void llenCommand(redisClient *c) {
3305306f 4820 robj *o;
ed9b544e 4821 list *l;
dd88747b 4822
4823 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4824 checkType(c,o,REDIS_LIST)) return;
e0a62c7f 4825
dd88747b 4826 l = o->ptr;
4827 addReplyUlong(c,listLength(l));
ed9b544e 4828}
4829
4830static void lindexCommand(redisClient *c) {
3305306f 4831 robj *o;
ed9b544e 4832 int index = atoi(c->argv[2]->ptr);
dd88747b 4833 list *list;
4834 listNode *ln;
4835
4836 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4837 checkType(c,o,REDIS_LIST)) return;
4838 list = o->ptr;
4839
4840 ln = listIndex(list, index);
4841 if (ln == NULL) {
c937aa89 4842 addReply(c,shared.nullbulk);
ed9b544e 4843 } else {
dd88747b 4844 robj *ele = listNodeValue(ln);
4845 addReplyBulk(c,ele);
ed9b544e 4846 }
4847}
4848
4849static void lsetCommand(redisClient *c) {
3305306f 4850 robj *o;
ed9b544e 4851 int index = atoi(c->argv[2]->ptr);
dd88747b 4852 list *list;
4853 listNode *ln;
4854
4855 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4856 checkType(c,o,REDIS_LIST)) return;
4857 list = o->ptr;
4858
4859 ln = listIndex(list, index);
4860 if (ln == NULL) {
4861 addReply(c,shared.outofrangeerr);
ed9b544e 4862 } else {
dd88747b 4863 robj *ele = listNodeValue(ln);
ed9b544e 4864
dd88747b 4865 decrRefCount(ele);
4866 listNodeValue(ln) = c->argv[3];
4867 incrRefCount(c->argv[3]);
4868 addReply(c,shared.ok);
4869 server.dirty++;
ed9b544e 4870 }
4871}
4872
4873static void popGenericCommand(redisClient *c, int where) {
3305306f 4874 robj *o;
dd88747b 4875 list *list;
4876 listNode *ln;
3305306f 4877
dd88747b 4878 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4879 checkType(c,o,REDIS_LIST)) return;
4880 list = o->ptr;
ed9b544e 4881
dd88747b 4882 if (where == REDIS_HEAD)
4883 ln = listFirst(list);
4884 else
4885 ln = listLast(list);
ed9b544e 4886
dd88747b 4887 if (ln == NULL) {
4888 addReply(c,shared.nullbulk);
4889 } else {
4890 robj *ele = listNodeValue(ln);
4891 addReplyBulk(c,ele);
4892 listDelNode(list,ln);
3ea27d37 4893 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4894 server.dirty++;
ed9b544e 4895 }
4896}
4897
4898static void lpopCommand(redisClient *c) {
4899 popGenericCommand(c,REDIS_HEAD);
4900}
4901
4902static void rpopCommand(redisClient *c) {
4903 popGenericCommand(c,REDIS_TAIL);
4904}
4905
4906static void lrangeCommand(redisClient *c) {
3305306f 4907 robj *o;
ed9b544e 4908 int start = atoi(c->argv[2]->ptr);
4909 int end = atoi(c->argv[3]->ptr);
dd88747b 4910 int llen;
4911 int rangelen, j;
4912 list *list;
4913 listNode *ln;
4914 robj *ele;
4915
4e27f268 4916 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4917 || checkType(c,o,REDIS_LIST)) return;
dd88747b 4918 list = o->ptr;
4919 llen = listLength(list);
4920
4921 /* convert negative indexes */
4922 if (start < 0) start = llen+start;
4923 if (end < 0) end = llen+end;
4924 if (start < 0) start = 0;
4925 if (end < 0) end = 0;
4926
4927 /* indexes sanity checks */
4928 if (start > end || start >= llen) {
4929 /* Out of range start or start > end result in empty list */
4930 addReply(c,shared.emptymultibulk);
4931 return;
4932 }
4933 if (end >= llen) end = llen-1;
4934 rangelen = (end-start)+1;
3305306f 4935
dd88747b 4936 /* Return the result in form of a multi-bulk reply */
4937 ln = listIndex(list, start);
4938 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4939 for (j = 0; j < rangelen; j++) {
4940 ele = listNodeValue(ln);
4941 addReplyBulk(c,ele);
4942 ln = ln->next;
ed9b544e 4943 }
4944}
4945
4946static void ltrimCommand(redisClient *c) {
3305306f 4947 robj *o;
ed9b544e 4948 int start = atoi(c->argv[2]->ptr);
4949 int end = atoi(c->argv[3]->ptr);
dd88747b 4950 int llen;
4951 int j, ltrim, rtrim;
4952 list *list;
4953 listNode *ln;
4954
4955 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4956 checkType(c,o,REDIS_LIST)) return;
4957 list = o->ptr;
4958 llen = listLength(list);
4959
4960 /* convert negative indexes */
4961 if (start < 0) start = llen+start;
4962 if (end < 0) end = llen+end;
4963 if (start < 0) start = 0;
4964 if (end < 0) end = 0;
4965
4966 /* indexes sanity checks */
4967 if (start > end || start >= llen) {
4968 /* Out of range start or start > end result in empty list */
4969 ltrim = llen;
4970 rtrim = 0;
ed9b544e 4971 } else {
dd88747b 4972 if (end >= llen) end = llen-1;
4973 ltrim = start;
4974 rtrim = llen-end-1;
4975 }
ed9b544e 4976
dd88747b 4977 /* Remove list elements to perform the trim */
4978 for (j = 0; j < ltrim; j++) {
4979 ln = listFirst(list);
4980 listDelNode(list,ln);
4981 }
4982 for (j = 0; j < rtrim; j++) {
4983 ln = listLast(list);
4984 listDelNode(list,ln);
ed9b544e 4985 }
3ea27d37 4986 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4987 server.dirty++;
4988 addReply(c,shared.ok);
ed9b544e 4989}
4990
4991static void lremCommand(redisClient *c) {
3305306f 4992 robj *o;
dd88747b 4993 list *list;
4994 listNode *ln, *next;
4995 int toremove = atoi(c->argv[2]->ptr);
4996 int removed = 0;
4997 int fromtail = 0;
a4d1ba9a 4998
dd88747b 4999 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5000 checkType(c,o,REDIS_LIST)) return;
5001 list = o->ptr;
5002
5003 if (toremove < 0) {
5004 toremove = -toremove;
5005 fromtail = 1;
5006 }
5007 ln = fromtail ? list->tail : list->head;
5008 while (ln) {
5009 robj *ele = listNodeValue(ln);
5010
5011 next = fromtail ? ln->prev : ln->next;
bf028098 5012 if (equalStringObjects(ele,c->argv[3])) {
dd88747b 5013 listDelNode(list,ln);
5014 server.dirty++;
5015 removed++;
5016 if (toremove && removed == toremove) break;
ed9b544e 5017 }
dd88747b 5018 ln = next;
ed9b544e 5019 }
3ea27d37 5020 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5021 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 5022}
5023
12f9d551 5024/* This is the semantic of this command:
0f5f7e9a 5025 * RPOPLPUSH srclist dstlist:
12f9d551 5026 * IF LLEN(srclist) > 0
5027 * element = RPOP srclist
5028 * LPUSH dstlist element
5029 * RETURN element
5030 * ELSE
5031 * RETURN nil
5032 * END
5033 * END
5034 *
5035 * The idea is to be able to get an element from a list in a reliable way
5036 * since the element is not just returned but pushed against another list
5037 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5038 */
0f5f7e9a 5039static void rpoplpushcommand(redisClient *c) {
12f9d551 5040 robj *sobj;
dd88747b 5041 list *srclist;
5042 listNode *ln;
5043
5044 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5045 checkType(c,sobj,REDIS_LIST)) return;
5046 srclist = sobj->ptr;
5047 ln = listLast(srclist);
12f9d551 5048
dd88747b 5049 if (ln == NULL) {
12f9d551 5050 addReply(c,shared.nullbulk);
5051 } else {
dd88747b 5052 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5053 robj *ele = listNodeValue(ln);
5054 list *dstlist;
e20fb74f 5055
dd88747b 5056 if (dobj && dobj->type != REDIS_LIST) {
5057 addReply(c,shared.wrongtypeerr);
5058 return;
5059 }
12f9d551 5060
dd88747b 5061 /* Add the element to the target list (unless it's directly
5062 * passed to some BLPOP-ing client */
5063 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
5064 if (dobj == NULL) {
5065 /* Create the list if the key does not exist */
5066 dobj = createListObject();
5067 dictAdd(c->db->dict,c->argv[2],dobj);
5068 incrRefCount(c->argv[2]);
12f9d551 5069 }
dd88747b 5070 dstlist = dobj->ptr;
5071 listAddNodeHead(dstlist,ele);
5072 incrRefCount(ele);
12f9d551 5073 }
dd88747b 5074
5075 /* Send the element to the client as reply as well */
5076 addReplyBulk(c,ele);
5077
5078 /* Finally remove the element from the source list */
5079 listDelNode(srclist,ln);
3ea27d37 5080 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5081 server.dirty++;
12f9d551 5082 }
5083}
5084
ed9b544e 5085/* ==================================== Sets ================================ */
5086
5087static void saddCommand(redisClient *c) {
ed9b544e 5088 robj *set;
5089
3305306f 5090 set = lookupKeyWrite(c->db,c->argv[1]);
5091 if (set == NULL) {
ed9b544e 5092 set = createSetObject();
3305306f 5093 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 5094 incrRefCount(c->argv[1]);
5095 } else {
ed9b544e 5096 if (set->type != REDIS_SET) {
c937aa89 5097 addReply(c,shared.wrongtypeerr);
ed9b544e 5098 return;
5099 }
5100 }
5101 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5102 incrRefCount(c->argv[2]);
5103 server.dirty++;
c937aa89 5104 addReply(c,shared.cone);
ed9b544e 5105 } else {
c937aa89 5106 addReply(c,shared.czero);
ed9b544e 5107 }
5108}
5109
5110static void sremCommand(redisClient *c) {
3305306f 5111 robj *set;
ed9b544e 5112
dd88747b 5113 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5114 checkType(c,set,REDIS_SET)) return;
5115
5116 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5117 server.dirty++;
5118 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 5119 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5120 addReply(c,shared.cone);
ed9b544e 5121 } else {
dd88747b 5122 addReply(c,shared.czero);
ed9b544e 5123 }
5124}
5125
a4460ef4 5126static void smoveCommand(redisClient *c) {
5127 robj *srcset, *dstset;
5128
5129 srcset = lookupKeyWrite(c->db,c->argv[1]);
5130 dstset = lookupKeyWrite(c->db,c->argv[2]);
5131
5132 /* If the source key does not exist return 0, if it's of the wrong type
5133 * raise an error */
5134 if (srcset == NULL || srcset->type != REDIS_SET) {
5135 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5136 return;
5137 }
5138 /* Error if the destination key is not a set as well */
5139 if (dstset && dstset->type != REDIS_SET) {
5140 addReply(c,shared.wrongtypeerr);
5141 return;
5142 }
5143 /* Remove the element from the source set */
5144 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5145 /* Key not found in the src set! return zero */
5146 addReply(c,shared.czero);
5147 return;
5148 }
3ea27d37 5149 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5150 deleteKey(c->db,c->argv[1]);
a4460ef4 5151 server.dirty++;
5152 /* Add the element to the destination set */
5153 if (!dstset) {
5154 dstset = createSetObject();
5155 dictAdd(c->db->dict,c->argv[2],dstset);
5156 incrRefCount(c->argv[2]);
5157 }
5158 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5159 incrRefCount(c->argv[3]);
5160 addReply(c,shared.cone);
5161}
5162
ed9b544e 5163static void sismemberCommand(redisClient *c) {
3305306f 5164 robj *set;
ed9b544e 5165
dd88747b 5166 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5167 checkType(c,set,REDIS_SET)) return;
5168
5169 if (dictFind(set->ptr,c->argv[2]))
5170 addReply(c,shared.cone);
5171 else
c937aa89 5172 addReply(c,shared.czero);
ed9b544e 5173}
5174
5175static void scardCommand(redisClient *c) {
3305306f 5176 robj *o;
ed9b544e 5177 dict *s;
dd88747b 5178
5179 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5180 checkType(c,o,REDIS_SET)) return;
e0a62c7f 5181
dd88747b 5182 s = o->ptr;
5183 addReplyUlong(c,dictSize(s));
ed9b544e 5184}
5185
12fea928 5186static void spopCommand(redisClient *c) {
5187 robj *set;
5188 dictEntry *de;
5189
dd88747b 5190 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5191 checkType(c,set,REDIS_SET)) return;
5192
5193 de = dictGetRandomKey(set->ptr);
5194 if (de == NULL) {
12fea928 5195 addReply(c,shared.nullbulk);
5196 } else {
dd88747b 5197 robj *ele = dictGetEntryKey(de);
12fea928 5198
dd88747b 5199 addReplyBulk(c,ele);
5200 dictDelete(set->ptr,ele);
5201 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 5202 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5203 server.dirty++;
12fea928 5204 }
5205}
5206
2abb95a9 5207static void srandmemberCommand(redisClient *c) {
5208 robj *set;
5209 dictEntry *de;
5210
dd88747b 5211 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5212 checkType(c,set,REDIS_SET)) return;
5213
5214 de = dictGetRandomKey(set->ptr);
5215 if (de == NULL) {
2abb95a9 5216 addReply(c,shared.nullbulk);
5217 } else {
dd88747b 5218 robj *ele = dictGetEntryKey(de);
2abb95a9 5219
dd88747b 5220 addReplyBulk(c,ele);
2abb95a9 5221 }
5222}
5223
ed9b544e 5224static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5225 dict **d1 = (void*) s1, **d2 = (void*) s2;
5226
3305306f 5227 return dictSize(*d1)-dictSize(*d2);
ed9b544e 5228}
5229
682ac724 5230static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 5231 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5232 dictIterator *di;
5233 dictEntry *de;
5234 robj *lenobj = NULL, *dstset = NULL;
682ac724 5235 unsigned long j, cardinality = 0;
ed9b544e 5236
ed9b544e 5237 for (j = 0; j < setsnum; j++) {
5238 robj *setobj;
3305306f 5239
5240 setobj = dstkey ?
5241 lookupKeyWrite(c->db,setskeys[j]) :
5242 lookupKeyRead(c->db,setskeys[j]);
5243 if (!setobj) {
ed9b544e 5244 zfree(dv);
5faa6025 5245 if (dstkey) {
fdcaae84 5246 if (deleteKey(c->db,dstkey))
5247 server.dirty++;
0d36ded0 5248 addReply(c,shared.czero);
5faa6025 5249 } else {
4e27f268 5250 addReply(c,shared.emptymultibulk);
5faa6025 5251 }
ed9b544e 5252 return;
5253 }
ed9b544e 5254 if (setobj->type != REDIS_SET) {
5255 zfree(dv);
c937aa89 5256 addReply(c,shared.wrongtypeerr);
ed9b544e 5257 return;
5258 }
5259 dv[j] = setobj->ptr;
5260 }
5261 /* Sort sets from the smallest to largest, this will improve our
5262 * algorithm's performace */
5263 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5264
5265 /* The first thing we should output is the total number of elements...
5266 * since this is a multi-bulk write, but at this stage we don't know
5267 * the intersection set size, so we use a trick, append an empty object
5268 * to the output list and save the pointer to later modify it with the
5269 * right length */
5270 if (!dstkey) {
5271 lenobj = createObject(REDIS_STRING,NULL);
5272 addReply(c,lenobj);
5273 decrRefCount(lenobj);
5274 } else {
5275 /* If we have a target key where to store the resulting set
5276 * create this key with an empty set inside */
5277 dstset = createSetObject();
ed9b544e 5278 }
5279
5280 /* Iterate all the elements of the first (smallest) set, and test
5281 * the element against all the other sets, if at least one set does
5282 * not include the element it is discarded */
5283 di = dictGetIterator(dv[0]);
ed9b544e 5284
5285 while((de = dictNext(di)) != NULL) {
5286 robj *ele;
5287
5288 for (j = 1; j < setsnum; j++)
5289 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5290 if (j != setsnum)
5291 continue; /* at least one set does not contain the member */
5292 ele = dictGetEntryKey(de);
5293 if (!dstkey) {
dd88747b 5294 addReplyBulk(c,ele);
ed9b544e 5295 cardinality++;
5296 } else {
5297 dictAdd(dstset->ptr,ele,NULL);
5298 incrRefCount(ele);
5299 }
5300 }
5301 dictReleaseIterator(di);
5302
83cdfe18 5303 if (dstkey) {
3ea27d37 5304 /* Store the resulting set into the target, if the intersection
5305 * is not an empty set. */
83cdfe18 5306 deleteKey(c->db,dstkey);
3ea27d37 5307 if (dictSize((dict*)dstset->ptr) > 0) {
5308 dictAdd(c->db->dict,dstkey,dstset);
5309 incrRefCount(dstkey);
482b672d 5310 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5311 } else {
5312 decrRefCount(dstset);
d36c4e97 5313 addReply(c,shared.czero);
3ea27d37 5314 }
40d224a9 5315 server.dirty++;
d36c4e97 5316 } else {
5317 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 5318 }
ed9b544e 5319 zfree(dv);
5320}
5321
5322static void sinterCommand(redisClient *c) {
5323 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5324}
5325
5326static void sinterstoreCommand(redisClient *c) {
5327 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5328}
5329
f4f56e1d 5330#define REDIS_OP_UNION 0
5331#define REDIS_OP_DIFF 1
2830ca53 5332#define REDIS_OP_INTER 2
f4f56e1d 5333
5334static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 5335 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5336 dictIterator *di;
5337 dictEntry *de;
f4f56e1d 5338 robj *dstset = NULL;
40d224a9 5339 int j, cardinality = 0;
5340
40d224a9 5341 for (j = 0; j < setsnum; j++) {
5342 robj *setobj;
5343
5344 setobj = dstkey ?
5345 lookupKeyWrite(c->db,setskeys[j]) :
5346 lookupKeyRead(c->db,setskeys[j]);
5347 if (!setobj) {
5348 dv[j] = NULL;
5349 continue;
5350 }
5351 if (setobj->type != REDIS_SET) {
5352 zfree(dv);
5353 addReply(c,shared.wrongtypeerr);
5354 return;
5355 }
5356 dv[j] = setobj->ptr;
5357 }
5358
5359 /* We need a temp set object to store our union. If the dstkey
5360 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5361 * this set object will be the resulting object to set into the target key*/
5362 dstset = createSetObject();
5363
40d224a9 5364 /* Iterate all the elements of all the sets, add every element a single
5365 * time to the result set */
5366 for (j = 0; j < setsnum; j++) {
51829ed3 5367 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 5368 if (!dv[j]) continue; /* non existing keys are like empty sets */
5369
5370 di = dictGetIterator(dv[j]);
40d224a9 5371
5372 while((de = dictNext(di)) != NULL) {
5373 robj *ele;
5374
5375 /* dictAdd will not add the same element multiple times */
5376 ele = dictGetEntryKey(de);
f4f56e1d 5377 if (op == REDIS_OP_UNION || j == 0) {
5378 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5379 incrRefCount(ele);
40d224a9 5380 cardinality++;
5381 }
f4f56e1d 5382 } else if (op == REDIS_OP_DIFF) {
5383 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5384 cardinality--;
5385 }
40d224a9 5386 }
5387 }
5388 dictReleaseIterator(di);
51829ed3 5389
d36c4e97 5390 /* result set is empty? Exit asap. */
5391 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 5392 }
5393
f4f56e1d 5394 /* Output the content of the resulting set, if not in STORE mode */
5395 if (!dstkey) {
5396 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5397 di = dictGetIterator(dstset->ptr);
f4f56e1d 5398 while((de = dictNext(di)) != NULL) {
5399 robj *ele;
5400
5401 ele = dictGetEntryKey(de);
dd88747b 5402 addReplyBulk(c,ele);
f4f56e1d 5403 }
5404 dictReleaseIterator(di);
d36c4e97 5405 decrRefCount(dstset);
83cdfe18
AG
5406 } else {
5407 /* If we have a target key where to store the resulting set
5408 * create this key with the result set inside */
5409 deleteKey(c->db,dstkey);
3ea27d37 5410 if (dictSize((dict*)dstset->ptr) > 0) {
5411 dictAdd(c->db->dict,dstkey,dstset);
5412 incrRefCount(dstkey);
482b672d 5413 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5414 } else {
5415 decrRefCount(dstset);
d36c4e97 5416 addReply(c,shared.czero);
3ea27d37 5417 }
40d224a9 5418 server.dirty++;
5419 }
5420 zfree(dv);
5421}
5422
5423static void sunionCommand(redisClient *c) {
f4f56e1d 5424 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 5425}
5426
5427static void sunionstoreCommand(redisClient *c) {
f4f56e1d 5428 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5429}
5430
5431static void sdiffCommand(redisClient *c) {
5432 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5433}
5434
5435static void sdiffstoreCommand(redisClient *c) {
5436 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 5437}
5438
6b47e12e 5439/* ==================================== ZSets =============================== */
5440
5441/* ZSETs are ordered sets using two data structures to hold the same elements
5442 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5443 * data structure.
5444 *
5445 * The elements are added to an hash table mapping Redis objects to scores.
5446 * At the same time the elements are added to a skip list mapping scores
5447 * to Redis objects (so objects are sorted by scores in this "view"). */
5448
5449/* This skiplist implementation is almost a C translation of the original
5450 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5451 * Alternative to Balanced Trees", modified in three ways:
5452 * a) this implementation allows for repeated values.
5453 * b) the comparison is not just by key (our 'score') but by satellite data.
5454 * c) there is a back pointer, so it's a doubly linked list with the back
5455 * pointers being only at "level 1". This allows to traverse the list
5456 * from tail to head, useful for ZREVRANGE. */
5457
5458static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5459 zskiplistNode *zn = zmalloc(sizeof(*zn));
5460
5461 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2f4dd7e0 5462 if (level > 1)
2b37892e 5463 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
2f4dd7e0 5464 else
5465 zn->span = NULL;
6b47e12e 5466 zn->score = score;
5467 zn->obj = obj;
5468 return zn;
5469}
5470
5471static zskiplist *zslCreate(void) {
5472 int j;
5473 zskiplist *zsl;
e0a62c7f 5474
6b47e12e 5475 zsl = zmalloc(sizeof(*zsl));
5476 zsl->level = 1;
cc812361 5477 zsl->length = 0;
6b47e12e 5478 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 5479 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 5480 zsl->header->forward[j] = NULL;
94e543b5 5481
5482 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5483 if (j < ZSKIPLIST_MAXLEVEL-1)
5484 zsl->header->span[j] = 0;
69d95c3e 5485 }
e3870fab 5486 zsl->header->backward = NULL;
5487 zsl->tail = NULL;
6b47e12e 5488 return zsl;
5489}
5490
fd8ccf44 5491static void zslFreeNode(zskiplistNode *node) {
5492 decrRefCount(node->obj);
ad807e6f 5493 zfree(node->forward);
69d95c3e 5494 zfree(node->span);
fd8ccf44 5495 zfree(node);
5496}
5497
5498static void zslFree(zskiplist *zsl) {
ad807e6f 5499 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 5500
ad807e6f 5501 zfree(zsl->header->forward);
69d95c3e 5502 zfree(zsl->header->span);
ad807e6f 5503 zfree(zsl->header);
fd8ccf44 5504 while(node) {
599379dd 5505 next = node->forward[0];
fd8ccf44 5506 zslFreeNode(node);
5507 node = next;
5508 }
ad807e6f 5509 zfree(zsl);
fd8ccf44 5510}
5511
6b47e12e 5512static int zslRandomLevel(void) {
5513 int level = 1;
5514 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5515 level += 1;
10c2baa5 5516 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6b47e12e 5517}
5518
5519static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5520 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 5521 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 5522 int i, level;
5523
5524 x = zsl->header;
5525 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
5526 /* store rank that is crossed to reach the insert position */
5527 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 5528
9d60e6e4 5529 while (x->forward[i] &&
5530 (x->forward[i]->score < score ||
5531 (x->forward[i]->score == score &&
69d95c3e 5532 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 5533 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 5534 x = x->forward[i];
69d95c3e 5535 }
6b47e12e 5536 update[i] = x;
5537 }
6b47e12e 5538 /* we assume the key is not already inside, since we allow duplicated
5539 * scores, and the re-insertion of score and redis object should never
5540 * happpen since the caller of zslInsert() should test in the hash table
5541 * if the element is already inside or not. */
5542 level = zslRandomLevel();
5543 if (level > zsl->level) {
69d95c3e 5544 for (i = zsl->level; i < level; i++) {
2b37892e 5545 rank[i] = 0;
6b47e12e 5546 update[i] = zsl->header;
2b37892e 5547 update[i]->span[i-1] = zsl->length;
69d95c3e 5548 }
6b47e12e 5549 zsl->level = level;
5550 }
5551 x = zslCreateNode(level,score,obj);
5552 for (i = 0; i < level; i++) {
5553 x->forward[i] = update[i]->forward[i];
5554 update[i]->forward[i] = x;
69d95c3e
PN
5555
5556 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
5557 if (i > 0) {
5558 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5559 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5560 }
6b47e12e 5561 }
69d95c3e
PN
5562
5563 /* increment span for untouched levels */
5564 for (i = level; i < zsl->level; i++) {
2b37892e 5565 update[i]->span[i-1]++;
69d95c3e
PN
5566 }
5567
bb975144 5568 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 5569 if (x->forward[0])
5570 x->forward[0]->backward = x;
5571 else
5572 zsl->tail = x;
cc812361 5573 zsl->length++;
6b47e12e 5574}
5575
84105336
PN
5576/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5577void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5578 int i;
5579 for (i = 0; i < zsl->level; i++) {
5580 if (update[i]->forward[i] == x) {
5581 if (i > 0) {
5582 update[i]->span[i-1] += x->span[i-1] - 1;
5583 }
5584 update[i]->forward[i] = x->forward[i];
5585 } else {
5586 /* invariant: i > 0, because update[0]->forward[0]
5587 * is always equal to x */
5588 update[i]->span[i-1] -= 1;
5589 }
5590 }
5591 if (x->forward[0]) {
5592 x->forward[0]->backward = x->backward;
5593 } else {
5594 zsl->tail = x->backward;
5595 }
5596 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5597 zsl->level--;
5598 zsl->length--;
5599}
5600
50c55df5 5601/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 5602static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 5603 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5604 int i;
5605
5606 x = zsl->header;
5607 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 5608 while (x->forward[i] &&
5609 (x->forward[i]->score < score ||
5610 (x->forward[i]->score == score &&
5611 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 5612 x = x->forward[i];
5613 update[i] = x;
5614 }
5615 /* We may have multiple elements with the same score, what we need
5616 * is to find the element with both the right score and object. */
5617 x = x->forward[0];
bf028098 5618 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
84105336 5619 zslDeleteNode(zsl, x, update);
9d60e6e4 5620 zslFreeNode(x);
9d60e6e4 5621 return 1;
5622 } else {
5623 return 0; /* not found */
e197b441 5624 }
5625 return 0; /* not found */
fd8ccf44 5626}
5627
1807985b 5628/* Delete all the elements with score between min and max from the skiplist.
5629 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5630 * Note that this function takes the reference to the hash table view of the
5631 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 5632static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 5633 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5634 unsigned long removed = 0;
5635 int i;
5636
5637 x = zsl->header;
5638 for (i = zsl->level-1; i >= 0; i--) {
5639 while (x->forward[i] && x->forward[i]->score < min)
5640 x = x->forward[i];
5641 update[i] = x;
5642 }
5643 /* We may have multiple elements with the same score, what we need
5644 * is to find the element with both the right score and object. */
5645 x = x->forward[0];
5646 while (x && x->score <= max) {
84105336
PN
5647 zskiplistNode *next = x->forward[0];
5648 zslDeleteNode(zsl, x, update);
1807985b 5649 dictDelete(dict,x->obj);
5650 zslFreeNode(x);
1807985b 5651 removed++;
5652 x = next;
5653 }
5654 return removed; /* not found */
5655}
1807985b 5656
9212eafd 5657/* Delete all the elements with rank between start and end from the skiplist.
2424490f 5658 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
5659static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5660 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5661 unsigned long traversed = 0, removed = 0;
5662 int i;
5663
9212eafd
PN
5664 x = zsl->header;
5665 for (i = zsl->level-1; i >= 0; i--) {
5666 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5667 traversed += i > 0 ? x->span[i-1] : 1;
5668 x = x->forward[i];
1807985b 5669 }
9212eafd
PN
5670 update[i] = x;
5671 }
5672
5673 traversed++;
5674 x = x->forward[0];
5675 while (x && traversed <= end) {
84105336
PN
5676 zskiplistNode *next = x->forward[0];
5677 zslDeleteNode(zsl, x, update);
1807985b 5678 dictDelete(dict,x->obj);
5679 zslFreeNode(x);
1807985b 5680 removed++;
9212eafd 5681 traversed++;
1807985b 5682 x = next;
5683 }
9212eafd 5684 return removed;
1807985b 5685}
5686
50c55df5 5687/* Find the first node having a score equal or greater than the specified one.
5688 * Returns NULL if there is no match. */
5689static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5690 zskiplistNode *x;
5691 int i;
5692
5693 x = zsl->header;
5694 for (i = zsl->level-1; i >= 0; i--) {
5695 while (x->forward[i] && x->forward[i]->score < score)
5696 x = x->forward[i];
5697 }
5698 /* We may have multiple elements with the same score, what we need
5699 * is to find the element with both the right score and object. */
5700 return x->forward[0];
5701}
5702
27b0ccca
PN
5703/* Find the rank for an element by both score and key.
5704 * Returns 0 when the element cannot be found, rank otherwise.
5705 * Note that the rank is 1-based due to the span of zsl->header to the
5706 * first element. */
5707static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5708 zskiplistNode *x;
5709 unsigned long rank = 0;
5710 int i;
5711
5712 x = zsl->header;
5713 for (i = zsl->level-1; i >= 0; i--) {
5714 while (x->forward[i] &&
5715 (x->forward[i]->score < score ||
5716 (x->forward[i]->score == score &&
5717 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 5718 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
5719 x = x->forward[i];
5720 }
5721
5722 /* x might be equal to zsl->header, so test if obj is non-NULL */
bf028098 5723 if (x->obj && equalStringObjects(x->obj,o)) {
27b0ccca
PN
5724 return rank;
5725 }
5726 }
5727 return 0;
5728}
5729
e74825c2
PN
5730/* Finds an element by its rank. The rank argument needs to be 1-based. */
5731zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5732 zskiplistNode *x;
5733 unsigned long traversed = 0;
5734 int i;
5735
5736 x = zsl->header;
5737 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 5738 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5739 {
a50ea45c 5740 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
5741 x = x->forward[i];
5742 }
e74825c2
PN
5743 if (traversed == rank) {
5744 return x;
5745 }
5746 }
5747 return NULL;
5748}
5749
fd8ccf44 5750/* The actual Z-commands implementations */
5751
7db723ad 5752/* This generic command implements both ZADD and ZINCRBY.
e2665397 5753 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 5754 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 5755static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 5756 robj *zsetobj;
5757 zset *zs;
5758 double *score;
5759
5fc9229c 5760 if (isnan(scoreval)) {
5761 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
5762 return;
5763 }
5764
e2665397 5765 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 5766 if (zsetobj == NULL) {
5767 zsetobj = createZsetObject();
e2665397 5768 dictAdd(c->db->dict,key,zsetobj);
5769 incrRefCount(key);
fd8ccf44 5770 } else {
5771 if (zsetobj->type != REDIS_ZSET) {
5772 addReply(c,shared.wrongtypeerr);
5773 return;
5774 }
5775 }
fd8ccf44 5776 zs = zsetobj->ptr;
e2665397 5777
7db723ad 5778 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 5779 * needs to handle the two different conditions. It's all about setting
5780 * '*score', that is, the new score to set, to the right value. */
5781 score = zmalloc(sizeof(double));
5782 if (doincrement) {
5783 dictEntry *de;
5784
5785 /* Read the old score. If the element was not present starts from 0 */
5786 de = dictFind(zs->dict,ele);
5787 if (de) {
5788 double *oldscore = dictGetEntryVal(de);
5789 *score = *oldscore + scoreval;
5790 } else {
5791 *score = scoreval;
5792 }
5fc9229c 5793 if (isnan(*score)) {
5794 addReplySds(c,
5795 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
5796 zfree(score);
5797 /* Note that we don't need to check if the zset may be empty and
5798 * should be removed here, as we can only obtain Nan as score if
5799 * there was already an element in the sorted set. */
5800 return;
5801 }
e2665397 5802 } else {
5803 *score = scoreval;
5804 }
5805
5806 /* What follows is a simple remove and re-insert operation that is common
7db723ad 5807 * to both ZADD and ZINCRBY... */
e2665397 5808 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 5809 /* case 1: New element */
e2665397 5810 incrRefCount(ele); /* added to hash */
5811 zslInsert(zs->zsl,*score,ele);
5812 incrRefCount(ele); /* added to skiplist */
fd8ccf44 5813 server.dirty++;
e2665397 5814 if (doincrement)
e2665397 5815 addReplyDouble(c,*score);
91d71bfc 5816 else
5817 addReply(c,shared.cone);
fd8ccf44 5818 } else {
5819 dictEntry *de;
5820 double *oldscore;
e0a62c7f 5821
fd8ccf44 5822 /* case 2: Score update operation */
e2665397 5823 de = dictFind(zs->dict,ele);
dfc5e96c 5824 redisAssert(de != NULL);
fd8ccf44 5825 oldscore = dictGetEntryVal(de);
5826 if (*score != *oldscore) {
5827 int deleted;
5828
e2665397 5829 /* Remove and insert the element in the skip list with new score */
5830 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 5831 redisAssert(deleted != 0);
e2665397 5832 zslInsert(zs->zsl,*score,ele);
5833 incrRefCount(ele);
5834 /* Update the score in the hash table */
5835 dictReplace(zs->dict,ele,score);
fd8ccf44 5836 server.dirty++;
2161a965 5837 } else {
5838 zfree(score);
fd8ccf44 5839 }
e2665397 5840 if (doincrement)
5841 addReplyDouble(c,*score);
5842 else
5843 addReply(c,shared.czero);
fd8ccf44 5844 }
5845}
5846
e2665397 5847static void zaddCommand(redisClient *c) {
5848 double scoreval;
5849
bd79a6bd 5850 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5851 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5852}
5853
7db723ad 5854static void zincrbyCommand(redisClient *c) {
e2665397 5855 double scoreval;
5856
bd79a6bd 5857 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5858 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5859}
5860
1b7106e7 5861static void zremCommand(redisClient *c) {
5862 robj *zsetobj;
5863 zset *zs;
dd88747b 5864 dictEntry *de;
5865 double *oldscore;
5866 int deleted;
1b7106e7 5867
dd88747b 5868 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5869 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 5870
dd88747b 5871 zs = zsetobj->ptr;
5872 de = dictFind(zs->dict,c->argv[2]);
5873 if (de == NULL) {
5874 addReply(c,shared.czero);
5875 return;
1b7106e7 5876 }
dd88747b 5877 /* Delete from the skiplist */
5878 oldscore = dictGetEntryVal(de);
5879 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5880 redisAssert(deleted != 0);
5881
5882 /* Delete from the hash table */
5883 dictDelete(zs->dict,c->argv[2]);
5884 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5885 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5886 server.dirty++;
5887 addReply(c,shared.cone);
1b7106e7 5888}
5889
1807985b 5890static void zremrangebyscoreCommand(redisClient *c) {
bbe025e0
AM
5891 double min;
5892 double max;
dd88747b 5893 long deleted;
1807985b 5894 robj *zsetobj;
5895 zset *zs;
5896
bd79a6bd
PN
5897 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5898 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
bbe025e0 5899
dd88747b 5900 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5901 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 5902
dd88747b 5903 zs = zsetobj->ptr;
5904 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5905 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5906 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5907 server.dirty += deleted;
482b672d 5908 addReplyLongLong(c,deleted);
1807985b 5909}
5910
9212eafd 5911static void zremrangebyrankCommand(redisClient *c) {
bbe025e0
AM
5912 long start;
5913 long end;
dd88747b 5914 int llen;
5915 long deleted;
9212eafd
PN
5916 robj *zsetobj;
5917 zset *zs;
5918
bd79a6bd
PN
5919 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5920 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 5921
dd88747b 5922 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5923 checkType(c,zsetobj,REDIS_ZSET)) return;
5924 zs = zsetobj->ptr;
5925 llen = zs->zsl->length;
9212eafd 5926
dd88747b 5927 /* convert negative indexes */
5928 if (start < 0) start = llen+start;
5929 if (end < 0) end = llen+end;
5930 if (start < 0) start = 0;
5931 if (end < 0) end = 0;
9212eafd 5932
dd88747b 5933 /* indexes sanity checks */
5934 if (start > end || start >= llen) {
5935 addReply(c,shared.czero);
5936 return;
9212eafd 5937 }
dd88747b 5938 if (end >= llen) end = llen-1;
5939
5940 /* increment start and end because zsl*Rank functions
5941 * use 1-based rank */
5942 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5943 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5944 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5945 server.dirty += deleted;
482b672d 5946 addReplyLongLong(c, deleted);
9212eafd
PN
5947}
5948
8f92e768
PN
5949typedef struct {
5950 dict *dict;
5951 double weight;
5952} zsetopsrc;
5953
5954static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5955 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5956 unsigned long size1, size2;
5957 size1 = d1->dict ? dictSize(d1->dict) : 0;
5958 size2 = d2->dict ? dictSize(d2->dict) : 0;
5959 return size1 - size2;
5960}
5961
d2764cd6
PN
5962#define REDIS_AGGR_SUM 1
5963#define REDIS_AGGR_MIN 2
5964#define REDIS_AGGR_MAX 3
bc000c1d 5965#define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
d2764cd6
PN
5966
5967inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5968 if (aggregate == REDIS_AGGR_SUM) {
5969 *target = *target + val;
5970 } else if (aggregate == REDIS_AGGR_MIN) {
5971 *target = val < *target ? val : *target;
5972 } else if (aggregate == REDIS_AGGR_MAX) {
5973 *target = val > *target ? val : *target;
5974 } else {
5975 /* safety net */
f83c6cb5 5976 redisPanic("Unknown ZUNION/INTER aggregate type");
d2764cd6
PN
5977 }
5978}
5979
2830ca53 5980static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
bc000c1d 5981 int i, j, setnum;
d2764cd6 5982 int aggregate = REDIS_AGGR_SUM;
8f92e768 5983 zsetopsrc *src;
2830ca53
PN
5984 robj *dstobj;
5985 zset *dstzset;
b287c9bb
PN
5986 dictIterator *di;
5987 dictEntry *de;
5988
bc000c1d
JC
5989 /* expect setnum input keys to be given */
5990 setnum = atoi(c->argv[2]->ptr);
5991 if (setnum < 1) {
5d373da9 5992 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
2830ca53 5993 return;
b287c9bb 5994 }
2830ca53
PN
5995
5996 /* test if the expected number of keys would overflow */
bc000c1d 5997 if (3+setnum > c->argc) {
b287c9bb
PN
5998 addReply(c,shared.syntaxerr);
5999 return;
6000 }
6001
2830ca53 6002 /* read keys to be used for input */
bc000c1d
JC
6003 src = zmalloc(sizeof(zsetopsrc) * setnum);
6004 for (i = 0, j = 3; i < setnum; i++, j++) {
6005 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
6006 if (!obj) {
8f92e768 6007 src[i].dict = NULL;
b287c9bb 6008 } else {
bc000c1d
JC
6009 if (obj->type == REDIS_ZSET) {
6010 src[i].dict = ((zset*)obj->ptr)->dict;
6011 } else if (obj->type == REDIS_SET) {
6012 src[i].dict = (obj->ptr);
6013 } else {
8f92e768 6014 zfree(src);
b287c9bb
PN
6015 addReply(c,shared.wrongtypeerr);
6016 return;
6017 }
b287c9bb 6018 }
2830ca53
PN
6019
6020 /* default all weights to 1 */
8f92e768 6021 src[i].weight = 1.0;
b287c9bb
PN
6022 }
6023
2830ca53
PN
6024 /* parse optional extra arguments */
6025 if (j < c->argc) {
d2764cd6 6026 int remaining = c->argc - j;
b287c9bb 6027
2830ca53 6028 while (remaining) {
bc000c1d 6029 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 6030 j++; remaining--;
bc000c1d 6031 for (i = 0; i < setnum; i++, j++, remaining--) {
bd79a6bd 6032 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
bbe025e0 6033 return;
2830ca53 6034 }
d2764cd6
PN
6035 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6036 j++; remaining--;
6037 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6038 aggregate = REDIS_AGGR_SUM;
6039 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6040 aggregate = REDIS_AGGR_MIN;
6041 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6042 aggregate = REDIS_AGGR_MAX;
6043 } else {
6044 zfree(src);
6045 addReply(c,shared.syntaxerr);
6046 return;
6047 }
6048 j++; remaining--;
2830ca53 6049 } else {
8f92e768 6050 zfree(src);
2830ca53
PN
6051 addReply(c,shared.syntaxerr);
6052 return;
6053 }
6054 }
6055 }
b287c9bb 6056
d2764cd6
PN
6057 /* sort sets from the smallest to largest, this will improve our
6058 * algorithm's performance */
bc000c1d 6059 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
d2764cd6 6060
2830ca53
PN
6061 dstobj = createZsetObject();
6062 dstzset = dstobj->ptr;
6063
6064 if (op == REDIS_OP_INTER) {
8f92e768
PN
6065 /* skip going over all entries if the smallest zset is NULL or empty */
6066 if (src[0].dict && dictSize(src[0].dict) > 0) {
6067 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6068 * from small to large, all src[i > 0].dict are non-empty too */
6069 di = dictGetIterator(src[0].dict);
2830ca53 6070 while((de = dictNext(di)) != NULL) {
d2764cd6 6071 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6072 *score = src[0].weight * zunionInterDictValue(de);
2830ca53 6073
bc000c1d 6074 for (j = 1; j < setnum; j++) {
d2764cd6 6075 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6076 if (other) {
bc000c1d 6077 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6078 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6079 } else {
6080 break;
6081 }
6082 }
b287c9bb 6083
2830ca53 6084 /* skip entry when not present in every source dict */
bc000c1d 6085 if (j != setnum) {
2830ca53
PN
6086 zfree(score);
6087 } else {
6088 robj *o = dictGetEntryKey(de);
6089 dictAdd(dstzset->dict,o,score);
6090 incrRefCount(o); /* added to dictionary */
6091 zslInsert(dstzset->zsl,*score,o);
6092 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
6093 }
6094 }
2830ca53
PN
6095 dictReleaseIterator(di);
6096 }
6097 } else if (op == REDIS_OP_UNION) {
bc000c1d 6098 for (i = 0; i < setnum; i++) {
8f92e768 6099 if (!src[i].dict) continue;
2830ca53 6100
8f92e768 6101 di = dictGetIterator(src[i].dict);
2830ca53
PN
6102 while((de = dictNext(di)) != NULL) {
6103 /* skip key when already processed */
6104 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6105
d2764cd6 6106 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6107 *score = src[i].weight * zunionInterDictValue(de);
2830ca53 6108
d2764cd6
PN
6109 /* because the zsets are sorted by size, its only possible
6110 * for sets at larger indices to hold this entry */
bc000c1d 6111 for (j = (i+1); j < setnum; j++) {
d2764cd6 6112 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6113 if (other) {
bc000c1d 6114 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6115 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6116 }
6117 }
b287c9bb 6118
2830ca53
PN
6119 robj *o = dictGetEntryKey(de);
6120 dictAdd(dstzset->dict,o,score);
6121 incrRefCount(o); /* added to dictionary */
6122 zslInsert(dstzset->zsl,*score,o);
6123 incrRefCount(o); /* added to skiplist */
6124 }
6125 dictReleaseIterator(di);
b287c9bb 6126 }
2830ca53
PN
6127 } else {
6128 /* unknown operator */
6129 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
6130 }
6131
6132 deleteKey(c->db,dstkey);
3ea27d37 6133 if (dstzset->zsl->length) {
6134 dictAdd(c->db->dict,dstkey,dstobj);
6135 incrRefCount(dstkey);
482b672d 6136 addReplyLongLong(c, dstzset->zsl->length);
3ea27d37 6137 server.dirty++;
6138 } else {
8bca8773 6139 decrRefCount(dstobj);
3ea27d37 6140 addReply(c, shared.czero);
6141 }
8f92e768 6142 zfree(src);
b287c9bb
PN
6143}
6144
5d373da9 6145static void zunionstoreCommand(redisClient *c) {
2830ca53 6146 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
6147}
6148
5d373da9 6149static void zinterstoreCommand(redisClient *c) {
2830ca53 6150 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
6151}
6152
e3870fab 6153static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 6154 robj *o;
bbe025e0
AM
6155 long start;
6156 long end;
752da584 6157 int withscores = 0;
dd88747b 6158 int llen;
6159 int rangelen, j;
6160 zset *zsetobj;
6161 zskiplist *zsl;
6162 zskiplistNode *ln;
6163 robj *ele;
752da584 6164
bd79a6bd
PN
6165 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6166 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 6167
752da584 6168 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6169 withscores = 1;
6170 } else if (c->argc >= 5) {
6171 addReply(c,shared.syntaxerr);
6172 return;
6173 }
cc812361 6174
4e27f268 6175 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6176 || checkType(c,o,REDIS_ZSET)) return;
dd88747b 6177 zsetobj = o->ptr;
6178 zsl = zsetobj->zsl;
6179 llen = zsl->length;
cc812361 6180
dd88747b 6181 /* convert negative indexes */
6182 if (start < 0) start = llen+start;
6183 if (end < 0) end = llen+end;
6184 if (start < 0) start = 0;
6185 if (end < 0) end = 0;
cc812361 6186
dd88747b 6187 /* indexes sanity checks */
6188 if (start > end || start >= llen) {
6189 /* Out of range start or start > end result in empty list */
6190 addReply(c,shared.emptymultibulk);
6191 return;
6192 }
6193 if (end >= llen) end = llen-1;
6194 rangelen = (end-start)+1;
cc812361 6195
dd88747b 6196 /* check if starting point is trivial, before searching
6197 * the element in log(N) time */
6198 if (reverse) {
6199 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6200 } else {
6201 ln = start == 0 ?
6202 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6203 }
cc812361 6204
dd88747b 6205 /* Return the result in form of a multi-bulk reply */
6206 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6207 withscores ? (rangelen*2) : rangelen));
6208 for (j = 0; j < rangelen; j++) {
6209 ele = ln->obj;
6210 addReplyBulk(c,ele);
6211 if (withscores)
6212 addReplyDouble(c,ln->score);
6213 ln = reverse ? ln->backward : ln->forward[0];
cc812361 6214 }
6215}
6216
e3870fab 6217static void zrangeCommand(redisClient *c) {
6218 zrangeGenericCommand(c,0);
6219}
6220
6221static void zrevrangeCommand(redisClient *c) {
6222 zrangeGenericCommand(c,1);
6223}
6224
f44dd428 6225/* This command implements both ZRANGEBYSCORE and ZCOUNT.
6226 * If justcount is non-zero, just the count is returned. */
6227static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 6228 robj *o;
f44dd428 6229 double min, max;
6230 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 6231 int offset = 0, limit = -1;
0500ef27
SH
6232 int withscores = 0;
6233 int badsyntax = 0;
6234
f44dd428 6235 /* Parse the min-max interval. If one of the values is prefixed
6236 * by the "(" character, it's considered "open". For instance
6237 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6238 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6239 if (((char*)c->argv[2]->ptr)[0] == '(') {
6240 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6241 minex = 1;
6242 } else {
6243 min = strtod(c->argv[2]->ptr,NULL);
6244 }
6245 if (((char*)c->argv[3]->ptr)[0] == '(') {
6246 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6247 maxex = 1;
6248 } else {
6249 max = strtod(c->argv[3]->ptr,NULL);
6250 }
6251
6252 /* Parse "WITHSCORES": note that if the command was called with
6253 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6254 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 6255 if (c->argc == 5 || c->argc == 8) {
3a3978b1 6256 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6257 withscores = 1;
6258 else
6259 badsyntax = 1;
0500ef27 6260 }
3a3978b1 6261 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 6262 badsyntax = 1;
0500ef27 6263 if (badsyntax) {
454d4e43 6264 addReplySds(c,
6265 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 6266 return;
0500ef27
SH
6267 }
6268
f44dd428 6269 /* Parse "LIMIT" */
0500ef27 6270 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 6271 addReply(c,shared.syntaxerr);
6272 return;
0500ef27 6273 } else if (c->argc == (7 + withscores)) {
80181f78 6274 offset = atoi(c->argv[5]->ptr);
6275 limit = atoi(c->argv[6]->ptr);
0b13687c 6276 if (offset < 0) offset = 0;
80181f78 6277 }
50c55df5 6278
f44dd428 6279 /* Ok, lookup the key and get the range */
50c55df5 6280 o = lookupKeyRead(c->db,c->argv[1]);
6281 if (o == NULL) {
4e27f268 6282 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6283 } else {
6284 if (o->type != REDIS_ZSET) {
6285 addReply(c,shared.wrongtypeerr);
6286 } else {
6287 zset *zsetobj = o->ptr;
6288 zskiplist *zsl = zsetobj->zsl;
6289 zskiplistNode *ln;
f44dd428 6290 robj *ele, *lenobj = NULL;
6291 unsigned long rangelen = 0;
50c55df5 6292
f44dd428 6293 /* Get the first node with the score >= min, or with
6294 * score > min if 'minex' is true. */
50c55df5 6295 ln = zslFirstWithScore(zsl,min);
f44dd428 6296 while (minex && ln && ln->score == min) ln = ln->forward[0];
6297
50c55df5 6298 if (ln == NULL) {
6299 /* No element matching the speciifed interval */
f44dd428 6300 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6301 return;
6302 }
6303
6304 /* We don't know in advance how many matching elements there
6305 * are in the list, so we push this object that will represent
6306 * the multi-bulk length in the output buffer, and will "fix"
6307 * it later */
f44dd428 6308 if (!justcount) {
6309 lenobj = createObject(REDIS_STRING,NULL);
6310 addReply(c,lenobj);
6311 decrRefCount(lenobj);
6312 }
50c55df5 6313
f44dd428 6314 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 6315 if (offset) {
6316 offset--;
6317 ln = ln->forward[0];
6318 continue;
6319 }
6320 if (limit == 0) break;
f44dd428 6321 if (!justcount) {
6322 ele = ln->obj;
dd88747b 6323 addReplyBulk(c,ele);
f44dd428 6324 if (withscores)
6325 addReplyDouble(c,ln->score);
6326 }
50c55df5 6327 ln = ln->forward[0];
6328 rangelen++;
80181f78 6329 if (limit > 0) limit--;
50c55df5 6330 }
f44dd428 6331 if (justcount) {
482b672d 6332 addReplyLongLong(c,(long)rangelen);
f44dd428 6333 } else {
6334 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6335 withscores ? (rangelen*2) : rangelen);
6336 }
50c55df5 6337 }
6338 }
6339}
6340
f44dd428 6341static void zrangebyscoreCommand(redisClient *c) {
6342 genericZrangebyscoreCommand(c,0);
6343}
6344
6345static void zcountCommand(redisClient *c) {
6346 genericZrangebyscoreCommand(c,1);
6347}
6348
3c41331e 6349static void zcardCommand(redisClient *c) {
e197b441 6350 robj *o;
6351 zset *zs;
dd88747b 6352
6353 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6354 checkType(c,o,REDIS_ZSET)) return;
6355
6356 zs = o->ptr;
6357 addReplyUlong(c,zs->zsl->length);
e197b441 6358}
6359
6e333bbe 6360static void zscoreCommand(redisClient *c) {
6361 robj *o;
6362 zset *zs;
dd88747b 6363 dictEntry *de;
6364
6365 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6366 checkType(c,o,REDIS_ZSET)) return;
6367
6368 zs = o->ptr;
6369 de = dictFind(zs->dict,c->argv[2]);
6370 if (!de) {
96d8b4ee 6371 addReply(c,shared.nullbulk);
6e333bbe 6372 } else {
dd88747b 6373 double *score = dictGetEntryVal(de);
6e333bbe 6374
dd88747b 6375 addReplyDouble(c,*score);
6e333bbe 6376 }
6377}
6378
798d9e55 6379static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 6380 robj *o;
dd88747b 6381 zset *zs;
6382 zskiplist *zsl;
6383 dictEntry *de;
6384 unsigned long rank;
6385 double *score;
6386
6387 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6388 checkType(c,o,REDIS_ZSET)) return;
6389
6390 zs = o->ptr;
6391 zsl = zs->zsl;
6392 de = dictFind(zs->dict,c->argv[2]);
6393 if (!de) {
69d95c3e
PN
6394 addReply(c,shared.nullbulk);
6395 return;
6396 }
69d95c3e 6397
dd88747b 6398 score = dictGetEntryVal(de);
6399 rank = zslGetRank(zsl, *score, c->argv[2]);
6400 if (rank) {
6401 if (reverse) {
482b672d 6402 addReplyLongLong(c, zsl->length - rank);
27b0ccca 6403 } else {
482b672d 6404 addReplyLongLong(c, rank-1);
69d95c3e 6405 }
dd88747b 6406 } else {
6407 addReply(c,shared.nullbulk);
978c2c94 6408 }
6409}
6410
798d9e55
PN
6411static void zrankCommand(redisClient *c) {
6412 zrankGenericCommand(c, 0);
6413}
6414
6415static void zrevrankCommand(redisClient *c) {
6416 zrankGenericCommand(c, 1);
6417}
6418
7fb16bac
PN
6419/* ========================= Hashes utility functions ======================= */
6420#define REDIS_HASH_KEY 1
6421#define REDIS_HASH_VALUE 2
978c2c94 6422
7fb16bac
PN
6423/* Check the length of a number of objects to see if we need to convert a
6424 * zipmap to a real hash. Note that we only check string encoded objects
6425 * as their string length can be queried in constant time. */
6426static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6427 int i;
6428 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
978c2c94 6429
7fb16bac
PN
6430 for (i = start; i <= end; i++) {
6431 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6432 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6433 {
6434 convertToRealHash(subject);
978c2c94 6435 return;
6436 }
6437 }
7fb16bac 6438}
bae2c7ec 6439
97224de7
PN
6440/* Encode given objects in-place when the hash uses a dict. */
6441static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6442 if (subject->encoding == REDIS_ENCODING_HT) {
3f973463
PN
6443 if (o1) *o1 = tryObjectEncoding(*o1);
6444 if (o2) *o2 = tryObjectEncoding(*o2);
97224de7
PN
6445 }
6446}
6447
7fb16bac 6448/* Get the value from a hash identified by key. Returns either a string
a3f3af86
PN
6449 * object or NULL if the value cannot be found. The refcount of the object
6450 * is always increased by 1 when the value was found. */
7fb16bac
PN
6451static robj *hashGet(robj *o, robj *key) {
6452 robj *value = NULL;
978c2c94 6453 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7fb16bac
PN
6454 unsigned char *v;
6455 unsigned int vlen;
6456 key = getDecodedObject(key);
6457 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6458 value = createStringObject((char*)v,vlen);
6459 }
6460 decrRefCount(key);
6461 } else {
6462 dictEntry *de = dictFind(o->ptr,key);
6463 if (de != NULL) {
6464 value = dictGetEntryVal(de);
a3f3af86 6465 incrRefCount(value);
7fb16bac
PN
6466 }
6467 }
6468 return value;
6469}
978c2c94 6470
7fb16bac
PN
6471/* Test if the key exists in the given hash. Returns 1 if the key
6472 * exists and 0 when it doesn't. */
6473static int hashExists(robj *o, robj *key) {
6474 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6475 key = getDecodedObject(key);
6476 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6477 decrRefCount(key);
6478 return 1;
6479 }
6480 decrRefCount(key);
6481 } else {
6482 if (dictFind(o->ptr,key) != NULL) {
6483 return 1;
6484 }
6485 }
6486 return 0;
6487}
bae2c7ec 6488
7fb16bac
PN
6489/* Add an element, discard the old if the key already exists.
6490 * Return 0 on insert and 1 on update. */
feb8d7e6 6491static int hashSet(robj *o, robj *key, robj *value) {
7fb16bac
PN
6492 int update = 0;
6493 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6494 key = getDecodedObject(key);
6495 value = getDecodedObject(value);
6496 o->ptr = zipmapSet(o->ptr,
6497 key->ptr,sdslen(key->ptr),
6498 value->ptr,sdslen(value->ptr), &update);
6499 decrRefCount(key);
6500 decrRefCount(value);
6501
6502 /* Check if the zipmap needs to be upgraded to a real hash table */
6503 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
bae2c7ec 6504 convertToRealHash(o);
978c2c94 6505 } else {
7fb16bac
PN
6506 if (dictReplace(o->ptr,key,value)) {
6507 /* Insert */
6508 incrRefCount(key);
978c2c94 6509 } else {
7fb16bac 6510 /* Update */
978c2c94 6511 update = 1;
6512 }
7fb16bac 6513 incrRefCount(value);
978c2c94 6514 }
7fb16bac 6515 return update;
978c2c94 6516}
6517
7fb16bac
PN
6518/* Delete an element from a hash.
6519 * Return 1 on deleted and 0 on not found. */
6520static int hashDelete(robj *o, robj *key) {
6521 int deleted = 0;
6522 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6523 key = getDecodedObject(key);
6524 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6525 decrRefCount(key);
6526 } else {
6527 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6528 /* Always check if the dictionary needs a resize after a delete. */
6529 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
d33278d1 6530 }
7fb16bac
PN
6531 return deleted;
6532}
d33278d1 6533
7fb16bac 6534/* Return the number of elements in a hash. */
c811bb38 6535static unsigned long hashLength(robj *o) {
7fb16bac
PN
6536 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6537 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6538}
6539
6540/* Structure to hold hash iteration abstration. Note that iteration over
6541 * hashes involves both fields and values. Because it is possible that
6542 * not both are required, store pointers in the iterator to avoid
6543 * unnecessary memory allocation for fields/values. */
6544typedef struct {
6545 int encoding;
6546 unsigned char *zi;
6547 unsigned char *zk, *zv;
6548 unsigned int zklen, zvlen;
6549
6550 dictIterator *di;
6551 dictEntry *de;
6552} hashIterator;
6553
c44d3b56
PN
6554static hashIterator *hashInitIterator(robj *subject) {
6555 hashIterator *hi = zmalloc(sizeof(hashIterator));
7fb16bac
PN
6556 hi->encoding = subject->encoding;
6557 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6558 hi->zi = zipmapRewind(subject->ptr);
6559 } else if (hi->encoding == REDIS_ENCODING_HT) {
6560 hi->di = dictGetIterator(subject->ptr);
d33278d1 6561 } else {
7fb16bac 6562 redisAssert(NULL);
d33278d1 6563 }
c44d3b56 6564 return hi;
7fb16bac 6565}
d33278d1 6566
7fb16bac
PN
6567static void hashReleaseIterator(hashIterator *hi) {
6568 if (hi->encoding == REDIS_ENCODING_HT) {
6569 dictReleaseIterator(hi->di);
d33278d1 6570 }
c44d3b56 6571 zfree(hi);
7fb16bac 6572}
d33278d1 6573
7fb16bac
PN
6574/* Move to the next entry in the hash. Return REDIS_OK when the next entry
6575 * could be found and REDIS_ERR when the iterator reaches the end. */
c811bb38 6576static int hashNext(hashIterator *hi) {
7fb16bac
PN
6577 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6578 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6579 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6580 } else {
6581 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6582 }
6583 return REDIS_OK;
6584}
d33278d1 6585
0c390abc 6586/* Get key or value object at current iteration position.
a3f3af86 6587 * This increases the refcount of the field object by 1. */
c811bb38 6588static robj *hashCurrent(hashIterator *hi, int what) {
7fb16bac
PN
6589 robj *o;
6590 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6591 if (what & REDIS_HASH_KEY) {
6592 o = createStringObject((char*)hi->zk,hi->zklen);
6593 } else {
6594 o = createStringObject((char*)hi->zv,hi->zvlen);
d33278d1 6595 }
d33278d1 6596 } else {
7fb16bac
PN
6597 if (what & REDIS_HASH_KEY) {
6598 o = dictGetEntryKey(hi->de);
6599 } else {
6600 o = dictGetEntryVal(hi->de);
d33278d1 6601 }
a3f3af86 6602 incrRefCount(o);
d33278d1 6603 }
7fb16bac 6604 return o;
d33278d1
PN
6605}
6606
7fb16bac
PN
6607static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6608 robj *o = lookupKeyWrite(c->db,key);
01426b05
PN
6609 if (o == NULL) {
6610 o = createHashObject();
7fb16bac
PN
6611 dictAdd(c->db->dict,key,o);
6612 incrRefCount(key);
01426b05
PN
6613 } else {
6614 if (o->type != REDIS_HASH) {
6615 addReply(c,shared.wrongtypeerr);
7fb16bac 6616 return NULL;
01426b05
PN
6617 }
6618 }
7fb16bac
PN
6619 return o;
6620}
01426b05 6621
7fb16bac
PN
6622/* ============================= Hash commands ============================== */
6623static void hsetCommand(redisClient *c) {
6e9e463f 6624 int update;
7fb16bac 6625 robj *o;
bbe025e0 6626
7fb16bac
PN
6627 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6628 hashTryConversion(o,c->argv,2,3);
97224de7 6629 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6630 update = hashSet(o,c->argv[2],c->argv[3]);
6e9e463f 6631 addReply(c, update ? shared.czero : shared.cone);
7fb16bac
PN
6632 server.dirty++;
6633}
01426b05 6634
1f1c7695
PN
6635static void hsetnxCommand(redisClient *c) {
6636 robj *o;
6637 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6638 hashTryConversion(o,c->argv,2,3);
6639
6640 if (hashExists(o, c->argv[2])) {
6641 addReply(c, shared.czero);
01426b05 6642 } else {
97224de7 6643 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6644 hashSet(o,c->argv[2],c->argv[3]);
1f1c7695
PN
6645 addReply(c, shared.cone);
6646 server.dirty++;
6647 }
6648}
01426b05 6649
7fb16bac
PN
6650static void hmsetCommand(redisClient *c) {
6651 int i;
6652 robj *o;
01426b05 6653
7fb16bac
PN
6654 if ((c->argc % 2) == 1) {
6655 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6656 return;
6657 }
01426b05 6658
7fb16bac
PN
6659 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6660 hashTryConversion(o,c->argv,2,c->argc-1);
6661 for (i = 2; i < c->argc; i += 2) {
97224de7 6662 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
feb8d7e6 6663 hashSet(o,c->argv[i],c->argv[i+1]);
7fb16bac
PN
6664 }
6665 addReply(c, shared.ok);
edc2f63a 6666 server.dirty++;
7fb16bac
PN
6667}
6668
6669static void hincrbyCommand(redisClient *c) {
6670 long long value, incr;
6671 robj *o, *current, *new;
6672
bd79a6bd 6673 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
7fb16bac
PN
6674 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6675 if ((current = hashGet(o,c->argv[2])) != NULL) {
946342c1
PN
6676 if (getLongLongFromObjectOrReply(c,current,&value,
6677 "hash value is not an integer") != REDIS_OK) {
6678 decrRefCount(current);
6679 return;
6680 }
a3f3af86 6681 decrRefCount(current);
7fb16bac
PN
6682 } else {
6683 value = 0;
01426b05
PN
6684 }
6685
7fb16bac 6686 value += incr;
3f973463
PN
6687 new = createStringObjectFromLongLong(value);
6688 hashTryObjectEncoding(o,&c->argv[2],NULL);
feb8d7e6 6689 hashSet(o,c->argv[2],new);
7fb16bac
PN
6690 decrRefCount(new);
6691 addReplyLongLong(c,value);
01426b05 6692 server.dirty++;
01426b05
PN
6693}
6694
978c2c94 6695static void hgetCommand(redisClient *c) {
7fb16bac 6696 robj *o, *value;
dd88747b 6697 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6698 checkType(c,o,REDIS_HASH)) return;
6699
7fb16bac
PN
6700 if ((value = hashGet(o,c->argv[2])) != NULL) {
6701 addReplyBulk(c,value);
a3f3af86 6702 decrRefCount(value);
dd88747b 6703 } else {
7fb16bac 6704 addReply(c,shared.nullbulk);
69d95c3e 6705 }
69d95c3e
PN
6706}
6707
09aeb579
PN
6708static void hmgetCommand(redisClient *c) {
6709 int i;
7fb16bac
PN
6710 robj *o, *value;
6711 o = lookupKeyRead(c->db,c->argv[1]);
6712 if (o != NULL && o->type != REDIS_HASH) {
6713 addReply(c,shared.wrongtypeerr);
09aeb579
PN
6714 }
6715
7fb16bac
PN
6716 /* Note the check for o != NULL happens inside the loop. This is
6717 * done because objects that cannot be found are considered to be
6718 * an empty hash. The reply should then be a series of NULLs. */
09aeb579 6719 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7fb16bac
PN
6720 for (i = 2; i < c->argc; i++) {
6721 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6722 addReplyBulk(c,value);
a3f3af86 6723 decrRefCount(value);
7fb16bac
PN
6724 } else {
6725 addReply(c,shared.nullbulk);
09aeb579
PN
6726 }
6727 }
6728}
6729
07efaf74 6730static void hdelCommand(redisClient *c) {
dd88747b 6731 robj *o;
dd88747b 6732 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6733 checkType(c,o,REDIS_HASH)) return;
07efaf74 6734
7fb16bac
PN
6735 if (hashDelete(o,c->argv[2])) {
6736 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6737 addReply(c,shared.cone);
6738 server.dirty++;
dd88747b 6739 } else {
7fb16bac 6740 addReply(c,shared.czero);
07efaf74 6741 }
6742}
6743
92b27fe9 6744static void hlenCommand(redisClient *c) {
6745 robj *o;
dd88747b 6746 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 6747 checkType(c,o,REDIS_HASH)) return;
6748
7fb16bac 6749 addReplyUlong(c,hashLength(o));
92b27fe9 6750}
6751
78409a0f 6752static void genericHgetallCommand(redisClient *c, int flags) {
7fb16bac 6753 robj *o, *lenobj, *obj;
78409a0f 6754 unsigned long count = 0;
c44d3b56 6755 hashIterator *hi;
78409a0f 6756
4e27f268 6757 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
78409a0f 6758 || checkType(c,o,REDIS_HASH)) return;
6759
6760 lenobj = createObject(REDIS_STRING,NULL);
6761 addReply(c,lenobj);
6762 decrRefCount(lenobj);
6763
c44d3b56
PN
6764 hi = hashInitIterator(o);
6765 while (hashNext(hi) != REDIS_ERR) {
7fb16bac 6766 if (flags & REDIS_HASH_KEY) {
c44d3b56 6767 obj = hashCurrent(hi,REDIS_HASH_KEY);
7fb16bac 6768 addReplyBulk(c,obj);
a3f3af86 6769 decrRefCount(obj);
7fb16bac 6770 count++;
78409a0f 6771 }
7fb16bac 6772 if (flags & REDIS_HASH_VALUE) {
c44d3b56 6773 obj = hashCurrent(hi,REDIS_HASH_VALUE);
7fb16bac 6774 addReplyBulk(c,obj);
a3f3af86 6775 decrRefCount(obj);
7fb16bac 6776 count++;
78409a0f 6777 }
78409a0f 6778 }
c44d3b56 6779 hashReleaseIterator(hi);
7fb16bac 6780
78409a0f 6781 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6782}
6783
6784static void hkeysCommand(redisClient *c) {
7fb16bac 6785 genericHgetallCommand(c,REDIS_HASH_KEY);
78409a0f 6786}
6787
6788static void hvalsCommand(redisClient *c) {
7fb16bac 6789 genericHgetallCommand(c,REDIS_HASH_VALUE);
78409a0f 6790}
6791
6792static void hgetallCommand(redisClient *c) {
7fb16bac 6793 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
78409a0f 6794}
6795
a86f14b1 6796static void hexistsCommand(redisClient *c) {
6797 robj *o;
a86f14b1 6798 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6799 checkType(c,o,REDIS_HASH)) return;
6800
7fb16bac 6801 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
a86f14b1 6802}
6803
ada386b2 6804static void convertToRealHash(robj *o) {
6805 unsigned char *key, *val, *p, *zm = o->ptr;
6806 unsigned int klen, vlen;
6807 dict *dict = dictCreate(&hashDictType,NULL);
6808
6809 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6810 p = zipmapRewind(zm);
6811 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6812 robj *keyobj, *valobj;
6813
6814 keyobj = createStringObject((char*)key,klen);
6815 valobj = createStringObject((char*)val,vlen);
05df7621 6816 keyobj = tryObjectEncoding(keyobj);
6817 valobj = tryObjectEncoding(valobj);
ada386b2 6818 dictAdd(dict,keyobj,valobj);
6819 }
6820 o->encoding = REDIS_ENCODING_HT;
6821 o->ptr = dict;
6822 zfree(zm);
6823}
6824
6b47e12e 6825/* ========================= Non type-specific commands ==================== */
6826
ed9b544e 6827static void flushdbCommand(redisClient *c) {
ca37e9cd 6828 server.dirty += dictSize(c->db->dict);
9b30e1a2 6829 touchWatchedKeysOnFlush(c->db->id);
3305306f 6830 dictEmpty(c->db->dict);
6831 dictEmpty(c->db->expires);
ed9b544e 6832 addReply(c,shared.ok);
ed9b544e 6833}
6834
6835static void flushallCommand(redisClient *c) {
9b30e1a2 6836 touchWatchedKeysOnFlush(-1);
ca37e9cd 6837 server.dirty += emptyDb();
ed9b544e 6838 addReply(c,shared.ok);
500ece7c 6839 if (server.bgsavechildpid != -1) {
6840 kill(server.bgsavechildpid,SIGKILL);
6841 rdbRemoveTempFile(server.bgsavechildpid);
6842 }
f78fd11b 6843 rdbSave(server.dbfilename);
ca37e9cd 6844 server.dirty++;
ed9b544e 6845}
6846
56906eef 6847static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 6848 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 6849 so->type = type;
6850 so->pattern = pattern;
6851 return so;
6852}
6853
6854/* Return the value associated to the key with a name obtained
55017f9d
PN
6855 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6856 * The returned object will always have its refcount increased by 1
6857 * when it is non-NULL. */
56906eef 6858static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6d7d1370 6859 char *p, *f;
ed9b544e 6860 sds spat, ssub;
6d7d1370
PN
6861 robj keyobj, fieldobj, *o;
6862 int prefixlen, sublen, postfixlen, fieldlen;
ed9b544e 6863 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6864 struct {
f1017b3f 6865 long len;
6866 long free;
ed9b544e 6867 char buf[REDIS_SORTKEY_MAX+1];
6d7d1370 6868 } keyname, fieldname;
ed9b544e 6869
28173a49 6870 /* If the pattern is "#" return the substitution object itself in order
6871 * to implement the "SORT ... GET #" feature. */
6872 spat = pattern->ptr;
6873 if (spat[0] == '#' && spat[1] == '\0') {
55017f9d 6874 incrRefCount(subst);
28173a49 6875 return subst;
6876 }
6877
6878 /* The substitution object may be specially encoded. If so we create
9d65a1bb 6879 * a decoded object on the fly. Otherwise getDecodedObject will just
6880 * increment the ref count, that we'll decrement later. */
6881 subst = getDecodedObject(subst);
942a3961 6882
ed9b544e 6883 ssub = subst->ptr;
6884 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6885 p = strchr(spat,'*');
ed5a857a 6886 if (!p) {
6887 decrRefCount(subst);
6888 return NULL;
6889 }
ed9b544e 6890
6d7d1370
PN
6891 /* Find out if we're dealing with a hash dereference. */
6892 if ((f = strstr(p+1, "->")) != NULL) {
6893 fieldlen = sdslen(spat)-(f-spat);
6894 /* this also copies \0 character */
6895 memcpy(fieldname.buf,f+2,fieldlen-1);
6896 fieldname.len = fieldlen-2;
6897 } else {
6898 fieldlen = 0;
6899 }
6900
ed9b544e 6901 prefixlen = p-spat;
6902 sublen = sdslen(ssub);
6d7d1370 6903 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
ed9b544e 6904 memcpy(keyname.buf,spat,prefixlen);
6905 memcpy(keyname.buf+prefixlen,ssub,sublen);
6906 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6907 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6908 keyname.len = prefixlen+sublen+postfixlen;
942a3961 6909 decrRefCount(subst);
6910
6d7d1370
PN
6911 /* Lookup substituted key */
6912 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6913 o = lookupKeyRead(db,&keyobj);
55017f9d
PN
6914 if (o == NULL) return NULL;
6915
6916 if (fieldlen > 0) {
6917 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6d7d1370 6918
705dad38
PN
6919 /* Retrieve value from hash by the field name. This operation
6920 * already increases the refcount of the returned object. */
6d7d1370
PN
6921 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6922 o = hashGet(o, &fieldobj);
705dad38 6923 } else {
55017f9d 6924 if (o->type != REDIS_STRING) return NULL;
b6f07345 6925
705dad38
PN
6926 /* Every object that this function returns needs to have its refcount
6927 * increased. sortCommand decreases it again. */
6928 incrRefCount(o);
6d7d1370
PN
6929 }
6930
6931 return o;
ed9b544e 6932}
6933
6934/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6935 * the additional parameter is not standard but a BSD-specific we have to
6936 * pass sorting parameters via the global 'server' structure */
6937static int sortCompare(const void *s1, const void *s2) {
6938 const redisSortObject *so1 = s1, *so2 = s2;
6939 int cmp;
6940
6941 if (!server.sort_alpha) {
6942 /* Numeric sorting. Here it's trivial as we precomputed scores */
6943 if (so1->u.score > so2->u.score) {
6944 cmp = 1;
6945 } else if (so1->u.score < so2->u.score) {
6946 cmp = -1;
6947 } else {
6948 cmp = 0;
6949 }
6950 } else {
6951 /* Alphanumeric sorting */
6952 if (server.sort_bypattern) {
6953 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6954 /* At least one compare object is NULL */
6955 if (so1->u.cmpobj == so2->u.cmpobj)
6956 cmp = 0;
6957 else if (so1->u.cmpobj == NULL)
6958 cmp = -1;
6959 else
6960 cmp = 1;
6961 } else {
6962 /* We have both the objects, use strcoll */
6963 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6964 }
6965 } else {
08ee9b57 6966 /* Compare elements directly. */
6967 cmp = compareStringObjects(so1->obj,so2->obj);
ed9b544e 6968 }
6969 }
6970 return server.sort_desc ? -cmp : cmp;
6971}
6972
6973/* The SORT command is the most complex command in Redis. Warning: this code
6974 * is optimized for speed and a bit less for readability */
6975static void sortCommand(redisClient *c) {
ed9b544e 6976 list *operations;
6977 int outputlen = 0;
6978 int desc = 0, alpha = 0;
6979 int limit_start = 0, limit_count = -1, start, end;
6980 int j, dontsort = 0, vectorlen;
6981 int getop = 0; /* GET operation counter */
443c6409 6982 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 6983 redisSortObject *vector; /* Resulting vector to sort */
6984
6985 /* Lookup the key to sort. It must be of the right types */
3305306f 6986 sortval = lookupKeyRead(c->db,c->argv[1]);
6987 if (sortval == NULL) {
4e27f268 6988 addReply(c,shared.emptymultibulk);
ed9b544e 6989 return;
6990 }
a5eb649b 6991 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6992 sortval->type != REDIS_ZSET)
6993 {
c937aa89 6994 addReply(c,shared.wrongtypeerr);
ed9b544e 6995 return;
6996 }
6997
6998 /* Create a list of operations to perform for every sorted element.
6999 * Operations can be GET/DEL/INCR/DECR */
7000 operations = listCreate();
092dac2a 7001 listSetFreeMethod(operations,zfree);
ed9b544e 7002 j = 2;
7003
7004 /* Now we need to protect sortval incrementing its count, in the future
7005 * SORT may have options able to overwrite/delete keys during the sorting
7006 * and the sorted key itself may get destroied */
7007 incrRefCount(sortval);
7008
7009 /* The SORT command has an SQL-alike syntax, parse it */
7010 while(j < c->argc) {
7011 int leftargs = c->argc-j-1;
7012 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7013 desc = 0;
7014 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7015 desc = 1;
7016 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7017 alpha = 1;
7018 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7019 limit_start = atoi(c->argv[j+1]->ptr);
7020 limit_count = atoi(c->argv[j+2]->ptr);
7021 j+=2;
443c6409 7022 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7023 storekey = c->argv[j+1];
7024 j++;
ed9b544e 7025 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7026 sortby = c->argv[j+1];
7027 /* If the BY pattern does not contain '*', i.e. it is constant,
7028 * we don't need to sort nor to lookup the weight keys. */
7029 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7030 j++;
7031 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7032 listAddNodeTail(operations,createSortOperation(
7033 REDIS_SORT_GET,c->argv[j+1]));
7034 getop++;
7035 j++;
ed9b544e 7036 } else {
7037 decrRefCount(sortval);
7038 listRelease(operations);
c937aa89 7039 addReply(c,shared.syntaxerr);
ed9b544e 7040 return;
7041 }
7042 j++;
7043 }
7044
7045 /* Load the sorting vector with all the objects to sort */
a5eb649b 7046 switch(sortval->type) {
7047 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
7048 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7049 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
f83c6cb5 7050 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
a5eb649b 7051 }
ed9b544e 7052 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 7053 j = 0;
a5eb649b 7054
ed9b544e 7055 if (sortval->type == REDIS_LIST) {
7056 list *list = sortval->ptr;
6208b3a7 7057 listNode *ln;
c7df85a4 7058 listIter li;
6208b3a7 7059
c7df85a4 7060 listRewind(list,&li);
7061 while((ln = listNext(&li))) {
ed9b544e 7062 robj *ele = ln->value;
7063 vector[j].obj = ele;
7064 vector[j].u.score = 0;
7065 vector[j].u.cmpobj = NULL;
ed9b544e 7066 j++;
7067 }
7068 } else {
a5eb649b 7069 dict *set;
ed9b544e 7070 dictIterator *di;
7071 dictEntry *setele;
7072
a5eb649b 7073 if (sortval->type == REDIS_SET) {
7074 set = sortval->ptr;
7075 } else {
7076 zset *zs = sortval->ptr;
7077 set = zs->dict;
7078 }
7079
ed9b544e 7080 di = dictGetIterator(set);
ed9b544e 7081 while((setele = dictNext(di)) != NULL) {
7082 vector[j].obj = dictGetEntryKey(setele);
7083 vector[j].u.score = 0;
7084 vector[j].u.cmpobj = NULL;
7085 j++;
7086 }
7087 dictReleaseIterator(di);
7088 }
dfc5e96c 7089 redisAssert(j == vectorlen);
ed9b544e 7090
7091 /* Now it's time to load the right scores in the sorting vector */
7092 if (dontsort == 0) {
7093 for (j = 0; j < vectorlen; j++) {
6d7d1370 7094 robj *byval;
ed9b544e 7095 if (sortby) {
6d7d1370 7096 /* lookup value to sort by */
3305306f 7097 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
705dad38 7098 if (!byval) continue;
ed9b544e 7099 } else {
6d7d1370
PN
7100 /* use object itself to sort by */
7101 byval = vector[j].obj;
7102 }
7103
7104 if (alpha) {
08ee9b57 7105 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6d7d1370
PN
7106 } else {
7107 if (byval->encoding == REDIS_ENCODING_RAW) {
7108 vector[j].u.score = strtod(byval->ptr,NULL);
16fa22f1 7109 } else if (byval->encoding == REDIS_ENCODING_INT) {
6d7d1370
PN
7110 /* Don't need to decode the object if it's
7111 * integer-encoded (the only encoding supported) so
7112 * far. We can just cast it */
16fa22f1
PN
7113 vector[j].u.score = (long)byval->ptr;
7114 } else {
7115 redisAssert(1 != 1);
942a3961 7116 }
ed9b544e 7117 }
6d7d1370 7118
705dad38
PN
7119 /* when the object was retrieved using lookupKeyByPattern,
7120 * its refcount needs to be decreased. */
7121 if (sortby) {
7122 decrRefCount(byval);
ed9b544e 7123 }
7124 }
7125 }
7126
7127 /* We are ready to sort the vector... perform a bit of sanity check
7128 * on the LIMIT option too. We'll use a partial version of quicksort. */
7129 start = (limit_start < 0) ? 0 : limit_start;
7130 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7131 if (start >= vectorlen) {
7132 start = vectorlen-1;
7133 end = vectorlen-2;
7134 }
7135 if (end >= vectorlen) end = vectorlen-1;
7136
7137 if (dontsort == 0) {
7138 server.sort_desc = desc;
7139 server.sort_alpha = alpha;
7140 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 7141 if (sortby && (start != 0 || end != vectorlen-1))
7142 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7143 else
7144 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 7145 }
7146
7147 /* Send command output to the output buffer, performing the specified
7148 * GET/DEL/INCR/DECR operations if any. */
7149 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 7150 if (storekey == NULL) {
7151 /* STORE option not specified, sent the sorting result to client */
7152 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7153 for (j = start; j <= end; j++) {
7154 listNode *ln;
c7df85a4 7155 listIter li;
7156
dd88747b 7157 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 7158 listRewind(operations,&li);
7159 while((ln = listNext(&li))) {
443c6409 7160 redisSortOperation *sop = ln->value;
7161 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7162 vector[j].obj);
7163
7164 if (sop->type == REDIS_SORT_GET) {
55017f9d 7165 if (!val) {
443c6409 7166 addReply(c,shared.nullbulk);
7167 } else {
dd88747b 7168 addReplyBulk(c,val);
55017f9d 7169 decrRefCount(val);
443c6409 7170 }
7171 } else {
dfc5e96c 7172 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 7173 }
7174 }
ed9b544e 7175 }
443c6409 7176 } else {
7177 robj *listObject = createListObject();
7178 list *listPtr = (list*) listObject->ptr;
7179
7180 /* STORE option specified, set the sorting result as a List object */
7181 for (j = start; j <= end; j++) {
7182 listNode *ln;
c7df85a4 7183 listIter li;
7184
443c6409 7185 if (!getop) {
7186 listAddNodeTail(listPtr,vector[j].obj);
7187 incrRefCount(vector[j].obj);
7188 }
c7df85a4 7189 listRewind(operations,&li);
7190 while((ln = listNext(&li))) {
443c6409 7191 redisSortOperation *sop = ln->value;
7192 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7193 vector[j].obj);
7194
7195 if (sop->type == REDIS_SORT_GET) {
55017f9d 7196 if (!val) {
443c6409 7197 listAddNodeTail(listPtr,createStringObject("",0));
7198 } else {
55017f9d
PN
7199 /* We should do a incrRefCount on val because it is
7200 * added to the list, but also a decrRefCount because
7201 * it is returned by lookupKeyByPattern. This results
7202 * in doing nothing at all. */
443c6409 7203 listAddNodeTail(listPtr,val);
443c6409 7204 }
ed9b544e 7205 } else {
dfc5e96c 7206 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 7207 }
ed9b544e 7208 }
ed9b544e 7209 }
121796f7 7210 if (dictReplace(c->db->dict,storekey,listObject)) {
7211 incrRefCount(storekey);
7212 }
443c6409 7213 /* Note: we add 1 because the DB is dirty anyway since even if the
7214 * SORT result is empty a new key is set and maybe the old content
7215 * replaced. */
7216 server.dirty += 1+outputlen;
7217 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 7218 }
7219
7220 /* Cleanup */
7221 decrRefCount(sortval);
7222 listRelease(operations);
7223 for (j = 0; j < vectorlen; j++) {
16fa22f1 7224 if (alpha && vector[j].u.cmpobj)
ed9b544e 7225 decrRefCount(vector[j].u.cmpobj);
7226 }
7227 zfree(vector);
7228}
7229
ec6c7a1d 7230/* Convert an amount of bytes into a human readable string in the form
7231 * of 100B, 2G, 100M, 4K, and so forth. */
7232static void bytesToHuman(char *s, unsigned long long n) {
7233 double d;
7234
7235 if (n < 1024) {
7236 /* Bytes */
7237 sprintf(s,"%lluB",n);
7238 return;
7239 } else if (n < (1024*1024)) {
7240 d = (double)n/(1024);
7241 sprintf(s,"%.2fK",d);
7242 } else if (n < (1024LL*1024*1024)) {
7243 d = (double)n/(1024*1024);
7244 sprintf(s,"%.2fM",d);
7245 } else if (n < (1024LL*1024*1024*1024)) {
7246 d = (double)n/(1024LL*1024*1024);
b72f6a4b 7247 sprintf(s,"%.2fG",d);
ec6c7a1d 7248 }
7249}
7250
1c85b79f 7251/* Create the string returned by the INFO command. This is decoupled
7252 * by the INFO command itself as we need to report the same information
7253 * on memory corruption problems. */
7254static sds genRedisInfoString(void) {
ed9b544e 7255 sds info;
7256 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 7257 int j;
ec6c7a1d 7258 char hmem[64];
55a8298f 7259
b72f6a4b 7260 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 7261 info = sdscatprintf(sdsempty(),
7262 "redis_version:%s\r\n"
5436146c
PN
7263 "redis_git_sha1:%s\r\n"
7264 "redis_git_dirty:%d\r\n"
f1017b3f 7265 "arch_bits:%s\r\n"
7a932b74 7266 "multiplexing_api:%s\r\n"
0d7170a4 7267 "process_id:%ld\r\n"
682ac724 7268 "uptime_in_seconds:%ld\r\n"
7269 "uptime_in_days:%ld\r\n"
ed9b544e 7270 "connected_clients:%d\r\n"
7271 "connected_slaves:%d\r\n"
f86a74e9 7272 "blocked_clients:%d\r\n"
5fba9f71 7273 "used_memory:%zu\r\n"
ec6c7a1d 7274 "used_memory_human:%s\r\n"
ed9b544e 7275 "changes_since_last_save:%lld\r\n"
be2bb6b0 7276 "bgsave_in_progress:%d\r\n"
682ac724 7277 "last_save_time:%ld\r\n"
b3fad521 7278 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 7279 "total_connections_received:%lld\r\n"
7280 "total_commands_processed:%lld\r\n"
2a6a2ed1 7281 "expired_keys:%lld\r\n"
3be2c9d7 7282 "hash_max_zipmap_entries:%zu\r\n"
7283 "hash_max_zipmap_value:%zu\r\n"
ffc6b7f8 7284 "pubsub_channels:%ld\r\n"
7285 "pubsub_patterns:%u\r\n"
7d98e08c 7286 "vm_enabled:%d\r\n"
a0f643ea 7287 "role:%s\r\n"
ed9b544e 7288 ,REDIS_VERSION,
5436146c 7289 REDIS_GIT_SHA1,
274e45e3 7290 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
f1017b3f 7291 (sizeof(long) == 8) ? "64" : "32",
7a932b74 7292 aeGetApiName(),
0d7170a4 7293 (long) getpid(),
a0f643ea 7294 uptime,
7295 uptime/(3600*24),
ed9b544e 7296 listLength(server.clients)-listLength(server.slaves),
7297 listLength(server.slaves),
d5d55fc3 7298 server.blpop_blocked_clients,
b72f6a4b 7299 zmalloc_used_memory(),
ec6c7a1d 7300 hmem,
ed9b544e 7301 server.dirty,
9d65a1bb 7302 server.bgsavechildpid != -1,
ed9b544e 7303 server.lastsave,
b3fad521 7304 server.bgrewritechildpid != -1,
ed9b544e 7305 server.stat_numconnections,
7306 server.stat_numcommands,
2a6a2ed1 7307 server.stat_expiredkeys,
55a8298f 7308 server.hash_max_zipmap_entries,
7309 server.hash_max_zipmap_value,
ffc6b7f8 7310 dictSize(server.pubsub_channels),
7311 listLength(server.pubsub_patterns),
7d98e08c 7312 server.vm_enabled != 0,
a0f643ea 7313 server.masterhost == NULL ? "master" : "slave"
ed9b544e 7314 );
a0f643ea 7315 if (server.masterhost) {
7316 info = sdscatprintf(info,
7317 "master_host:%s\r\n"
7318 "master_port:%d\r\n"
7319 "master_link_status:%s\r\n"
7320 "master_last_io_seconds_ago:%d\r\n"
7321 ,server.masterhost,
7322 server.masterport,
7323 (server.replstate == REDIS_REPL_CONNECTED) ?
7324 "up" : "down",
f72b934d 7325 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 7326 );
7327 }
7d98e08c 7328 if (server.vm_enabled) {
1064ef87 7329 lockThreadedIO();
7d98e08c 7330 info = sdscatprintf(info,
7331 "vm_conf_max_memory:%llu\r\n"
7332 "vm_conf_page_size:%llu\r\n"
7333 "vm_conf_pages:%llu\r\n"
7334 "vm_stats_used_pages:%llu\r\n"
7335 "vm_stats_swapped_objects:%llu\r\n"
7336 "vm_stats_swappin_count:%llu\r\n"
7337 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 7338 "vm_stats_io_newjobs_len:%lu\r\n"
7339 "vm_stats_io_processing_len:%lu\r\n"
7340 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 7341 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 7342 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 7343 ,(unsigned long long) server.vm_max_memory,
7344 (unsigned long long) server.vm_page_size,
7345 (unsigned long long) server.vm_pages,
7346 (unsigned long long) server.vm_stats_used_pages,
7347 (unsigned long long) server.vm_stats_swapped_objects,
7348 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 7349 (unsigned long long) server.vm_stats_swapouts,
7350 (unsigned long) listLength(server.io_newjobs),
7351 (unsigned long) listLength(server.io_processing),
7352 (unsigned long) listLength(server.io_processed),
d5d55fc3 7353 (unsigned long) server.io_active_threads,
7354 (unsigned long) server.vm_blocked_clients
7d98e08c 7355 );
1064ef87 7356 unlockThreadedIO();
7d98e08c 7357 }
c3cb078d 7358 for (j = 0; j < server.dbnum; j++) {
7359 long long keys, vkeys;
7360
7361 keys = dictSize(server.db[j].dict);
7362 vkeys = dictSize(server.db[j].expires);
7363 if (keys || vkeys) {
9d65a1bb 7364 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 7365 j, keys, vkeys);
7366 }
7367 }
1c85b79f 7368 return info;
7369}
7370
7371static void infoCommand(redisClient *c) {
7372 sds info = genRedisInfoString();
83c6a618 7373 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7374 (unsigned long)sdslen(info)));
ed9b544e 7375 addReplySds(c,info);
70003d28 7376 addReply(c,shared.crlf);
ed9b544e 7377}
7378
3305306f 7379static void monitorCommand(redisClient *c) {
7380 /* ignore MONITOR if aleady slave or in monitor mode */
7381 if (c->flags & REDIS_SLAVE) return;
7382
7383 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7384 c->slaveseldb = 0;
6b47e12e 7385 listAddNodeTail(server.monitors,c);
3305306f 7386 addReply(c,shared.ok);
7387}
7388
7389/* ================================= Expire ================================= */
7390static int removeExpire(redisDb *db, robj *key) {
7391 if (dictDelete(db->expires,key) == DICT_OK) {
7392 return 1;
7393 } else {
7394 return 0;
7395 }
7396}
7397
7398static int setExpire(redisDb *db, robj *key, time_t when) {
7399 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7400 return 0;
7401 } else {
7402 incrRefCount(key);
7403 return 1;
7404 }
7405}
7406
bb32ede5 7407/* Return the expire time of the specified key, or -1 if no expire
7408 * is associated with this key (i.e. the key is non volatile) */
7409static time_t getExpire(redisDb *db, robj *key) {
7410 dictEntry *de;
7411
7412 /* No expire? return ASAP */
7413 if (dictSize(db->expires) == 0 ||
7414 (de = dictFind(db->expires,key)) == NULL) return -1;
7415
7416 return (time_t) dictGetEntryVal(de);
7417}
7418
3305306f 7419static int expireIfNeeded(redisDb *db, robj *key) {
7420 time_t when;
7421 dictEntry *de;
7422
7423 /* No expire? return ASAP */
7424 if (dictSize(db->expires) == 0 ||
7425 (de = dictFind(db->expires,key)) == NULL) return 0;
7426
7427 /* Lookup the expire */
7428 when = (time_t) dictGetEntryVal(de);
7429 if (time(NULL) <= when) return 0;
7430
7431 /* Delete the key */
7432 dictDelete(db->expires,key);
2a6a2ed1 7433 server.stat_expiredkeys++;
3305306f 7434 return dictDelete(db->dict,key) == DICT_OK;
7435}
7436
7437static int deleteIfVolatile(redisDb *db, robj *key) {
7438 dictEntry *de;
7439
7440 /* No expire? return ASAP */
7441 if (dictSize(db->expires) == 0 ||
7442 (de = dictFind(db->expires,key)) == NULL) return 0;
7443
7444 /* Delete the key */
0c66a471 7445 server.dirty++;
2a6a2ed1 7446 server.stat_expiredkeys++;
3305306f 7447 dictDelete(db->expires,key);
7448 return dictDelete(db->dict,key) == DICT_OK;
7449}
7450
bbe025e0 7451static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
3305306f 7452 dictEntry *de;
bbe025e0
AM
7453 time_t seconds;
7454
bd79a6bd 7455 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
bbe025e0
AM
7456
7457 seconds -= offset;
3305306f 7458
802e8373 7459 de = dictFind(c->db->dict,key);
3305306f 7460 if (de == NULL) {
7461 addReply(c,shared.czero);
7462 return;
7463 }
d4dd6556 7464 if (seconds <= 0) {
43e5ccdf 7465 if (deleteKey(c->db,key)) server.dirty++;
7466 addReply(c, shared.cone);
3305306f 7467 return;
7468 } else {
7469 time_t when = time(NULL)+seconds;
802e8373 7470 if (setExpire(c->db,key,when)) {
3305306f 7471 addReply(c,shared.cone);
77423026 7472 server.dirty++;
7473 } else {
3305306f 7474 addReply(c,shared.czero);
77423026 7475 }
3305306f 7476 return;
7477 }
7478}
7479
802e8373 7480static void expireCommand(redisClient *c) {
bbe025e0 7481 expireGenericCommand(c,c->argv[1],c->argv[2],0);
802e8373 7482}
7483
7484static void expireatCommand(redisClient *c) {
bbe025e0 7485 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
802e8373 7486}
7487
fd88489a 7488static void ttlCommand(redisClient *c) {
7489 time_t expire;
7490 int ttl = -1;
7491
7492 expire = getExpire(c->db,c->argv[1]);
7493 if (expire != -1) {
7494 ttl = (int) (expire-time(NULL));
7495 if (ttl < 0) ttl = -1;
7496 }
7497 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7498}
7499
6e469882 7500/* ================================ MULTI/EXEC ============================== */
7501
7502/* Client state initialization for MULTI/EXEC */
7503static void initClientMultiState(redisClient *c) {
7504 c->mstate.commands = NULL;
7505 c->mstate.count = 0;
7506}
7507
7508/* Release all the resources associated with MULTI/EXEC state */
7509static void freeClientMultiState(redisClient *c) {
7510 int j;
7511
7512 for (j = 0; j < c->mstate.count; j++) {
7513 int i;
7514 multiCmd *mc = c->mstate.commands+j;
7515
7516 for (i = 0; i < mc->argc; i++)
7517 decrRefCount(mc->argv[i]);
7518 zfree(mc->argv);
7519 }
7520 zfree(c->mstate.commands);
7521}
7522
7523/* Add a new command into the MULTI commands queue */
7524static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7525 multiCmd *mc;
7526 int j;
7527
7528 c->mstate.commands = zrealloc(c->mstate.commands,
7529 sizeof(multiCmd)*(c->mstate.count+1));
7530 mc = c->mstate.commands+c->mstate.count;
7531 mc->cmd = cmd;
7532 mc->argc = c->argc;
7533 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7534 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7535 for (j = 0; j < c->argc; j++)
7536 incrRefCount(mc->argv[j]);
7537 c->mstate.count++;
7538}
7539
7540static void multiCommand(redisClient *c) {
6531c94d 7541 if (c->flags & REDIS_MULTI) {
7542 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
7543 return;
7544 }
6e469882 7545 c->flags |= REDIS_MULTI;
36c548f0 7546 addReply(c,shared.ok);
6e469882 7547}
7548
18b6cb76
DJ
7549static void discardCommand(redisClient *c) {
7550 if (!(c->flags & REDIS_MULTI)) {
7551 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7552 return;
7553 }
7554
7555 freeClientMultiState(c);
7556 initClientMultiState(c);
7557 c->flags &= (~REDIS_MULTI);
7558 addReply(c,shared.ok);
7559}
7560
66c8853f 7561/* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7562 * implememntation for more information. */
7563static void execCommandReplicateMulti(redisClient *c) {
7564 struct redisCommand *cmd;
7565 robj *multistring = createStringObject("MULTI",5);
7566
7567 cmd = lookupCommand("multi");
7568 if (server.appendonly)
7569 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7570 if (listLength(server.slaves))
7571 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7572 decrRefCount(multistring);
7573}
7574
6e469882 7575static void execCommand(redisClient *c) {
7576 int j;
7577 robj **orig_argv;
7578 int orig_argc;
7579
7580 if (!(c->flags & REDIS_MULTI)) {
7581 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7582 return;
7583 }
7584
37ab76c9 7585 /* Check if we need to abort the EXEC if some WATCHed key was touched.
7586 * A failed EXEC will return a multi bulk nil object. */
7587 if (c->flags & REDIS_DIRTY_CAS) {
7588 freeClientMultiState(c);
7589 initClientMultiState(c);
7590 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7591 unwatchAllKeys(c);
7592 addReply(c,shared.nullmultibulk);
7593 return;
7594 }
7595
66c8853f 7596 /* Replicate a MULTI request now that we are sure the block is executed.
7597 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7598 * both the AOF and the replication link will have the same consistency
7599 * and atomicity guarantees. */
7600 execCommandReplicateMulti(c);
7601
7602 /* Exec all the queued commands */
1ad4d316 7603 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
6e469882 7604 orig_argv = c->argv;
7605 orig_argc = c->argc;
7606 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7607 for (j = 0; j < c->mstate.count; j++) {
7608 c->argc = c->mstate.commands[j].argc;
7609 c->argv = c->mstate.commands[j].argv;
7610 call(c,c->mstate.commands[j].cmd);
7611 }
7612 c->argv = orig_argv;
7613 c->argc = orig_argc;
7614 freeClientMultiState(c);
7615 initClientMultiState(c);
1ad4d316 7616 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
66c8853f 7617 /* Make sure the EXEC command is always replicated / AOF, since we
7618 * always send the MULTI command (we can't know beforehand if the
7619 * next operations will contain at least a modification to the DB). */
7620 server.dirty++;
6e469882 7621}
7622
4409877e 7623/* =========================== Blocking Operations ========================= */
7624
7625/* Currently Redis blocking operations support is limited to list POP ops,
7626 * so the current implementation is not fully generic, but it is also not
7627 * completely specific so it will not require a rewrite to support new
7628 * kind of blocking operations in the future.
7629 *
7630 * Still it's important to note that list blocking operations can be already
7631 * used as a notification mechanism in order to implement other blocking
7632 * operations at application level, so there must be a very strong evidence
7633 * of usefulness and generality before new blocking operations are implemented.
7634 *
7635 * This is how the current blocking POP works, we use BLPOP as example:
7636 * - If the user calls BLPOP and the key exists and contains a non empty list
7637 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7638 * if there is not to block.
7639 * - If instead BLPOP is called and the key does not exists or the list is
7640 * empty we need to block. In order to do so we remove the notification for
7641 * new data to read in the client socket (so that we'll not serve new
7642 * requests if the blocking request is not served). Also we put the client
37ab76c9 7643 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
4409877e 7644 * blocking for this keys.
7645 * - If a PUSH operation against a key with blocked clients waiting is
7646 * performed, we serve the first in the list: basically instead to push
7647 * the new element inside the list we return it to the (first / oldest)
7648 * blocking client, unblock the client, and remove it form the list.
7649 *
7650 * The above comment and the source code should be enough in order to understand
7651 * the implementation and modify / fix it later.
7652 */
7653
7654/* Set a client in blocking mode for the specified key, with the specified
7655 * timeout */
b177fd30 7656static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 7657 dictEntry *de;
7658 list *l;
b177fd30 7659 int j;
4409877e 7660
37ab76c9 7661 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
7662 c->blocking_keys_num = numkeys;
4409877e 7663 c->blockingto = timeout;
b177fd30 7664 for (j = 0; j < numkeys; j++) {
7665 /* Add the key in the client structure, to map clients -> keys */
37ab76c9 7666 c->blocking_keys[j] = keys[j];
b177fd30 7667 incrRefCount(keys[j]);
4409877e 7668
b177fd30 7669 /* And in the other "side", to map keys -> clients */
37ab76c9 7670 de = dictFind(c->db->blocking_keys,keys[j]);
b177fd30 7671 if (de == NULL) {
7672 int retval;
7673
7674 /* For every key we take a list of clients blocked for it */
7675 l = listCreate();
37ab76c9 7676 retval = dictAdd(c->db->blocking_keys,keys[j],l);
b177fd30 7677 incrRefCount(keys[j]);
7678 assert(retval == DICT_OK);
7679 } else {
7680 l = dictGetEntryVal(de);
7681 }
7682 listAddNodeTail(l,c);
4409877e 7683 }
b177fd30 7684 /* Mark the client as a blocked client */
4409877e 7685 c->flags |= REDIS_BLOCKED;
d5d55fc3 7686 server.blpop_blocked_clients++;
4409877e 7687}
7688
7689/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 7690static void unblockClientWaitingData(redisClient *c) {
4409877e 7691 dictEntry *de;
7692 list *l;
b177fd30 7693 int j;
4409877e 7694
37ab76c9 7695 assert(c->blocking_keys != NULL);
b177fd30 7696 /* The client may wait for multiple keys, so unblock it for every key. */
37ab76c9 7697 for (j = 0; j < c->blocking_keys_num; j++) {
b177fd30 7698 /* Remove this client from the list of clients waiting for this key. */
37ab76c9 7699 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
b177fd30 7700 assert(de != NULL);
7701 l = dictGetEntryVal(de);
7702 listDelNode(l,listSearchKey(l,c));
7703 /* If the list is empty we need to remove it to avoid wasting memory */
7704 if (listLength(l) == 0)
37ab76c9 7705 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
7706 decrRefCount(c->blocking_keys[j]);
b177fd30 7707 }
7708 /* Cleanup the client structure */
37ab76c9 7709 zfree(c->blocking_keys);
7710 c->blocking_keys = NULL;
4409877e 7711 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 7712 server.blpop_blocked_clients--;
5921aa36 7713 /* We want to process data if there is some command waiting
b0d8747d 7714 * in the input buffer. Note that this is safe even if
7715 * unblockClientWaitingData() gets called from freeClient() because
7716 * freeClient() will be smart enough to call this function
7717 * *after* c->querybuf was set to NULL. */
4409877e 7718 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7719}
7720
7721/* This should be called from any function PUSHing into lists.
7722 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7723 * 'ele' is the element pushed.
7724 *
7725 * If the function returns 0 there was no client waiting for a list push
7726 * against this key.
7727 *
7728 * If the function returns 1 there was a client waiting for a list push
7729 * against this key, the element was passed to this client thus it's not
7730 * needed to actually add it to the list and the caller should return asap. */
7731static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7732 struct dictEntry *de;
7733 redisClient *receiver;
7734 list *l;
7735 listNode *ln;
7736
37ab76c9 7737 de = dictFind(c->db->blocking_keys,key);
4409877e 7738 if (de == NULL) return 0;
7739 l = dictGetEntryVal(de);
7740 ln = listFirst(l);
7741 assert(ln != NULL);
7742 receiver = ln->value;
4409877e 7743
b177fd30 7744 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 7745 addReplyBulk(receiver,key);
7746 addReplyBulk(receiver,ele);
b0d8747d 7747 unblockClientWaitingData(receiver);
4409877e 7748 return 1;
7749}
7750
7751/* Blocking RPOP/LPOP */
7752static void blockingPopGenericCommand(redisClient *c, int where) {
7753 robj *o;
7754 time_t timeout;
b177fd30 7755 int j;
4409877e 7756
b177fd30 7757 for (j = 1; j < c->argc-1; j++) {
7758 o = lookupKeyWrite(c->db,c->argv[j]);
7759 if (o != NULL) {
7760 if (o->type != REDIS_LIST) {
7761 addReply(c,shared.wrongtypeerr);
4409877e 7762 return;
b177fd30 7763 } else {
7764 list *list = o->ptr;
7765 if (listLength(list) != 0) {
7766 /* If the list contains elements fall back to the usual
7767 * non-blocking POP operation */
7768 robj *argv[2], **orig_argv;
7769 int orig_argc;
e0a62c7f 7770
b177fd30 7771 /* We need to alter the command arguments before to call
7772 * popGenericCommand() as the command takes a single key. */
7773 orig_argv = c->argv;
7774 orig_argc = c->argc;
7775 argv[1] = c->argv[j];
7776 c->argv = argv;
7777 c->argc = 2;
7778
7779 /* Also the return value is different, we need to output
7780 * the multi bulk reply header and the key name. The
7781 * "real" command will add the last element (the value)
7782 * for us. If this souds like an hack to you it's just
7783 * because it is... */
7784 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 7785 addReplyBulk(c,argv[1]);
b177fd30 7786 popGenericCommand(c,where);
7787
7788 /* Fix the client structure with the original stuff */
7789 c->argv = orig_argv;
7790 c->argc = orig_argc;
7791 return;
7792 }
4409877e 7793 }
7794 }
7795 }
7796 /* If the list is empty or the key does not exists we must block */
b177fd30 7797 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 7798 if (timeout > 0) timeout += time(NULL);
b177fd30 7799 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 7800}
7801
7802static void blpopCommand(redisClient *c) {
7803 blockingPopGenericCommand(c,REDIS_HEAD);
7804}
7805
7806static void brpopCommand(redisClient *c) {
7807 blockingPopGenericCommand(c,REDIS_TAIL);
7808}
7809
ed9b544e 7810/* =============================== Replication ============================= */
7811
a4d1ba9a 7812static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7813 ssize_t nwritten, ret = size;
7814 time_t start = time(NULL);
7815
7816 timeout++;
7817 while(size) {
7818 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7819 nwritten = write(fd,ptr,size);
7820 if (nwritten == -1) return -1;
7821 ptr += nwritten;
7822 size -= nwritten;
7823 }
7824 if ((time(NULL)-start) > timeout) {
7825 errno = ETIMEDOUT;
7826 return -1;
7827 }
7828 }
7829 return ret;
7830}
7831
a4d1ba9a 7832static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7833 ssize_t nread, totread = 0;
7834 time_t start = time(NULL);
7835
7836 timeout++;
7837 while(size) {
7838 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7839 nread = read(fd,ptr,size);
7840 if (nread == -1) return -1;
7841 ptr += nread;
7842 size -= nread;
7843 totread += nread;
7844 }
7845 if ((time(NULL)-start) > timeout) {
7846 errno = ETIMEDOUT;
7847 return -1;
7848 }
7849 }
7850 return totread;
7851}
7852
7853static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7854 ssize_t nread = 0;
7855
7856 size--;
7857 while(size) {
7858 char c;
7859
7860 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7861 if (c == '\n') {
7862 *ptr = '\0';
7863 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7864 return nread;
7865 } else {
7866 *ptr++ = c;
7867 *ptr = '\0';
7868 nread++;
7869 }
7870 }
7871 return nread;
7872}
7873
7874static void syncCommand(redisClient *c) {
40d224a9 7875 /* ignore SYNC if aleady slave or in monitor mode */
7876 if (c->flags & REDIS_SLAVE) return;
7877
7878 /* SYNC can't be issued when the server has pending data to send to
7879 * the client about already issued commands. We need a fresh reply
7880 * buffer registering the differences between the BGSAVE and the current
7881 * dataset, so that we can copy to other slaves if needed. */
7882 if (listLength(c->reply) != 0) {
7883 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7884 return;
7885 }
7886
7887 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7888 /* Here we need to check if there is a background saving operation
7889 * in progress, or if it is required to start one */
9d65a1bb 7890 if (server.bgsavechildpid != -1) {
40d224a9 7891 /* Ok a background save is in progress. Let's check if it is a good
7892 * one for replication, i.e. if there is another slave that is
7893 * registering differences since the server forked to save */
7894 redisClient *slave;
7895 listNode *ln;
c7df85a4 7896 listIter li;
40d224a9 7897
c7df85a4 7898 listRewind(server.slaves,&li);
7899 while((ln = listNext(&li))) {
40d224a9 7900 slave = ln->value;
7901 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 7902 }
7903 if (ln) {
7904 /* Perfect, the server is already registering differences for
7905 * another slave. Set the right state, and copy the buffer. */
7906 listRelease(c->reply);
7907 c->reply = listDup(slave->reply);
40d224a9 7908 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7909 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7910 } else {
7911 /* No way, we need to wait for the next BGSAVE in order to
7912 * register differences */
7913 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7914 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7915 }
7916 } else {
7917 /* Ok we don't have a BGSAVE in progress, let's start one */
7918 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7919 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7920 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7921 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7922 return;
7923 }
7924 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7925 }
6208b3a7 7926 c->repldbfd = -1;
40d224a9 7927 c->flags |= REDIS_SLAVE;
7928 c->slaveseldb = 0;
6b47e12e 7929 listAddNodeTail(server.slaves,c);
40d224a9 7930 return;
7931}
7932
6208b3a7 7933static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7934 redisClient *slave = privdata;
7935 REDIS_NOTUSED(el);
7936 REDIS_NOTUSED(mask);
7937 char buf[REDIS_IOBUF_LEN];
7938 ssize_t nwritten, buflen;
7939
7940 if (slave->repldboff == 0) {
7941 /* Write the bulk write count before to transfer the DB. In theory here
7942 * we don't know how much room there is in the output buffer of the
7943 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7944 * operations) will never be smaller than the few bytes we need. */
7945 sds bulkcount;
7946
7947 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7948 slave->repldbsize);
7949 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7950 {
7951 sdsfree(bulkcount);
7952 freeClient(slave);
7953 return;
7954 }
7955 sdsfree(bulkcount);
7956 }
7957 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7958 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7959 if (buflen <= 0) {
7960 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7961 (buflen == 0) ? "premature EOF" : strerror(errno));
7962 freeClient(slave);
7963 return;
7964 }
7965 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 7966 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 7967 strerror(errno));
7968 freeClient(slave);
7969 return;
7970 }
7971 slave->repldboff += nwritten;
7972 if (slave->repldboff == slave->repldbsize) {
7973 close(slave->repldbfd);
7974 slave->repldbfd = -1;
7975 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7976 slave->replstate = REDIS_REPL_ONLINE;
7977 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 7978 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 7979 freeClient(slave);
7980 return;
7981 }
7982 addReplySds(slave,sdsempty());
7983 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7984 }
7985}
ed9b544e 7986
a3b21203 7987/* This function is called at the end of every backgrond saving.
7988 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7989 * otherwise REDIS_ERR is passed to the function.
7990 *
7991 * The goal of this function is to handle slaves waiting for a successful
7992 * background saving in order to perform non-blocking synchronization. */
7993static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 7994 listNode *ln;
7995 int startbgsave = 0;
c7df85a4 7996 listIter li;
ed9b544e 7997
c7df85a4 7998 listRewind(server.slaves,&li);
7999 while((ln = listNext(&li))) {
6208b3a7 8000 redisClient *slave = ln->value;
ed9b544e 8001
6208b3a7 8002 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
8003 startbgsave = 1;
8004 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8005 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 8006 struct redis_stat buf;
e0a62c7f 8007
6208b3a7 8008 if (bgsaveerr != REDIS_OK) {
8009 freeClient(slave);
8010 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8011 continue;
8012 }
8013 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 8014 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 8015 freeClient(slave);
8016 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8017 continue;
8018 }
8019 slave->repldboff = 0;
8020 slave->repldbsize = buf.st_size;
8021 slave->replstate = REDIS_REPL_SEND_BULK;
8022 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 8023 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 8024 freeClient(slave);
8025 continue;
8026 }
8027 }
ed9b544e 8028 }
6208b3a7 8029 if (startbgsave) {
8030 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 8031 listIter li;
8032
8033 listRewind(server.slaves,&li);
6208b3a7 8034 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 8035 while((ln = listNext(&li))) {
6208b3a7 8036 redisClient *slave = ln->value;
ed9b544e 8037
6208b3a7 8038 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8039 freeClient(slave);
8040 }
8041 }
8042 }
ed9b544e 8043}
8044
8045static int syncWithMaster(void) {
d0ccebcf 8046 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 8047 long dumpsize;
ed9b544e 8048 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 8049 int dfd, maxtries = 5;
ed9b544e 8050
8051 if (fd == -1) {
8052 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8053 strerror(errno));
8054 return REDIS_ERR;
8055 }
d0ccebcf 8056
8057 /* AUTH with the master if required. */
8058 if(server.masterauth) {
8059 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8060 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8061 close(fd);
8062 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8063 strerror(errno));
8064 return REDIS_ERR;
8065 }
8066 /* Read the AUTH result. */
8067 if (syncReadLine(fd,buf,1024,3600) == -1) {
8068 close(fd);
8069 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8070 strerror(errno));
8071 return REDIS_ERR;
8072 }
8073 if (buf[0] != '+') {
8074 close(fd);
8075 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8076 return REDIS_ERR;
8077 }
8078 }
8079
ed9b544e 8080 /* Issue the SYNC command */
8081 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8082 close(fd);
8083 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8084 strerror(errno));
8085 return REDIS_ERR;
8086 }
8087 /* Read the bulk write count */
8c4d91fc 8088 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 8089 close(fd);
8090 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8091 strerror(errno));
8092 return REDIS_ERR;
8093 }
4aa701c1 8094 if (buf[0] != '$') {
8095 close(fd);
8096 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8097 return REDIS_ERR;
8098 }
18e61fa2 8099 dumpsize = strtol(buf+1,NULL,10);
8100 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 8101 /* Read the bulk write data on a temp file */
8c5abee8 8102 while(maxtries--) {
8103 snprintf(tmpfile,256,
8104 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8105 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8106 if (dfd != -1) break;
5de9ad7c 8107 sleep(1);
8c5abee8 8108 }
ed9b544e 8109 if (dfd == -1) {
8110 close(fd);
8111 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8112 return REDIS_ERR;
8113 }
8114 while(dumpsize) {
8115 int nread, nwritten;
8116
8117 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8118 if (nread == -1) {
8119 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8120 strerror(errno));
8121 close(fd);
8122 close(dfd);
8123 return REDIS_ERR;
8124 }
8125 nwritten = write(dfd,buf,nread);
8126 if (nwritten == -1) {
8127 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8128 close(fd);
8129 close(dfd);
8130 return REDIS_ERR;
8131 }
8132 dumpsize -= nread;
8133 }
8134 close(dfd);
8135 if (rename(tmpfile,server.dbfilename) == -1) {
8136 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8137 unlink(tmpfile);
8138 close(fd);
8139 return REDIS_ERR;
8140 }
8141 emptyDb();
f78fd11b 8142 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 8143 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8144 close(fd);
8145 return REDIS_ERR;
8146 }
8147 server.master = createClient(fd);
8148 server.master->flags |= REDIS_MASTER;
179b3952 8149 server.master->authenticated = 1;
ed9b544e 8150 server.replstate = REDIS_REPL_CONNECTED;
8151 return REDIS_OK;
8152}
8153
321b0e13 8154static void slaveofCommand(redisClient *c) {
8155 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8156 !strcasecmp(c->argv[2]->ptr,"one")) {
8157 if (server.masterhost) {
8158 sdsfree(server.masterhost);
8159 server.masterhost = NULL;
8160 if (server.master) freeClient(server.master);
8161 server.replstate = REDIS_REPL_NONE;
8162 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8163 }
8164 } else {
8165 sdsfree(server.masterhost);
8166 server.masterhost = sdsdup(c->argv[1]->ptr);
8167 server.masterport = atoi(c->argv[2]->ptr);
8168 if (server.master) freeClient(server.master);
8169 server.replstate = REDIS_REPL_CONNECT;
8170 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8171 server.masterhost, server.masterport);
8172 }
8173 addReply(c,shared.ok);
8174}
8175
3fd78bcd 8176/* ============================ Maxmemory directive ======================== */
8177
a5819310 8178/* Try to free one object form the pre-allocated objects free list.
8179 * This is useful under low mem conditions as by default we take 1 million
8180 * free objects allocated. On success REDIS_OK is returned, otherwise
8181 * REDIS_ERR. */
8182static int tryFreeOneObjectFromFreelist(void) {
f870935d 8183 robj *o;
8184
a5819310 8185 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8186 if (listLength(server.objfreelist)) {
8187 listNode *head = listFirst(server.objfreelist);
8188 o = listNodeValue(head);
8189 listDelNode(server.objfreelist,head);
8190 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8191 zfree(o);
8192 return REDIS_OK;
8193 } else {
8194 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8195 return REDIS_ERR;
8196 }
f870935d 8197}
8198
3fd78bcd 8199/* This function gets called when 'maxmemory' is set on the config file to limit
8200 * the max memory used by the server, and we are out of memory.
8201 * This function will try to, in order:
8202 *
8203 * - Free objects from the free list
8204 * - Try to remove keys with an EXPIRE set
8205 *
8206 * It is not possible to free enough memory to reach used-memory < maxmemory
8207 * the server will start refusing commands that will enlarge even more the
8208 * memory usage.
8209 */
8210static void freeMemoryIfNeeded(void) {
8211 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 8212 int j, k, freed = 0;
8213
8214 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8215 for (j = 0; j < server.dbnum; j++) {
8216 int minttl = -1;
8217 robj *minkey = NULL;
8218 struct dictEntry *de;
8219
8220 if (dictSize(server.db[j].expires)) {
8221 freed = 1;
8222 /* From a sample of three keys drop the one nearest to
8223 * the natural expire */
8224 for (k = 0; k < 3; k++) {
8225 time_t t;
8226
8227 de = dictGetRandomKey(server.db[j].expires);
8228 t = (time_t) dictGetEntryVal(de);
8229 if (minttl == -1 || t < minttl) {
8230 minkey = dictGetEntryKey(de);
8231 minttl = t;
3fd78bcd 8232 }
3fd78bcd 8233 }
a5819310 8234 deleteKey(server.db+j,minkey);
3fd78bcd 8235 }
3fd78bcd 8236 }
a5819310 8237 if (!freed) return; /* nothing to free... */
3fd78bcd 8238 }
8239}
8240
f80dff62 8241/* ============================== Append Only file ========================== */
8242
28ed1f33 8243/* Write the append only file buffer on disk.
8244 *
8245 * Since we are required to write the AOF before replying to the client,
8246 * and the only way the client socket can get a write is entering when the
8247 * the event loop, we accumulate all the AOF writes in a memory
8248 * buffer and write it on disk using this function just before entering
8249 * the event loop again. */
8250static void flushAppendOnlyFile(void) {
8251 time_t now;
8252 ssize_t nwritten;
8253
8254 if (sdslen(server.aofbuf) == 0) return;
8255
8256 /* We want to perform a single write. This should be guaranteed atomic
8257 * at least if the filesystem we are writing is a real physical one.
8258 * While this will save us against the server being killed I don't think
8259 * there is much to do about the whole server stopping for power problems
8260 * or alike */
8261 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8262 if (nwritten != (signed)sdslen(server.aofbuf)) {
8263 /* Ooops, we are in troubles. The best thing to do for now is
8264 * aborting instead of giving the illusion that everything is
8265 * working as expected. */
8266 if (nwritten == -1) {
8267 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8268 } else {
8269 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8270 }
8271 exit(1);
8272 }
8273 sdsfree(server.aofbuf);
8274 server.aofbuf = sdsempty();
8275
38db9171 8276 /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
8277 * childs performing heavy I/O on disk. */
8278 if (server.no_appendfsync_on_rewrite &&
8279 (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
8280 return;
28ed1f33 8281 /* Fsync if needed */
8282 now = time(NULL);
8283 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8284 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8285 now-server.lastfsync > 1))
8286 {
8287 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8288 * flushing metadata. */
8289 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8290 server.lastfsync = now;
8291 }
8292}
8293
9376e434
PN
8294static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8295 int j;
8296 buf = sdscatprintf(buf,"*%d\r\n",argc);
8297 for (j = 0; j < argc; j++) {
8298 robj *o = getDecodedObject(argv[j]);
8299 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8300 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8301 buf = sdscatlen(buf,"\r\n",2);
8302 decrRefCount(o);
8303 }
8304 return buf;
8305}
8306
8307static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8308 int argc = 3;
8309 long when;
8310 robj *argv[3];
8311
8312 /* Make sure we can use strtol */
8313 seconds = getDecodedObject(seconds);
8314 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8315 decrRefCount(seconds);
8316
8317 argv[0] = createStringObject("EXPIREAT",8);
8318 argv[1] = key;
8319 argv[2] = createObject(REDIS_STRING,
8320 sdscatprintf(sdsempty(),"%ld",when));
8321 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8322 decrRefCount(argv[0]);
8323 decrRefCount(argv[2]);
8324 return buf;
8325}
8326
f80dff62 8327static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8328 sds buf = sdsempty();
f80dff62 8329 robj *tmpargv[3];
8330
8331 /* The DB this command was targetting is not the same as the last command
8332 * we appendend. To issue a SELECT command is needed. */
8333 if (dictid != server.appendseldb) {
8334 char seldb[64];
8335
8336 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 8337 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 8338 (unsigned long)strlen(seldb),seldb);
f80dff62 8339 server.appendseldb = dictid;
8340 }
8341
f80dff62 8342 if (cmd->proc == expireCommand) {
9376e434
PN
8343 /* Translate EXPIRE into EXPIREAT */
8344 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8345 } else if (cmd->proc == setexCommand) {
8346 /* Translate SETEX to SET and EXPIREAT */
8347 tmpargv[0] = createStringObject("SET",3);
f80dff62 8348 tmpargv[1] = argv[1];
9376e434
PN
8349 tmpargv[2] = argv[3];
8350 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8351 decrRefCount(tmpargv[0]);
8352 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8353 } else {
8354 buf = catAppendOnlyGenericCommand(buf,argc,argv);
f80dff62 8355 }
8356
28ed1f33 8357 /* Append to the AOF buffer. This will be flushed on disk just before
8358 * of re-entering the event loop, so before the client will get a
8359 * positive reply about the operation performed. */
8360 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8361
85a83172 8362 /* If a background append only file rewriting is in progress we want to
8363 * accumulate the differences between the child DB and the current one
8364 * in a buffer, so that when the child process will do its work we
8365 * can append the differences to the new append only file. */
8366 if (server.bgrewritechildpid != -1)
8367 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8368
8369 sdsfree(buf);
f80dff62 8370}
8371
8372/* In Redis commands are always executed in the context of a client, so in
8373 * order to load the append only file we need to create a fake client. */
8374static struct redisClient *createFakeClient(void) {
8375 struct redisClient *c = zmalloc(sizeof(*c));
8376
8377 selectDb(c,0);
8378 c->fd = -1;
8379 c->querybuf = sdsempty();
8380 c->argc = 0;
8381 c->argv = NULL;
8382 c->flags = 0;
9387d17d 8383 /* We set the fake client as a slave waiting for the synchronization
8384 * so that Redis will not try to send replies to this client. */
8385 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 8386 c->reply = listCreate();
8387 listSetFreeMethod(c->reply,decrRefCount);
8388 listSetDupMethod(c->reply,dupClientReplyValue);
4132ad8d 8389 initClientMultiState(c);
f80dff62 8390 return c;
8391}
8392
8393static void freeFakeClient(struct redisClient *c) {
8394 sdsfree(c->querybuf);
8395 listRelease(c->reply);
4132ad8d 8396 freeClientMultiState(c);
f80dff62 8397 zfree(c);
8398}
8399
8400/* Replay the append log file. On error REDIS_OK is returned. On non fatal
8401 * error (the append only file is zero-length) REDIS_ERR is returned. On
8402 * fatal error an error message is logged and the program exists. */
8403int loadAppendOnlyFile(char *filename) {
8404 struct redisClient *fakeClient;
8405 FILE *fp = fopen(filename,"r");
8406 struct redis_stat sb;
b492cf00 8407 unsigned long long loadedkeys = 0;
4132ad8d 8408 int appendonly = server.appendonly;
f80dff62 8409
8410 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8411 return REDIS_ERR;
8412
8413 if (fp == NULL) {
8414 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8415 exit(1);
8416 }
8417
4132ad8d
PN
8418 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8419 * to the same file we're about to read. */
8420 server.appendonly = 0;
8421
f80dff62 8422 fakeClient = createFakeClient();
8423 while(1) {
8424 int argc, j;
8425 unsigned long len;
8426 robj **argv;
8427 char buf[128];
8428 sds argsds;
8429 struct redisCommand *cmd;
8430
8431 if (fgets(buf,sizeof(buf),fp) == NULL) {
8432 if (feof(fp))
8433 break;
8434 else
8435 goto readerr;
8436 }
8437 if (buf[0] != '*') goto fmterr;
8438 argc = atoi(buf+1);
8439 argv = zmalloc(sizeof(robj*)*argc);
8440 for (j = 0; j < argc; j++) {
8441 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8442 if (buf[0] != '$') goto fmterr;
8443 len = strtol(buf+1,NULL,10);
8444 argsds = sdsnewlen(NULL,len);
0f151ef1 8445 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 8446 argv[j] = createObject(REDIS_STRING,argsds);
8447 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8448 }
8449
8450 /* Command lookup */
8451 cmd = lookupCommand(argv[0]->ptr);
8452 if (!cmd) {
8453 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8454 exit(1);
8455 }
bdcb92f2 8456 /* Try object encoding */
f80dff62 8457 if (cmd->flags & REDIS_CMD_BULK)
05df7621 8458 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
f80dff62 8459 /* Run the command in the context of a fake client */
8460 fakeClient->argc = argc;
8461 fakeClient->argv = argv;
8462 cmd->proc(fakeClient);
8463 /* Discard the reply objects list from the fake client */
8464 while(listLength(fakeClient->reply))
8465 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8466 /* Clean up, ready for the next command */
8467 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8468 zfree(argv);
b492cf00 8469 /* Handle swapping while loading big datasets when VM is on */
8470 loadedkeys++;
8471 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8472 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 8473 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 8474 }
8475 }
f80dff62 8476 }
4132ad8d
PN
8477
8478 /* This point can only be reached when EOF is reached without errors.
8479 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8480 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8481
f80dff62 8482 fclose(fp);
8483 freeFakeClient(fakeClient);
4132ad8d 8484 server.appendonly = appendonly;
f80dff62 8485 return REDIS_OK;
8486
8487readerr:
8488 if (feof(fp)) {
8489 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8490 } else {
8491 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8492 }
8493 exit(1);
8494fmterr:
8495 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8496 exit(1);
8497}
8498
9d65a1bb 8499/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
9c8e3cee 8500static int fwriteBulkObject(FILE *fp, robj *obj) {
9d65a1bb 8501 char buf[128];
b9bc0eef 8502 int decrrc = 0;
8503
f2d9f50f 8504 /* Avoid the incr/decr ref count business if possible to help
8505 * copy-on-write (we are often in a child process when this function
8506 * is called).
8507 * Also makes sure that key objects don't get incrRefCount-ed when VM
8508 * is enabled */
8509 if (obj->encoding != REDIS_ENCODING_RAW) {
b9bc0eef 8510 obj = getDecodedObject(obj);
8511 decrrc = 1;
8512 }
9d65a1bb 8513 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8514 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 8515 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8516 goto err;
9d65a1bb 8517 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 8518 if (decrrc) decrRefCount(obj);
9d65a1bb 8519 return 1;
8520err:
b9bc0eef 8521 if (decrrc) decrRefCount(obj);
9d65a1bb 8522 return 0;
8523}
8524
9c8e3cee 8525/* Write binary-safe string into a file in the bulkformat
8526 * $<count>\r\n<payload>\r\n */
8527static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8528 char buf[128];
8529
8530 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8531 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8532 if (len && fwrite(s,len,1,fp) == 0) return 0;
8533 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8534 return 1;
8535}
8536
9d65a1bb 8537/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8538static int fwriteBulkDouble(FILE *fp, double d) {
8539 char buf[128], dbuf[128];
8540
8541 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8542 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8543 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8544 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8545 return 1;
8546}
8547
8548/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8549static int fwriteBulkLong(FILE *fp, long l) {
8550 char buf[128], lbuf[128];
8551
8552 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8553 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8554 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8555 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8556 return 1;
8557}
8558
8559/* Write a sequence of commands able to fully rebuild the dataset into
8560 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8561static int rewriteAppendOnlyFile(char *filename) {
8562 dictIterator *di = NULL;
8563 dictEntry *de;
8564 FILE *fp;
8565 char tmpfile[256];
8566 int j;
8567 time_t now = time(NULL);
8568
8569 /* Note that we have to use a different temp name here compared to the
8570 * one used by rewriteAppendOnlyFileBackground() function. */
8571 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8572 fp = fopen(tmpfile,"w");
8573 if (!fp) {
8574 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8575 return REDIS_ERR;
8576 }
8577 for (j = 0; j < server.dbnum; j++) {
8578 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8579 redisDb *db = server.db+j;
8580 dict *d = db->dict;
8581 if (dictSize(d) == 0) continue;
8582 di = dictGetIterator(d);
8583 if (!di) {
8584 fclose(fp);
8585 return REDIS_ERR;
8586 }
8587
8588 /* SELECT the new DB */
8589 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 8590 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 8591
8592 /* Iterate this DB writing every entry */
8593 while((de = dictNext(di)) != NULL) {
e7546c63 8594 robj *key, *o;
8595 time_t expiretime;
8596 int swapped;
8597
8598 key = dictGetEntryKey(de);
b9bc0eef 8599 /* If the value for this key is swapped, load a preview in memory.
8600 * We use a "swapped" flag to remember if we need to free the
8601 * value object instead to just increment the ref count anyway
8602 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 8603 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8604 key->storage == REDIS_VM_SWAPPING) {
e7546c63 8605 o = dictGetEntryVal(de);
8606 swapped = 0;
8607 } else {
8608 o = vmPreviewObject(key);
e7546c63 8609 swapped = 1;
8610 }
8611 expiretime = getExpire(db,key);
9d65a1bb 8612
8613 /* Save the key and associated value */
9d65a1bb 8614 if (o->type == REDIS_STRING) {
8615 /* Emit a SET command */
8616 char cmd[]="*3\r\n$3\r\nSET\r\n";
8617 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8618 /* Key and value */
9c8e3cee 8619 if (fwriteBulkObject(fp,key) == 0) goto werr;
8620 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 8621 } else if (o->type == REDIS_LIST) {
8622 /* Emit the RPUSHes needed to rebuild the list */
8623 list *list = o->ptr;
8624 listNode *ln;
c7df85a4 8625 listIter li;
9d65a1bb 8626
c7df85a4 8627 listRewind(list,&li);
8628 while((ln = listNext(&li))) {
9d65a1bb 8629 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8630 robj *eleobj = listNodeValue(ln);
8631
8632 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8633 if (fwriteBulkObject(fp,key) == 0) goto werr;
8634 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8635 }
8636 } else if (o->type == REDIS_SET) {
8637 /* Emit the SADDs needed to rebuild the set */
8638 dict *set = o->ptr;
8639 dictIterator *di = dictGetIterator(set);
8640 dictEntry *de;
8641
8642 while((de = dictNext(di)) != NULL) {
8643 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8644 robj *eleobj = dictGetEntryKey(de);
8645
8646 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8647 if (fwriteBulkObject(fp,key) == 0) goto werr;
8648 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8649 }
8650 dictReleaseIterator(di);
8651 } else if (o->type == REDIS_ZSET) {
8652 /* Emit the ZADDs needed to rebuild the sorted set */
8653 zset *zs = o->ptr;
8654 dictIterator *di = dictGetIterator(zs->dict);
8655 dictEntry *de;
8656
8657 while((de = dictNext(di)) != NULL) {
8658 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8659 robj *eleobj = dictGetEntryKey(de);
8660 double *score = dictGetEntryVal(de);
8661
8662 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8663 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8664 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 8665 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8666 }
8667 dictReleaseIterator(di);
9c8e3cee 8668 } else if (o->type == REDIS_HASH) {
8669 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8670
8671 /* Emit the HSETs needed to rebuild the hash */
8672 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8673 unsigned char *p = zipmapRewind(o->ptr);
8674 unsigned char *field, *val;
8675 unsigned int flen, vlen;
8676
8677 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8678 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8679 if (fwriteBulkObject(fp,key) == 0) goto werr;
8680 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8681 return -1;
8682 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8683 return -1;
8684 }
8685 } else {
8686 dictIterator *di = dictGetIterator(o->ptr);
8687 dictEntry *de;
8688
8689 while((de = dictNext(di)) != NULL) {
8690 robj *field = dictGetEntryKey(de);
8691 robj *val = dictGetEntryVal(de);
8692
8693 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8694 if (fwriteBulkObject(fp,key) == 0) goto werr;
8695 if (fwriteBulkObject(fp,field) == -1) return -1;
8696 if (fwriteBulkObject(fp,val) == -1) return -1;
8697 }
8698 dictReleaseIterator(di);
8699 }
9d65a1bb 8700 } else {
f83c6cb5 8701 redisPanic("Unknown object type");
9d65a1bb 8702 }
8703 /* Save the expire time */
8704 if (expiretime != -1) {
e96e4fbf 8705 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 8706 /* If this key is already expired skip it */
8707 if (expiretime < now) continue;
8708 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8709 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8710 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8711 }
b9bc0eef 8712 if (swapped) decrRefCount(o);
9d65a1bb 8713 }
8714 dictReleaseIterator(di);
8715 }
8716
8717 /* Make sure data will not remain on the OS's output buffers */
8718 fflush(fp);
b0bd87f6 8719 aof_fsync(fileno(fp));
9d65a1bb 8720 fclose(fp);
e0a62c7f 8721
9d65a1bb 8722 /* Use RENAME to make sure the DB file is changed atomically only
8723 * if the generate DB file is ok. */
8724 if (rename(tmpfile,filename) == -1) {
8725 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8726 unlink(tmpfile);
8727 return REDIS_ERR;
8728 }
8729 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8730 return REDIS_OK;
8731
8732werr:
8733 fclose(fp);
8734 unlink(tmpfile);
e96e4fbf 8735 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 8736 if (di) dictReleaseIterator(di);
8737 return REDIS_ERR;
8738}
8739
8740/* This is how rewriting of the append only file in background works:
8741 *
8742 * 1) The user calls BGREWRITEAOF
8743 * 2) Redis calls this function, that forks():
8744 * 2a) the child rewrite the append only file in a temp file.
8745 * 2b) the parent accumulates differences in server.bgrewritebuf.
8746 * 3) When the child finished '2a' exists.
8747 * 4) The parent will trap the exit code, if it's OK, will append the
8748 * data accumulated into server.bgrewritebuf into the temp file, and
8749 * finally will rename(2) the temp file in the actual file name.
8750 * The the new file is reopened as the new append only file. Profit!
8751 */
8752static int rewriteAppendOnlyFileBackground(void) {
8753 pid_t childpid;
8754
8755 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 8756 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 8757 if ((childpid = fork()) == 0) {
8758 /* Child */
8759 char tmpfile[256];
9d65a1bb 8760
054e426d 8761 if (server.vm_enabled) vmReopenSwapFile();
8762 close(server.fd);
9d65a1bb 8763 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8764 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 8765 _exit(0);
9d65a1bb 8766 } else {
478c2c6f 8767 _exit(1);
9d65a1bb 8768 }
8769 } else {
8770 /* Parent */
8771 if (childpid == -1) {
8772 redisLog(REDIS_WARNING,
8773 "Can't rewrite append only file in background: fork: %s",
8774 strerror(errno));
8775 return REDIS_ERR;
8776 }
8777 redisLog(REDIS_NOTICE,
8778 "Background append only file rewriting started by pid %d",childpid);
8779 server.bgrewritechildpid = childpid;
884d4b39 8780 updateDictResizePolicy();
85a83172 8781 /* We set appendseldb to -1 in order to force the next call to the
8782 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8783 * accumulated by the parent into server.bgrewritebuf will start
8784 * with a SELECT statement and it will be safe to merge. */
8785 server.appendseldb = -1;
9d65a1bb 8786 return REDIS_OK;
8787 }
8788 return REDIS_OK; /* unreached */
8789}
8790
8791static void bgrewriteaofCommand(redisClient *c) {
8792 if (server.bgrewritechildpid != -1) {
8793 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8794 return;
8795 }
8796 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 8797 char *status = "+Background append only file rewriting started\r\n";
8798 addReplySds(c,sdsnew(status));
9d65a1bb 8799 } else {
8800 addReply(c,shared.err);
8801 }
8802}
8803
8804static void aofRemoveTempFile(pid_t childpid) {
8805 char tmpfile[256];
8806
8807 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8808 unlink(tmpfile);
8809}
8810
996cb5f7 8811/* Virtual Memory is composed mainly of two subsystems:
8812 * - Blocking Virutal Memory
8813 * - Threaded Virtual Memory I/O
8814 * The two parts are not fully decoupled, but functions are split among two
8815 * different sections of the source code (delimited by comments) in order to
8816 * make more clear what functionality is about the blocking VM and what about
8817 * the threaded (not blocking) VM.
8818 *
8819 * Redis VM design:
8820 *
8821 * Redis VM is a blocking VM (one that blocks reading swapped values from
8822 * disk into memory when a value swapped out is needed in memory) that is made
8823 * unblocking by trying to examine the command argument vector in order to
8824 * load in background values that will likely be needed in order to exec
8825 * the command. The command is executed only once all the relevant keys
8826 * are loaded into memory.
8827 *
8828 * This basically is almost as simple of a blocking VM, but almost as parallel
8829 * as a fully non-blocking VM.
8830 */
8831
2e5eb04e 8832/* Called when the user switches from "appendonly yes" to "appendonly no"
8833 * at runtime using the CONFIG command. */
8834static void stopAppendOnly(void) {
8835 flushAppendOnlyFile();
b0bd87f6 8836 aof_fsync(server.appendfd);
2e5eb04e 8837 close(server.appendfd);
8838
8839 server.appendfd = -1;
8840 server.appendseldb = -1;
8841 server.appendonly = 0;
8842 /* rewrite operation in progress? kill it, wait child exit */
8843 if (server.bgsavechildpid != -1) {
8844 int statloc;
8845
30dd89b6 8846 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8847 wait3(&statloc,0,NULL);
2e5eb04e 8848 /* reset the buffer accumulating changes while the child saves */
8849 sdsfree(server.bgrewritebuf);
8850 server.bgrewritebuf = sdsempty();
30dd89b6 8851 server.bgsavechildpid = -1;
2e5eb04e 8852 }
8853}
8854
8855/* Called when the user switches from "appendonly no" to "appendonly yes"
8856 * at runtime using the CONFIG command. */
8857static int startAppendOnly(void) {
8858 server.appendonly = 1;
8859 server.lastfsync = time(NULL);
8860 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8861 if (server.appendfd == -1) {
8862 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8863 return REDIS_ERR;
8864 }
8865 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8866 server.appendonly = 0;
8867 close(server.appendfd);
8868 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8869 return REDIS_ERR;
8870 }
8871 return REDIS_OK;
8872}
8873
996cb5f7 8874/* =================== Virtual Memory - Blocking Side ====================== */
054e426d 8875
75680a3c 8876static void vmInit(void) {
8877 off_t totsize;
996cb5f7 8878 int pipefds[2];
bcaa7a4f 8879 size_t stacksize;
8b5bb414 8880 struct flock fl;
75680a3c 8881
4ad37480 8882 if (server.vm_max_threads != 0)
8883 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8884
054e426d 8885 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8b5bb414 8886 /* Try to open the old swap file, otherwise create it */
6fa987e3 8887 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8888 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8889 }
75680a3c 8890 if (server.vm_fp == NULL) {
6fa987e3 8891 redisLog(REDIS_WARNING,
8b5bb414 8892 "Can't open the swap file: %s. Exiting.",
6fa987e3 8893 strerror(errno));
75680a3c 8894 exit(1);
8895 }
8896 server.vm_fd = fileno(server.vm_fp);
8b5bb414 8897 /* Lock the swap file for writing, this is useful in order to avoid
8898 * another instance to use the same swap file for a config error. */
8899 fl.l_type = F_WRLCK;
8900 fl.l_whence = SEEK_SET;
8901 fl.l_start = fl.l_len = 0;
8902 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8903 redisLog(REDIS_WARNING,
8904 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8905 exit(1);
8906 }
8907 /* Initialize */
75680a3c 8908 server.vm_next_page = 0;
8909 server.vm_near_pages = 0;
7d98e08c 8910 server.vm_stats_used_pages = 0;
8911 server.vm_stats_swapped_objects = 0;
8912 server.vm_stats_swapouts = 0;
8913 server.vm_stats_swapins = 0;
75680a3c 8914 totsize = server.vm_pages*server.vm_page_size;
8915 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8916 if (ftruncate(server.vm_fd,totsize) == -1) {
8917 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8918 strerror(errno));
8919 exit(1);
8920 } else {
8921 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8922 }
7d30035d 8923 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 8924 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 8925 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 8926 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 8927
996cb5f7 8928 /* Initialize threaded I/O (used by Virtual Memory) */
8929 server.io_newjobs = listCreate();
8930 server.io_processing = listCreate();
8931 server.io_processed = listCreate();
d5d55fc3 8932 server.io_ready_clients = listCreate();
92f8e882 8933 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 8934 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8935 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 8936 server.io_active_threads = 0;
996cb5f7 8937 if (pipe(pipefds) == -1) {
8938 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8939 ,strerror(errno));
8940 exit(1);
8941 }
8942 server.io_ready_pipe_read = pipefds[0];
8943 server.io_ready_pipe_write = pipefds[1];
8944 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 8945 /* LZF requires a lot of stack */
8946 pthread_attr_init(&server.io_threads_attr);
8947 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8948 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8949 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 8950 /* Listen for events in the threaded I/O pipe */
8951 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8952 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8953 oom("creating file event");
75680a3c 8954}
8955
06224fec 8956/* Mark the page as used */
8957static void vmMarkPageUsed(off_t page) {
8958 off_t byte = page/8;
8959 int bit = page&7;
970e10bb 8960 redisAssert(vmFreePage(page) == 1);
06224fec 8961 server.vm_bitmap[byte] |= 1<<bit;
8962}
8963
8964/* Mark N contiguous pages as used, with 'page' being the first. */
8965static void vmMarkPagesUsed(off_t page, off_t count) {
8966 off_t j;
8967
8968 for (j = 0; j < count; j++)
7d30035d 8969 vmMarkPageUsed(page+j);
7d98e08c 8970 server.vm_stats_used_pages += count;
7c775e09 8971 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8972 (long long)count, (long long)page);
06224fec 8973}
8974
8975/* Mark the page as free */
8976static void vmMarkPageFree(off_t page) {
8977 off_t byte = page/8;
8978 int bit = page&7;
970e10bb 8979 redisAssert(vmFreePage(page) == 0);
06224fec 8980 server.vm_bitmap[byte] &= ~(1<<bit);
8981}
8982
8983/* Mark N contiguous pages as free, with 'page' being the first. */
8984static void vmMarkPagesFree(off_t page, off_t count) {
8985 off_t j;
8986
8987 for (j = 0; j < count; j++)
7d30035d 8988 vmMarkPageFree(page+j);
7d98e08c 8989 server.vm_stats_used_pages -= count;
7c775e09 8990 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8991 (long long)count, (long long)page);
06224fec 8992}
8993
8994/* Test if the page is free */
8995static int vmFreePage(off_t page) {
8996 off_t byte = page/8;
8997 int bit = page&7;
7d30035d 8998 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 8999}
9000
9001/* Find N contiguous free pages storing the first page of the cluster in *first.
e0a62c7f 9002 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
3a66edc7 9003 * REDIS_ERR is returned.
06224fec 9004 *
9005 * This function uses a simple algorithm: we try to allocate
9006 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9007 * again from the start of the swap file searching for free spaces.
9008 *
9009 * If it looks pretty clear that there are no free pages near our offset
9010 * we try to find less populated places doing a forward jump of
9011 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9012 * without hurry, and then we jump again and so forth...
e0a62c7f 9013 *
06224fec 9014 * This function can be improved using a free list to avoid to guess
9015 * too much, since we could collect data about freed pages.
9016 *
9017 * note: I implemented this function just after watching an episode of
9018 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9019 */
c7df85a4 9020static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 9021 off_t base, offset = 0, since_jump = 0, numfree = 0;
9022
9023 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9024 server.vm_near_pages = 0;
9025 server.vm_next_page = 0;
9026 }
9027 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9028 base = server.vm_next_page;
9029
9030 while(offset < server.vm_pages) {
9031 off_t this = base+offset;
9032
9033 /* If we overflow, restart from page zero */
9034 if (this >= server.vm_pages) {
9035 this -= server.vm_pages;
9036 if (this == 0) {
9037 /* Just overflowed, what we found on tail is no longer
9038 * interesting, as it's no longer contiguous. */
9039 numfree = 0;
9040 }
9041 }
9042 if (vmFreePage(this)) {
9043 /* This is a free page */
9044 numfree++;
9045 /* Already got N free pages? Return to the caller, with success */
9046 if (numfree == n) {
7d30035d 9047 *first = this-(n-1);
9048 server.vm_next_page = this+1;
7c775e09 9049 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 9050 return REDIS_OK;
06224fec 9051 }
9052 } else {
9053 /* The current one is not a free page */
9054 numfree = 0;
9055 }
9056
9057 /* Fast-forward if the current page is not free and we already
9058 * searched enough near this place. */
9059 since_jump++;
9060 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9061 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9062 since_jump = 0;
9063 /* Note that even if we rewind after the jump, we are don't need
9064 * to make sure numfree is set to zero as we only jump *if* it
9065 * is set to zero. */
9066 } else {
9067 /* Otherwise just check the next page */
9068 offset++;
9069 }
9070 }
3a66edc7 9071 return REDIS_ERR;
9072}
9073
a5819310 9074/* Write the specified object at the specified page of the swap file */
9075static int vmWriteObjectOnSwap(robj *o, off_t page) {
9076 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9077 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9078 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9079 redisLog(REDIS_WARNING,
9ebed7cf 9080 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 9081 strerror(errno));
9082 return REDIS_ERR;
9083 }
9084 rdbSaveObject(server.vm_fp,o);
ba76a8f9 9085 fflush(server.vm_fp);
a5819310 9086 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9087 return REDIS_OK;
9088}
9089
3a66edc7 9090/* Swap the 'val' object relative to 'key' into disk. Store all the information
9091 * needed to later retrieve the object into the key object.
9092 * If we can't find enough contiguous empty pages to swap the object on disk
9093 * REDIS_ERR is returned. */
a69a0c9c 9094static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 9095 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 9096 off_t page;
9097
9098 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 9099 assert(key->refcount == 1);
3a66edc7 9100 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 9101 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 9102 key->vm.page = page;
9103 key->vm.usedpages = pages;
9104 key->storage = REDIS_VM_SWAPPED;
d894161b 9105 key->vtype = val->type;
3a66edc7 9106 decrRefCount(val); /* Deallocate the object from memory. */
9107 vmMarkPagesUsed(page,pages);
7d30035d 9108 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
9109 (unsigned char*) key->ptr,
9110 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 9111 server.vm_stats_swapped_objects++;
9112 server.vm_stats_swapouts++;
3a66edc7 9113 return REDIS_OK;
9114}
9115
a5819310 9116static robj *vmReadObjectFromSwap(off_t page, int type) {
9117 robj *o;
3a66edc7 9118
a5819310 9119 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9120 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 9121 redisLog(REDIS_WARNING,
d5d55fc3 9122 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 9123 strerror(errno));
478c2c6f 9124 _exit(1);
3a66edc7 9125 }
a5819310 9126 o = rdbLoadObject(type,server.vm_fp);
9127 if (o == NULL) {
d5d55fc3 9128 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 9129 _exit(1);
3a66edc7 9130 }
a5819310 9131 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9132 return o;
9133}
9134
9135/* Load the value object relative to the 'key' object from swap to memory.
9136 * The newly allocated object is returned.
9137 *
9138 * If preview is true the unserialized object is returned to the caller but
9139 * no changes are made to the key object, nor the pages are marked as freed */
9140static robj *vmGenericLoadObject(robj *key, int preview) {
9141 robj *val;
9142
d5d55fc3 9143 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
a5819310 9144 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 9145 if (!preview) {
9146 key->storage = REDIS_VM_MEMORY;
9147 key->vm.atime = server.unixtime;
9148 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9149 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
9150 (unsigned char*) key->ptr);
7d98e08c 9151 server.vm_stats_swapped_objects--;
38aba9a1 9152 } else {
9153 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
9154 (unsigned char*) key->ptr);
7e69548d 9155 }
7d98e08c 9156 server.vm_stats_swapins++;
3a66edc7 9157 return val;
06224fec 9158}
9159
7e69548d 9160/* Plain object loading, from swap to memory */
9161static robj *vmLoadObject(robj *key) {
996cb5f7 9162 /* If we are loading the object in background, stop it, we
9163 * need to load this object synchronously ASAP. */
9164 if (key->storage == REDIS_VM_LOADING)
9165 vmCancelThreadedIOJob(key);
7e69548d 9166 return vmGenericLoadObject(key,0);
9167}
9168
9169/* Just load the value on disk, without to modify the key.
9170 * This is useful when we want to perform some operation on the value
9171 * without to really bring it from swap to memory, like while saving the
9172 * dataset or rewriting the append only log. */
9173static robj *vmPreviewObject(robj *key) {
9174 return vmGenericLoadObject(key,1);
9175}
9176
4ef8de8a 9177/* How a good candidate is this object for swapping?
9178 * The better candidate it is, the greater the returned value.
9179 *
9180 * Currently we try to perform a fast estimation of the object size in
9181 * memory, and combine it with aging informations.
9182 *
9183 * Basically swappability = idle-time * log(estimated size)
9184 *
9185 * Bigger objects are preferred over smaller objects, but not
9186 * proportionally, this is why we use the logarithm. This algorithm is
9187 * just a first try and will probably be tuned later. */
9188static double computeObjectSwappability(robj *o) {
9189 time_t age = server.unixtime - o->vm.atime;
9190 long asize = 0;
9191 list *l;
9192 dict *d;
9193 struct dictEntry *de;
9194 int z;
9195
9196 if (age <= 0) return 0;
9197 switch(o->type) {
9198 case REDIS_STRING:
9199 if (o->encoding != REDIS_ENCODING_RAW) {
9200 asize = sizeof(*o);
9201 } else {
9202 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9203 }
9204 break;
9205 case REDIS_LIST:
9206 l = o->ptr;
9207 listNode *ln = listFirst(l);
9208
9209 asize = sizeof(list);
9210 if (ln) {
9211 robj *ele = ln->value;
9212 long elesize;
9213
9214 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9215 (sizeof(*o)+sdslen(ele->ptr)) :
9216 sizeof(*o);
9217 asize += (sizeof(listNode)+elesize)*listLength(l);
9218 }
9219 break;
9220 case REDIS_SET:
9221 case REDIS_ZSET:
9222 z = (o->type == REDIS_ZSET);
9223 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9224
9225 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9226 if (z) asize += sizeof(zset)-sizeof(dict);
9227 if (dictSize(d)) {
9228 long elesize;
9229 robj *ele;
9230
9231 de = dictGetRandomKey(d);
9232 ele = dictGetEntryKey(de);
9233 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9234 (sizeof(*o)+sdslen(ele->ptr)) :
9235 sizeof(*o);
9236 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9237 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9238 }
9239 break;
a97b9060 9240 case REDIS_HASH:
9241 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9242 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9243 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9244 unsigned int klen, vlen;
9245 unsigned char *key, *val;
9246
9247 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9248 klen = 0;
9249 vlen = 0;
9250 }
9251 asize = len*(klen+vlen+3);
9252 } else if (o->encoding == REDIS_ENCODING_HT) {
9253 d = o->ptr;
9254 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9255 if (dictSize(d)) {
9256 long elesize;
9257 robj *ele;
9258
9259 de = dictGetRandomKey(d);
9260 ele = dictGetEntryKey(de);
9261 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9262 (sizeof(*o)+sdslen(ele->ptr)) :
9263 sizeof(*o);
9264 ele = dictGetEntryVal(de);
9265 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9266 (sizeof(*o)+sdslen(ele->ptr)) :
9267 sizeof(*o);
9268 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9269 }
9270 }
9271 break;
4ef8de8a 9272 }
c8c72447 9273 return (double)age*log(1+asize);
4ef8de8a 9274}
9275
9276/* Try to swap an object that's a good candidate for swapping.
9277 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 9278 * to swap any object at all.
9279 *
9280 * If 'usethreaded' is true, Redis will try to swap the object in background
9281 * using I/O threads. */
9282static int vmSwapOneObject(int usethreads) {
4ef8de8a 9283 int j, i;
9284 struct dictEntry *best = NULL;
9285 double best_swappability = 0;
b9bc0eef 9286 redisDb *best_db = NULL;
4ef8de8a 9287 robj *key, *val;
9288
9289 for (j = 0; j < server.dbnum; j++) {
9290 redisDb *db = server.db+j;
b72f6a4b 9291 /* Why maxtries is set to 100?
9292 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9293 * are swappable objects */
b0d8747d 9294 int maxtries = 100;
4ef8de8a 9295
9296 if (dictSize(db->dict) == 0) continue;
9297 for (i = 0; i < 5; i++) {
9298 dictEntry *de;
9299 double swappability;
9300
e3cadb8a 9301 if (maxtries) maxtries--;
4ef8de8a 9302 de = dictGetRandomKey(db->dict);
9303 key = dictGetEntryKey(de);
9304 val = dictGetEntryVal(de);
1064ef87 9305 /* Only swap objects that are currently in memory.
9306 *
9307 * Also don't swap shared objects if threaded VM is on, as we
9308 * try to ensure that the main thread does not touch the
9309 * object while the I/O thread is using it, but we can't
9310 * control other keys without adding additional mutex. */
9311 if (key->storage != REDIS_VM_MEMORY ||
9312 (server.vm_max_threads != 0 && val->refcount != 1)) {
e3cadb8a 9313 if (maxtries) i--; /* don't count this try */
9314 continue;
9315 }
4ef8de8a 9316 swappability = computeObjectSwappability(val);
9317 if (!best || swappability > best_swappability) {
9318 best = de;
9319 best_swappability = swappability;
b9bc0eef 9320 best_db = db;
4ef8de8a 9321 }
9322 }
9323 }
7c775e09 9324 if (best == NULL) return REDIS_ERR;
4ef8de8a 9325 key = dictGetEntryKey(best);
9326 val = dictGetEntryVal(best);
9327
e3cadb8a 9328 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 9329 key->ptr, best_swappability);
9330
9331 /* Unshare the key if needed */
9332 if (key->refcount > 1) {
9333 robj *newkey = dupStringObject(key);
9334 decrRefCount(key);
9335 key = dictGetEntryKey(best) = newkey;
9336 }
9337 /* Swap it */
a69a0c9c 9338 if (usethreads) {
b9bc0eef 9339 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 9340 return REDIS_OK;
9341 } else {
a69a0c9c 9342 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9343 dictGetEntryVal(best) = NULL;
9344 return REDIS_OK;
9345 } else {
9346 return REDIS_ERR;
9347 }
4ef8de8a 9348 }
9349}
9350
a69a0c9c 9351static int vmSwapOneObjectBlocking() {
9352 return vmSwapOneObject(0);
9353}
9354
9355static int vmSwapOneObjectThreaded() {
9356 return vmSwapOneObject(1);
9357}
9358
7e69548d 9359/* Return true if it's safe to swap out objects in a given moment.
9360 * Basically we don't want to swap objects out while there is a BGSAVE
9361 * or a BGAEOREWRITE running in backgroud. */
9362static int vmCanSwapOut(void) {
9363 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9364}
9365
1b03836c 9366/* Delete a key if swapped. Returns 1 if the key was found, was swapped
9367 * and was deleted. Otherwise 0 is returned. */
9368static int deleteIfSwapped(redisDb *db, robj *key) {
9369 dictEntry *de;
9370 robj *foundkey;
9371
9372 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9373 foundkey = dictGetEntryKey(de);
9374 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9375 deleteKey(db,key);
9376 return 1;
9377}
9378
996cb5f7 9379/* =================== Virtual Memory - Threaded I/O ======================= */
9380
b9bc0eef 9381static void freeIOJob(iojob *j) {
d5d55fc3 9382 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9383 j->type == REDIS_IOJOB_DO_SWAP ||
9384 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
b9bc0eef 9385 decrRefCount(j->val);
78ebe4c8 9386 /* We don't decrRefCount the j->key field as we did't incremented
9387 * the count creating IO Jobs. This is because the key field here is
9388 * just used as an indentifier and if a key is removed the Job should
9389 * never be touched again. */
b9bc0eef 9390 zfree(j);
9391}
9392
996cb5f7 9393/* Every time a thread finished a Job, it writes a byte into the write side
9394 * of an unix pipe in order to "awake" the main thread, and this function
9395 * is called. */
9396static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9397 int mask)
9398{
9399 char buf[1];
b0d8747d 9400 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 9401 REDIS_NOTUSED(el);
9402 REDIS_NOTUSED(mask);
9403 REDIS_NOTUSED(privdata);
9404
9405 /* For every byte we read in the read side of the pipe, there is one
9406 * I/O job completed to process. */
9407 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 9408 iojob *j;
9409 listNode *ln;
9410 robj *key;
9411 struct dictEntry *de;
9412
996cb5f7 9413 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 9414
9415 /* Get the processed element (the oldest one) */
9416 lockThreadedIO();
1064ef87 9417 assert(listLength(server.io_processed) != 0);
f6c0bba8 9418 if (toprocess == -1) {
9419 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9420 if (toprocess <= 0) toprocess = 1;
9421 }
b9bc0eef 9422 ln = listFirst(server.io_processed);
9423 j = ln->value;
9424 listDelNode(server.io_processed,ln);
9425 unlockThreadedIO();
9426 /* If this job is marked as canceled, just ignore it */
9427 if (j->canceled) {
9428 freeIOJob(j);
9429 continue;
9430 }
9431 /* Post process it in the main thread, as there are things we
9432 * can do just here to avoid race conditions and/or invasive locks */
6c96ba7d 9433 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 9434 de = dictFind(j->db->dict,j->key);
9435 assert(de != NULL);
9436 key = dictGetEntryKey(de);
9437 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9438 redisDb *db;
9439
b9bc0eef 9440 /* Key loaded, bring it at home */
9441 key->storage = REDIS_VM_MEMORY;
9442 key->vm.atime = server.unixtime;
9443 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9444 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9445 (unsigned char*) key->ptr);
9446 server.vm_stats_swapped_objects--;
9447 server.vm_stats_swapins++;
d5d55fc3 9448 dictGetEntryVal(de) = j->val;
9449 incrRefCount(j->val);
9450 db = j->db;
b9bc0eef 9451 freeIOJob(j);
d5d55fc3 9452 /* Handle clients waiting for this key to be loaded. */
9453 handleClientsBlockedOnSwappedKey(db,key);
b9bc0eef 9454 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9455 /* Now we know the amount of pages required to swap this object.
9456 * Let's find some space for it, and queue this task again
9457 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 9458 if (!vmCanSwapOut() ||
9459 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9460 {
9461 /* Ooops... no space or we can't swap as there is
9462 * a fork()ed Redis trying to save stuff on disk. */
b9bc0eef 9463 freeIOJob(j);
054e426d 9464 key->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 9465 } else {
c7df85a4 9466 /* Note that we need to mark this pages as used now,
9467 * if the job will be canceled, we'll mark them as freed
9468 * again. */
9469 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 9470 j->type = REDIS_IOJOB_DO_SWAP;
9471 lockThreadedIO();
9472 queueIOJob(j);
9473 unlockThreadedIO();
9474 }
9475 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9476 robj *val;
9477
9478 /* Key swapped. We can finally free some memory. */
6c96ba7d 9479 if (key->storage != REDIS_VM_SWAPPING) {
9480 printf("key->storage: %d\n",key->storage);
9481 printf("key->name: %s\n",(char*)key->ptr);
9482 printf("key->refcount: %d\n",key->refcount);
9483 printf("val: %p\n",(void*)j->val);
9484 printf("val->type: %d\n",j->val->type);
9485 printf("val->ptr: %s\n",(char*)j->val->ptr);
9486 }
9487 redisAssert(key->storage == REDIS_VM_SWAPPING);
b9bc0eef 9488 val = dictGetEntryVal(de);
9489 key->vm.page = j->page;
9490 key->vm.usedpages = j->pages;
9491 key->storage = REDIS_VM_SWAPPED;
9492 key->vtype = j->val->type;
9493 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 9494 dictGetEntryVal(de) = NULL;
b9bc0eef 9495 redisLog(REDIS_DEBUG,
9496 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9497 (unsigned char*) key->ptr,
9498 (unsigned long long) j->page, (unsigned long long) j->pages);
9499 server.vm_stats_swapped_objects++;
9500 server.vm_stats_swapouts++;
9501 freeIOJob(j);
f11b8647 9502 /* Put a few more swap requests in queue if we are still
9503 * out of memory */
b0d8747d 9504 if (trytoswap && vmCanSwapOut() &&
9505 zmalloc_used_memory() > server.vm_max_memory)
9506 {
f11b8647 9507 int more = 1;
9508 while(more) {
9509 lockThreadedIO();
9510 more = listLength(server.io_newjobs) <
9511 (unsigned) server.vm_max_threads;
9512 unlockThreadedIO();
9513 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 9514 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9515 trytoswap = 0;
9516 break;
9517 }
f11b8647 9518 }
9519 }
b9bc0eef 9520 }
c953f24b 9521 processed++;
f6c0bba8 9522 if (processed == toprocess) return;
996cb5f7 9523 }
9524 if (retval < 0 && errno != EAGAIN) {
9525 redisLog(REDIS_WARNING,
9526 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9527 strerror(errno));
9528 }
9529}
9530
9531static void lockThreadedIO(void) {
9532 pthread_mutex_lock(&server.io_mutex);
9533}
9534
9535static void unlockThreadedIO(void) {
9536 pthread_mutex_unlock(&server.io_mutex);
9537}
9538
9539/* Remove the specified object from the threaded I/O queue if still not
9540 * processed, otherwise make sure to flag it as canceled. */
9541static void vmCancelThreadedIOJob(robj *o) {
9542 list *lists[3] = {
6c96ba7d 9543 server.io_newjobs, /* 0 */
9544 server.io_processing, /* 1 */
9545 server.io_processed /* 2 */
996cb5f7 9546 };
9547 int i;
9548
9549 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 9550again:
996cb5f7 9551 lockThreadedIO();
9552 /* Search for a matching key in one of the queues */
9553 for (i = 0; i < 3; i++) {
9554 listNode *ln;
c7df85a4 9555 listIter li;
996cb5f7 9556
c7df85a4 9557 listRewind(lists[i],&li);
9558 while ((ln = listNext(&li)) != NULL) {
996cb5f7 9559 iojob *job = ln->value;
9560
6c96ba7d 9561 if (job->canceled) continue; /* Skip this, already canceled. */
78ebe4c8 9562 if (job->key == o) {
970e10bb 9563 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9564 (void*)job, (char*)o->ptr, job->type, i);
427a2153 9565 /* Mark the pages as free since the swap didn't happened
9566 * or happened but is now discarded. */
970e10bb 9567 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 9568 vmMarkPagesFree(job->page,job->pages);
9569 /* Cancel the job. It depends on the list the job is
9570 * living in. */
996cb5f7 9571 switch(i) {
9572 case 0: /* io_newjobs */
6c96ba7d 9573 /* If the job was yet not processed the best thing to do
996cb5f7 9574 * is to remove it from the queue at all */
6c96ba7d 9575 freeIOJob(job);
996cb5f7 9576 listDelNode(lists[i],ln);
9577 break;
9578 case 1: /* io_processing */
d5d55fc3 9579 /* Oh Shi- the thread is messing with the Job:
9580 *
9581 * Probably it's accessing the object if this is a
9582 * PREPARE_SWAP or DO_SWAP job.
9583 * If it's a LOAD job it may be reading from disk and
9584 * if we don't wait for the job to terminate before to
9585 * cancel it, maybe in a few microseconds data can be
9586 * corrupted in this pages. So the short story is:
9587 *
9588 * Better to wait for the job to move into the
9589 * next queue (processed)... */
9590
9591 /* We try again and again until the job is completed. */
9592 unlockThreadedIO();
9593 /* But let's wait some time for the I/O thread
9594 * to finish with this job. After all this condition
9595 * should be very rare. */
9596 usleep(1);
9597 goto again;
996cb5f7 9598 case 2: /* io_processed */
2e111efe 9599 /* The job was already processed, that's easy...
9600 * just mark it as canceled so that we'll ignore it
9601 * when processing completed jobs. */
996cb5f7 9602 job->canceled = 1;
9603 break;
9604 }
c7df85a4 9605 /* Finally we have to adjust the storage type of the object
9606 * in order to "UNDO" the operaiton. */
996cb5f7 9607 if (o->storage == REDIS_VM_LOADING)
9608 o->storage = REDIS_VM_SWAPPED;
9609 else if (o->storage == REDIS_VM_SWAPPING)
9610 o->storage = REDIS_VM_MEMORY;
9611 unlockThreadedIO();
9612 return;
9613 }
9614 }
9615 }
9616 unlockThreadedIO();
9617 assert(1 != 1); /* We should never reach this */
9618}
9619
b9bc0eef 9620static void *IOThreadEntryPoint(void *arg) {
9621 iojob *j;
9622 listNode *ln;
9623 REDIS_NOTUSED(arg);
9624
9625 pthread_detach(pthread_self());
9626 while(1) {
9627 /* Get a new job to process */
9628 lockThreadedIO();
9629 if (listLength(server.io_newjobs) == 0) {
9630 /* No new jobs in queue, exit. */
9ebed7cf 9631 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9632 (long) pthread_self());
b9bc0eef 9633 server.io_active_threads--;
9634 unlockThreadedIO();
9635 return NULL;
9636 }
9637 ln = listFirst(server.io_newjobs);
9638 j = ln->value;
9639 listDelNode(server.io_newjobs,ln);
9640 /* Add the job in the processing queue */
9641 j->thread = pthread_self();
9642 listAddNodeTail(server.io_processing,j);
9643 ln = listLast(server.io_processing); /* We use ln later to remove it */
9644 unlockThreadedIO();
9ebed7cf 9645 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9646 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 9647
9648 /* Process the Job */
9649 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9650 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
b9bc0eef 9651 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9652 FILE *fp = fopen("/dev/null","w+");
9653 j->pages = rdbSavedObjectPages(j->val,fp);
9654 fclose(fp);
9655 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 9656 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9657 j->canceled = 1;
b9bc0eef 9658 }
9659
9660 /* Done: insert the job into the processed queue */
9ebed7cf 9661 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9662 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 9663 lockThreadedIO();
9664 listDelNode(server.io_processing,ln);
9665 listAddNodeTail(server.io_processed,j);
9666 unlockThreadedIO();
e0a62c7f 9667
b9bc0eef 9668 /* Signal the main thread there is new stuff to process */
9669 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9670 }
9671 return NULL; /* never reached */
9672}
9673
9674static void spawnIOThread(void) {
9675 pthread_t thread;
478c2c6f 9676 sigset_t mask, omask;
a97b9060 9677 int err;
b9bc0eef 9678
478c2c6f 9679 sigemptyset(&mask);
9680 sigaddset(&mask,SIGCHLD);
9681 sigaddset(&mask,SIGHUP);
9682 sigaddset(&mask,SIGPIPE);
9683 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 9684 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9685 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9686 strerror(err));
9687 usleep(1000000);
9688 }
478c2c6f 9689 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 9690 server.io_active_threads++;
9691}
9692
4ee9488d 9693/* We need to wait for the last thread to exit before we are able to
9694 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 9695static void waitEmptyIOJobsQueue(void) {
4ee9488d 9696 while(1) {
76b7233a 9697 int io_processed_len;
9698
4ee9488d 9699 lockThreadedIO();
054e426d 9700 if (listLength(server.io_newjobs) == 0 &&
9701 listLength(server.io_processing) == 0 &&
9702 server.io_active_threads == 0)
9703 {
4ee9488d 9704 unlockThreadedIO();
9705 return;
9706 }
76b7233a 9707 /* While waiting for empty jobs queue condition we post-process some
9708 * finshed job, as I/O threads may be hanging trying to write against
9709 * the io_ready_pipe_write FD but there are so much pending jobs that
9710 * it's blocking. */
9711 io_processed_len = listLength(server.io_processed);
4ee9488d 9712 unlockThreadedIO();
76b7233a 9713 if (io_processed_len) {
9714 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9715 usleep(1000); /* 1 millisecond */
9716 } else {
9717 usleep(10000); /* 10 milliseconds */
9718 }
4ee9488d 9719 }
9720}
9721
054e426d 9722static void vmReopenSwapFile(void) {
478c2c6f 9723 /* Note: we don't close the old one as we are in the child process
9724 * and don't want to mess at all with the original file object. */
054e426d 9725 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9726 if (server.vm_fp == NULL) {
9727 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9728 server.vm_swap_file);
478c2c6f 9729 _exit(1);
054e426d 9730 }
9731 server.vm_fd = fileno(server.vm_fp);
9732}
9733
b9bc0eef 9734/* This function must be called while with threaded IO locked */
9735static void queueIOJob(iojob *j) {
6c96ba7d 9736 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9737 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 9738 listAddNodeTail(server.io_newjobs,j);
9739 if (server.io_active_threads < server.vm_max_threads)
9740 spawnIOThread();
9741}
9742
9743static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9744 iojob *j;
e0a62c7f 9745
b9bc0eef 9746 assert(key->storage == REDIS_VM_MEMORY);
9747 assert(key->refcount == 1);
9748
9749 j = zmalloc(sizeof(*j));
9750 j->type = REDIS_IOJOB_PREPARE_SWAP;
9751 j->db = db;
78ebe4c8 9752 j->key = key;
b9bc0eef 9753 j->val = val;
9754 incrRefCount(val);
9755 j->canceled = 0;
9756 j->thread = (pthread_t) -1;
f11b8647 9757 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 9758
9759 lockThreadedIO();
9760 queueIOJob(j);
9761 unlockThreadedIO();
9762 return REDIS_OK;
9763}
9764
b0d8747d 9765/* ============ Virtual Memory - Blocking clients on missing keys =========== */
9766
d5d55fc3 9767/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9768 * If there is not already a job loading the key, it is craeted.
9769 * The key is added to the io_keys list in the client structure, and also
9770 * in the hash table mapping swapped keys to waiting clients, that is,
9771 * server.io_waited_keys. */
9772static int waitForSwappedKey(redisClient *c, robj *key) {
9773 struct dictEntry *de;
9774 robj *o;
9775 list *l;
9776
9777 /* If the key does not exist or is already in RAM we don't need to
9778 * block the client at all. */
9779 de = dictFind(c->db->dict,key);
9780 if (de == NULL) return 0;
9781 o = dictGetEntryKey(de);
9782 if (o->storage == REDIS_VM_MEMORY) {
9783 return 0;
9784 } else if (o->storage == REDIS_VM_SWAPPING) {
9785 /* We were swapping the key, undo it! */
9786 vmCancelThreadedIOJob(o);
9787 return 0;
9788 }
e0a62c7f 9789
d5d55fc3 9790 /* OK: the key is either swapped, or being loaded just now. */
9791
9792 /* Add the key to the list of keys this client is waiting for.
9793 * This maps clients to keys they are waiting for. */
9794 listAddNodeTail(c->io_keys,key);
9795 incrRefCount(key);
9796
9797 /* Add the client to the swapped keys => clients waiting map. */
9798 de = dictFind(c->db->io_keys,key);
9799 if (de == NULL) {
9800 int retval;
9801
9802 /* For every key we take a list of clients blocked for it */
9803 l = listCreate();
9804 retval = dictAdd(c->db->io_keys,key,l);
9805 incrRefCount(key);
9806 assert(retval == DICT_OK);
9807 } else {
9808 l = dictGetEntryVal(de);
9809 }
9810 listAddNodeTail(l,c);
9811
9812 /* Are we already loading the key from disk? If not create a job */
9813 if (o->storage == REDIS_VM_SWAPPED) {
9814 iojob *j;
9815
9816 o->storage = REDIS_VM_LOADING;
9817 j = zmalloc(sizeof(*j));
9818 j->type = REDIS_IOJOB_LOAD;
9819 j->db = c->db;
78ebe4c8 9820 j->key = o;
d5d55fc3 9821 j->key->vtype = o->vtype;
9822 j->page = o->vm.page;
9823 j->val = NULL;
9824 j->canceled = 0;
9825 j->thread = (pthread_t) -1;
9826 lockThreadedIO();
9827 queueIOJob(j);
9828 unlockThreadedIO();
9829 }
9830 return 1;
9831}
9832
6f078746
PN
9833/* Preload keys for any command with first, last and step values for
9834 * the command keys prototype, as defined in the command table. */
9835static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9836 int j, last;
9837 if (cmd->vm_firstkey == 0) return;
9838 last = cmd->vm_lastkey;
9839 if (last < 0) last = argc+last;
9840 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
9841 redisAssert(j < argc);
9842 waitForSwappedKey(c,argv[j]);
9843 }
9844}
9845
5d373da9 9846/* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
739ba0d2
PN
9847 * Note that the number of keys to preload is user-defined, so we need to
9848 * apply a sanity check against argc. */
ca1788b5 9849static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
76583ea4 9850 int i, num;
ca1788b5 9851 REDIS_NOTUSED(cmd);
ca1788b5
PN
9852
9853 num = atoi(argv[2]->ptr);
739ba0d2 9854 if (num > (argc-3)) return;
76583ea4 9855 for (i = 0; i < num; i++) {
ca1788b5 9856 waitForSwappedKey(c,argv[3+i]);
76583ea4
PN
9857 }
9858}
9859
3805e04f
PN
9860/* Preload keys needed to execute the entire MULTI/EXEC block.
9861 *
9862 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
9863 * and will block the client when any command requires a swapped out value. */
9864static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9865 int i, margc;
9866 struct redisCommand *mcmd;
9867 robj **margv;
9868 REDIS_NOTUSED(cmd);
9869 REDIS_NOTUSED(argc);
9870 REDIS_NOTUSED(argv);
9871
9872 if (!(c->flags & REDIS_MULTI)) return;
9873 for (i = 0; i < c->mstate.count; i++) {
9874 mcmd = c->mstate.commands[i].cmd;
9875 margc = c->mstate.commands[i].argc;
9876 margv = c->mstate.commands[i].argv;
9877
9878 if (mcmd->vm_preload_proc != NULL) {
9879 mcmd->vm_preload_proc(c,mcmd,margc,margv);
9880 } else {
9881 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
9882 }
76583ea4
PN
9883 }
9884}
9885
b0d8747d 9886/* Is this client attempting to run a command against swapped keys?
d5d55fc3 9887 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 9888 *
d5d55fc3 9889 * The important idea about this function is that it can fail! If keys will
9890 * still be swapped when the client is resumed, this key lookups will
9891 * just block loading keys from disk. In practical terms this should only
9892 * happen with SORT BY command or if there is a bug in this function.
9893 *
9894 * Return 1 if the client is marked as blocked, 0 if the client can
9895 * continue as the keys it is going to access appear to be in memory. */
0a6f3f0f 9896static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
76583ea4 9897 if (cmd->vm_preload_proc != NULL) {
ca1788b5 9898 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
76583ea4 9899 } else {
6f078746 9900 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
76583ea4
PN
9901 }
9902
d5d55fc3 9903 /* If the client was blocked for at least one key, mark it as blocked. */
9904 if (listLength(c->io_keys)) {
9905 c->flags |= REDIS_IO_WAIT;
9906 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9907 server.vm_blocked_clients++;
9908 return 1;
9909 } else {
9910 return 0;
9911 }
9912}
9913
9914/* Remove the 'key' from the list of blocked keys for a given client.
9915 *
9916 * The function returns 1 when there are no longer blocking keys after
9917 * the current one was removed (and the client can be unblocked). */
9918static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9919 list *l;
9920 listNode *ln;
9921 listIter li;
9922 struct dictEntry *de;
9923
9924 /* Remove the key from the list of keys this client is waiting for. */
9925 listRewind(c->io_keys,&li);
9926 while ((ln = listNext(&li)) != NULL) {
bf028098 9927 if (equalStringObjects(ln->value,key)) {
d5d55fc3 9928 listDelNode(c->io_keys,ln);
9929 break;
9930 }
9931 }
9932 assert(ln != NULL);
9933
9934 /* Remove the client form the key => waiting clients map. */
9935 de = dictFind(c->db->io_keys,key);
9936 assert(de != NULL);
9937 l = dictGetEntryVal(de);
9938 ln = listSearchKey(l,c);
9939 assert(ln != NULL);
9940 listDelNode(l,ln);
9941 if (listLength(l) == 0)
9942 dictDelete(c->db->io_keys,key);
9943
9944 return listLength(c->io_keys) == 0;
9945}
9946
9947static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9948 struct dictEntry *de;
9949 list *l;
9950 listNode *ln;
9951 int len;
9952
9953 de = dictFind(db->io_keys,key);
9954 if (!de) return;
9955
9956 l = dictGetEntryVal(de);
9957 len = listLength(l);
9958 /* Note: we can't use something like while(listLength(l)) as the list
9959 * can be freed by the calling function when we remove the last element. */
9960 while (len--) {
9961 ln = listFirst(l);
9962 redisClient *c = ln->value;
9963
9964 if (dontWaitForSwappedKey(c,key)) {
9965 /* Put the client in the list of clients ready to go as we
9966 * loaded all the keys about it. */
9967 listAddNodeTail(server.io_ready_clients,c);
9968 }
9969 }
b0d8747d 9970}
b0d8747d 9971
500ece7c 9972/* =========================== Remote Configuration ========================= */
9973
9974static void configSetCommand(redisClient *c) {
9975 robj *o = getDecodedObject(c->argv[3]);
2e5eb04e 9976 long long ll;
9977
500ece7c 9978 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9979 zfree(server.dbfilename);
9980 server.dbfilename = zstrdup(o->ptr);
9981 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9982 zfree(server.requirepass);
9983 server.requirepass = zstrdup(o->ptr);
9984 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9985 zfree(server.masterauth);
9986 server.masterauth = zstrdup(o->ptr);
9987 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
2e5eb04e 9988 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9989 ll < 0) goto badfmt;
9990 server.maxmemory = ll;
9991 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
9992 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9993 ll < 0 || ll > LONG_MAX) goto badfmt;
9994 server.maxidletime = ll;
1b677732 9995 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
9996 if (!strcasecmp(o->ptr,"no")) {
9997 server.appendfsync = APPENDFSYNC_NO;
9998 } else if (!strcasecmp(o->ptr,"everysec")) {
9999 server.appendfsync = APPENDFSYNC_EVERYSEC;
10000 } else if (!strcasecmp(o->ptr,"always")) {
10001 server.appendfsync = APPENDFSYNC_ALWAYS;
10002 } else {
10003 goto badfmt;
10004 }
38db9171 10005 } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) {
10006 int yn = yesnotoi(o->ptr);
10007
10008 if (yn == -1) goto badfmt;
10009 server.no_appendfsync_on_rewrite = yn;
2e5eb04e 10010 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
10011 int old = server.appendonly;
10012 int new = yesnotoi(o->ptr);
10013
10014 if (new == -1) goto badfmt;
10015 if (old != new) {
10016 if (new == 0) {
10017 stopAppendOnly();
10018 } else {
10019 if (startAppendOnly() == REDIS_ERR) {
10020 addReplySds(c,sdscatprintf(sdsempty(),
10021 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10022 decrRefCount(o);
10023 return;
10024 }
10025 }
10026 }
a34e0a25 10027 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10028 int vlen, j;
10029 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10030
10031 /* Perform sanity check before setting the new config:
10032 * - Even number of args
10033 * - Seconds >= 1, changes >= 0 */
10034 if (vlen & 1) {
10035 sdsfreesplitres(v,vlen);
10036 goto badfmt;
10037 }
10038 for (j = 0; j < vlen; j++) {
10039 char *eptr;
10040 long val;
10041
10042 val = strtoll(v[j], &eptr, 10);
10043 if (eptr[0] != '\0' ||
10044 ((j & 1) == 0 && val < 1) ||
10045 ((j & 1) == 1 && val < 0)) {
10046 sdsfreesplitres(v,vlen);
10047 goto badfmt;
10048 }
10049 }
10050 /* Finally set the new config */
10051 resetServerSaveParams();
10052 for (j = 0; j < vlen; j += 2) {
10053 time_t seconds;
10054 int changes;
10055
10056 seconds = strtoll(v[j],NULL,10);
10057 changes = strtoll(v[j+1],NULL,10);
10058 appendServerSaveParams(seconds, changes);
10059 }
10060 sdsfreesplitres(v,vlen);
500ece7c 10061 } else {
10062 addReplySds(c,sdscatprintf(sdsempty(),
10063 "-ERR not supported CONFIG parameter %s\r\n",
10064 (char*)c->argv[2]->ptr));
10065 decrRefCount(o);
10066 return;
10067 }
10068 decrRefCount(o);
10069 addReply(c,shared.ok);
a34e0a25 10070 return;
10071
10072badfmt: /* Bad format errors */
10073 addReplySds(c,sdscatprintf(sdsempty(),
10074 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10075 (char*)o->ptr,
10076 (char*)c->argv[2]->ptr));
10077 decrRefCount(o);
500ece7c 10078}
10079
10080static void configGetCommand(redisClient *c) {
10081 robj *o = getDecodedObject(c->argv[2]);
10082 robj *lenobj = createObject(REDIS_STRING,NULL);
10083 char *pattern = o->ptr;
10084 int matches = 0;
10085
10086 addReply(c,lenobj);
10087 decrRefCount(lenobj);
10088
10089 if (stringmatch(pattern,"dbfilename",0)) {
10090 addReplyBulkCString(c,"dbfilename");
10091 addReplyBulkCString(c,server.dbfilename);
10092 matches++;
10093 }
10094 if (stringmatch(pattern,"requirepass",0)) {
10095 addReplyBulkCString(c,"requirepass");
10096 addReplyBulkCString(c,server.requirepass);
10097 matches++;
10098 }
10099 if (stringmatch(pattern,"masterauth",0)) {
10100 addReplyBulkCString(c,"masterauth");
10101 addReplyBulkCString(c,server.masterauth);
10102 matches++;
10103 }
10104 if (stringmatch(pattern,"maxmemory",0)) {
10105 char buf[128];
10106
2e5eb04e 10107 ll2string(buf,128,server.maxmemory);
500ece7c 10108 addReplyBulkCString(c,"maxmemory");
10109 addReplyBulkCString(c,buf);
10110 matches++;
10111 }
2e5eb04e 10112 if (stringmatch(pattern,"timeout",0)) {
10113 char buf[128];
10114
10115 ll2string(buf,128,server.maxidletime);
10116 addReplyBulkCString(c,"timeout");
10117 addReplyBulkCString(c,buf);
10118 matches++;
10119 }
10120 if (stringmatch(pattern,"appendonly",0)) {
10121 addReplyBulkCString(c,"appendonly");
10122 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10123 matches++;
10124 }
38db9171 10125 if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) {
10126 addReplyBulkCString(c,"no-appendfsync-on-rewrite");
10127 addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no");
10128 matches++;
10129 }
1b677732 10130 if (stringmatch(pattern,"appendfsync",0)) {
10131 char *policy;
10132
10133 switch(server.appendfsync) {
10134 case APPENDFSYNC_NO: policy = "no"; break;
10135 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10136 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10137 default: policy = "unknown"; break; /* too harmless to panic */
10138 }
10139 addReplyBulkCString(c,"appendfsync");
10140 addReplyBulkCString(c,policy);
10141 matches++;
10142 }
a34e0a25 10143 if (stringmatch(pattern,"save",0)) {
10144 sds buf = sdsempty();
10145 int j;
10146
10147 for (j = 0; j < server.saveparamslen; j++) {
10148 buf = sdscatprintf(buf,"%ld %d",
10149 server.saveparams[j].seconds,
10150 server.saveparams[j].changes);
10151 if (j != server.saveparamslen-1)
10152 buf = sdscatlen(buf," ",1);
10153 }
10154 addReplyBulkCString(c,"save");
10155 addReplyBulkCString(c,buf);
10156 sdsfree(buf);
10157 matches++;
10158 }
500ece7c 10159 decrRefCount(o);
10160 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10161}
10162
10163static void configCommand(redisClient *c) {
10164 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10165 if (c->argc != 4) goto badarity;
10166 configSetCommand(c);
10167 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10168 if (c->argc != 3) goto badarity;
10169 configGetCommand(c);
10170 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10171 if (c->argc != 2) goto badarity;
10172 server.stat_numcommands = 0;
10173 server.stat_numconnections = 0;
10174 server.stat_expiredkeys = 0;
10175 server.stat_starttime = time(NULL);
10176 addReply(c,shared.ok);
10177 } else {
10178 addReplySds(c,sdscatprintf(sdsempty(),
10179 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10180 }
10181 return;
10182
10183badarity:
10184 addReplySds(c,sdscatprintf(sdsempty(),
10185 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10186 (char*) c->argv[1]->ptr));
10187}
10188
befec3cd 10189/* =========================== Pubsub implementation ======================== */
10190
ffc6b7f8 10191static void freePubsubPattern(void *p) {
10192 pubsubPattern *pat = p;
10193
10194 decrRefCount(pat->pattern);
10195 zfree(pat);
10196}
10197
10198static int listMatchPubsubPattern(void *a, void *b) {
10199 pubsubPattern *pa = a, *pb = b;
10200
10201 return (pa->client == pb->client) &&
bf028098 10202 (equalStringObjects(pa->pattern,pb->pattern));
ffc6b7f8 10203}
10204
10205/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10206 * 0 if the client was already subscribed to that channel. */
10207static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
befec3cd 10208 struct dictEntry *de;
10209 list *clients = NULL;
10210 int retval = 0;
10211
ffc6b7f8 10212 /* Add the channel to the client -> channels hash table */
10213 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
befec3cd 10214 retval = 1;
ffc6b7f8 10215 incrRefCount(channel);
10216 /* Add the client to the channel -> list of clients hash table */
10217 de = dictFind(server.pubsub_channels,channel);
befec3cd 10218 if (de == NULL) {
10219 clients = listCreate();
ffc6b7f8 10220 dictAdd(server.pubsub_channels,channel,clients);
10221 incrRefCount(channel);
befec3cd 10222 } else {
10223 clients = dictGetEntryVal(de);
10224 }
10225 listAddNodeTail(clients,c);
10226 }
10227 /* Notify the client */
10228 addReply(c,shared.mbulk3);
10229 addReply(c,shared.subscribebulk);
ffc6b7f8 10230 addReplyBulk(c,channel);
482b672d 10231 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
befec3cd 10232 return retval;
10233}
10234
ffc6b7f8 10235/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10236 * 0 if the client was not subscribed to the specified channel. */
10237static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
befec3cd 10238 struct dictEntry *de;
10239 list *clients;
10240 listNode *ln;
10241 int retval = 0;
10242
ffc6b7f8 10243 /* Remove the channel from the client -> channels hash table */
10244 incrRefCount(channel); /* channel may be just a pointer to the same object
201037f5 10245 we have in the hash tables. Protect it... */
ffc6b7f8 10246 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
befec3cd 10247 retval = 1;
ffc6b7f8 10248 /* Remove the client from the channel -> clients list hash table */
10249 de = dictFind(server.pubsub_channels,channel);
befec3cd 10250 assert(de != NULL);
10251 clients = dictGetEntryVal(de);
10252 ln = listSearchKey(clients,c);
10253 assert(ln != NULL);
10254 listDelNode(clients,ln);
ff767a75 10255 if (listLength(clients) == 0) {
10256 /* Free the list and associated hash entry at all if this was
10257 * the latest client, so that it will be possible to abuse
ffc6b7f8 10258 * Redis PUBSUB creating millions of channels. */
10259 dictDelete(server.pubsub_channels,channel);
ff767a75 10260 }
befec3cd 10261 }
10262 /* Notify the client */
10263 if (notify) {
10264 addReply(c,shared.mbulk3);
10265 addReply(c,shared.unsubscribebulk);
ffc6b7f8 10266 addReplyBulk(c,channel);
482b672d 10267 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10268 listLength(c->pubsub_patterns));
10269
10270 }
10271 decrRefCount(channel); /* it is finally safe to release it */
10272 return retval;
10273}
10274
10275/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10276static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10277 int retval = 0;
10278
10279 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10280 retval = 1;
10281 pubsubPattern *pat;
10282 listAddNodeTail(c->pubsub_patterns,pattern);
10283 incrRefCount(pattern);
10284 pat = zmalloc(sizeof(*pat));
10285 pat->pattern = getDecodedObject(pattern);
10286 pat->client = c;
10287 listAddNodeTail(server.pubsub_patterns,pat);
10288 }
10289 /* Notify the client */
10290 addReply(c,shared.mbulk3);
10291 addReply(c,shared.psubscribebulk);
10292 addReplyBulk(c,pattern);
482b672d 10293 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
ffc6b7f8 10294 return retval;
10295}
10296
10297/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10298 * 0 if the client was not subscribed to the specified channel. */
10299static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10300 listNode *ln;
10301 pubsubPattern pat;
10302 int retval = 0;
10303
10304 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10305 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10306 retval = 1;
10307 listDelNode(c->pubsub_patterns,ln);
10308 pat.client = c;
10309 pat.pattern = pattern;
10310 ln = listSearchKey(server.pubsub_patterns,&pat);
10311 listDelNode(server.pubsub_patterns,ln);
10312 }
10313 /* Notify the client */
10314 if (notify) {
10315 addReply(c,shared.mbulk3);
10316 addReply(c,shared.punsubscribebulk);
10317 addReplyBulk(c,pattern);
482b672d 10318 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10319 listLength(c->pubsub_patterns));
befec3cd 10320 }
ffc6b7f8 10321 decrRefCount(pattern);
befec3cd 10322 return retval;
10323}
10324
ffc6b7f8 10325/* Unsubscribe from all the channels. Return the number of channels the
10326 * client was subscribed from. */
10327static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10328 dictIterator *di = dictGetIterator(c->pubsub_channels);
befec3cd 10329 dictEntry *de;
10330 int count = 0;
10331
10332 while((de = dictNext(di)) != NULL) {
ffc6b7f8 10333 robj *channel = dictGetEntryKey(de);
befec3cd 10334
ffc6b7f8 10335 count += pubsubUnsubscribeChannel(c,channel,notify);
befec3cd 10336 }
10337 dictReleaseIterator(di);
10338 return count;
10339}
10340
ffc6b7f8 10341/* Unsubscribe from all the patterns. Return the number of patterns the
10342 * client was subscribed from. */
10343static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10344 listNode *ln;
10345 listIter li;
10346 int count = 0;
10347
10348 listRewind(c->pubsub_patterns,&li);
10349 while ((ln = listNext(&li)) != NULL) {
10350 robj *pattern = ln->value;
10351
10352 count += pubsubUnsubscribePattern(c,pattern,notify);
10353 }
10354 return count;
10355}
10356
befec3cd 10357/* Publish a message */
ffc6b7f8 10358static int pubsubPublishMessage(robj *channel, robj *message) {
befec3cd 10359 int receivers = 0;
10360 struct dictEntry *de;
ffc6b7f8 10361 listNode *ln;
10362 listIter li;
befec3cd 10363
ffc6b7f8 10364 /* Send to clients listening for that channel */
10365 de = dictFind(server.pubsub_channels,channel);
befec3cd 10366 if (de) {
10367 list *list = dictGetEntryVal(de);
10368 listNode *ln;
10369 listIter li;
10370
10371 listRewind(list,&li);
10372 while ((ln = listNext(&li)) != NULL) {
10373 redisClient *c = ln->value;
10374
10375 addReply(c,shared.mbulk3);
10376 addReply(c,shared.messagebulk);
ffc6b7f8 10377 addReplyBulk(c,channel);
befec3cd 10378 addReplyBulk(c,message);
10379 receivers++;
10380 }
10381 }
ffc6b7f8 10382 /* Send to clients listening to matching channels */
10383 if (listLength(server.pubsub_patterns)) {
10384 listRewind(server.pubsub_patterns,&li);
10385 channel = getDecodedObject(channel);
10386 while ((ln = listNext(&li)) != NULL) {
10387 pubsubPattern *pat = ln->value;
10388
10389 if (stringmatchlen((char*)pat->pattern->ptr,
10390 sdslen(pat->pattern->ptr),
10391 (char*)channel->ptr,
10392 sdslen(channel->ptr),0)) {
c8d0ea0e 10393 addReply(pat->client,shared.mbulk4);
10394 addReply(pat->client,shared.pmessagebulk);
10395 addReplyBulk(pat->client,pat->pattern);
ffc6b7f8 10396 addReplyBulk(pat->client,channel);
10397 addReplyBulk(pat->client,message);
10398 receivers++;
10399 }
10400 }
10401 decrRefCount(channel);
10402 }
befec3cd 10403 return receivers;
10404}
10405
10406static void subscribeCommand(redisClient *c) {
10407 int j;
10408
10409 for (j = 1; j < c->argc; j++)
ffc6b7f8 10410 pubsubSubscribeChannel(c,c->argv[j]);
befec3cd 10411}
10412
10413static void unsubscribeCommand(redisClient *c) {
10414 if (c->argc == 1) {
ffc6b7f8 10415 pubsubUnsubscribeAllChannels(c,1);
10416 return;
10417 } else {
10418 int j;
10419
10420 for (j = 1; j < c->argc; j++)
10421 pubsubUnsubscribeChannel(c,c->argv[j],1);
10422 }
10423}
10424
10425static void psubscribeCommand(redisClient *c) {
10426 int j;
10427
10428 for (j = 1; j < c->argc; j++)
10429 pubsubSubscribePattern(c,c->argv[j]);
10430}
10431
10432static void punsubscribeCommand(redisClient *c) {
10433 if (c->argc == 1) {
10434 pubsubUnsubscribeAllPatterns(c,1);
befec3cd 10435 return;
10436 } else {
10437 int j;
10438
10439 for (j = 1; j < c->argc; j++)
ffc6b7f8 10440 pubsubUnsubscribePattern(c,c->argv[j],1);
befec3cd 10441 }
10442}
10443
10444static void publishCommand(redisClient *c) {
10445 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
482b672d 10446 addReplyLongLong(c,receivers);
befec3cd 10447}
10448
37ab76c9 10449/* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10450 *
10451 * The implementation uses a per-DB hash table mapping keys to list of clients
10452 * WATCHing those keys, so that given a key that is going to be modified
10453 * we can mark all the associated clients as dirty.
10454 *
10455 * Also every client contains a list of WATCHed keys so that's possible to
10456 * un-watch such keys when the client is freed or when UNWATCH is called. */
10457
10458/* In the client->watched_keys list we need to use watchedKey structures
10459 * as in order to identify a key in Redis we need both the key name and the
10460 * DB */
10461typedef struct watchedKey {
10462 robj *key;
10463 redisDb *db;
10464} watchedKey;
10465
10466/* Watch for the specified key */
10467static void watchForKey(redisClient *c, robj *key) {
10468 list *clients = NULL;
10469 listIter li;
10470 listNode *ln;
10471 watchedKey *wk;
10472
10473 /* Check if we are already watching for this key */
10474 listRewind(c->watched_keys,&li);
10475 while((ln = listNext(&li))) {
10476 wk = listNodeValue(ln);
10477 if (wk->db == c->db && equalStringObjects(key,wk->key))
10478 return; /* Key already watched */
10479 }
10480 /* This key is not already watched in this DB. Let's add it */
10481 clients = dictFetchValue(c->db->watched_keys,key);
10482 if (!clients) {
10483 clients = listCreate();
10484 dictAdd(c->db->watched_keys,key,clients);
10485 incrRefCount(key);
10486 }
10487 listAddNodeTail(clients,c);
10488 /* Add the new key to the lits of keys watched by this client */
10489 wk = zmalloc(sizeof(*wk));
10490 wk->key = key;
10491 wk->db = c->db;
10492 incrRefCount(key);
10493 listAddNodeTail(c->watched_keys,wk);
10494}
10495
10496/* Unwatch all the keys watched by this client. To clean the EXEC dirty
10497 * flag is up to the caller. */
10498static void unwatchAllKeys(redisClient *c) {
10499 listIter li;
10500 listNode *ln;
10501
10502 if (listLength(c->watched_keys) == 0) return;
10503 listRewind(c->watched_keys,&li);
10504 while((ln = listNext(&li))) {
10505 list *clients;
10506 watchedKey *wk;
10507
10508 /* Lookup the watched key -> clients list and remove the client
10509 * from the list */
10510 wk = listNodeValue(ln);
10511 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10512 assert(clients != NULL);
10513 listDelNode(clients,listSearchKey(clients,c));
10514 /* Kill the entry at all if this was the only client */
10515 if (listLength(clients) == 0)
10516 dictDelete(wk->db->watched_keys, wk->key);
10517 /* Remove this watched key from the client->watched list */
10518 listDelNode(c->watched_keys,ln);
10519 decrRefCount(wk->key);
10520 zfree(wk);
10521 }
10522}
10523
ca3f830b 10524/* "Touch" a key, so that if this key is being WATCHed by some client the
37ab76c9 10525 * next EXEC will fail. */
10526static void touchWatchedKey(redisDb *db, robj *key) {
10527 list *clients;
10528 listIter li;
10529 listNode *ln;
10530
10531 if (dictSize(db->watched_keys) == 0) return;
10532 clients = dictFetchValue(db->watched_keys, key);
10533 if (!clients) return;
10534
10535 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
10536 /* Check if we are already watching for this key */
10537 listRewind(clients,&li);
10538 while((ln = listNext(&li))) {
10539 redisClient *c = listNodeValue(ln);
10540
10541 c->flags |= REDIS_DIRTY_CAS;
10542 }
10543}
10544
9b30e1a2 10545/* On FLUSHDB or FLUSHALL all the watched keys that are present before the
10546 * flush but will be deleted as effect of the flushing operation should
10547 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
10548 * a FLUSHALL operation (all the DBs flushed). */
10549static void touchWatchedKeysOnFlush(int dbid) {
10550 listIter li1, li2;
10551 listNode *ln;
10552
10553 /* For every client, check all the waited keys */
10554 listRewind(server.clients,&li1);
10555 while((ln = listNext(&li1))) {
10556 redisClient *c = listNodeValue(ln);
10557 listRewind(c->watched_keys,&li2);
10558 while((ln = listNext(&li2))) {
10559 watchedKey *wk = listNodeValue(ln);
10560
10561 /* For every watched key matching the specified DB, if the
10562 * key exists, mark the client as dirty, as the key will be
10563 * removed. */
10564 if (dbid == -1 || wk->db->id == dbid) {
10565 if (dictFind(wk->db->dict, wk->key) != NULL)
10566 c->flags |= REDIS_DIRTY_CAS;
10567 }
10568 }
10569 }
10570}
10571
37ab76c9 10572static void watchCommand(redisClient *c) {
10573 int j;
10574
6531c94d 10575 if (c->flags & REDIS_MULTI) {
10576 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
10577 return;
10578 }
37ab76c9 10579 for (j = 1; j < c->argc; j++)
10580 watchForKey(c,c->argv[j]);
10581 addReply(c,shared.ok);
10582}
10583
10584static void unwatchCommand(redisClient *c) {
10585 unwatchAllKeys(c);
10586 c->flags &= (~REDIS_DIRTY_CAS);
10587 addReply(c,shared.ok);
10588}
10589
7f957c92 10590/* ================================= Debugging ============================== */
10591
ba798261 10592/* Compute the sha1 of string at 's' with 'len' bytes long.
10593 * The SHA1 is then xored againt the string pointed by digest.
10594 * Since xor is commutative, this operation is used in order to
10595 * "add" digests relative to unordered elements.
10596 *
10597 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
10598static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
10599 SHA1_CTX ctx;
10600 unsigned char hash[20], *s = ptr;
10601 int j;
10602
10603 SHA1Init(&ctx);
10604 SHA1Update(&ctx,s,len);
10605 SHA1Final(hash,&ctx);
10606
10607 for (j = 0; j < 20; j++)
10608 digest[j] ^= hash[j];
10609}
10610
10611static void xorObjectDigest(unsigned char *digest, robj *o) {
10612 o = getDecodedObject(o);
10613 xorDigest(digest,o->ptr,sdslen(o->ptr));
10614 decrRefCount(o);
10615}
10616
10617/* This function instead of just computing the SHA1 and xoring it
10618 * against diget, also perform the digest of "digest" itself and
10619 * replace the old value with the new one.
10620 *
10621 * So the final digest will be:
10622 *
10623 * digest = SHA1(digest xor SHA1(data))
10624 *
10625 * This function is used every time we want to preserve the order so
10626 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
10627 *
10628 * Also note that mixdigest("foo") followed by mixdigest("bar")
10629 * will lead to a different digest compared to "fo", "obar".
10630 */
10631static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
10632 SHA1_CTX ctx;
10633 char *s = ptr;
10634
10635 xorDigest(digest,s,len);
10636 SHA1Init(&ctx);
10637 SHA1Update(&ctx,digest,20);
10638 SHA1Final(digest,&ctx);
10639}
10640
10641static void mixObjectDigest(unsigned char *digest, robj *o) {
10642 o = getDecodedObject(o);
10643 mixDigest(digest,o->ptr,sdslen(o->ptr));
10644 decrRefCount(o);
10645}
10646
10647/* Compute the dataset digest. Since keys, sets elements, hashes elements
10648 * are not ordered, we use a trick: every aggregate digest is the xor
10649 * of the digests of their elements. This way the order will not change
10650 * the result. For list instead we use a feedback entering the output digest
10651 * as input in order to ensure that a different ordered list will result in
10652 * a different digest. */
10653static void computeDatasetDigest(unsigned char *final) {
10654 unsigned char digest[20];
10655 char buf[128];
10656 dictIterator *di = NULL;
10657 dictEntry *de;
10658 int j;
10659 uint32_t aux;
10660
10661 memset(final,0,20); /* Start with a clean result */
10662
10663 for (j = 0; j < server.dbnum; j++) {
10664 redisDb *db = server.db+j;
10665
10666 if (dictSize(db->dict) == 0) continue;
10667 di = dictGetIterator(db->dict);
10668
10669 /* hash the DB id, so the same dataset moved in a different
10670 * DB will lead to a different digest */
10671 aux = htonl(j);
10672 mixDigest(final,&aux,sizeof(aux));
10673
10674 /* Iterate this DB writing every entry */
10675 while((de = dictNext(di)) != NULL) {
cbae1d34 10676 robj *key, *o, *kcopy;
ba798261 10677 time_t expiretime;
10678
10679 memset(digest,0,20); /* This key-val digest */
10680 key = dictGetEntryKey(de);
cbae1d34 10681
10682 if (!server.vm_enabled) {
10683 mixObjectDigest(digest,key);
ba798261 10684 o = dictGetEntryVal(de);
ba798261 10685 } else {
cbae1d34 10686 /* Don't work with the key directly as when VM is active
10687 * this is unsafe: TODO: fix decrRefCount to check if the
10688 * count really reached 0 to avoid this mess */
10689 kcopy = dupStringObject(key);
10690 mixObjectDigest(digest,kcopy);
10691 o = lookupKeyRead(db,kcopy);
10692 decrRefCount(kcopy);
ba798261 10693 }
10694 aux = htonl(o->type);
10695 mixDigest(digest,&aux,sizeof(aux));
10696 expiretime = getExpire(db,key);
10697
10698 /* Save the key and associated value */
10699 if (o->type == REDIS_STRING) {
10700 mixObjectDigest(digest,o);
10701 } else if (o->type == REDIS_LIST) {
10702 list *list = o->ptr;
10703 listNode *ln;
10704 listIter li;
10705
10706 listRewind(list,&li);
10707 while((ln = listNext(&li))) {
10708 robj *eleobj = listNodeValue(ln);
10709
10710 mixObjectDigest(digest,eleobj);
10711 }
10712 } else if (o->type == REDIS_SET) {
10713 dict *set = o->ptr;
10714 dictIterator *di = dictGetIterator(set);
10715 dictEntry *de;
10716
10717 while((de = dictNext(di)) != NULL) {
10718 robj *eleobj = dictGetEntryKey(de);
10719
10720 xorObjectDigest(digest,eleobj);
10721 }
10722 dictReleaseIterator(di);
10723 } else if (o->type == REDIS_ZSET) {
10724 zset *zs = o->ptr;
10725 dictIterator *di = dictGetIterator(zs->dict);
10726 dictEntry *de;
10727
10728 while((de = dictNext(di)) != NULL) {
10729 robj *eleobj = dictGetEntryKey(de);
10730 double *score = dictGetEntryVal(de);
10731 unsigned char eledigest[20];
10732
10733 snprintf(buf,sizeof(buf),"%.17g",*score);
10734 memset(eledigest,0,20);
10735 mixObjectDigest(eledigest,eleobj);
10736 mixDigest(eledigest,buf,strlen(buf));
10737 xorDigest(digest,eledigest,20);
10738 }
10739 dictReleaseIterator(di);
10740 } else if (o->type == REDIS_HASH) {
10741 hashIterator *hi;
10742 robj *obj;
10743
10744 hi = hashInitIterator(o);
10745 while (hashNext(hi) != REDIS_ERR) {
10746 unsigned char eledigest[20];
10747
10748 memset(eledigest,0,20);
10749 obj = hashCurrent(hi,REDIS_HASH_KEY);
10750 mixObjectDigest(eledigest,obj);
10751 decrRefCount(obj);
10752 obj = hashCurrent(hi,REDIS_HASH_VALUE);
10753 mixObjectDigest(eledigest,obj);
10754 decrRefCount(obj);
10755 xorDigest(digest,eledigest,20);
10756 }
10757 hashReleaseIterator(hi);
10758 } else {
10759 redisPanic("Unknown object type");
10760 }
ba798261 10761 /* If the key has an expire, add it to the mix */
10762 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
10763 /* We can finally xor the key-val digest to the final digest */
10764 xorDigest(final,digest,20);
10765 }
10766 dictReleaseIterator(di);
10767 }
10768}
10769
7f957c92 10770static void debugCommand(redisClient *c) {
10771 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10772 *((char*)-1) = 'x';
210e29f7 10773 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10774 if (rdbSave(server.dbfilename) != REDIS_OK) {
10775 addReply(c,shared.err);
10776 return;
10777 }
10778 emptyDb();
10779 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10780 addReply(c,shared.err);
10781 return;
10782 }
10783 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10784 addReply(c,shared.ok);
71c2b467 10785 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10786 emptyDb();
10787 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10788 addReply(c,shared.err);
10789 return;
10790 }
10791 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10792 addReply(c,shared.ok);
333298da 10793 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10794 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10795 robj *key, *val;
10796
10797 if (!de) {
10798 addReply(c,shared.nokeyerr);
10799 return;
10800 }
10801 key = dictGetEntryKey(de);
10802 val = dictGetEntryVal(de);
59146ef3 10803 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
10804 key->storage == REDIS_VM_SWAPPING)) {
07efaf74 10805 char *strenc;
10806 char buf[128];
10807
10808 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10809 strenc = strencoding[val->encoding];
10810 } else {
10811 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10812 strenc = buf;
10813 }
ace06542 10814 addReplySds(c,sdscatprintf(sdsempty(),
10815 "+Key at:%p refcount:%d, value at:%p refcount:%d "
07efaf74 10816 "encoding:%s serializedlength:%lld\r\n",
682ac724 10817 (void*)key, key->refcount, (void*)val, val->refcount,
07efaf74 10818 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 10819 } else {
10820 addReplySds(c,sdscatprintf(sdsempty(),
10821 "+Key at:%p refcount:%d, value swapped at: page %llu "
10822 "using %llu pages\r\n",
10823 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10824 (unsigned long long) key->vm.usedpages));
10825 }
78ebe4c8 10826 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10827 lookupKeyRead(c->db,c->argv[2]);
10828 addReply(c,shared.ok);
7d30035d 10829 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10830 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10831 robj *key, *val;
10832
10833 if (!server.vm_enabled) {
10834 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10835 return;
10836 }
10837 if (!de) {
10838 addReply(c,shared.nokeyerr);
10839 return;
10840 }
10841 key = dictGetEntryKey(de);
10842 val = dictGetEntryVal(de);
4ef8de8a 10843 /* If the key is shared we want to create a copy */
10844 if (key->refcount > 1) {
10845 robj *newkey = dupStringObject(key);
10846 decrRefCount(key);
10847 key = dictGetEntryKey(de) = newkey;
10848 }
10849 /* Swap it */
7d30035d 10850 if (key->storage != REDIS_VM_MEMORY) {
10851 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 10852 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 10853 dictGetEntryVal(de) = NULL;
10854 addReply(c,shared.ok);
10855 } else {
10856 addReply(c,shared.err);
10857 }
59305dc7 10858 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
10859 long keys, j;
10860 robj *key, *val;
10861 char buf[128];
10862
10863 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
10864 return;
10865 for (j = 0; j < keys; j++) {
10866 snprintf(buf,sizeof(buf),"key:%lu",j);
10867 key = createStringObject(buf,strlen(buf));
10868 if (lookupKeyRead(c->db,key) != NULL) {
10869 decrRefCount(key);
10870 continue;
10871 }
10872 snprintf(buf,sizeof(buf),"value:%lu",j);
10873 val = createStringObject(buf,strlen(buf));
10874 dictAdd(c->db->dict,key,val);
10875 }
10876 addReply(c,shared.ok);
ba798261 10877 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
10878 unsigned char digest[20];
10879 sds d = sdsnew("+");
10880 int j;
10881
10882 computeDatasetDigest(digest);
10883 for (j = 0; j < 20; j++)
10884 d = sdscatprintf(d, "%02x",digest[j]);
10885
10886 d = sdscatlen(d,"\r\n",2);
10887 addReplySds(c,d);
7f957c92 10888 } else {
333298da 10889 addReplySds(c,sdsnew(
bdcb92f2 10890 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 10891 }
10892}
56906eef 10893
6c96ba7d 10894static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 10895 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
fdfb02e7 10896 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
dfc5e96c 10897#ifdef HAVE_BACKTRACE
10898 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10899 *((char*)-1) = 'x';
10900#endif
10901}
10902
c651fd9e 10903static void _redisPanic(char *msg, char *file, int line) {
10904 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
17772754 10905 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
c651fd9e 10906#ifdef HAVE_BACKTRACE
10907 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10908 *((char*)-1) = 'x';
10909#endif
10910}
10911
bcfc686d 10912/* =================================== Main! ================================ */
56906eef 10913
bcfc686d 10914#ifdef __linux__
10915int linuxOvercommitMemoryValue(void) {
10916 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10917 char buf[64];
56906eef 10918
bcfc686d 10919 if (!fp) return -1;
10920 if (fgets(buf,64,fp) == NULL) {
10921 fclose(fp);
10922 return -1;
10923 }
10924 fclose(fp);
56906eef 10925
bcfc686d 10926 return atoi(buf);
10927}
10928
10929void linuxOvercommitMemoryWarning(void) {
10930 if (linuxOvercommitMemoryValue() == 0) {
7ccd2d0a 10931 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
bcfc686d 10932 }
10933}
10934#endif /* __linux__ */
10935
10936static void daemonize(void) {
10937 int fd;
10938 FILE *fp;
10939
10940 if (fork() != 0) exit(0); /* parent exits */
10941 setsid(); /* create a new session */
10942
10943 /* Every output goes to /dev/null. If Redis is daemonized but
10944 * the 'logfile' is set to 'stdout' in the configuration file
10945 * it will not log at all. */
10946 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10947 dup2(fd, STDIN_FILENO);
10948 dup2(fd, STDOUT_FILENO);
10949 dup2(fd, STDERR_FILENO);
10950 if (fd > STDERR_FILENO) close(fd);
10951 }
10952 /* Try to write the pid file */
10953 fp = fopen(server.pidfile,"w");
10954 if (fp) {
10955 fprintf(fp,"%d\n",getpid());
10956 fclose(fp);
56906eef 10957 }
56906eef 10958}
10959
42ab0172 10960static void version() {
8a3b0d2d 10961 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
10962 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
42ab0172
AO
10963 exit(0);
10964}
10965
723fb69b
AO
10966static void usage() {
10967 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 10968 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
10969 exit(1);
10970}
10971
bcfc686d 10972int main(int argc, char **argv) {
9651a787 10973 time_t start;
10974
bcfc686d 10975 initServerConfig();
1a132bbc 10976 sortCommandTable();
bcfc686d 10977 if (argc == 2) {
44efe66e 10978 if (strcmp(argv[1], "-v") == 0 ||
10979 strcmp(argv[1], "--version") == 0) version();
10980 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 10981 resetServerSaveParams();
10982 loadServerConfig(argv[1]);
723fb69b
AO
10983 } else if ((argc > 2)) {
10984 usage();
bcfc686d 10985 } else {
10986 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10987 }
bcfc686d 10988 if (server.daemonize) daemonize();
71c54b21 10989 initServer();
bcfc686d 10990 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10991#ifdef __linux__
10992 linuxOvercommitMemoryWarning();
10993#endif
9651a787 10994 start = time(NULL);
bcfc686d 10995 if (server.appendonly) {
10996 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 10997 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 10998 } else {
10999 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 11000 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 11001 }
bcfc686d 11002 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 11003 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 11004 aeMain(server.el);
11005 aeDeleteEventLoop(server.el);
11006 return 0;
11007}
11008
11009/* ============================= Backtrace support ========================= */
11010
11011#ifdef HAVE_BACKTRACE
11012static char *findFuncName(void *pointer, unsigned long *offset);
11013
56906eef 11014static void *getMcontextEip(ucontext_t *uc) {
11015#if defined(__FreeBSD__)
11016 return (void*) uc->uc_mcontext.mc_eip;
11017#elif defined(__dietlibc__)
11018 return (void*) uc->uc_mcontext.eip;
06db1f50 11019#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 11020 #if __x86_64__
11021 return (void*) uc->uc_mcontext->__ss.__rip;
11022 #else
56906eef 11023 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 11024 #endif
06db1f50 11025#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 11026 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 11027 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 11028 #else
11029 return (void*) uc->uc_mcontext->__ss.__eip;
e0a62c7f 11030 #endif
54bac49d 11031#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 11032 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 11033#elif defined(__ia64__) /* Linux IA64 */
11034 return (void*) uc->uc_mcontext.sc_ip;
11035#else
11036 return NULL;
56906eef 11037#endif
11038}
11039
11040static void segvHandler(int sig, siginfo_t *info, void *secret) {
11041 void *trace[100];
11042 char **messages = NULL;
11043 int i, trace_size = 0;
11044 unsigned long offset=0;
56906eef 11045 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 11046 sds infostring;
56906eef 11047 REDIS_NOTUSED(info);
11048
11049 redisLog(REDIS_WARNING,
11050 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 11051 infostring = genRedisInfoString();
11052 redisLog(REDIS_WARNING, "%s",infostring);
11053 /* It's not safe to sdsfree() the returned string under memory
11054 * corruption conditions. Let it leak as we are going to abort */
e0a62c7f 11055
56906eef 11056 trace_size = backtrace(trace, 100);
de96dbfe 11057 /* overwrite sigaction with caller's address */
b91cf5ef 11058 if (getMcontextEip(uc) != NULL) {
11059 trace[1] = getMcontextEip(uc);
11060 }
56906eef 11061 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 11062
d76412d1 11063 for (i=1; i<trace_size; ++i) {
56906eef 11064 char *fn = findFuncName(trace[i], &offset), *p;
11065
11066 p = strchr(messages[i],'+');
11067 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11068 redisLog(REDIS_WARNING,"%s", messages[i]);
11069 } else {
11070 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11071 }
11072 }
b177fd30 11073 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 11074 _exit(0);
fe3bbfbe 11075}
56906eef 11076
fab43727 11077static void sigtermHandler(int sig) {
11078 REDIS_NOTUSED(sig);
b58ba105 11079
fab43727 11080 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11081 server.shutdown_asap = 1;
b58ba105
AM
11082}
11083
56906eef 11084static void setupSigSegvAction(void) {
11085 struct sigaction act;
11086
11087 sigemptyset (&act.sa_mask);
11088 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11089 * is used. Otherwise, sa_handler is used */
11090 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11091 act.sa_sigaction = segvHandler;
11092 sigaction (SIGSEGV, &act, NULL);
11093 sigaction (SIGBUS, &act, NULL);
12fea928 11094 sigaction (SIGFPE, &act, NULL);
11095 sigaction (SIGILL, &act, NULL);
11096 sigaction (SIGBUS, &act, NULL);
b58ba105
AM
11097
11098 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
fab43727 11099 act.sa_handler = sigtermHandler;
b58ba105 11100 sigaction (SIGTERM, &act, NULL);
e65fdc78 11101 return;
56906eef 11102}
e65fdc78 11103
bcfc686d 11104#include "staticsymbols.h"
11105/* This function try to convert a pointer into a function name. It's used in
11106 * oreder to provide a backtrace under segmentation fault that's able to
11107 * display functions declared as static (otherwise the backtrace is useless). */
11108static char *findFuncName(void *pointer, unsigned long *offset){
11109 int i, ret = -1;
11110 unsigned long off, minoff = 0;
ed9b544e 11111
bcfc686d 11112 /* Try to match against the Symbol with the smallest offset */
11113 for (i=0; symsTable[i].pointer; i++) {
11114 unsigned long lp = (unsigned long) pointer;
0bc03378 11115
bcfc686d 11116 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11117 off=lp-symsTable[i].pointer;
11118 if (ret < 0 || off < minoff) {
11119 minoff=off;
11120 ret=i;
11121 }
11122 }
0bc03378 11123 }
bcfc686d 11124 if (ret == -1) return NULL;
11125 *offset = minoff;
11126 return symsTable[ret].name;
0bc03378 11127}
bcfc686d 11128#else /* HAVE_BACKTRACE */
11129static void setupSigSegvAction(void) {
0bc03378 11130}
bcfc686d 11131#endif /* HAVE_BACKTRACE */
0bc03378 11132
ed9b544e 11133
ed9b544e 11134
bcfc686d 11135/* The End */
11136
11137
ed9b544e 11138