]> git.saurik.com Git - redis.git/blame - redis.c
changed how server.tcl accepts options to support more directives without requiring...
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
9005896c 30#define REDIS_VERSION "2.1.1"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
40#include <signal.h>
fbf9bcdb 41
42#ifdef HAVE_BACKTRACE
c9468bcf 43#include <execinfo.h>
44#include <ucontext.h>
fbf9bcdb 45#endif /* HAVE_BACKTRACE */
46
ed9b544e 47#include <sys/wait.h>
48#include <errno.h>
49#include <assert.h>
50#include <ctype.h>
51#include <stdarg.h>
52#include <inttypes.h>
53#include <arpa/inet.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#include <sys/time.h>
57#include <sys/resource.h>
2895e862 58#include <sys/uio.h>
f78fd11b 59#include <limits.h>
fb82e75c 60#include <float.h>
a7866db6 61#include <math.h>
92f8e882 62#include <pthread.h>
0bc1b2f6 63
64#if defined(__sun)
5043dff3 65#include "solarisfixes.h"
66#endif
ed9b544e 67
c9468bcf 68#include "redis.h"
ed9b544e 69#include "ae.h" /* Event driven programming library */
70#include "sds.h" /* Dynamic safe strings */
71#include "anet.h" /* Networking the easy way */
72#include "dict.h" /* Hash tables */
73#include "adlist.h" /* Linked lists */
74#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 75#include "lzf.h" /* LZF compression library */
76#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
ba798261 77#include "zipmap.h" /* Compact dictionary-alike data structure */
78#include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
5436146c 79#include "release.h" /* Release and/or git repository information */
ed9b544e 80
81/* Error codes */
82#define REDIS_OK 0
83#define REDIS_ERR -1
84
85/* Static server configuration */
86#define REDIS_SERVERPORT 6379 /* TCP port */
87#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 88#define REDIS_IOBUF_LEN 1024
ed9b544e 89#define REDIS_LOADBUF_LEN 1024
248ea310 90#define REDIS_STATIC_ARGS 8
ed9b544e 91#define REDIS_DEFAULT_DBNUM 16
92#define REDIS_CONFIGLINE_MAX 1024
93#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
94#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
8ca3e9d1 95#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
6f376729 96#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 97#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98
99/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
100#define REDIS_WRITEV_THRESHOLD 3
101/* Max number of iovecs used for each writev call */
102#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 103
104/* Hash table parameters */
105#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 106
107/* Command flags */
3fd78bcd 108#define REDIS_CMD_BULK 1 /* Bulk write command */
109#define REDIS_CMD_INLINE 2 /* Inline command */
110/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
111 this flags will return an error when the 'maxmemory' option is set in the
112 config file and the server is using more than maxmemory bytes of memory.
113 In short this commands are denied on low memory conditions. */
114#define REDIS_CMD_DENYOOM 4
4005fef1 115#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
ed9b544e 116
117/* Object types */
118#define REDIS_STRING 0
119#define REDIS_LIST 1
120#define REDIS_SET 2
1812e024 121#define REDIS_ZSET 3
122#define REDIS_HASH 4
f78fd11b 123
5234952b 124/* Objects encoding. Some kind of objects like Strings and Hashes can be
125 * internally represented in multiple ways. The 'encoding' field of the object
126 * is set to one of this fields for this object. */
942a3961 127#define REDIS_ENCODING_RAW 0 /* Raw representation */
128#define REDIS_ENCODING_INT 1 /* Encoded as integer */
5234952b 129#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
130#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
942a3961 131
07efaf74 132static char* strencoding[] = {
133 "raw", "int", "zipmap", "hashtable"
134};
135
f78fd11b 136/* Object types only used for dumping to disk */
bb32ede5 137#define REDIS_EXPIRETIME 253
ed9b544e 138#define REDIS_SELECTDB 254
139#define REDIS_EOF 255
140
f78fd11b 141/* Defines related to the dump file format. To store 32 bits lengths for short
142 * keys requires a lot of space, so we check the most significant 2 bits of
143 * the first byte to interpreter the length:
144 *
145 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
146 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
147 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 148 * 11|000000 this means: specially encoded object will follow. The six bits
149 * number specify the kind of object that follows.
150 * See the REDIS_RDB_ENC_* defines.
f78fd11b 151 *
10c43610 152 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
153 * values, will fit inside. */
f78fd11b 154#define REDIS_RDB_6BITLEN 0
155#define REDIS_RDB_14BITLEN 1
156#define REDIS_RDB_32BITLEN 2
17be1a4a 157#define REDIS_RDB_ENCVAL 3
f78fd11b 158#define REDIS_RDB_LENERR UINT_MAX
159
a4d1ba9a 160/* When a length of a string object stored on disk has the first two bits
161 * set, the remaining two bits specify a special encoding for the object
162 * accordingly to the following defines: */
163#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
164#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
165#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 166#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 167
75680a3c 168/* Virtual memory object->where field. */
169#define REDIS_VM_MEMORY 0 /* The object is on memory */
170#define REDIS_VM_SWAPPED 1 /* The object is on disk */
171#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
172#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
173
06224fec 174/* Virtual memory static configuration stuff.
175 * Check vmFindContiguousPages() to know more about this magic numbers. */
176#define REDIS_VM_MAX_NEAR_PAGES 65536
177#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 178#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 179#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 180/* The following is the *percentage* of completed I/O jobs to process when the
181 * handelr is called. While Virtual Memory I/O operations are performed by
182 * threads, this operations must be processed by the main thread when completed
183 * in order to take effect. */
c953f24b 184#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 185
ed9b544e 186/* Client flags */
d5d55fc3 187#define REDIS_SLAVE 1 /* This client is a slave server */
188#define REDIS_MASTER 2 /* This client is a master server */
189#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
190#define REDIS_MULTI 8 /* This client is in a MULTI context */
191#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
192#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
37ab76c9 193#define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
ed9b544e 194
40d224a9 195/* Slave replication state - slave side */
ed9b544e 196#define REDIS_REPL_NONE 0 /* No active replication */
197#define REDIS_REPL_CONNECT 1 /* Must connect to master */
198#define REDIS_REPL_CONNECTED 2 /* Connected to master */
199
40d224a9 200/* Slave replication state - from the point of view of master
201 * Note that in SEND_BULK and ONLINE state the slave receives new updates
202 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
203 * to start the next background saving in order to send updates to it. */
204#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
205#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
206#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
207#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
208
ed9b544e 209/* List related stuff */
210#define REDIS_HEAD 0
211#define REDIS_TAIL 1
212
213/* Sort operations */
214#define REDIS_SORT_GET 0
443c6409 215#define REDIS_SORT_ASC 1
216#define REDIS_SORT_DESC 2
ed9b544e 217#define REDIS_SORTKEY_MAX 1024
218
219/* Log levels */
220#define REDIS_DEBUG 0
f870935d 221#define REDIS_VERBOSE 1
222#define REDIS_NOTICE 2
223#define REDIS_WARNING 3
ed9b544e 224
225/* Anti-warning macro... */
226#define REDIS_NOTUSED(V) ((void) V)
227
6b47e12e 228#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
229#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 230
48f0308a 231/* Append only defines */
232#define APPENDFSYNC_NO 0
233#define APPENDFSYNC_ALWAYS 1
234#define APPENDFSYNC_EVERYSEC 2
235
cbba7dd7 236/* Hashes related defaults */
237#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
238#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
239
dfc5e96c 240/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 241#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
c651fd9e 242#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
6c96ba7d 243static void _redisAssert(char *estr, char *file, int line);
c651fd9e 244static void _redisPanic(char *msg, char *file, int line);
dfc5e96c 245
ed9b544e 246/*================================= Data types ============================== */
247
248/* A redis object, that is a type able to hold a string / list / set */
75680a3c 249
250/* The VM object structure */
251struct redisObjectVM {
3a66edc7 252 off_t page; /* the page at witch the object is stored on disk */
253 off_t usedpages; /* number of pages used on disk */
254 time_t atime; /* Last access time */
75680a3c 255} vm;
256
257/* The actual Redis Object */
ed9b544e 258typedef struct redisObject {
ed9b544e 259 void *ptr;
942a3961 260 unsigned char type;
261 unsigned char encoding;
d894161b 262 unsigned char storage; /* If this object is a key, where is the value?
263 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
264 unsigned char vtype; /* If this object is a key, and value is swapped out,
265 * this is the type of the swapped out object. */
ed9b544e 266 int refcount;
75680a3c 267 /* VM fields, this are only allocated if VM is active, otherwise the
268 * object allocation function will just allocate
269 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
270 * Redis without VM active will not have any overhead. */
271 struct redisObjectVM vm;
ed9b544e 272} robj;
273
dfc5e96c 274/* Macro used to initalize a Redis object allocated on the stack.
275 * Note that this macro is taken near the structure definition to make sure
276 * we'll update it when the structure is changed, to avoid bugs like
277 * bug #85 introduced exactly in this way. */
278#define initStaticStringObject(_var,_ptr) do { \
279 _var.refcount = 1; \
280 _var.type = REDIS_STRING; \
281 _var.encoding = REDIS_ENCODING_RAW; \
282 _var.ptr = _ptr; \
3a66edc7 283 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 284} while(0);
285
3305306f 286typedef struct redisDb {
4409877e 287 dict *dict; /* The keyspace for this DB */
288 dict *expires; /* Timeout of keys with a timeout set */
37ab76c9 289 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 290 dict *io_keys; /* Keys with clients waiting for VM I/O */
37ab76c9 291 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
3305306f 292 int id;
293} redisDb;
294
6e469882 295/* Client MULTI/EXEC state */
296typedef struct multiCmd {
297 robj **argv;
298 int argc;
299 struct redisCommand *cmd;
300} multiCmd;
301
302typedef struct multiState {
303 multiCmd *commands; /* Array of MULTI commands */
304 int count; /* Total number of MULTI commands */
305} multiState;
306
ed9b544e 307/* With multiplexing we need to take per-clinet state.
308 * Clients are taken in a liked list. */
309typedef struct redisClient {
310 int fd;
3305306f 311 redisDb *db;
ed9b544e 312 int dictid;
313 sds querybuf;
e8a74421 314 robj **argv, **mbargv;
315 int argc, mbargc;
40d224a9 316 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 317 int multibulk; /* multi bulk command format active */
ed9b544e 318 list *reply;
319 int sentlen;
320 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 321 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 322 int slaveseldb; /* slave selected db, if this client is a slave */
323 int authenticated; /* when requirepass is non-NULL */
324 int replstate; /* replication state if this is a slave */
325 int repldbfd; /* replication DB file descriptor */
6e469882 326 long repldboff; /* replication DB file offset */
40d224a9 327 off_t repldbsize; /* replication DB file size */
6e469882 328 multiState mstate; /* MULTI/EXEC state */
37ab76c9 329 robj **blocking_keys; /* The key we are waiting to terminate a blocking
4409877e 330 * operation such as BLPOP. Otherwise NULL. */
37ab76c9 331 int blocking_keys_num; /* Number of blocking keys */
4409877e 332 time_t blockingto; /* Blocking operation timeout. If UNIX current time
333 * is >= blockingto then the operation timed out. */
92f8e882 334 list *io_keys; /* Keys this client is waiting to be loaded from the
335 * swap file in order to continue. */
37ab76c9 336 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
ffc6b7f8 337 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
338 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
ed9b544e 339} redisClient;
340
341struct saveparam {
342 time_t seconds;
343 int changes;
344};
345
346/* Global server state structure */
347struct redisServer {
348 int port;
349 int fd;
3305306f 350 redisDb *db;
ed9b544e 351 long long dirty; /* changes to DB from the last save */
352 list *clients;
87eca727 353 list *slaves, *monitors;
ed9b544e 354 char neterr[ANET_ERR_LEN];
355 aeEventLoop *el;
356 int cronloops; /* number of times the cron function run */
357 list *objfreelist; /* A list of freed objects to avoid malloc() */
358 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 359 /* Fields used only for stats */
360 time_t stat_starttime; /* server start time */
361 long long stat_numcommands; /* number of processed commands */
362 long long stat_numconnections; /* number of connections received */
2a6a2ed1 363 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 364 /* Configuration */
365 int verbosity;
366 int glueoutputbuf;
367 int maxidletime;
368 int dbnum;
369 int daemonize;
44b38ef4 370 int appendonly;
48f0308a 371 int appendfsync;
fab43727 372 int shutdown_asap;
48f0308a 373 time_t lastfsync;
44b38ef4 374 int appendfd;
375 int appendseldb;
ed329fcf 376 char *pidfile;
9f3c422c 377 pid_t bgsavechildpid;
9d65a1bb 378 pid_t bgrewritechildpid;
379 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
28ed1f33 380 sds aofbuf; /* AOF buffer, written before entering the event loop */
ed9b544e 381 struct saveparam *saveparams;
382 int saveparamslen;
383 char *logfile;
384 char *bindaddr;
385 char *dbfilename;
44b38ef4 386 char *appendfilename;
abcb223e 387 char *requirepass;
121f70cf 388 int rdbcompression;
8ca3e9d1 389 int activerehashing;
ed9b544e 390 /* Replication related */
391 int isslave;
d0ccebcf 392 char *masterauth;
ed9b544e 393 char *masterhost;
394 int masterport;
40d224a9 395 redisClient *master; /* client that is master for this slave */
ed9b544e 396 int replstate;
285add55 397 unsigned int maxclients;
4ef8de8a 398 unsigned long long maxmemory;
d5d55fc3 399 unsigned int blpop_blocked_clients;
400 unsigned int vm_blocked_clients;
ed9b544e 401 /* Sort parameters - qsort_r() is only available under BSD so we
402 * have to take this state global, in order to pass it to sortCompare() */
403 int sort_desc;
404 int sort_alpha;
405 int sort_bypattern;
75680a3c 406 /* Virtual memory configuration */
407 int vm_enabled;
054e426d 408 char *vm_swap_file;
75680a3c 409 off_t vm_page_size;
410 off_t vm_pages;
4ef8de8a 411 unsigned long long vm_max_memory;
cbba7dd7 412 /* Hashes config */
413 size_t hash_max_zipmap_entries;
414 size_t hash_max_zipmap_value;
75680a3c 415 /* Virtual memory state */
416 FILE *vm_fp;
417 int vm_fd;
418 off_t vm_next_page; /* Next probably empty page */
419 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 420 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 421 time_t unixtime; /* Unix time sampled every second. */
92f8e882 422 /* Virtual memory I/O threads stuff */
92f8e882 423 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 424 * put the result of the operation in the io_done list. While the
425 * job is being processed, it's put on io_processing queue. */
426 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
427 list *io_processing; /* List of VM I/O jobs being processed */
428 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 429 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 430 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 431 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
432 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 433 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 434 int io_active_threads; /* Number of running I/O threads */
435 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 436 /* Our main thread is blocked on the event loop, locking for sockets ready
437 * to be read or written, so when a threaded I/O operation is ready to be
438 * processed by the main thread, the I/O thread will use a unix pipe to
439 * awake the main thread. The followings are the two pipe FDs. */
440 int io_ready_pipe_read;
441 int io_ready_pipe_write;
7d98e08c 442 /* Virtual memory stats */
443 unsigned long long vm_stats_used_pages;
444 unsigned long long vm_stats_swapped_objects;
445 unsigned long long vm_stats_swapouts;
446 unsigned long long vm_stats_swapins;
befec3cd 447 /* Pubsub */
ffc6b7f8 448 dict *pubsub_channels; /* Map channels to list of subscribed clients */
449 list *pubsub_patterns; /* A list of pubsub_patterns */
befec3cd 450 /* Misc */
b9bc0eef 451 FILE *devnull;
ed9b544e 452};
453
ffc6b7f8 454typedef struct pubsubPattern {
455 redisClient *client;
456 robj *pattern;
457} pubsubPattern;
458
ed9b544e 459typedef void redisCommandProc(redisClient *c);
ca1788b5 460typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
ed9b544e 461struct redisCommand {
462 char *name;
463 redisCommandProc *proc;
464 int arity;
465 int flags;
76583ea4
PN
466 /* Use a function to determine which keys need to be loaded
467 * in the background prior to executing this command. Takes precedence
468 * over vm_firstkey and others, ignored when NULL */
ca1788b5 469 redisVmPreloadProc *vm_preload_proc;
7c775e09 470 /* What keys should be loaded in background when calling this command? */
471 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
472 int vm_lastkey; /* THe last argument that's a key */
473 int vm_keystep; /* The step between first and last key */
ed9b544e 474};
475
de96dbfe 476struct redisFunctionSym {
477 char *name;
56906eef 478 unsigned long pointer;
de96dbfe 479};
480
ed9b544e 481typedef struct _redisSortObject {
482 robj *obj;
483 union {
484 double score;
485 robj *cmpobj;
486 } u;
487} redisSortObject;
488
489typedef struct _redisSortOperation {
490 int type;
491 robj *pattern;
492} redisSortOperation;
493
6b47e12e 494/* ZSETs use a specialized version of Skiplists */
495
496typedef struct zskiplistNode {
497 struct zskiplistNode **forward;
e3870fab 498 struct zskiplistNode *backward;
912b9165 499 unsigned int *span;
6b47e12e 500 double score;
501 robj *obj;
502} zskiplistNode;
503
504typedef struct zskiplist {
e3870fab 505 struct zskiplistNode *header, *tail;
d13f767c 506 unsigned long length;
6b47e12e 507 int level;
508} zskiplist;
509
1812e024 510typedef struct zset {
511 dict *dict;
6b47e12e 512 zskiplist *zsl;
1812e024 513} zset;
514
6b47e12e 515/* Our shared "common" objects */
516
05df7621 517#define REDIS_SHARED_INTEGERS 10000
ed9b544e 518struct sharedObjectsStruct {
c937aa89 519 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 520 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 521 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
522 *outofrangeerr, *plus,
ed9b544e 523 *select0, *select1, *select2, *select3, *select4,
befec3cd 524 *select5, *select6, *select7, *select8, *select9,
c8d0ea0e 525 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
526 *mbulk4, *psubscribebulk, *punsubscribebulk,
527 *integers[REDIS_SHARED_INTEGERS];
ed9b544e 528} shared;
529
a7866db6 530/* Global vars that are actally used as constants. The following double
531 * values are used for double on-disk serialization, and are initialized
532 * at runtime to avoid strange compiler optimizations. */
533
534static double R_Zero, R_PosInf, R_NegInf, R_Nan;
535
92f8e882 536/* VM threaded I/O request message */
b9bc0eef 537#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
538#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
539#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 540typedef struct iojob {
996cb5f7 541 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 542 redisDb *db;/* Redis database */
92f8e882 543 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 544 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 545 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
546 off_t page; /* Swap page where to read/write the object */
248ea310 547 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 548 int canceled; /* True if this command was canceled by blocking side of VM */
549 pthread_t thread; /* ID of the thread processing this entry */
550} iojob;
92f8e882 551
ed9b544e 552/*================================ Prototypes =============================== */
553
554static void freeStringObject(robj *o);
555static void freeListObject(robj *o);
556static void freeSetObject(robj *o);
557static void decrRefCount(void *o);
558static robj *createObject(int type, void *ptr);
559static void freeClient(redisClient *c);
f78fd11b 560static int rdbLoad(char *filename);
ed9b544e 561static void addReply(redisClient *c, robj *obj);
562static void addReplySds(redisClient *c, sds s);
563static void incrRefCount(robj *o);
f78fd11b 564static int rdbSaveBackground(char *filename);
ed9b544e 565static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 566static robj *dupStringObject(robj *o);
248ea310 567static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
dd142b9c 568static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
28ed1f33 569static void flushAppendOnlyFile(void);
44b38ef4 570static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 571static int syncWithMaster(void);
05df7621 572static robj *tryObjectEncoding(robj *o);
9d65a1bb 573static robj *getDecodedObject(robj *o);
3305306f 574static int removeExpire(redisDb *db, robj *key);
575static int expireIfNeeded(redisDb *db, robj *key);
576static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 577static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 578static int deleteKey(redisDb *db, robj *key);
bb32ede5 579static time_t getExpire(redisDb *db, robj *key);
580static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 581static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 582static void freeMemoryIfNeeded(void);
de96dbfe 583static int processCommand(redisClient *c);
56906eef 584static void setupSigSegvAction(void);
a3b21203 585static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 586static void aofRemoveTempFile(pid_t childpid);
0ea663ea 587static size_t stringObjectLen(robj *o);
638e42ac 588static void processInputBuffer(redisClient *c);
6b47e12e 589static zskiplist *zslCreate(void);
fd8ccf44 590static void zslFree(zskiplist *zsl);
2b59cfdf 591static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 592static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 593static void initClientMultiState(redisClient *c);
594static void freeClientMultiState(redisClient *c);
595static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 596static void unblockClientWaitingData(redisClient *c);
4409877e 597static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 598static void vmInit(void);
a35ddf12 599static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 600static robj *vmLoadObject(robj *key);
7e69548d 601static robj *vmPreviewObject(robj *key);
a69a0c9c 602static int vmSwapOneObjectBlocking(void);
603static int vmSwapOneObjectThreaded(void);
7e69548d 604static int vmCanSwapOut(void);
a5819310 605static int tryFreeOneObjectFromFreelist(void);
996cb5f7 606static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
607static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
608static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 609static void lockThreadedIO(void);
610static void unlockThreadedIO(void);
611static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
612static void freeIOJob(iojob *j);
613static void queueIOJob(iojob *j);
a5819310 614static int vmWriteObjectOnSwap(robj *o, off_t page);
615static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 616static void waitEmptyIOJobsQueue(void);
617static void vmReopenSwapFile(void);
970e10bb 618static int vmFreePage(off_t page);
ca1788b5 619static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
3805e04f 620static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
0a6f3f0f 621static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
d5d55fc3 622static int dontWaitForSwappedKey(redisClient *c, robj *key);
623static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
624static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
625static struct redisCommand *lookupCommand(char *name);
626static void call(redisClient *c, struct redisCommand *cmd);
627static void resetClient(redisClient *c);
ada386b2 628static void convertToRealHash(robj *o);
ffc6b7f8 629static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
630static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
631static void freePubsubPattern(void *p);
632static int listMatchPubsubPattern(void *a, void *b);
633static int compareStringObjects(robj *a, robj *b);
bf028098 634static int equalStringObjects(robj *a, robj *b);
befec3cd 635static void usage();
8f63ddca 636static int rewriteAppendOnlyFileBackground(void);
242a64f3 637static int vmSwapObjectBlocking(robj *key, robj *val);
fab43727 638static int prepareForShutdown();
37ab76c9 639static void touchWatchedKey(redisDb *db, robj *key);
9b30e1a2 640static void touchWatchedKeysOnFlush(int dbid);
37ab76c9 641static void unwatchAllKeys(redisClient *c);
ed9b544e 642
abcb223e 643static void authCommand(redisClient *c);
ed9b544e 644static void pingCommand(redisClient *c);
645static void echoCommand(redisClient *c);
646static void setCommand(redisClient *c);
647static void setnxCommand(redisClient *c);
526d00a5 648static void setexCommand(redisClient *c);
ed9b544e 649static void getCommand(redisClient *c);
650static void delCommand(redisClient *c);
651static void existsCommand(redisClient *c);
652static void incrCommand(redisClient *c);
653static void decrCommand(redisClient *c);
654static void incrbyCommand(redisClient *c);
655static void decrbyCommand(redisClient *c);
656static void selectCommand(redisClient *c);
657static void randomkeyCommand(redisClient *c);
658static void keysCommand(redisClient *c);
659static void dbsizeCommand(redisClient *c);
660static void lastsaveCommand(redisClient *c);
661static void saveCommand(redisClient *c);
662static void bgsaveCommand(redisClient *c);
9d65a1bb 663static void bgrewriteaofCommand(redisClient *c);
ed9b544e 664static void shutdownCommand(redisClient *c);
665static void moveCommand(redisClient *c);
666static void renameCommand(redisClient *c);
667static void renamenxCommand(redisClient *c);
668static void lpushCommand(redisClient *c);
669static void rpushCommand(redisClient *c);
670static void lpopCommand(redisClient *c);
671static void rpopCommand(redisClient *c);
672static void llenCommand(redisClient *c);
673static void lindexCommand(redisClient *c);
674static void lrangeCommand(redisClient *c);
675static void ltrimCommand(redisClient *c);
676static void typeCommand(redisClient *c);
677static void lsetCommand(redisClient *c);
678static void saddCommand(redisClient *c);
679static void sremCommand(redisClient *c);
a4460ef4 680static void smoveCommand(redisClient *c);
ed9b544e 681static void sismemberCommand(redisClient *c);
682static void scardCommand(redisClient *c);
12fea928 683static void spopCommand(redisClient *c);
2abb95a9 684static void srandmemberCommand(redisClient *c);
ed9b544e 685static void sinterCommand(redisClient *c);
686static void sinterstoreCommand(redisClient *c);
40d224a9 687static void sunionCommand(redisClient *c);
688static void sunionstoreCommand(redisClient *c);
f4f56e1d 689static void sdiffCommand(redisClient *c);
690static void sdiffstoreCommand(redisClient *c);
ed9b544e 691static void syncCommand(redisClient *c);
692static void flushdbCommand(redisClient *c);
693static void flushallCommand(redisClient *c);
694static void sortCommand(redisClient *c);
695static void lremCommand(redisClient *c);
0f5f7e9a 696static void rpoplpushcommand(redisClient *c);
ed9b544e 697static void infoCommand(redisClient *c);
70003d28 698static void mgetCommand(redisClient *c);
87eca727 699static void monitorCommand(redisClient *c);
3305306f 700static void expireCommand(redisClient *c);
802e8373 701static void expireatCommand(redisClient *c);
f6b141c5 702static void getsetCommand(redisClient *c);
fd88489a 703static void ttlCommand(redisClient *c);
321b0e13 704static void slaveofCommand(redisClient *c);
7f957c92 705static void debugCommand(redisClient *c);
f6b141c5 706static void msetCommand(redisClient *c);
707static void msetnxCommand(redisClient *c);
fd8ccf44 708static void zaddCommand(redisClient *c);
7db723ad 709static void zincrbyCommand(redisClient *c);
cc812361 710static void zrangeCommand(redisClient *c);
50c55df5 711static void zrangebyscoreCommand(redisClient *c);
f44dd428 712static void zcountCommand(redisClient *c);
e3870fab 713static void zrevrangeCommand(redisClient *c);
3c41331e 714static void zcardCommand(redisClient *c);
1b7106e7 715static void zremCommand(redisClient *c);
6e333bbe 716static void zscoreCommand(redisClient *c);
1807985b 717static void zremrangebyscoreCommand(redisClient *c);
6e469882 718static void multiCommand(redisClient *c);
719static void execCommand(redisClient *c);
18b6cb76 720static void discardCommand(redisClient *c);
4409877e 721static void blpopCommand(redisClient *c);
722static void brpopCommand(redisClient *c);
4b00bebd 723static void appendCommand(redisClient *c);
39191553 724static void substrCommand(redisClient *c);
69d95c3e 725static void zrankCommand(redisClient *c);
798d9e55 726static void zrevrankCommand(redisClient *c);
978c2c94 727static void hsetCommand(redisClient *c);
1f1c7695 728static void hsetnxCommand(redisClient *c);
978c2c94 729static void hgetCommand(redisClient *c);
09aeb579
PN
730static void hmsetCommand(redisClient *c);
731static void hmgetCommand(redisClient *c);
07efaf74 732static void hdelCommand(redisClient *c);
92b27fe9 733static void hlenCommand(redisClient *c);
9212eafd 734static void zremrangebyrankCommand(redisClient *c);
5d373da9 735static void zunionstoreCommand(redisClient *c);
736static void zinterstoreCommand(redisClient *c);
78409a0f 737static void hkeysCommand(redisClient *c);
738static void hvalsCommand(redisClient *c);
739static void hgetallCommand(redisClient *c);
a86f14b1 740static void hexistsCommand(redisClient *c);
500ece7c 741static void configCommand(redisClient *c);
01426b05 742static void hincrbyCommand(redisClient *c);
befec3cd 743static void subscribeCommand(redisClient *c);
744static void unsubscribeCommand(redisClient *c);
ffc6b7f8 745static void psubscribeCommand(redisClient *c);
746static void punsubscribeCommand(redisClient *c);
befec3cd 747static void publishCommand(redisClient *c);
37ab76c9 748static void watchCommand(redisClient *c);
749static void unwatchCommand(redisClient *c);
f6b141c5 750
ed9b544e 751/*================================= Globals ================================= */
752
753/* Global vars */
754static struct redisServer server; /* server global state */
1a132bbc 755static struct redisCommand *commandTable;
1a132bbc 756static struct redisCommand readonlyCommandTable[] = {
76583ea4
PN
757 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
758 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
759 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
526d00a5 760 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
761 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
762 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
763 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
764 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
765 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
766 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
767 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
768 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
769 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
770 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
771 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
777 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
780 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
781 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
782 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
783 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
784 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
785 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
789 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
790 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
791 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
792 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
793 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
794 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
795 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
798 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
5d373da9 800 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
801 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
802 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
807 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
808 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
809 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
810 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
1f1c7695 811 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 812 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
d33278d1 813 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 814 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
01426b05 815 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
816 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
817 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
819 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
820 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 821 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
822 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
823 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
824 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
825 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
826 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
827 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
830 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
831 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
832 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
839 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
840 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
842 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
843 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
844 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
845 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
3805e04f 846 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
847 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
848 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
849 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
850 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
851 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
852 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
853 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
854 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
855 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
856 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 857 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
befec3cd 858 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
859 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
ffc6b7f8 860 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
861 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
4005fef1 862 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
37ab76c9 863 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
d55d5c5d 864 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}
ed9b544e 865};
bcfc686d 866
ed9b544e 867/*============================ Utility functions ============================ */
868
869/* Glob-style pattern matching. */
500ece7c 870static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 871 const char *string, int stringLen, int nocase)
872{
873 while(patternLen) {
874 switch(pattern[0]) {
875 case '*':
876 while (pattern[1] == '*') {
877 pattern++;
878 patternLen--;
879 }
880 if (patternLen == 1)
881 return 1; /* match */
882 while(stringLen) {
883 if (stringmatchlen(pattern+1, patternLen-1,
884 string, stringLen, nocase))
885 return 1; /* match */
886 string++;
887 stringLen--;
888 }
889 return 0; /* no match */
890 break;
891 case '?':
892 if (stringLen == 0)
893 return 0; /* no match */
894 string++;
895 stringLen--;
896 break;
897 case '[':
898 {
899 int not, match;
900
901 pattern++;
902 patternLen--;
903 not = pattern[0] == '^';
904 if (not) {
905 pattern++;
906 patternLen--;
907 }
908 match = 0;
909 while(1) {
910 if (pattern[0] == '\\') {
911 pattern++;
912 patternLen--;
913 if (pattern[0] == string[0])
914 match = 1;
915 } else if (pattern[0] == ']') {
916 break;
917 } else if (patternLen == 0) {
918 pattern--;
919 patternLen++;
920 break;
921 } else if (pattern[1] == '-' && patternLen >= 3) {
922 int start = pattern[0];
923 int end = pattern[2];
924 int c = string[0];
925 if (start > end) {
926 int t = start;
927 start = end;
928 end = t;
929 }
930 if (nocase) {
931 start = tolower(start);
932 end = tolower(end);
933 c = tolower(c);
934 }
935 pattern += 2;
936 patternLen -= 2;
937 if (c >= start && c <= end)
938 match = 1;
939 } else {
940 if (!nocase) {
941 if (pattern[0] == string[0])
942 match = 1;
943 } else {
944 if (tolower((int)pattern[0]) == tolower((int)string[0]))
945 match = 1;
946 }
947 }
948 pattern++;
949 patternLen--;
950 }
951 if (not)
952 match = !match;
953 if (!match)
954 return 0; /* no match */
955 string++;
956 stringLen--;
957 break;
958 }
959 case '\\':
960 if (patternLen >= 2) {
961 pattern++;
962 patternLen--;
963 }
964 /* fall through */
965 default:
966 if (!nocase) {
967 if (pattern[0] != string[0])
968 return 0; /* no match */
969 } else {
970 if (tolower((int)pattern[0]) != tolower((int)string[0]))
971 return 0; /* no match */
972 }
973 string++;
974 stringLen--;
975 break;
976 }
977 pattern++;
978 patternLen--;
979 if (stringLen == 0) {
980 while(*pattern == '*') {
981 pattern++;
982 patternLen--;
983 }
984 break;
985 }
986 }
987 if (patternLen == 0 && stringLen == 0)
988 return 1;
989 return 0;
990}
991
500ece7c 992static int stringmatch(const char *pattern, const char *string, int nocase) {
993 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
994}
995
2b619329 996/* Convert a string representing an amount of memory into the number of
997 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
998 * (1024*1024*1024).
999 *
1000 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1001 * set to 0 */
1002static long long memtoll(const char *p, int *err) {
1003 const char *u;
1004 char buf[128];
1005 long mul; /* unit multiplier */
1006 long long val;
1007 unsigned int digits;
1008
1009 if (err) *err = 0;
1010 /* Search the first non digit character. */
1011 u = p;
1012 if (*u == '-') u++;
1013 while(*u && isdigit(*u)) u++;
1014 if (*u == '\0' || !strcasecmp(u,"b")) {
1015 mul = 1;
72324005 1016 } else if (!strcasecmp(u,"k")) {
2b619329 1017 mul = 1000;
72324005 1018 } else if (!strcasecmp(u,"kb")) {
2b619329 1019 mul = 1024;
72324005 1020 } else if (!strcasecmp(u,"m")) {
2b619329 1021 mul = 1000*1000;
72324005 1022 } else if (!strcasecmp(u,"mb")) {
2b619329 1023 mul = 1024*1024;
72324005 1024 } else if (!strcasecmp(u,"g")) {
2b619329 1025 mul = 1000L*1000*1000;
72324005 1026 } else if (!strcasecmp(u,"gb")) {
2b619329 1027 mul = 1024L*1024*1024;
1028 } else {
1029 if (err) *err = 1;
1030 mul = 1;
1031 }
1032 digits = u-p;
1033 if (digits >= sizeof(buf)) {
1034 if (err) *err = 1;
1035 return LLONG_MAX;
1036 }
1037 memcpy(buf,p,digits);
1038 buf[digits] = '\0';
1039 val = strtoll(buf,NULL,10);
1040 return val*mul;
1041}
1042
ee14da56 1043/* Convert a long long into a string. Returns the number of
1044 * characters needed to represent the number, that can be shorter if passed
1045 * buffer length is not enough to store the whole number. */
1046static int ll2string(char *s, size_t len, long long value) {
1047 char buf[32], *p;
1048 unsigned long long v;
1049 size_t l;
1050
1051 if (len == 0) return 0;
1052 v = (value < 0) ? -value : value;
1053 p = buf+31; /* point to the last character */
1054 do {
1055 *p-- = '0'+(v%10);
1056 v /= 10;
1057 } while(v);
1058 if (value < 0) *p-- = '-';
1059 p++;
1060 l = 32-(p-buf);
1061 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1062 memcpy(s,p,l);
1063 s[l] = '\0';
1064 return l;
1065}
1066
56906eef 1067static void redisLog(int level, const char *fmt, ...) {
ed9b544e 1068 va_list ap;
1069 FILE *fp;
1070
1071 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1072 if (!fp) return;
1073
1074 va_start(ap, fmt);
1075 if (level >= server.verbosity) {
6766f45e 1076 char *c = ".-*#";
1904ecc1 1077 char buf[64];
1078 time_t now;
1079
1080 now = time(NULL);
6c9385e0 1081 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 1082 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 1083 vfprintf(fp, fmt, ap);
1084 fprintf(fp,"\n");
1085 fflush(fp);
1086 }
1087 va_end(ap);
1088
1089 if (server.logfile) fclose(fp);
1090}
1091
1092/*====================== Hash table type implementation ==================== */
1093
1094/* This is an hash table type that uses the SDS dynamic strings libary as
1095 * keys and radis objects as values (objects can hold SDS strings,
1096 * lists, sets). */
1097
1812e024 1098static void dictVanillaFree(void *privdata, void *val)
1099{
1100 DICT_NOTUSED(privdata);
1101 zfree(val);
1102}
1103
4409877e 1104static void dictListDestructor(void *privdata, void *val)
1105{
1106 DICT_NOTUSED(privdata);
1107 listRelease((list*)val);
1108}
1109
ed9b544e 1110static int sdsDictKeyCompare(void *privdata, const void *key1,
1111 const void *key2)
1112{
1113 int l1,l2;
1114 DICT_NOTUSED(privdata);
1115
1116 l1 = sdslen((sds)key1);
1117 l2 = sdslen((sds)key2);
1118 if (l1 != l2) return 0;
1119 return memcmp(key1, key2, l1) == 0;
1120}
1121
1122static void dictRedisObjectDestructor(void *privdata, void *val)
1123{
1124 DICT_NOTUSED(privdata);
1125
a35ddf12 1126 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 1127 decrRefCount(val);
1128}
1129
942a3961 1130static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1131 const void *key2)
1132{
1133 const robj *o1 = key1, *o2 = key2;
1134 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1135}
1136
942a3961 1137static unsigned int dictObjHash(const void *key) {
ed9b544e 1138 const robj *o = key;
1139 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1140}
1141
942a3961 1142static int dictEncObjKeyCompare(void *privdata, const void *key1,
1143 const void *key2)
1144{
9d65a1bb 1145 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1146 int cmp;
942a3961 1147
2a1198b4 1148 if (o1->encoding == REDIS_ENCODING_INT &&
dc05abde 1149 o2->encoding == REDIS_ENCODING_INT)
1150 return o1->ptr == o2->ptr;
2a1198b4 1151
9d65a1bb 1152 o1 = getDecodedObject(o1);
1153 o2 = getDecodedObject(o2);
1154 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1155 decrRefCount(o1);
1156 decrRefCount(o2);
1157 return cmp;
942a3961 1158}
1159
1160static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1161 robj *o = (robj*) key;
942a3961 1162
ed9e4966 1163 if (o->encoding == REDIS_ENCODING_RAW) {
1164 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1165 } else {
1166 if (o->encoding == REDIS_ENCODING_INT) {
1167 char buf[32];
1168 int len;
1169
ee14da56 1170 len = ll2string(buf,32,(long)o->ptr);
ed9e4966 1171 return dictGenHashFunction((unsigned char*)buf, len);
1172 } else {
1173 unsigned int hash;
1174
1175 o = getDecodedObject(o);
1176 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1177 decrRefCount(o);
1178 return hash;
1179 }
1180 }
942a3961 1181}
1182
f2d9f50f 1183/* Sets type and expires */
ed9b544e 1184static dictType setDictType = {
942a3961 1185 dictEncObjHash, /* hash function */
ed9b544e 1186 NULL, /* key dup */
1187 NULL, /* val dup */
942a3961 1188 dictEncObjKeyCompare, /* key compare */
ed9b544e 1189 dictRedisObjectDestructor, /* key destructor */
1190 NULL /* val destructor */
1191};
1192
f2d9f50f 1193/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1194static dictType zsetDictType = {
1195 dictEncObjHash, /* hash function */
1196 NULL, /* key dup */
1197 NULL, /* val dup */
1198 dictEncObjKeyCompare, /* key compare */
1199 dictRedisObjectDestructor, /* key destructor */
da0a1620 1200 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1201};
1202
f2d9f50f 1203/* Db->dict */
5234952b 1204static dictType dbDictType = {
942a3961 1205 dictObjHash, /* hash function */
ed9b544e 1206 NULL, /* key dup */
1207 NULL, /* val dup */
942a3961 1208 dictObjKeyCompare, /* key compare */
ed9b544e 1209 dictRedisObjectDestructor, /* key destructor */
1210 dictRedisObjectDestructor /* val destructor */
1211};
1212
f2d9f50f 1213/* Db->expires */
1214static dictType keyptrDictType = {
1215 dictObjHash, /* hash function */
1216 NULL, /* key dup */
1217 NULL, /* val dup */
1218 dictObjKeyCompare, /* key compare */
1219 dictRedisObjectDestructor, /* key destructor */
1220 NULL /* val destructor */
1221};
1222
5234952b 1223/* Hash type hash table (note that small hashes are represented with zimpaps) */
1224static dictType hashDictType = {
1225 dictEncObjHash, /* hash function */
1226 NULL, /* key dup */
1227 NULL, /* val dup */
1228 dictEncObjKeyCompare, /* key compare */
1229 dictRedisObjectDestructor, /* key destructor */
1230 dictRedisObjectDestructor /* val destructor */
1231};
1232
4409877e 1233/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1234 * lists as values. It's used for blocking operations (BLPOP) and to
1235 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1236static dictType keylistDictType = {
1237 dictObjHash, /* hash function */
1238 NULL, /* key dup */
1239 NULL, /* val dup */
1240 dictObjKeyCompare, /* key compare */
1241 dictRedisObjectDestructor, /* key destructor */
1242 dictListDestructor /* val destructor */
1243};
1244
42ab0172
AO
1245static void version();
1246
ed9b544e 1247/* ========================= Random utility functions ======================= */
1248
1249/* Redis generally does not try to recover from out of memory conditions
1250 * when allocating objects or strings, it is not clear if it will be possible
1251 * to report this condition to the client since the networking layer itself
1252 * is based on heap allocation for send buffers, so we simply abort.
1253 * At least the code will be simpler to read... */
1254static void oom(const char *msg) {
71c54b21 1255 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1256 sleep(1);
1257 abort();
1258}
1259
1260/* ====================== Redis server networking stuff ===================== */
56906eef 1261static void closeTimedoutClients(void) {
ed9b544e 1262 redisClient *c;
ed9b544e 1263 listNode *ln;
1264 time_t now = time(NULL);
c7df85a4 1265 listIter li;
ed9b544e 1266
c7df85a4 1267 listRewind(server.clients,&li);
1268 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1269 c = listNodeValue(ln);
f86a74e9 1270 if (server.maxidletime &&
1271 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1272 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
ffc6b7f8 1273 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1274 listLength(c->pubsub_patterns) == 0 &&
d6cc8867 1275 (now - c->lastinteraction > server.maxidletime))
f86a74e9 1276 {
f870935d 1277 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1278 freeClient(c);
f86a74e9 1279 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1280 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1281 addReply(c,shared.nullmultibulk);
b0d8747d 1282 unblockClientWaitingData(c);
f86a74e9 1283 }
ed9b544e 1284 }
1285 }
ed9b544e 1286}
1287
12fea928 1288static int htNeedsResize(dict *dict) {
1289 long long size, used;
1290
1291 size = dictSlots(dict);
1292 used = dictSize(dict);
1293 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1294 (used*100/size < REDIS_HT_MINFILL));
1295}
1296
0bc03378 1297/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1298 * we resize the hash table to save memory */
56906eef 1299static void tryResizeHashTables(void) {
0bc03378 1300 int j;
1301
1302 for (j = 0; j < server.dbnum; j++) {
5413c40d 1303 if (htNeedsResize(server.db[j].dict))
0bc03378 1304 dictResize(server.db[j].dict);
12fea928 1305 if (htNeedsResize(server.db[j].expires))
1306 dictResize(server.db[j].expires);
0bc03378 1307 }
1308}
1309
8ca3e9d1 1310/* Our hash table implementation performs rehashing incrementally while
1311 * we write/read from the hash table. Still if the server is idle, the hash
1312 * table will use two tables for a long time. So we try to use 1 millisecond
1313 * of CPU time at every serverCron() loop in order to rehash some key. */
1314static void incrementallyRehash(void) {
1315 int j;
1316
1317 for (j = 0; j < server.dbnum; j++) {
1318 if (dictIsRehashing(server.db[j].dict)) {
1319 dictRehashMilliseconds(server.db[j].dict,1);
1320 break; /* already used our millisecond for this loop... */
1321 }
1322 }
1323}
1324
9d65a1bb 1325/* A background saving child (BGSAVE) terminated its work. Handle this. */
1326void backgroundSaveDoneHandler(int statloc) {
1327 int exitcode = WEXITSTATUS(statloc);
1328 int bysignal = WIFSIGNALED(statloc);
1329
1330 if (!bysignal && exitcode == 0) {
1331 redisLog(REDIS_NOTICE,
1332 "Background saving terminated with success");
1333 server.dirty = 0;
1334 server.lastsave = time(NULL);
1335 } else if (!bysignal && exitcode != 0) {
1336 redisLog(REDIS_WARNING, "Background saving error");
1337 } else {
1338 redisLog(REDIS_WARNING,
454eea7c 1339 "Background saving terminated by signal %d", WTERMSIG(statloc));
9d65a1bb 1340 rdbRemoveTempFile(server.bgsavechildpid);
1341 }
1342 server.bgsavechildpid = -1;
1343 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1344 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1345 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1346}
1347
1348/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1349 * Handle this. */
1350void backgroundRewriteDoneHandler(int statloc) {
1351 int exitcode = WEXITSTATUS(statloc);
1352 int bysignal = WIFSIGNALED(statloc);
1353
1354 if (!bysignal && exitcode == 0) {
1355 int fd;
1356 char tmpfile[256];
1357
1358 redisLog(REDIS_NOTICE,
1359 "Background append only file rewriting terminated with success");
1360 /* Now it's time to flush the differences accumulated by the parent */
1361 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1362 fd = open(tmpfile,O_WRONLY|O_APPEND);
1363 if (fd == -1) {
1364 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1365 goto cleanup;
1366 }
1367 /* Flush our data... */
1368 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1369 (signed) sdslen(server.bgrewritebuf)) {
1370 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1371 close(fd);
1372 goto cleanup;
1373 }
b32627cd 1374 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1375 /* Now our work is to rename the temp file into the stable file. And
1376 * switch the file descriptor used by the server for append only. */
1377 if (rename(tmpfile,server.appendfilename) == -1) {
1378 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1379 close(fd);
1380 goto cleanup;
1381 }
1382 /* Mission completed... almost */
1383 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1384 if (server.appendfd != -1) {
1385 /* If append only is actually enabled... */
1386 close(server.appendfd);
1387 server.appendfd = fd;
1388 fsync(fd);
85a83172 1389 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1390 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1391 } else {
1392 /* If append only is disabled we just generate a dump in this
1393 * format. Why not? */
1394 close(fd);
1395 }
1396 } else if (!bysignal && exitcode != 0) {
1397 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1398 } else {
1399 redisLog(REDIS_WARNING,
454eea7c 1400 "Background append only file rewriting terminated by signal %d",
1401 WTERMSIG(statloc));
9d65a1bb 1402 }
1403cleanup:
1404 sdsfree(server.bgrewritebuf);
1405 server.bgrewritebuf = sdsempty();
1406 aofRemoveTempFile(server.bgrewritechildpid);
1407 server.bgrewritechildpid = -1;
1408}
1409
884d4b39 1410/* This function is called once a background process of some kind terminates,
1411 * as we want to avoid resizing the hash tables when there is a child in order
1412 * to play well with copy-on-write (otherwise when a resize happens lots of
1413 * memory pages are copied). The goal of this function is to update the ability
1414 * for dict.c to resize the hash tables accordingly to the fact we have o not
1415 * running childs. */
1416static void updateDictResizePolicy(void) {
1417 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1418 dictEnableResize();
1419 else
1420 dictDisableResize();
1421}
1422
56906eef 1423static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1424 int j, loops = server.cronloops++;
ed9b544e 1425 REDIS_NOTUSED(eventLoop);
1426 REDIS_NOTUSED(id);
1427 REDIS_NOTUSED(clientData);
1428
3a66edc7 1429 /* We take a cached value of the unix time in the global state because
1430 * with virtual memory and aging there is to store the current time
1431 * in objects at every object access, and accuracy is not needed.
1432 * To access a global var is faster than calling time(NULL) */
1433 server.unixtime = time(NULL);
1434
fab43727 1435 /* We received a SIGTERM, shutting down here in a safe way, as it is
1436 * not ok doing so inside the signal handler. */
1437 if (server.shutdown_asap) {
1438 if (prepareForShutdown() == REDIS_OK) exit(0);
1439 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1440 }
1441
0bc03378 1442 /* Show some info about non-empty databases */
ed9b544e 1443 for (j = 0; j < server.dbnum; j++) {
dec423d9 1444 long long size, used, vkeys;
94754ccc 1445
3305306f 1446 size = dictSlots(server.db[j].dict);
1447 used = dictSize(server.db[j].dict);
94754ccc 1448 vkeys = dictSize(server.db[j].expires);
1763929f 1449 if (!(loops % 50) && (used || vkeys)) {
f870935d 1450 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1451 /* dictPrintStats(server.dict); */
ed9b544e 1452 }
ed9b544e 1453 }
1454
0bc03378 1455 /* We don't want to resize the hash tables while a bacground saving
1456 * is in progress: the saving child is created using fork() that is
1457 * implemented with a copy-on-write semantic in most modern systems, so
1458 * if we resize the HT while there is the saving child at work actually
1459 * a lot of memory movements in the parent will cause a lot of pages
1460 * copied. */
8ca3e9d1 1461 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1462 if (!(loops % 10)) tryResizeHashTables();
1463 if (server.activerehashing) incrementallyRehash();
884d4b39 1464 }
0bc03378 1465
ed9b544e 1466 /* Show information about connected clients */
1763929f 1467 if (!(loops % 50)) {
bdcb92f2 1468 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
ed9b544e 1469 listLength(server.clients)-listLength(server.slaves),
1470 listLength(server.slaves),
bdcb92f2 1471 zmalloc_used_memory());
ed9b544e 1472 }
1473
1474 /* Close connections of timedout clients */
1763929f 1475 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1476 closeTimedoutClients();
1477
9d65a1bb 1478 /* Check if a background saving or AOF rewrite in progress terminated */
1479 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1480 int statloc;
9d65a1bb 1481 pid_t pid;
1482
1483 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1484 if (pid == server.bgsavechildpid) {
1485 backgroundSaveDoneHandler(statloc);
ed9b544e 1486 } else {
9d65a1bb 1487 backgroundRewriteDoneHandler(statloc);
ed9b544e 1488 }
884d4b39 1489 updateDictResizePolicy();
ed9b544e 1490 }
1491 } else {
1492 /* If there is not a background saving in progress check if
1493 * we have to save now */
1494 time_t now = time(NULL);
1495 for (j = 0; j < server.saveparamslen; j++) {
1496 struct saveparam *sp = server.saveparams+j;
1497
1498 if (server.dirty >= sp->changes &&
1499 now-server.lastsave > sp->seconds) {
1500 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1501 sp->changes, sp->seconds);
f78fd11b 1502 rdbSaveBackground(server.dbfilename);
ed9b544e 1503 break;
1504 }
1505 }
1506 }
94754ccc 1507
f2324293 1508 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1509 * will use few CPU cycles if there are few expiring keys, otherwise
1510 * it will get more aggressive to avoid that too much memory is used by
1511 * keys that can be removed from the keyspace. */
94754ccc 1512 for (j = 0; j < server.dbnum; j++) {
f2324293 1513 int expired;
94754ccc 1514 redisDb *db = server.db+j;
94754ccc 1515
f2324293 1516 /* Continue to expire if at the end of the cycle more than 25%
1517 * of the keys were expired. */
1518 do {
4ef8de8a 1519 long num = dictSize(db->expires);
94754ccc 1520 time_t now = time(NULL);
1521
f2324293 1522 expired = 0;
94754ccc 1523 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1524 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1525 while (num--) {
1526 dictEntry *de;
1527 time_t t;
1528
1529 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1530 t = (time_t) dictGetEntryVal(de);
1531 if (now > t) {
1532 deleteKey(db,dictGetEntryKey(de));
f2324293 1533 expired++;
2a6a2ed1 1534 server.stat_expiredkeys++;
94754ccc 1535 }
1536 }
f2324293 1537 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1538 }
1539
4ef8de8a 1540 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1541 * is enbled. Try to free objects from the free list first. */
7e69548d 1542 if (vmCanSwapOut()) {
1543 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1544 server.vm_max_memory)
1545 {
72e9fd40 1546 int retval;
1547
a5819310 1548 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1549 retval = (server.vm_max_threads == 0) ?
1550 vmSwapOneObjectBlocking() :
1551 vmSwapOneObjectThreaded();
1763929f 1552 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1553 zmalloc_used_memory() >
1554 (server.vm_max_memory+server.vm_max_memory/10))
1555 {
1556 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1557 }
72e9fd40 1558 /* Note that when using threade I/O we free just one object,
1559 * because anyway when the I/O thread in charge to swap this
1560 * object out will finish, the handler of completed jobs
1561 * will try to swap more objects if we are still out of memory. */
1562 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1563 }
1564 }
1565
ed9b544e 1566 /* Check if we should connect to a MASTER */
1763929f 1567 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1568 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1569 if (syncWithMaster() == REDIS_OK) {
1570 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
8f63ddca 1571 if (server.appendonly) rewriteAppendOnlyFileBackground();
ed9b544e 1572 }
1573 }
1763929f 1574 return 100;
ed9b544e 1575}
1576
d5d55fc3 1577/* This function gets called every time Redis is entering the
1578 * main loop of the event driven library, that is, before to sleep
1579 * for ready file descriptors. */
1580static void beforeSleep(struct aeEventLoop *eventLoop) {
1581 REDIS_NOTUSED(eventLoop);
1582
28ed1f33 1583 /* Awake clients that got all the swapped keys they requested */
d5d55fc3 1584 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1585 listIter li;
1586 listNode *ln;
1587
1588 listRewind(server.io_ready_clients,&li);
1589 while((ln = listNext(&li))) {
1590 redisClient *c = ln->value;
1591 struct redisCommand *cmd;
1592
1593 /* Resume the client. */
1594 listDelNode(server.io_ready_clients,ln);
1595 c->flags &= (~REDIS_IO_WAIT);
1596 server.vm_blocked_clients--;
1597 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1598 readQueryFromClient, c);
1599 cmd = lookupCommand(c->argv[0]->ptr);
1600 assert(cmd != NULL);
1601 call(c,cmd);
1602 resetClient(c);
1603 /* There may be more data to process in the input buffer. */
1604 if (c->querybuf && sdslen(c->querybuf) > 0)
1605 processInputBuffer(c);
1606 }
1607 }
28ed1f33 1608 /* Write the AOF buffer on disk */
1609 flushAppendOnlyFile();
d5d55fc3 1610}
1611
ed9b544e 1612static void createSharedObjects(void) {
05df7621 1613 int j;
1614
ed9b544e 1615 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1616 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1617 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1618 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1619 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1620 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1621 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1622 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1623 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1624 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1625 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1626 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1627 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1628 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1629 "-ERR no such key\r\n"));
ed9b544e 1630 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1631 "-ERR syntax error\r\n"));
c937aa89 1632 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1633 "-ERR source and destination objects are the same\r\n"));
1634 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1635 "-ERR index out of range\r\n"));
ed9b544e 1636 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1637 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1638 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1639 shared.select0 = createStringObject("select 0\r\n",10);
1640 shared.select1 = createStringObject("select 1\r\n",10);
1641 shared.select2 = createStringObject("select 2\r\n",10);
1642 shared.select3 = createStringObject("select 3\r\n",10);
1643 shared.select4 = createStringObject("select 4\r\n",10);
1644 shared.select5 = createStringObject("select 5\r\n",10);
1645 shared.select6 = createStringObject("select 6\r\n",10);
1646 shared.select7 = createStringObject("select 7\r\n",10);
1647 shared.select8 = createStringObject("select 8\r\n",10);
1648 shared.select9 = createStringObject("select 9\r\n",10);
befec3cd 1649 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
c8d0ea0e 1650 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
befec3cd 1651 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
fc46bb71 1652 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
ffc6b7f8 1653 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1654 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
befec3cd 1655 shared.mbulk3 = createStringObject("*3\r\n",4);
c8d0ea0e 1656 shared.mbulk4 = createStringObject("*4\r\n",4);
05df7621 1657 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1658 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1659 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1660 }
ed9b544e 1661}
1662
1663static void appendServerSaveParams(time_t seconds, int changes) {
1664 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1665 server.saveparams[server.saveparamslen].seconds = seconds;
1666 server.saveparams[server.saveparamslen].changes = changes;
1667 server.saveparamslen++;
1668}
1669
bcfc686d 1670static void resetServerSaveParams() {
ed9b544e 1671 zfree(server.saveparams);
1672 server.saveparams = NULL;
1673 server.saveparamslen = 0;
1674}
1675
1676static void initServerConfig() {
1677 server.dbnum = REDIS_DEFAULT_DBNUM;
1678 server.port = REDIS_SERVERPORT;
f870935d 1679 server.verbosity = REDIS_VERBOSE;
ed9b544e 1680 server.maxidletime = REDIS_MAXIDLETIME;
1681 server.saveparams = NULL;
1682 server.logfile = NULL; /* NULL = log on standard output */
1683 server.bindaddr = NULL;
1684 server.glueoutputbuf = 1;
1685 server.daemonize = 0;
44b38ef4 1686 server.appendonly = 0;
1b677732 1687 server.appendfsync = APPENDFSYNC_EVERYSEC;
48f0308a 1688 server.lastfsync = time(NULL);
44b38ef4 1689 server.appendfd = -1;
1690 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1691 server.pidfile = zstrdup("/var/run/redis.pid");
1692 server.dbfilename = zstrdup("dump.rdb");
1693 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1694 server.requirepass = NULL;
b0553789 1695 server.rdbcompression = 1;
8ca3e9d1 1696 server.activerehashing = 1;
285add55 1697 server.maxclients = 0;
d5d55fc3 1698 server.blpop_blocked_clients = 0;
3fd78bcd 1699 server.maxmemory = 0;
75680a3c 1700 server.vm_enabled = 0;
054e426d 1701 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1702 server.vm_page_size = 256; /* 256 bytes per page */
1703 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1704 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1705 server.vm_max_threads = 4;
d5d55fc3 1706 server.vm_blocked_clients = 0;
cbba7dd7 1707 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1708 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
fab43727 1709 server.shutdown_asap = 0;
75680a3c 1710
bcfc686d 1711 resetServerSaveParams();
ed9b544e 1712
1713 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1714 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1715 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1716 /* Replication related */
1717 server.isslave = 0;
d0ccebcf 1718 server.masterauth = NULL;
ed9b544e 1719 server.masterhost = NULL;
1720 server.masterport = 6379;
1721 server.master = NULL;
1722 server.replstate = REDIS_REPL_NONE;
a7866db6 1723
1724 /* Double constants initialization */
1725 R_Zero = 0.0;
1726 R_PosInf = 1.0/R_Zero;
1727 R_NegInf = -1.0/R_Zero;
1728 R_Nan = R_Zero/R_Zero;
ed9b544e 1729}
1730
1731static void initServer() {
1732 int j;
1733
1734 signal(SIGHUP, SIG_IGN);
1735 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1736 setupSigSegvAction();
ed9b544e 1737
b9bc0eef 1738 server.devnull = fopen("/dev/null","w");
1739 if (server.devnull == NULL) {
1740 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1741 exit(1);
1742 }
ed9b544e 1743 server.clients = listCreate();
1744 server.slaves = listCreate();
87eca727 1745 server.monitors = listCreate();
ed9b544e 1746 server.objfreelist = listCreate();
1747 createSharedObjects();
1748 server.el = aeCreateEventLoop();
3305306f 1749 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
ed9b544e 1750 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1751 if (server.fd == -1) {
1752 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1753 exit(1);
1754 }
3305306f 1755 for (j = 0; j < server.dbnum; j++) {
5234952b 1756 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1757 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
37ab76c9 1758 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1759 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1760 if (server.vm_enabled)
1761 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1762 server.db[j].id = j;
1763 }
ffc6b7f8 1764 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1765 server.pubsub_patterns = listCreate();
1766 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1767 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
ed9b544e 1768 server.cronloops = 0;
9f3c422c 1769 server.bgsavechildpid = -1;
9d65a1bb 1770 server.bgrewritechildpid = -1;
1771 server.bgrewritebuf = sdsempty();
28ed1f33 1772 server.aofbuf = sdsempty();
ed9b544e 1773 server.lastsave = time(NULL);
1774 server.dirty = 0;
ed9b544e 1775 server.stat_numcommands = 0;
1776 server.stat_numconnections = 0;
2a6a2ed1 1777 server.stat_expiredkeys = 0;
ed9b544e 1778 server.stat_starttime = time(NULL);
3a66edc7 1779 server.unixtime = time(NULL);
d8f8b666 1780 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1781 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1782 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1783
1784 if (server.appendonly) {
3bb225d6 1785 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1786 if (server.appendfd == -1) {
1787 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1788 strerror(errno));
1789 exit(1);
1790 }
1791 }
75680a3c 1792
1793 if (server.vm_enabled) vmInit();
ed9b544e 1794}
1795
1796/* Empty the whole database */
ca37e9cd 1797static long long emptyDb() {
ed9b544e 1798 int j;
ca37e9cd 1799 long long removed = 0;
ed9b544e 1800
3305306f 1801 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1802 removed += dictSize(server.db[j].dict);
3305306f 1803 dictEmpty(server.db[j].dict);
1804 dictEmpty(server.db[j].expires);
1805 }
ca37e9cd 1806 return removed;
ed9b544e 1807}
1808
85dd2f3a 1809static int yesnotoi(char *s) {
1810 if (!strcasecmp(s,"yes")) return 1;
1811 else if (!strcasecmp(s,"no")) return 0;
1812 else return -1;
1813}
1814
ed9b544e 1815/* I agree, this is a very rudimental way to load a configuration...
1816 will improve later if the config gets more complex */
1817static void loadServerConfig(char *filename) {
c9a111ac 1818 FILE *fp;
ed9b544e 1819 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1820 int linenum = 0;
1821 sds line = NULL;
c9a111ac 1822
1823 if (filename[0] == '-' && filename[1] == '\0')
1824 fp = stdin;
1825 else {
1826 if ((fp = fopen(filename,"r")) == NULL) {
9a22de82 1827 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
c9a111ac 1828 exit(1);
1829 }
ed9b544e 1830 }
c9a111ac 1831
ed9b544e 1832 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1833 sds *argv;
1834 int argc, j;
1835
1836 linenum++;
1837 line = sdsnew(buf);
1838 line = sdstrim(line," \t\r\n");
1839
1840 /* Skip comments and blank lines*/
1841 if (line[0] == '#' || line[0] == '\0') {
1842 sdsfree(line);
1843 continue;
1844 }
1845
1846 /* Split into arguments */
1847 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1848 sdstolower(argv[0]);
1849
1850 /* Execute config directives */
bb0b03a3 1851 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1852 server.maxidletime = atoi(argv[1]);
0150db36 1853 if (server.maxidletime < 0) {
ed9b544e 1854 err = "Invalid timeout value"; goto loaderr;
1855 }
bb0b03a3 1856 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1857 server.port = atoi(argv[1]);
1858 if (server.port < 1 || server.port > 65535) {
1859 err = "Invalid port"; goto loaderr;
1860 }
bb0b03a3 1861 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1862 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1863 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1864 int seconds = atoi(argv[1]);
1865 int changes = atoi(argv[2]);
1866 if (seconds < 1 || changes < 0) {
1867 err = "Invalid save parameters"; goto loaderr;
1868 }
1869 appendServerSaveParams(seconds,changes);
bb0b03a3 1870 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1871 if (chdir(argv[1]) == -1) {
1872 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1873 argv[1], strerror(errno));
1874 exit(1);
1875 }
bb0b03a3 1876 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1877 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1878 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1879 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1880 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1881 else {
1882 err = "Invalid log level. Must be one of debug, notice, warning";
1883 goto loaderr;
1884 }
bb0b03a3 1885 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1886 FILE *logfp;
ed9b544e 1887
1888 server.logfile = zstrdup(argv[1]);
bb0b03a3 1889 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1890 zfree(server.logfile);
1891 server.logfile = NULL;
1892 }
1893 if (server.logfile) {
1894 /* Test if we are able to open the file. The server will not
1895 * be able to abort just for this problem later... */
c9a111ac 1896 logfp = fopen(server.logfile,"a");
1897 if (logfp == NULL) {
ed9b544e 1898 err = sdscatprintf(sdsempty(),
1899 "Can't open the log file: %s", strerror(errno));
1900 goto loaderr;
1901 }
c9a111ac 1902 fclose(logfp);
ed9b544e 1903 }
bb0b03a3 1904 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1905 server.dbnum = atoi(argv[1]);
1906 if (server.dbnum < 1) {
1907 err = "Invalid number of databases"; goto loaderr;
1908 }
b3f83f12
JZ
1909 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1910 loadServerConfig(argv[1]);
285add55 1911 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1912 server.maxclients = atoi(argv[1]);
3fd78bcd 1913 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
2b619329 1914 server.maxmemory = memtoll(argv[1],NULL);
bb0b03a3 1915 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1916 server.masterhost = sdsnew(argv[1]);
1917 server.masterport = atoi(argv[2]);
1918 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1919 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1920 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1921 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1922 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1923 err = "argument must be 'yes' or 'no'"; goto loaderr;
1924 }
121f70cf 1925 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1926 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
8ca3e9d1 1927 err = "argument must be 'yes' or 'no'"; goto loaderr;
1928 }
1929 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1930 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
121f70cf 1931 err = "argument must be 'yes' or 'no'"; goto loaderr;
1932 }
bb0b03a3 1933 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1934 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1935 err = "argument must be 'yes' or 'no'"; goto loaderr;
1936 }
44b38ef4 1937 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1938 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1939 err = "argument must be 'yes' or 'no'"; goto loaderr;
1940 }
f3b52411
PN
1941 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1942 zfree(server.appendfilename);
1943 server.appendfilename = zstrdup(argv[1]);
48f0308a 1944 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1945 if (!strcasecmp(argv[1],"no")) {
48f0308a 1946 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1947 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1948 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1949 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1950 server.appendfsync = APPENDFSYNC_EVERYSEC;
1951 } else {
1952 err = "argument must be 'no', 'always' or 'everysec'";
1953 goto loaderr;
1954 }
bb0b03a3 1955 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 1956 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1957 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 1958 zfree(server.pidfile);
054e426d 1959 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1960 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 1961 zfree(server.dbfilename);
054e426d 1962 server.dbfilename = zstrdup(argv[1]);
75680a3c 1963 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1964 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1965 err = "argument must be 'yes' or 'no'"; goto loaderr;
1966 }
054e426d 1967 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 1968 zfree(server.vm_swap_file);
054e426d 1969 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 1970 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2b619329 1971 server.vm_max_memory = memtoll(argv[1],NULL);
4ef8de8a 1972 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2b619329 1973 server.vm_page_size = memtoll(argv[1], NULL);
4ef8de8a 1974 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2b619329 1975 server.vm_pages = memtoll(argv[1], NULL);
92f8e882 1976 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1977 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 1978 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2b619329 1979 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
cbba7dd7 1980 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2b619329 1981 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
ed9b544e 1982 } else {
1983 err = "Bad directive or wrong number of arguments"; goto loaderr;
1984 }
1985 for (j = 0; j < argc; j++)
1986 sdsfree(argv[j]);
1987 zfree(argv);
1988 sdsfree(line);
1989 }
c9a111ac 1990 if (fp != stdin) fclose(fp);
ed9b544e 1991 return;
1992
1993loaderr:
1994 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1995 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1996 fprintf(stderr, ">>> '%s'\n", line);
1997 fprintf(stderr, "%s\n", err);
1998 exit(1);
1999}
2000
2001static void freeClientArgv(redisClient *c) {
2002 int j;
2003
2004 for (j = 0; j < c->argc; j++)
2005 decrRefCount(c->argv[j]);
e8a74421 2006 for (j = 0; j < c->mbargc; j++)
2007 decrRefCount(c->mbargv[j]);
ed9b544e 2008 c->argc = 0;
e8a74421 2009 c->mbargc = 0;
ed9b544e 2010}
2011
2012static void freeClient(redisClient *c) {
2013 listNode *ln;
2014
4409877e 2015 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 2016 * call, we have to set querybuf to NULL *before* to call
2017 * unblockClientWaitingData() to avoid processInputBuffer() will get
2018 * called. Also it is important to remove the file events after
2019 * this, because this call adds the READABLE event. */
4409877e 2020 sdsfree(c->querybuf);
2021 c->querybuf = NULL;
2022 if (c->flags & REDIS_BLOCKED)
b0d8747d 2023 unblockClientWaitingData(c);
4409877e 2024
37ab76c9 2025 /* UNWATCH all the keys */
2026 unwatchAllKeys(c);
2027 listRelease(c->watched_keys);
ffc6b7f8 2028 /* Unsubscribe from all the pubsub channels */
2029 pubsubUnsubscribeAllChannels(c,0);
2030 pubsubUnsubscribeAllPatterns(c,0);
2031 dictRelease(c->pubsub_channels);
2032 listRelease(c->pubsub_patterns);
befec3cd 2033 /* Obvious cleanup */
ed9b544e 2034 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2035 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 2036 listRelease(c->reply);
2037 freeClientArgv(c);
2038 close(c->fd);
92f8e882 2039 /* Remove from the list of clients */
ed9b544e 2040 ln = listSearchKey(server.clients,c);
dfc5e96c 2041 redisAssert(ln != NULL);
ed9b544e 2042 listDelNode(server.clients,ln);
37ab76c9 2043 /* Remove from the list of clients that are now ready to be restarted
2044 * after waiting for swapped keys */
d5d55fc3 2045 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2046 ln = listSearchKey(server.io_ready_clients,c);
2047 if (ln) {
2048 listDelNode(server.io_ready_clients,ln);
2049 server.vm_blocked_clients--;
2050 }
2051 }
37ab76c9 2052 /* Remove from the list of clients waiting for swapped keys */
d5d55fc3 2053 while (server.vm_enabled && listLength(c->io_keys)) {
2054 ln = listFirst(c->io_keys);
2055 dontWaitForSwappedKey(c,ln->value);
92f8e882 2056 }
b3e3d0d7 2057 listRelease(c->io_keys);
befec3cd 2058 /* Master/slave cleanup */
ed9b544e 2059 if (c->flags & REDIS_SLAVE) {
6208b3a7 2060 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2061 close(c->repldbfd);
87eca727 2062 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2063 ln = listSearchKey(l,c);
dfc5e96c 2064 redisAssert(ln != NULL);
87eca727 2065 listDelNode(l,ln);
ed9b544e 2066 }
2067 if (c->flags & REDIS_MASTER) {
2068 server.master = NULL;
2069 server.replstate = REDIS_REPL_CONNECT;
2070 }
befec3cd 2071 /* Release memory */
93ea3759 2072 zfree(c->argv);
e8a74421 2073 zfree(c->mbargv);
6e469882 2074 freeClientMultiState(c);
ed9b544e 2075 zfree(c);
2076}
2077
cc30e368 2078#define GLUEREPLY_UP_TO (1024)
ed9b544e 2079static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 2080 int copylen = 0;
2081 char buf[GLUEREPLY_UP_TO];
6208b3a7 2082 listNode *ln;
c7df85a4 2083 listIter li;
ed9b544e 2084 robj *o;
2085
c7df85a4 2086 listRewind(c->reply,&li);
2087 while((ln = listNext(&li))) {
c28b42ac 2088 int objlen;
2089
ed9b544e 2090 o = ln->value;
c28b42ac 2091 objlen = sdslen(o->ptr);
2092 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2093 memcpy(buf+copylen,o->ptr,objlen);
2094 copylen += objlen;
ed9b544e 2095 listDelNode(c->reply,ln);
c28b42ac 2096 } else {
2097 if (copylen == 0) return;
2098 break;
ed9b544e 2099 }
ed9b544e 2100 }
c28b42ac 2101 /* Now the output buffer is empty, add the new single element */
2102 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2103 listAddNodeHead(c->reply,o);
ed9b544e 2104}
2105
2106static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2107 redisClient *c = privdata;
2108 int nwritten = 0, totwritten = 0, objlen;
2109 robj *o;
2110 REDIS_NOTUSED(el);
2111 REDIS_NOTUSED(mask);
2112
2895e862 2113 /* Use writev() if we have enough buffers to send */
7ea870c0 2114 if (!server.glueoutputbuf &&
e0a62c7f 2115 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
7ea870c0 2116 !(c->flags & REDIS_MASTER))
2895e862 2117 {
2118 sendReplyToClientWritev(el, fd, privdata, mask);
2119 return;
2120 }
2895e862 2121
ed9b544e 2122 while(listLength(c->reply)) {
c28b42ac 2123 if (server.glueoutputbuf && listLength(c->reply) > 1)
2124 glueReplyBuffersIfNeeded(c);
2125
ed9b544e 2126 o = listNodeValue(listFirst(c->reply));
2127 objlen = sdslen(o->ptr);
2128
2129 if (objlen == 0) {
2130 listDelNode(c->reply,listFirst(c->reply));
2131 continue;
2132 }
2133
2134 if (c->flags & REDIS_MASTER) {
6f376729 2135 /* Don't reply to a master */
ed9b544e 2136 nwritten = objlen - c->sentlen;
2137 } else {
a4d1ba9a 2138 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 2139 if (nwritten <= 0) break;
2140 }
2141 c->sentlen += nwritten;
2142 totwritten += nwritten;
2143 /* If we fully sent the object on head go to the next one */
2144 if (c->sentlen == objlen) {
2145 listDelNode(c->reply,listFirst(c->reply));
2146 c->sentlen = 0;
2147 }
6f376729 2148 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 2149 * bytes, in a single threaded server it's a good idea to serve
6f376729 2150 * other clients as well, even if a very large request comes from
2151 * super fast link that is always able to accept data (in real world
12f9d551 2152 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 2153 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 2154 }
2155 if (nwritten == -1) {
2156 if (errno == EAGAIN) {
2157 nwritten = 0;
2158 } else {
f870935d 2159 redisLog(REDIS_VERBOSE,
ed9b544e 2160 "Error writing to client: %s", strerror(errno));
2161 freeClient(c);
2162 return;
2163 }
2164 }
2165 if (totwritten > 0) c->lastinteraction = time(NULL);
2166 if (listLength(c->reply) == 0) {
2167 c->sentlen = 0;
2168 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2169 }
2170}
2171
2895e862 2172static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2173{
2174 redisClient *c = privdata;
2175 int nwritten = 0, totwritten = 0, objlen, willwrite;
2176 robj *o;
2177 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2178 int offset, ion = 0;
2179 REDIS_NOTUSED(el);
2180 REDIS_NOTUSED(mask);
2181
2182 listNode *node;
2183 while (listLength(c->reply)) {
2184 offset = c->sentlen;
2185 ion = 0;
2186 willwrite = 0;
2187
2188 /* fill-in the iov[] array */
2189 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2190 o = listNodeValue(node);
2191 objlen = sdslen(o->ptr);
2192
e0a62c7f 2193 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2895e862 2194 break;
2195
2196 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2197 break; /* no more iovecs */
2198
2199 iov[ion].iov_base = ((char*)o->ptr) + offset;
2200 iov[ion].iov_len = objlen - offset;
2201 willwrite += objlen - offset;
2202 offset = 0; /* just for the first item */
2203 ion++;
2204 }
2205
2206 if(willwrite == 0)
2207 break;
2208
2209 /* write all collected blocks at once */
2210 if((nwritten = writev(fd, iov, ion)) < 0) {
2211 if (errno != EAGAIN) {
f870935d 2212 redisLog(REDIS_VERBOSE,
2895e862 2213 "Error writing to client: %s", strerror(errno));
2214 freeClient(c);
2215 return;
2216 }
2217 break;
2218 }
2219
2220 totwritten += nwritten;
2221 offset = c->sentlen;
2222
2223 /* remove written robjs from c->reply */
2224 while (nwritten && listLength(c->reply)) {
2225 o = listNodeValue(listFirst(c->reply));
2226 objlen = sdslen(o->ptr);
2227
2228 if(nwritten >= objlen - offset) {
2229 listDelNode(c->reply, listFirst(c->reply));
2230 nwritten -= objlen - offset;
2231 c->sentlen = 0;
2232 } else {
2233 /* partial write */
2234 c->sentlen += nwritten;
2235 break;
2236 }
2237 offset = 0;
2238 }
2239 }
2240
e0a62c7f 2241 if (totwritten > 0)
2895e862 2242 c->lastinteraction = time(NULL);
2243
2244 if (listLength(c->reply) == 0) {
2245 c->sentlen = 0;
2246 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2247 }
2248}
2249
1a132bbc
PN
2250static int qsortRedisCommands(const void *r1, const void *r2) {
2251 return strcasecmp(
2252 ((struct redisCommand*)r1)->name,
2253 ((struct redisCommand*)r2)->name);
2254}
2255
2256static void sortCommandTable() {
1a132bbc
PN
2257 /* Copy and sort the read-only version of the command table */
2258 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2259 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
d55d5c5d 2260 qsort(commandTable,
2261 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2262 sizeof(struct redisCommand),qsortRedisCommands);
1a132bbc
PN
2263}
2264
ed9b544e 2265static struct redisCommand *lookupCommand(char *name) {
1a132bbc
PN
2266 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2267 return bsearch(
2268 &tmp,
2269 commandTable,
d55d5c5d 2270 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
1a132bbc
PN
2271 sizeof(struct redisCommand),
2272 qsortRedisCommands);
ed9b544e 2273}
2274
2275/* resetClient prepare the client to process the next command */
2276static void resetClient(redisClient *c) {
2277 freeClientArgv(c);
2278 c->bulklen = -1;
e8a74421 2279 c->multibulk = 0;
ed9b544e 2280}
2281
6e469882 2282/* Call() is the core of Redis execution of a command */
2283static void call(redisClient *c, struct redisCommand *cmd) {
2284 long long dirty;
2285
2286 dirty = server.dirty;
2287 cmd->proc(c);
4005fef1 2288 dirty = server.dirty-dirty;
2289
2290 if (server.appendonly && dirty)
6e469882 2291 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
4005fef1 2292 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2293 listLength(server.slaves))
248ea310 2294 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2295 if (listLength(server.monitors))
dd142b9c 2296 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2297 server.stat_numcommands++;
2298}
2299
ed9b544e 2300/* If this function gets called we already read a whole
2301 * command, argments are in the client argv/argc fields.
2302 * processCommand() execute the command or prepare the
2303 * server for a bulk read from the client.
2304 *
2305 * If 1 is returned the client is still alive and valid and
2306 * and other operations can be performed by the caller. Otherwise
2307 * if 0 is returned the client was destroied (i.e. after QUIT). */
2308static int processCommand(redisClient *c) {
2309 struct redisCommand *cmd;
ed9b544e 2310
3fd78bcd 2311 /* Free some memory if needed (maxmemory setting) */
2312 if (server.maxmemory) freeMemoryIfNeeded();
2313
e8a74421 2314 /* Handle the multi bulk command type. This is an alternative protocol
2315 * supported by Redis in order to receive commands that are composed of
2316 * multiple binary-safe "bulk" arguments. The latency of processing is
2317 * a bit higher but this allows things like multi-sets, so if this
2318 * protocol is used only for MSET and similar commands this is a big win. */
2319 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2320 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2321 if (c->multibulk <= 0) {
2322 resetClient(c);
2323 return 1;
2324 } else {
2325 decrRefCount(c->argv[c->argc-1]);
2326 c->argc--;
2327 return 1;
2328 }
2329 } else if (c->multibulk) {
2330 if (c->bulklen == -1) {
2331 if (((char*)c->argv[0]->ptr)[0] != '$') {
2332 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2333 resetClient(c);
2334 return 1;
2335 } else {
2336 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2337 decrRefCount(c->argv[0]);
2338 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2339 c->argc--;
2340 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2341 resetClient(c);
2342 return 1;
2343 }
2344 c->argc--;
2345 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2346 return 1;
2347 }
2348 } else {
2349 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2350 c->mbargv[c->mbargc] = c->argv[0];
2351 c->mbargc++;
2352 c->argc--;
2353 c->multibulk--;
2354 if (c->multibulk == 0) {
2355 robj **auxargv;
2356 int auxargc;
2357
2358 /* Here we need to swap the multi-bulk argc/argv with the
2359 * normal argc/argv of the client structure. */
2360 auxargv = c->argv;
2361 c->argv = c->mbargv;
2362 c->mbargv = auxargv;
2363
2364 auxargc = c->argc;
2365 c->argc = c->mbargc;
2366 c->mbargc = auxargc;
2367
2368 /* We need to set bulklen to something different than -1
2369 * in order for the code below to process the command without
2370 * to try to read the last argument of a bulk command as
2371 * a special argument. */
2372 c->bulklen = 0;
2373 /* continue below and process the command */
2374 } else {
2375 c->bulklen = -1;
2376 return 1;
2377 }
2378 }
2379 }
2380 /* -- end of multi bulk commands processing -- */
2381
ed9b544e 2382 /* The QUIT command is handled as a special case. Normal command
2383 * procs are unable to close the client connection safely */
bb0b03a3 2384 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2385 freeClient(c);
2386 return 0;
2387 }
d5d55fc3 2388
2389 /* Now lookup the command and check ASAP about trivial error conditions
2390 * such wrong arity, bad command name and so forth. */
ed9b544e 2391 cmd = lookupCommand(c->argv[0]->ptr);
2392 if (!cmd) {
2c14807b 2393 addReplySds(c,
2394 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2395 (char*)c->argv[0]->ptr));
ed9b544e 2396 resetClient(c);
2397 return 1;
2398 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2399 (c->argc < -cmd->arity)) {
454d4e43 2400 addReplySds(c,
2401 sdscatprintf(sdsempty(),
2402 "-ERR wrong number of arguments for '%s' command\r\n",
2403 cmd->name));
ed9b544e 2404 resetClient(c);
2405 return 1;
2406 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2407 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2408 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2409
2410 decrRefCount(c->argv[c->argc-1]);
2411 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2412 c->argc--;
2413 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2414 resetClient(c);
2415 return 1;
2416 }
2417 c->argc--;
2418 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2419 /* It is possible that the bulk read is already in the
8d0490e7 2420 * buffer. Check this condition and handle it accordingly.
2421 * This is just a fast path, alternative to call processInputBuffer().
2422 * It's a good idea since the code is small and this condition
2423 * happens most of the times. */
ed9b544e 2424 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2425 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2426 c->argc++;
2427 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2428 } else {
d5d55fc3 2429 /* Otherwise return... there is to read the last argument
2430 * from the socket. */
ed9b544e 2431 return 1;
2432 }
2433 }
942a3961 2434 /* Let's try to encode the bulk object to save space. */
2435 if (cmd->flags & REDIS_CMD_BULK)
05df7621 2436 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
942a3961 2437
e63943a4 2438 /* Check if the user is authenticated */
2439 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2440 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2441 resetClient(c);
2442 return 1;
2443 }
2444
b61a28fe 2445 /* Handle the maxmemory directive */
2446 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2447 zmalloc_used_memory() > server.maxmemory)
2448 {
2449 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2450 resetClient(c);
2451 return 1;
2452 }
2453
d6cc8867 2454 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
e6cca5db 2455 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2456 &&
ffc6b7f8 2457 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2458 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2459 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
d6cc8867 2460 resetClient(c);
2461 return 1;
2462 }
2463
ed9b544e 2464 /* Exec the command */
6531c94d 2465 if (c->flags & REDIS_MULTI &&
2466 cmd->proc != execCommand && cmd->proc != discardCommand &&
2467 cmd->proc != multiCommand && cmd->proc != watchCommand)
2468 {
6e469882 2469 queueMultiCommand(c,cmd);
2470 addReply(c,shared.queued);
2471 } else {
d5d55fc3 2472 if (server.vm_enabled && server.vm_max_threads > 0 &&
0a6f3f0f 2473 blockClientOnSwappedKeys(c,cmd)) return 1;
6e469882 2474 call(c,cmd);
2475 }
ed9b544e 2476
2477 /* Prepare the client for the next command */
ed9b544e 2478 resetClient(c);
2479 return 1;
2480}
2481
248ea310 2482static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2483 listNode *ln;
c7df85a4 2484 listIter li;
ed9b544e 2485 int outc = 0, j;
93ea3759 2486 robj **outv;
248ea310 2487 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2488 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2489 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2490 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2491 robj *lenobj;
93ea3759 2492
2493 if (argc <= REDIS_STATIC_ARGS) {
2494 outv = static_outv;
2495 } else {
248ea310 2496 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2497 }
248ea310 2498
2499 lenobj = createObject(REDIS_STRING,
2500 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2501 lenobj->refcount = 0;
2502 outv[outc++] = lenobj;
ed9b544e 2503 for (j = 0; j < argc; j++) {
248ea310 2504 lenobj = createObject(REDIS_STRING,
2505 sdscatprintf(sdsempty(),"$%lu\r\n",
2506 (unsigned long) stringObjectLen(argv[j])));
2507 lenobj->refcount = 0;
2508 outv[outc++] = lenobj;
ed9b544e 2509 outv[outc++] = argv[j];
248ea310 2510 outv[outc++] = shared.crlf;
ed9b544e 2511 }
ed9b544e 2512
40d224a9 2513 /* Increment all the refcounts at start and decrement at end in order to
2514 * be sure to free objects if there is no slave in a replication state
2515 * able to be feed with commands */
2516 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2517 listRewind(slaves,&li);
2518 while((ln = listNext(&li))) {
ed9b544e 2519 redisClient *slave = ln->value;
40d224a9 2520
2521 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2522 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2523
2524 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2525 if (slave->slaveseldb != dictid) {
2526 robj *selectcmd;
2527
2528 switch(dictid) {
2529 case 0: selectcmd = shared.select0; break;
2530 case 1: selectcmd = shared.select1; break;
2531 case 2: selectcmd = shared.select2; break;
2532 case 3: selectcmd = shared.select3; break;
2533 case 4: selectcmd = shared.select4; break;
2534 case 5: selectcmd = shared.select5; break;
2535 case 6: selectcmd = shared.select6; break;
2536 case 7: selectcmd = shared.select7; break;
2537 case 8: selectcmd = shared.select8; break;
2538 case 9: selectcmd = shared.select9; break;
2539 default:
2540 selectcmd = createObject(REDIS_STRING,
2541 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2542 selectcmd->refcount = 0;
2543 break;
2544 }
2545 addReply(slave,selectcmd);
2546 slave->slaveseldb = dictid;
2547 }
2548 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2549 }
40d224a9 2550 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2551 if (outv != static_outv) zfree(outv);
ed9b544e 2552}
2553
dd142b9c 2554static sds sdscatrepr(sds s, char *p, size_t len) {
2555 s = sdscatlen(s,"\"",1);
2556 while(len--) {
2557 switch(*p) {
2558 case '\\':
2559 case '"':
2560 s = sdscatprintf(s,"\\%c",*p);
2561 break;
2562 case '\n': s = sdscatlen(s,"\\n",1); break;
2563 case '\r': s = sdscatlen(s,"\\r",1); break;
2564 case '\t': s = sdscatlen(s,"\\t",1); break;
2565 case '\a': s = sdscatlen(s,"\\a",1); break;
2566 case '\b': s = sdscatlen(s,"\\b",1); break;
2567 default:
2568 if (isprint(*p))
2569 s = sdscatprintf(s,"%c",*p);
2570 else
2571 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2572 break;
2573 }
2574 p++;
2575 }
2576 return sdscatlen(s,"\"",1);
2577}
2578
2579static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2580 listNode *ln;
2581 listIter li;
2582 int j;
2583 sds cmdrepr = sdsnew("+");
2584 robj *cmdobj;
2585 struct timeval tv;
2586
2587 gettimeofday(&tv,NULL);
2588 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2589 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2590
2591 for (j = 0; j < argc; j++) {
2592 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2593 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2594 } else {
2595 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2596 sdslen(argv[j]->ptr));
2597 }
2598 if (j != argc-1)
2599 cmdrepr = sdscatlen(cmdrepr," ",1);
2600 }
2601 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2602 cmdobj = createObject(REDIS_STRING,cmdrepr);
2603
2604 listRewind(monitors,&li);
2605 while((ln = listNext(&li))) {
2606 redisClient *monitor = ln->value;
2607 addReply(monitor,cmdobj);
2608 }
2609 decrRefCount(cmdobj);
2610}
2611
638e42ac 2612static void processInputBuffer(redisClient *c) {
ed9b544e 2613again:
4409877e 2614 /* Before to process the input buffer, make sure the client is not
2615 * waitig for a blocking operation such as BLPOP. Note that the first
2616 * iteration the client is never blocked, otherwise the processInputBuffer
2617 * would not be called at all, but after the execution of the first commands
2618 * in the input buffer the client may be blocked, and the "goto again"
2619 * will try to reiterate. The following line will make it return asap. */
92f8e882 2620 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2621 if (c->bulklen == -1) {
2622 /* Read the first line of the query */
2623 char *p = strchr(c->querybuf,'\n');
2624 size_t querylen;
644fafa3 2625
ed9b544e 2626 if (p) {
2627 sds query, *argv;
2628 int argc, j;
e0a62c7f 2629
ed9b544e 2630 query = c->querybuf;
2631 c->querybuf = sdsempty();
2632 querylen = 1+(p-(query));
2633 if (sdslen(query) > querylen) {
2634 /* leave data after the first line of the query in the buffer */
2635 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2636 }
2637 *p = '\0'; /* remove "\n" */
2638 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2639 sdsupdatelen(query);
2640
2641 /* Now we can split the query in arguments */
ed9b544e 2642 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2643 sdsfree(query);
2644
2645 if (c->argv) zfree(c->argv);
2646 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2647
2648 for (j = 0; j < argc; j++) {
ed9b544e 2649 if (sdslen(argv[j])) {
2650 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2651 c->argc++;
2652 } else {
2653 sdsfree(argv[j]);
2654 }
2655 }
2656 zfree(argv);
7c49733c 2657 if (c->argc) {
2658 /* Execute the command. If the client is still valid
2659 * after processCommand() return and there is something
2660 * on the query buffer try to process the next command. */
2661 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2662 } else {
2663 /* Nothing to process, argc == 0. Just process the query
2664 * buffer if it's not empty or return to the caller */
2665 if (sdslen(c->querybuf)) goto again;
2666 }
ed9b544e 2667 return;
644fafa3 2668 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2669 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2670 freeClient(c);
2671 return;
2672 }
2673 } else {
2674 /* Bulk read handling. Note that if we are at this point
2675 the client already sent a command terminated with a newline,
2676 we are reading the bulk data that is actually the last
2677 argument of the command. */
2678 int qbl = sdslen(c->querybuf);
2679
2680 if (c->bulklen <= qbl) {
2681 /* Copy everything but the final CRLF as final argument */
2682 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2683 c->argc++;
2684 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2685 /* Process the command. If the client is still valid after
2686 * the processing and there is more data in the buffer
2687 * try to parse it. */
2688 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2689 return;
2690 }
2691 }
2692}
2693
638e42ac 2694static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2695 redisClient *c = (redisClient*) privdata;
2696 char buf[REDIS_IOBUF_LEN];
2697 int nread;
2698 REDIS_NOTUSED(el);
2699 REDIS_NOTUSED(mask);
2700
2701 nread = read(fd, buf, REDIS_IOBUF_LEN);
2702 if (nread == -1) {
2703 if (errno == EAGAIN) {
2704 nread = 0;
2705 } else {
f870935d 2706 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2707 freeClient(c);
2708 return;
2709 }
2710 } else if (nread == 0) {
f870935d 2711 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2712 freeClient(c);
2713 return;
2714 }
2715 if (nread) {
2716 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2717 c->lastinteraction = time(NULL);
2718 } else {
2719 return;
2720 }
168ac5c6 2721 processInputBuffer(c);
638e42ac 2722}
2723
ed9b544e 2724static int selectDb(redisClient *c, int id) {
2725 if (id < 0 || id >= server.dbnum)
2726 return REDIS_ERR;
3305306f 2727 c->db = &server.db[id];
ed9b544e 2728 return REDIS_OK;
2729}
2730
40d224a9 2731static void *dupClientReplyValue(void *o) {
2732 incrRefCount((robj*)o);
12d090d2 2733 return o;
40d224a9 2734}
2735
ffc6b7f8 2736static int listMatchObjects(void *a, void *b) {
bf028098 2737 return equalStringObjects(a,b);
ffc6b7f8 2738}
2739
ed9b544e 2740static redisClient *createClient(int fd) {
2741 redisClient *c = zmalloc(sizeof(*c));
2742
2743 anetNonBlock(NULL,fd);
2744 anetTcpNoDelay(NULL,fd);
2745 if (!c) return NULL;
2746 selectDb(c,0);
2747 c->fd = fd;
2748 c->querybuf = sdsempty();
2749 c->argc = 0;
93ea3759 2750 c->argv = NULL;
ed9b544e 2751 c->bulklen = -1;
e8a74421 2752 c->multibulk = 0;
2753 c->mbargc = 0;
2754 c->mbargv = NULL;
ed9b544e 2755 c->sentlen = 0;
2756 c->flags = 0;
2757 c->lastinteraction = time(NULL);
abcb223e 2758 c->authenticated = 0;
40d224a9 2759 c->replstate = REDIS_REPL_NONE;
6b47e12e 2760 c->reply = listCreate();
ed9b544e 2761 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2762 listSetDupMethod(c->reply,dupClientReplyValue);
37ab76c9 2763 c->blocking_keys = NULL;
2764 c->blocking_keys_num = 0;
92f8e882 2765 c->io_keys = listCreate();
87c68815 2766 c->watched_keys = listCreate();
92f8e882 2767 listSetFreeMethod(c->io_keys,decrRefCount);
ffc6b7f8 2768 c->pubsub_channels = dictCreate(&setDictType,NULL);
2769 c->pubsub_patterns = listCreate();
2770 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2771 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
ed9b544e 2772 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2773 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2774 freeClient(c);
2775 return NULL;
2776 }
6b47e12e 2777 listAddNodeTail(server.clients,c);
6e469882 2778 initClientMultiState(c);
ed9b544e 2779 return c;
2780}
2781
2782static void addReply(redisClient *c, robj *obj) {
2783 if (listLength(c->reply) == 0 &&
6208b3a7 2784 (c->replstate == REDIS_REPL_NONE ||
2785 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2786 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2787 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2788
2789 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2790 obj = dupStringObject(obj);
2791 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2792 }
9d65a1bb 2793 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2794}
2795
2796static void addReplySds(redisClient *c, sds s) {
2797 robj *o = createObject(REDIS_STRING,s);
2798 addReply(c,o);
2799 decrRefCount(o);
2800}
2801
e2665397 2802static void addReplyDouble(redisClient *c, double d) {
2803 char buf[128];
2804
2805 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2806 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2807 (unsigned long) strlen(buf),buf));
e2665397 2808}
2809
aa7c2934
PN
2810static void addReplyLongLong(redisClient *c, long long ll) {
2811 char buf[128];
2812 size_t len;
2813
2814 if (ll == 0) {
2815 addReply(c,shared.czero);
2816 return;
2817 } else if (ll == 1) {
2818 addReply(c,shared.cone);
2819 return;
2820 }
482b672d 2821 buf[0] = ':';
2822 len = ll2string(buf+1,sizeof(buf)-1,ll);
2823 buf[len+1] = '\r';
2824 buf[len+2] = '\n';
2825 addReplySds(c,sdsnewlen(buf,len+3));
aa7c2934
PN
2826}
2827
92b27fe9 2828static void addReplyUlong(redisClient *c, unsigned long ul) {
2829 char buf[128];
2830 size_t len;
2831
dd88747b 2832 if (ul == 0) {
2833 addReply(c,shared.czero);
2834 return;
2835 } else if (ul == 1) {
2836 addReply(c,shared.cone);
2837 return;
2838 }
92b27fe9 2839 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2840 addReplySds(c,sdsnewlen(buf,len));
2841}
2842
942a3961 2843static void addReplyBulkLen(redisClient *c, robj *obj) {
482b672d 2844 size_t len, intlen;
2845 char buf[128];
942a3961 2846
2847 if (obj->encoding == REDIS_ENCODING_RAW) {
2848 len = sdslen(obj->ptr);
2849 } else {
2850 long n = (long)obj->ptr;
2851
e054afda 2852 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2853 len = 1;
2854 if (n < 0) {
2855 len++;
2856 n = -n;
2857 }
2858 while((n = n/10) != 0) {
2859 len++;
2860 }
2861 }
482b672d 2862 buf[0] = '$';
2863 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2864 buf[intlen+1] = '\r';
2865 buf[intlen+2] = '\n';
2866 addReplySds(c,sdsnewlen(buf,intlen+3));
942a3961 2867}
2868
dd88747b 2869static void addReplyBulk(redisClient *c, robj *obj) {
2870 addReplyBulkLen(c,obj);
2871 addReply(c,obj);
2872 addReply(c,shared.crlf);
2873}
2874
500ece7c 2875/* In the CONFIG command we need to add vanilla C string as bulk replies */
2876static void addReplyBulkCString(redisClient *c, char *s) {
2877 if (s == NULL) {
2878 addReply(c,shared.nullbulk);
2879 } else {
2880 robj *o = createStringObject(s,strlen(s));
2881 addReplyBulk(c,o);
2882 decrRefCount(o);
2883 }
2884}
2885
ed9b544e 2886static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2887 int cport, cfd;
2888 char cip[128];
285add55 2889 redisClient *c;
ed9b544e 2890 REDIS_NOTUSED(el);
2891 REDIS_NOTUSED(mask);
2892 REDIS_NOTUSED(privdata);
2893
2894 cfd = anetAccept(server.neterr, fd, cip, &cport);
2895 if (cfd == AE_ERR) {
f870935d 2896 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2897 return;
2898 }
f870935d 2899 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2900 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2901 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2902 close(cfd); /* May be already closed, just ingore errors */
2903 return;
2904 }
285add55 2905 /* If maxclient directive is set and this is one client more... close the
2906 * connection. Note that we create the client instead to check before
2907 * for this condition, since now the socket is already set in nonblocking
2908 * mode and we can send an error for free using the Kernel I/O */
2909 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2910 char *err = "-ERR max number of clients reached\r\n";
2911
2912 /* That's a best effort error message, don't check write errors */
fee803ba 2913 if (write(c->fd,err,strlen(err)) == -1) {
2914 /* Nothing to do, Just to avoid the warning... */
2915 }
285add55 2916 freeClient(c);
2917 return;
2918 }
ed9b544e 2919 server.stat_numconnections++;
2920}
2921
2922/* ======================= Redis objects implementation ===================== */
2923
2924static robj *createObject(int type, void *ptr) {
2925 robj *o;
2926
a5819310 2927 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2928 if (listLength(server.objfreelist)) {
2929 listNode *head = listFirst(server.objfreelist);
2930 o = listNodeValue(head);
2931 listDelNode(server.objfreelist,head);
a5819310 2932 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2933 } else {
75680a3c 2934 if (server.vm_enabled) {
a5819310 2935 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2936 o = zmalloc(sizeof(*o));
2937 } else {
2938 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2939 }
ed9b544e 2940 }
ed9b544e 2941 o->type = type;
942a3961 2942 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2943 o->ptr = ptr;
2944 o->refcount = 1;
3a66edc7 2945 if (server.vm_enabled) {
1064ef87 2946 /* Note that this code may run in the context of an I/O thread
2947 * and accessing to server.unixtime in theory is an error
2948 * (no locks). But in practice this is safe, and even if we read
2949 * garbage Redis will not fail, as it's just a statistical info */
3a66edc7 2950 o->vm.atime = server.unixtime;
2951 o->storage = REDIS_VM_MEMORY;
2952 }
ed9b544e 2953 return o;
2954}
2955
2956static robj *createStringObject(char *ptr, size_t len) {
2957 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2958}
2959
3f973463
PN
2960static robj *createStringObjectFromLongLong(long long value) {
2961 robj *o;
2962 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2963 incrRefCount(shared.integers[value]);
2964 o = shared.integers[value];
2965 } else {
3f973463 2966 if (value >= LONG_MIN && value <= LONG_MAX) {
10dea8dc 2967 o = createObject(REDIS_STRING, NULL);
3f973463
PN
2968 o->encoding = REDIS_ENCODING_INT;
2969 o->ptr = (void*)((long)value);
2970 } else {
ee14da56 2971 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3f973463
PN
2972 }
2973 }
2974 return o;
2975}
2976
4ef8de8a 2977static robj *dupStringObject(robj *o) {
b9bc0eef 2978 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2979 return createStringObject(o->ptr,sdslen(o->ptr));
2980}
2981
ed9b544e 2982static robj *createListObject(void) {
2983 list *l = listCreate();
2984
ed9b544e 2985 listSetFreeMethod(l,decrRefCount);
2986 return createObject(REDIS_LIST,l);
2987}
2988
2989static robj *createSetObject(void) {
2990 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2991 return createObject(REDIS_SET,d);
2992}
2993
5234952b 2994static robj *createHashObject(void) {
2995 /* All the Hashes start as zipmaps. Will be automatically converted
2996 * into hash tables if there are enough elements or big elements
2997 * inside. */
2998 unsigned char *zm = zipmapNew();
2999 robj *o = createObject(REDIS_HASH,zm);
3000 o->encoding = REDIS_ENCODING_ZIPMAP;
3001 return o;
3002}
3003
1812e024 3004static robj *createZsetObject(void) {
6b47e12e 3005 zset *zs = zmalloc(sizeof(*zs));
3006
3007 zs->dict = dictCreate(&zsetDictType,NULL);
3008 zs->zsl = zslCreate();
3009 return createObject(REDIS_ZSET,zs);
1812e024 3010}
3011
ed9b544e 3012static void freeStringObject(robj *o) {
942a3961 3013 if (o->encoding == REDIS_ENCODING_RAW) {
3014 sdsfree(o->ptr);
3015 }
ed9b544e 3016}
3017
3018static void freeListObject(robj *o) {
3019 listRelease((list*) o->ptr);
3020}
3021
3022static void freeSetObject(robj *o) {
3023 dictRelease((dict*) o->ptr);
3024}
3025
fd8ccf44 3026static void freeZsetObject(robj *o) {
3027 zset *zs = o->ptr;
3028
3029 dictRelease(zs->dict);
3030 zslFree(zs->zsl);
3031 zfree(zs);
3032}
3033
ed9b544e 3034static void freeHashObject(robj *o) {
cbba7dd7 3035 switch (o->encoding) {
3036 case REDIS_ENCODING_HT:
3037 dictRelease((dict*) o->ptr);
3038 break;
3039 case REDIS_ENCODING_ZIPMAP:
3040 zfree(o->ptr);
3041 break;
3042 default:
f83c6cb5 3043 redisPanic("Unknown hash encoding type");
cbba7dd7 3044 break;
3045 }
ed9b544e 3046}
3047
3048static void incrRefCount(robj *o) {
3049 o->refcount++;
3050}
3051
3052static void decrRefCount(void *obj) {
3053 robj *o = obj;
94754ccc 3054
c651fd9e 3055 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
970e10bb 3056 /* Object is a key of a swapped out value, or in the process of being
3057 * loaded. */
996cb5f7 3058 if (server.vm_enabled &&
3059 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3060 {
996cb5f7 3061 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 3062 redisAssert(o->type == REDIS_STRING);
a35ddf12 3063 freeStringObject(o);
3064 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 3065 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 3066 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3067 !listAddNodeHead(server.objfreelist,o))
3068 zfree(o);
a5819310 3069 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 3070 server.vm_stats_swapped_objects--;
a35ddf12 3071 return;
3072 }
996cb5f7 3073 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 3074 if (--(o->refcount) == 0) {
996cb5f7 3075 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3076 vmCancelThreadedIOJob(obj);
ed9b544e 3077 switch(o->type) {
3078 case REDIS_STRING: freeStringObject(o); break;
3079 case REDIS_LIST: freeListObject(o); break;
3080 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 3081 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 3082 case REDIS_HASH: freeHashObject(o); break;
f83c6cb5 3083 default: redisPanic("Unknown object type"); break;
ed9b544e 3084 }
a5819310 3085 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 3086 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3087 !listAddNodeHead(server.objfreelist,o))
3088 zfree(o);
a5819310 3089 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 3090 }
3091}
3092
942a3961 3093static robj *lookupKey(redisDb *db, robj *key) {
3094 dictEntry *de = dictFind(db->dict,key);
3a66edc7 3095 if (de) {
55cf8433 3096 robj *key = dictGetEntryKey(de);
3097 robj *val = dictGetEntryVal(de);
3a66edc7 3098
55cf8433 3099 if (server.vm_enabled) {
996cb5f7 3100 if (key->storage == REDIS_VM_MEMORY ||
3101 key->storage == REDIS_VM_SWAPPING)
3102 {
3103 /* If we were swapping the object out, stop it, this key
3104 * was requested. */
3105 if (key->storage == REDIS_VM_SWAPPING)
3106 vmCancelThreadedIOJob(key);
55cf8433 3107 /* Update the access time of the key for the aging algorithm. */
3108 key->vm.atime = server.unixtime;
3109 } else {
d5d55fc3 3110 int notify = (key->storage == REDIS_VM_LOADING);
3111
55cf8433 3112 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 3113 redisAssert(val == NULL);
55cf8433 3114 val = vmLoadObject(key);
3115 dictGetEntryVal(de) = val;
d5d55fc3 3116
3117 /* Clients blocked by the VM subsystem may be waiting for
3118 * this key... */
3119 if (notify) handleClientsBlockedOnSwappedKey(db,key);
55cf8433 3120 }
3121 }
3122 return val;
3a66edc7 3123 } else {
3124 return NULL;
3125 }
942a3961 3126}
3127
3128static robj *lookupKeyRead(redisDb *db, robj *key) {
3129 expireIfNeeded(db,key);
3130 return lookupKey(db,key);
3131}
3132
3133static robj *lookupKeyWrite(redisDb *db, robj *key) {
3134 deleteIfVolatile(db,key);
37ab76c9 3135 touchWatchedKey(db,key);
942a3961 3136 return lookupKey(db,key);
3137}
3138
92b27fe9 3139static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3140 robj *o = lookupKeyRead(c->db, key);
3141 if (!o) addReply(c,reply);
3142 return o;
3143}
3144
3145static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3146 robj *o = lookupKeyWrite(c->db, key);
3147 if (!o) addReply(c,reply);
3148 return o;
3149}
3150
3151static int checkType(redisClient *c, robj *o, int type) {
3152 if (o->type != type) {
3153 addReply(c,shared.wrongtypeerr);
3154 return 1;
3155 }
3156 return 0;
3157}
3158
942a3961 3159static int deleteKey(redisDb *db, robj *key) {
3160 int retval;
3161
3162 /* We need to protect key from destruction: after the first dictDelete()
3163 * it may happen that 'key' is no longer valid if we don't increment
3164 * it's count. This may happen when we get the object reference directly
3165 * from the hash table with dictRandomKey() or dict iterators */
3166 incrRefCount(key);
3167 if (dictSize(db->expires)) dictDelete(db->expires,key);
3168 retval = dictDelete(db->dict,key);
3169 decrRefCount(key);
3170
3171 return retval == DICT_OK;
3172}
3173
724a51b1 3174/* Check if the nul-terminated string 's' can be represented by a long
3175 * (that is, is a number that fits into long without any other space or
3176 * character before or after the digits).
3177 *
3178 * If so, the function returns REDIS_OK and *longval is set to the value
3179 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 3180static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 3181 char buf[32], *endptr;
3182 long value;
3183 int slen;
e0a62c7f 3184
724a51b1 3185 value = strtol(s, &endptr, 10);
3186 if (endptr[0] != '\0') return REDIS_ERR;
ee14da56 3187 slen = ll2string(buf,32,value);
724a51b1 3188
3189 /* If the number converted back into a string is not identical
3190 * then it's not possible to encode the string as integer */
f69f2cba 3191 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 3192 if (longval) *longval = value;
3193 return REDIS_OK;
3194}
3195
942a3961 3196/* Try to encode a string object in order to save space */
05df7621 3197static robj *tryObjectEncoding(robj *o) {
942a3961 3198 long value;
942a3961 3199 sds s = o->ptr;
3305306f 3200
942a3961 3201 if (o->encoding != REDIS_ENCODING_RAW)
05df7621 3202 return o; /* Already encoded */
3305306f 3203
05df7621 3204 /* It's not safe to encode shared objects: shared objects can be shared
942a3961 3205 * everywhere in the "object space" of Redis. Encoded objects can only
3206 * appear as "values" (and not, for instance, as keys) */
05df7621 3207 if (o->refcount > 1) return o;
3305306f 3208
942a3961 3209 /* Currently we try to encode only strings */
dfc5e96c 3210 redisAssert(o->type == REDIS_STRING);
94754ccc 3211
724a51b1 3212 /* Check if we can represent this string as a long integer */
05df7621 3213 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
942a3961 3214
3215 /* Ok, this object can be encoded */
05df7621 3216 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3217 decrRefCount(o);
3218 incrRefCount(shared.integers[value]);
3219 return shared.integers[value];
3220 } else {
3221 o->encoding = REDIS_ENCODING_INT;
3222 sdsfree(o->ptr);
3223 o->ptr = (void*) value;
3224 return o;
3225 }
942a3961 3226}
3227
9d65a1bb 3228/* Get a decoded version of an encoded object (returned as a new object).
3229 * If the object is already raw-encoded just increment the ref count. */
3230static robj *getDecodedObject(robj *o) {
942a3961 3231 robj *dec;
e0a62c7f 3232
9d65a1bb 3233 if (o->encoding == REDIS_ENCODING_RAW) {
3234 incrRefCount(o);
3235 return o;
3236 }
942a3961 3237 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3238 char buf[32];
3239
ee14da56 3240 ll2string(buf,32,(long)o->ptr);
942a3961 3241 dec = createStringObject(buf,strlen(buf));
3242 return dec;
3243 } else {
08ee9b57 3244 redisPanic("Unknown encoding type");
942a3961 3245 }
3305306f 3246}
3247
d7f43c08 3248/* Compare two string objects via strcmp() or alike.
3249 * Note that the objects may be integer-encoded. In such a case we
ee14da56 3250 * use ll2string() to get a string representation of the numbers on the stack
1fd9bc8a 3251 * and compare the strings, it's much faster than calling getDecodedObject().
3252 *
3253 * Important note: if objects are not integer encoded, but binary-safe strings,
3254 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3255 * binary safe. */
724a51b1 3256static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 3257 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 3258 char bufa[128], bufb[128], *astr, *bstr;
3259 int bothsds = 1;
724a51b1 3260
e197b441 3261 if (a == b) return 0;
d7f43c08 3262 if (a->encoding != REDIS_ENCODING_RAW) {
ee14da56 3263 ll2string(bufa,sizeof(bufa),(long) a->ptr);
d7f43c08 3264 astr = bufa;
3265 bothsds = 0;
724a51b1 3266 } else {
d7f43c08 3267 astr = a->ptr;
724a51b1 3268 }
d7f43c08 3269 if (b->encoding != REDIS_ENCODING_RAW) {
ee14da56 3270 ll2string(bufb,sizeof(bufb),(long) b->ptr);
d7f43c08 3271 bstr = bufb;
3272 bothsds = 0;
3273 } else {
3274 bstr = b->ptr;
3275 }
3276 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3277}
3278
bf028098 3279/* Equal string objects return 1 if the two objects are the same from the
3280 * point of view of a string comparison, otherwise 0 is returned. Note that
3281 * this function is faster then checking for (compareStringObject(a,b) == 0)
3282 * because it can perform some more optimization. */
3283static int equalStringObjects(robj *a, robj *b) {
3284 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3285 return a->ptr == b->ptr;
3286 } else {
3287 return compareStringObjects(a,b) == 0;
3288 }
3289}
3290
0ea663ea 3291static size_t stringObjectLen(robj *o) {
dfc5e96c 3292 redisAssert(o->type == REDIS_STRING);
0ea663ea 3293 if (o->encoding == REDIS_ENCODING_RAW) {
3294 return sdslen(o->ptr);
3295 } else {
3296 char buf[32];
3297
ee14da56 3298 return ll2string(buf,32,(long)o->ptr);
0ea663ea 3299 }
3300}
3301
bd79a6bd
PN
3302static int getDoubleFromObject(robj *o, double *target) {
3303 double value;
682c73e8 3304 char *eptr;
bbe025e0 3305
bd79a6bd
PN
3306 if (o == NULL) {
3307 value = 0;
3308 } else {
3309 redisAssert(o->type == REDIS_STRING);
3310 if (o->encoding == REDIS_ENCODING_RAW) {
3311 value = strtod(o->ptr, &eptr);
682c73e8 3312 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3313 } else if (o->encoding == REDIS_ENCODING_INT) {
3314 value = (long)o->ptr;
3315 } else {
946342c1 3316 redisPanic("Unknown string encoding");
bd79a6bd
PN
3317 }
3318 }
3319
bd79a6bd
PN
3320 *target = value;
3321 return REDIS_OK;
3322}
bbe025e0 3323
bd79a6bd
PN
3324static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3325 double value;
3326 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3327 if (msg != NULL) {
3328 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3329 } else {
3330 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3331 }
bbe025e0
AM
3332 return REDIS_ERR;
3333 }
3334
bd79a6bd 3335 *target = value;
bbe025e0
AM
3336 return REDIS_OK;
3337}
3338
bd79a6bd
PN
3339static int getLongLongFromObject(robj *o, long long *target) {
3340 long long value;
682c73e8 3341 char *eptr;
bbe025e0 3342
bd79a6bd
PN
3343 if (o == NULL) {
3344 value = 0;
3345 } else {
3346 redisAssert(o->type == REDIS_STRING);
3347 if (o->encoding == REDIS_ENCODING_RAW) {
3348 value = strtoll(o->ptr, &eptr, 10);
682c73e8 3349 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3350 } else if (o->encoding == REDIS_ENCODING_INT) {
3351 value = (long)o->ptr;
3352 } else {
946342c1 3353 redisPanic("Unknown string encoding");
bd79a6bd
PN
3354 }
3355 }
3356
bd79a6bd
PN
3357 *target = value;
3358 return REDIS_OK;
3359}
bbe025e0 3360
bd79a6bd
PN
3361static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3362 long long value;
3363 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3364 if (msg != NULL) {
3365 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3366 } else {
3367 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3368 }
bbe025e0
AM
3369 return REDIS_ERR;
3370 }
3371
bd79a6bd 3372 *target = value;
bbe025e0
AM
3373 return REDIS_OK;
3374}
3375
bd79a6bd
PN
3376static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3377 long long value;
bbe025e0 3378
bd79a6bd
PN
3379 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3380 if (value < LONG_MIN || value > LONG_MAX) {
3381 if (msg != NULL) {
3382 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3383 } else {
3384 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3385 }
bbe025e0
AM
3386 return REDIS_ERR;
3387 }
3388
bd79a6bd 3389 *target = value;
bbe025e0
AM
3390 return REDIS_OK;
3391}
3392
06233c45 3393/*============================ RDB saving/loading =========================== */
ed9b544e 3394
f78fd11b 3395static int rdbSaveType(FILE *fp, unsigned char type) {
3396 if (fwrite(&type,1,1,fp) == 0) return -1;
3397 return 0;
3398}
3399
bb32ede5 3400static int rdbSaveTime(FILE *fp, time_t t) {
3401 int32_t t32 = (int32_t) t;
3402 if (fwrite(&t32,4,1,fp) == 0) return -1;
3403 return 0;
3404}
3405
e3566d4b 3406/* check rdbLoadLen() comments for more info */
f78fd11b 3407static int rdbSaveLen(FILE *fp, uint32_t len) {
3408 unsigned char buf[2];
3409
3410 if (len < (1<<6)) {
3411 /* Save a 6 bit len */
10c43610 3412 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3413 if (fwrite(buf,1,1,fp) == 0) return -1;
3414 } else if (len < (1<<14)) {
3415 /* Save a 14 bit len */
10c43610 3416 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3417 buf[1] = len&0xFF;
17be1a4a 3418 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3419 } else {
3420 /* Save a 32 bit len */
10c43610 3421 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3422 if (fwrite(buf,1,1,fp) == 0) return -1;
3423 len = htonl(len);
3424 if (fwrite(&len,4,1,fp) == 0) return -1;
3425 }
3426 return 0;
3427}
3428
32a66513 3429/* Encode 'value' as an integer if possible (if integer will fit the
3430 * supported range). If the function sucessful encoded the integer
3431 * then the (up to 5 bytes) encoded representation is written in the
3432 * string pointed by 'enc' and the length is returned. Otherwise
3433 * 0 is returned. */
3434static int rdbEncodeInteger(long long value, unsigned char *enc) {
e3566d4b 3435 /* Finally check if it fits in our ranges */
3436 if (value >= -(1<<7) && value <= (1<<7)-1) {
3437 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3438 enc[1] = value&0xFF;
3439 return 2;
3440 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3441 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3442 enc[1] = value&0xFF;
3443 enc[2] = (value>>8)&0xFF;
3444 return 3;
3445 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3446 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3447 enc[1] = value&0xFF;
3448 enc[2] = (value>>8)&0xFF;
3449 enc[3] = (value>>16)&0xFF;
3450 enc[4] = (value>>24)&0xFF;
3451 return 5;
3452 } else {
3453 return 0;
3454 }
3455}
3456
32a66513 3457/* String objects in the form "2391" "-100" without any space and with a
3458 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3459 * encoded as integers to save space */
3460static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3461 long long value;
3462 char *endptr, buf[32];
3463
3464 /* Check if it's possible to encode this value as a number */
3465 value = strtoll(s, &endptr, 10);
3466 if (endptr[0] != '\0') return 0;
3467 ll2string(buf,32,value);
3468
3469 /* If the number converted back into a string is not identical
3470 * then it's not possible to encode the string as integer */
3471 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3472
3473 return rdbEncodeInteger(value,enc);
3474}
3475
b1befe6a 3476static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3477 size_t comprlen, outlen;
774e3047 3478 unsigned char byte;
3479 void *out;
3480
3481 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3482 if (len <= 4) return 0;
3483 outlen = len-4;
3a2694c4 3484 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3485 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3486 if (comprlen == 0) {
88e85998 3487 zfree(out);
774e3047 3488 return 0;
3489 }
3490 /* Data compressed! Let's save it on disk */
3491 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3492 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3493 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3494 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3495 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3496 zfree(out);
774e3047 3497 return comprlen;
3498
3499writeerr:
88e85998 3500 zfree(out);
774e3047 3501 return -1;
3502}
3503
e3566d4b 3504/* Save a string objet as [len][data] on disk. If the object is a string
3505 * representation of an integer value we try to safe it in a special form */
b1befe6a 3506static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3507 int enclen;
10c43610 3508
774e3047 3509 /* Try integer encoding */
e3566d4b 3510 if (len <= 11) {
3511 unsigned char buf[5];
b1befe6a 3512 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3513 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3514 return 0;
3515 }
3516 }
774e3047 3517
3518 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3519 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3520 if (server.rdbcompression && len > 20) {
774e3047 3521 int retval;
3522
b1befe6a 3523 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3524 if (retval == -1) return -1;
3525 if (retval > 0) return 0;
3526 /* retval == 0 means data can't be compressed, save the old way */
3527 }
3528
3529 /* Store verbatim */
10c43610 3530 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3531 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3532 return 0;
3533}
3534
942a3961 3535/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3536static int rdbSaveStringObject(FILE *fp, robj *obj) {
3537 int retval;
942a3961 3538
32a66513 3539 /* Avoid to decode the object, then encode it again, if the
3540 * object is alrady integer encoded. */
3541 if (obj->encoding == REDIS_ENCODING_INT) {
3542 long val = (long) obj->ptr;
3543 unsigned char buf[5];
3544 int enclen;
3545
3546 if ((enclen = rdbEncodeInteger(val,buf)) > 0) {
3547 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3548 return 0;
3549 }
3550 /* otherwise... fall throught and continue with the usual
3551 * code path. */
3552 }
3553
f2d9f50f 3554 /* Avoid incr/decr ref count business when possible.
3555 * This plays well with copy-on-write given that we are probably
3556 * in a child process (BGSAVE). Also this makes sure key objects
3557 * of swapped objects are not incRefCount-ed (an assert does not allow
3558 * this in order to avoid bugs) */
3559 if (obj->encoding != REDIS_ENCODING_RAW) {
996cb5f7 3560 obj = getDecodedObject(obj);
b1befe6a 3561 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3562 decrRefCount(obj);
3563 } else {
b1befe6a 3564 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3565 }
9d65a1bb 3566 return retval;
942a3961 3567}
3568
a7866db6 3569/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3570 * 8 bit integer specifing the length of the representation.
3571 * This 8 bit integer has special values in order to specify the following
3572 * conditions:
3573 * 253: not a number
3574 * 254: + inf
3575 * 255: - inf
3576 */
3577static int rdbSaveDoubleValue(FILE *fp, double val) {
3578 unsigned char buf[128];
3579 int len;
3580
3581 if (isnan(val)) {
3582 buf[0] = 253;
3583 len = 1;
3584 } else if (!isfinite(val)) {
3585 len = 1;
3586 buf[0] = (val < 0) ? 255 : 254;
3587 } else {
88e8d89f 3588#if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
fe244589 3589 /* Check if the float is in a safe range to be casted into a
3590 * long long. We are assuming that long long is 64 bit here.
3591 * Also we are assuming that there are no implementations around where
3592 * double has precision < 52 bit.
3593 *
3594 * Under this assumptions we test if a double is inside an interval
3595 * where casting to long long is safe. Then using two castings we
3596 * make sure the decimal part is zero. If all this is true we use
3597 * integer printing function that is much faster. */
fb82e75c 3598 double min = -4503599627370495; /* (2^52)-1 */
3599 double max = 4503599627370496; /* -(2^52) */
fe244589 3600 if (val > min && val < max && val == ((double)((long long)val)))
8c096b16 3601 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3602 else
88e8d89f 3603#endif
8c096b16 3604 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3605 buf[0] = strlen((char*)buf+1);
a7866db6 3606 len = buf[0]+1;
3607 }
3608 if (fwrite(buf,len,1,fp) == 0) return -1;
3609 return 0;
3610}
3611
06233c45 3612/* Save a Redis object. */
3613static int rdbSaveObject(FILE *fp, robj *o) {
3614 if (o->type == REDIS_STRING) {
3615 /* Save a string value */
3616 if (rdbSaveStringObject(fp,o) == -1) return -1;
3617 } else if (o->type == REDIS_LIST) {
3618 /* Save a list value */
3619 list *list = o->ptr;
c7df85a4 3620 listIter li;
06233c45 3621 listNode *ln;
3622
06233c45 3623 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
c7df85a4 3624 listRewind(list,&li);
3625 while((ln = listNext(&li))) {
06233c45 3626 robj *eleobj = listNodeValue(ln);
3627
3628 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3629 }
3630 } else if (o->type == REDIS_SET) {
3631 /* Save a set value */
3632 dict *set = o->ptr;
3633 dictIterator *di = dictGetIterator(set);
3634 dictEntry *de;
3635
3636 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3637 while((de = dictNext(di)) != NULL) {
3638 robj *eleobj = dictGetEntryKey(de);
3639
3640 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3641 }
3642 dictReleaseIterator(di);
3643 } else if (o->type == REDIS_ZSET) {
3644 /* Save a set value */
3645 zset *zs = o->ptr;
3646 dictIterator *di = dictGetIterator(zs->dict);
3647 dictEntry *de;
3648
3649 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3650 while((de = dictNext(di)) != NULL) {
3651 robj *eleobj = dictGetEntryKey(de);
3652 double *score = dictGetEntryVal(de);
3653
3654 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3655 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3656 }
3657 dictReleaseIterator(di);
b1befe6a 3658 } else if (o->type == REDIS_HASH) {
3659 /* Save a hash value */
3660 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3661 unsigned char *p = zipmapRewind(o->ptr);
3662 unsigned int count = zipmapLen(o->ptr);
3663 unsigned char *key, *val;
3664 unsigned int klen, vlen;
3665
3666 if (rdbSaveLen(fp,count) == -1) return -1;
3667 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3668 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3669 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3670 }
3671 } else {
3672 dictIterator *di = dictGetIterator(o->ptr);
3673 dictEntry *de;
3674
3675 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3676 while((de = dictNext(di)) != NULL) {
3677 robj *key = dictGetEntryKey(de);
3678 robj *val = dictGetEntryVal(de);
3679
3680 if (rdbSaveStringObject(fp,key) == -1) return -1;
3681 if (rdbSaveStringObject(fp,val) == -1) return -1;
3682 }
3683 dictReleaseIterator(di);
3684 }
06233c45 3685 } else {
f83c6cb5 3686 redisPanic("Unknown object type");
06233c45 3687 }
3688 return 0;
3689}
3690
3691/* Return the length the object will have on disk if saved with
3692 * the rdbSaveObject() function. Currently we use a trick to get
3693 * this length with very little changes to the code. In the future
3694 * we could switch to a faster solution. */
b9bc0eef 3695static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3696 if (fp == NULL) fp = server.devnull;
06233c45 3697 rewind(fp);
3698 assert(rdbSaveObject(fp,o) != 1);
3699 return ftello(fp);
3700}
3701
06224fec 3702/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3703static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3704 off_t bytes = rdbSavedObjectLen(o,fp);
e0a62c7f 3705
06224fec 3706 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3707}
3708
ed9b544e 3709/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3710static int rdbSave(char *filename) {
ed9b544e 3711 dictIterator *di = NULL;
3712 dictEntry *de;
ed9b544e 3713 FILE *fp;
3714 char tmpfile[256];
3715 int j;
bb32ede5 3716 time_t now = time(NULL);
ed9b544e 3717
2316bb3b 3718 /* Wait for I/O therads to terminate, just in case this is a
3719 * foreground-saving, to avoid seeking the swap file descriptor at the
3720 * same time. */
3721 if (server.vm_enabled)
3722 waitEmptyIOJobsQueue();
3723
a3b21203 3724 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3725 fp = fopen(tmpfile,"w");
3726 if (!fp) {
3727 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3728 return REDIS_ERR;
3729 }
f78fd11b 3730 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3731 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3732 redisDb *db = server.db+j;
3733 dict *d = db->dict;
3305306f 3734 if (dictSize(d) == 0) continue;
ed9b544e 3735 di = dictGetIterator(d);
3736 if (!di) {
3737 fclose(fp);
3738 return REDIS_ERR;
3739 }
3740
3741 /* Write the SELECT DB opcode */
f78fd11b 3742 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3743 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3744
3745 /* Iterate this DB writing every entry */
3746 while((de = dictNext(di)) != NULL) {
3747 robj *key = dictGetEntryKey(de);
3748 robj *o = dictGetEntryVal(de);
bb32ede5 3749 time_t expiretime = getExpire(db,key);
3750
3751 /* Save the expire time */
3752 if (expiretime != -1) {
3753 /* If this key is already expired skip it */
3754 if (expiretime < now) continue;
3755 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3756 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3757 }
7e69548d 3758 /* Save the key and associated value. This requires special
3759 * handling if the value is swapped out. */
996cb5f7 3760 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3761 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3762 /* Save type, key, value */
3763 if (rdbSaveType(fp,o->type) == -1) goto werr;
3764 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3765 if (rdbSaveObject(fp,o) == -1) goto werr;
3766 } else {
996cb5f7 3767 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3768 robj *po;
7e69548d 3769 /* Get a preview of the object in memory */
3770 po = vmPreviewObject(key);
7e69548d 3771 /* Save type, key, value */
3772 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3773 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3774 if (rdbSaveObject(fp,po) == -1) goto werr;
3775 /* Remove the loaded object from memory */
3776 decrRefCount(po);
7e69548d 3777 }
ed9b544e 3778 }
3779 dictReleaseIterator(di);
3780 }
3781 /* EOF opcode */
f78fd11b 3782 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3783
3784 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3785 fflush(fp);
3786 fsync(fileno(fp));
3787 fclose(fp);
e0a62c7f 3788
ed9b544e 3789 /* Use RENAME to make sure the DB file is changed atomically only
3790 * if the generate DB file is ok. */
3791 if (rename(tmpfile,filename) == -1) {
325d1eb4 3792 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3793 unlink(tmpfile);
3794 return REDIS_ERR;
3795 }
3796 redisLog(REDIS_NOTICE,"DB saved on disk");
3797 server.dirty = 0;
3798 server.lastsave = time(NULL);
3799 return REDIS_OK;
3800
3801werr:
3802 fclose(fp);
3803 unlink(tmpfile);
3804 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3805 if (di) dictReleaseIterator(di);
3806 return REDIS_ERR;
3807}
3808
f78fd11b 3809static int rdbSaveBackground(char *filename) {
ed9b544e 3810 pid_t childpid;
3811
9d65a1bb 3812 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3813 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3814 if ((childpid = fork()) == 0) {
3815 /* Child */
054e426d 3816 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3817 close(server.fd);
f78fd11b 3818 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3819 _exit(0);
ed9b544e 3820 } else {
478c2c6f 3821 _exit(1);
ed9b544e 3822 }
3823 } else {
3824 /* Parent */
5a7c647e 3825 if (childpid == -1) {
3826 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3827 strerror(errno));
3828 return REDIS_ERR;
3829 }
ed9b544e 3830 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3831 server.bgsavechildpid = childpid;
884d4b39 3832 updateDictResizePolicy();
ed9b544e 3833 return REDIS_OK;
3834 }
3835 return REDIS_OK; /* unreached */
3836}
3837
a3b21203 3838static void rdbRemoveTempFile(pid_t childpid) {
3839 char tmpfile[256];
3840
3841 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3842 unlink(tmpfile);
3843}
3844
f78fd11b 3845static int rdbLoadType(FILE *fp) {
3846 unsigned char type;
7b45bfb2 3847 if (fread(&type,1,1,fp) == 0) return -1;
3848 return type;
3849}
3850
bb32ede5 3851static time_t rdbLoadTime(FILE *fp) {
3852 int32_t t32;
3853 if (fread(&t32,4,1,fp) == 0) return -1;
3854 return (time_t) t32;
3855}
3856
e3566d4b 3857/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3858 * of this file for a description of how this are stored on disk.
3859 *
3860 * isencoded is set to 1 if the readed length is not actually a length but
3861 * an "encoding type", check the above comments for more info */
c78a8ccc 3862static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3863 unsigned char buf[2];
3864 uint32_t len;
c78a8ccc 3865 int type;
f78fd11b 3866
e3566d4b 3867 if (isencoded) *isencoded = 0;
c78a8ccc 3868 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3869 type = (buf[0]&0xC0)>>6;
3870 if (type == REDIS_RDB_6BITLEN) {
3871 /* Read a 6 bit len */
3872 return buf[0]&0x3F;
3873 } else if (type == REDIS_RDB_ENCVAL) {
3874 /* Read a 6 bit len encoding type */
3875 if (isencoded) *isencoded = 1;
3876 return buf[0]&0x3F;
3877 } else if (type == REDIS_RDB_14BITLEN) {
3878 /* Read a 14 bit len */
3879 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3880 return ((buf[0]&0x3F)<<8)|buf[1];
3881 } else {
3882 /* Read a 32 bit len */
f78fd11b 3883 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3884 return ntohl(len);
f78fd11b 3885 }
f78fd11b 3886}
3887
ad30aa60 3888/* Load an integer-encoded object from file 'fp', with the specified
3889 * encoding type 'enctype'. If encode is true the function may return
3890 * an integer-encoded object as reply, otherwise the returned object
3891 * will always be encoded as a raw string. */
3892static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
e3566d4b 3893 unsigned char enc[4];
3894 long long val;
3895
3896 if (enctype == REDIS_RDB_ENC_INT8) {
3897 if (fread(enc,1,1,fp) == 0) return NULL;
3898 val = (signed char)enc[0];
3899 } else if (enctype == REDIS_RDB_ENC_INT16) {
3900 uint16_t v;
3901 if (fread(enc,2,1,fp) == 0) return NULL;
3902 v = enc[0]|(enc[1]<<8);
3903 val = (int16_t)v;
3904 } else if (enctype == REDIS_RDB_ENC_INT32) {
3905 uint32_t v;
3906 if (fread(enc,4,1,fp) == 0) return NULL;
3907 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3908 val = (int32_t)v;
3909 } else {
3910 val = 0; /* anti-warning */
f83c6cb5 3911 redisPanic("Unknown RDB integer encoding type");
e3566d4b 3912 }
ad30aa60 3913 if (encode)
3914 return createStringObjectFromLongLong(val);
3915 else
3916 return createObject(REDIS_STRING,sdsfromlonglong(val));
e3566d4b 3917}
3918
c78a8ccc 3919static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3920 unsigned int len, clen;
3921 unsigned char *c = NULL;
3922 sds val = NULL;
3923
c78a8ccc 3924 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3925 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3926 if ((c = zmalloc(clen)) == NULL) goto err;
3927 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3928 if (fread(c,clen,1,fp) == 0) goto err;
3929 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3930 zfree(c);
88e85998 3931 return createObject(REDIS_STRING,val);
3932err:
3933 zfree(c);
3934 sdsfree(val);
3935 return NULL;
3936}
3937
ad30aa60 3938static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
e3566d4b 3939 int isencoded;
3940 uint32_t len;
f78fd11b 3941 sds val;
3942
c78a8ccc 3943 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3944 if (isencoded) {
3945 switch(len) {
3946 case REDIS_RDB_ENC_INT8:
3947 case REDIS_RDB_ENC_INT16:
3948 case REDIS_RDB_ENC_INT32:
ad30aa60 3949 return rdbLoadIntegerObject(fp,len,encode);
88e85998 3950 case REDIS_RDB_ENC_LZF:
bdcb92f2 3951 return rdbLoadLzfStringObject(fp);
e3566d4b 3952 default:
f83c6cb5 3953 redisPanic("Unknown RDB encoding type");
e3566d4b 3954 }
3955 }
3956
f78fd11b 3957 if (len == REDIS_RDB_LENERR) return NULL;
3958 val = sdsnewlen(NULL,len);
3959 if (len && fread(val,len,1,fp) == 0) {
3960 sdsfree(val);
3961 return NULL;
3962 }
bdcb92f2 3963 return createObject(REDIS_STRING,val);
f78fd11b 3964}
3965
ad30aa60 3966static robj *rdbLoadStringObject(FILE *fp) {
3967 return rdbGenericLoadStringObject(fp,0);
3968}
3969
3970static robj *rdbLoadEncodedStringObject(FILE *fp) {
3971 return rdbGenericLoadStringObject(fp,1);
3972}
3973
a7866db6 3974/* For information about double serialization check rdbSaveDoubleValue() */
3975static int rdbLoadDoubleValue(FILE *fp, double *val) {
3976 char buf[128];
3977 unsigned char len;
3978
3979 if (fread(&len,1,1,fp) == 0) return -1;
3980 switch(len) {
3981 case 255: *val = R_NegInf; return 0;
3982 case 254: *val = R_PosInf; return 0;
3983 case 253: *val = R_Nan; return 0;
3984 default:
3985 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3986 buf[len] = '\0';
a7866db6 3987 sscanf(buf, "%lg", val);
3988 return 0;
3989 }
3990}
3991
c78a8ccc 3992/* Load a Redis object of the specified type from the specified file.
3993 * On success a newly allocated object is returned, otherwise NULL. */
3994static robj *rdbLoadObject(int type, FILE *fp) {
3995 robj *o;
3996
bcd11906 3997 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 3998 if (type == REDIS_STRING) {
3999 /* Read string value */
ad30aa60 4000 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4001 o = tryObjectEncoding(o);
c78a8ccc 4002 } else if (type == REDIS_LIST || type == REDIS_SET) {
4003 /* Read list/set value */
4004 uint32_t listlen;
4005
4006 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4007 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3c68de9b 4008 /* It's faster to expand the dict to the right size asap in order
4009 * to avoid rehashing */
4010 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
4011 dictExpand(o->ptr,listlen);
c78a8ccc 4012 /* Load every single element of the list/set */
4013 while(listlen--) {
4014 robj *ele;
4015
ad30aa60 4016 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4017 ele = tryObjectEncoding(ele);
c78a8ccc 4018 if (type == REDIS_LIST) {
4019 listAddNodeTail((list*)o->ptr,ele);
4020 } else {
4021 dictAdd((dict*)o->ptr,ele,NULL);
4022 }
4023 }
4024 } else if (type == REDIS_ZSET) {
4025 /* Read list/set value */
ada386b2 4026 size_t zsetlen;
c78a8ccc 4027 zset *zs;
4028
4029 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4030 o = createZsetObject();
4031 zs = o->ptr;
4032 /* Load every single element of the list/set */
4033 while(zsetlen--) {
4034 robj *ele;
4035 double *score = zmalloc(sizeof(double));
4036
ad30aa60 4037 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4038 ele = tryObjectEncoding(ele);
c78a8ccc 4039 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4040 dictAdd(zs->dict,ele,score);
4041 zslInsert(zs->zsl,*score,ele);
4042 incrRefCount(ele); /* added to skiplist */
4043 }
ada386b2 4044 } else if (type == REDIS_HASH) {
4045 size_t hashlen;
4046
4047 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4048 o = createHashObject();
4049 /* Too many entries? Use an hash table. */
4050 if (hashlen > server.hash_max_zipmap_entries)
4051 convertToRealHash(o);
4052 /* Load every key/value, then set it into the zipmap or hash
4053 * table, as needed. */
4054 while(hashlen--) {
4055 robj *key, *val;
4056
4057 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4058 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4059 /* If we are using a zipmap and there are too big values
4060 * the object is converted to real hash table encoding. */
4061 if (o->encoding != REDIS_ENCODING_HT &&
4062 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4063 sdslen(val->ptr) > server.hash_max_zipmap_value))
4064 {
4065 convertToRealHash(o);
4066 }
4067
4068 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4069 unsigned char *zm = o->ptr;
4070
4071 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4072 val->ptr,sdslen(val->ptr),NULL);
4073 o->ptr = zm;
4074 decrRefCount(key);
4075 decrRefCount(val);
4076 } else {
05df7621 4077 key = tryObjectEncoding(key);
4078 val = tryObjectEncoding(val);
ada386b2 4079 dictAdd((dict*)o->ptr,key,val);
ada386b2 4080 }
4081 }
c78a8ccc 4082 } else {
f83c6cb5 4083 redisPanic("Unknown object type");
c78a8ccc 4084 }
4085 return o;
4086}
4087
f78fd11b 4088static int rdbLoad(char *filename) {
ed9b544e 4089 FILE *fp;
f78fd11b 4090 uint32_t dbid;
bb32ede5 4091 int type, retval, rdbver;
585af7e2 4092 int swap_all_values = 0;
3305306f 4093 dict *d = server.db[0].dict;
bb32ede5 4094 redisDb *db = server.db+0;
f78fd11b 4095 char buf[1024];
242a64f3 4096 time_t expiretime, now = time(NULL);
b492cf00 4097 long long loadedkeys = 0;
bb32ede5 4098
ed9b544e 4099 fp = fopen(filename,"r");
4100 if (!fp) return REDIS_ERR;
4101 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 4102 buf[9] = '\0';
4103 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 4104 fclose(fp);
4105 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4106 return REDIS_ERR;
4107 }
f78fd11b 4108 rdbver = atoi(buf+5);
c78a8ccc 4109 if (rdbver != 1) {
f78fd11b 4110 fclose(fp);
4111 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4112 return REDIS_ERR;
4113 }
ed9b544e 4114 while(1) {
585af7e2 4115 robj *key, *val;
ed9b544e 4116
585af7e2 4117 expiretime = -1;
ed9b544e 4118 /* Read type. */
f78fd11b 4119 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 4120 if (type == REDIS_EXPIRETIME) {
4121 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4122 /* We read the time so we need to read the object type again */
4123 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4124 }
ed9b544e 4125 if (type == REDIS_EOF) break;
4126 /* Handle SELECT DB opcode as a special case */
4127 if (type == REDIS_SELECTDB) {
c78a8ccc 4128 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 4129 goto eoferr;
ed9b544e 4130 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 4131 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 4132 exit(1);
4133 }
bb32ede5 4134 db = server.db+dbid;
4135 d = db->dict;
ed9b544e 4136 continue;
4137 }
4138 /* Read key */
585af7e2 4139 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
c78a8ccc 4140 /* Read value */
585af7e2 4141 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
89e689c5 4142 /* Check if the key already expired */
4143 if (expiretime != -1 && expiretime < now) {
4144 decrRefCount(key);
4145 decrRefCount(val);
4146 continue;
4147 }
ed9b544e 4148 /* Add the new object in the hash table */
585af7e2 4149 retval = dictAdd(d,key,val);
ed9b544e 4150 if (retval == DICT_ERR) {
585af7e2 4151 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
ed9b544e 4152 exit(1);
4153 }
242a64f3 4154 loadedkeys++;
bb32ede5 4155 /* Set the expire time if needed */
89e689c5 4156 if (expiretime != -1) setExpire(db,key,expiretime);
242a64f3 4157
b492cf00 4158 /* Handle swapping while loading big datasets when VM is on */
242a64f3 4159
4160 /* If we detecter we are hopeless about fitting something in memory
4161 * we just swap every new key on disk. Directly...
4162 * Note that's important to check for this condition before resorting
4163 * to random sampling, otherwise we may try to swap already
4164 * swapped keys. */
585af7e2 4165 if (swap_all_values) {
4166 dictEntry *de = dictFind(d,key);
242a64f3 4167
4168 /* de may be NULL since the key already expired */
4169 if (de) {
585af7e2 4170 key = dictGetEntryKey(de);
4171 val = dictGetEntryVal(de);
242a64f3 4172
585af7e2 4173 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
242a64f3 4174 dictGetEntryVal(de) = NULL;
4175 }
4176 }
4177 continue;
4178 }
4179
4180 /* If we have still some hope of having some value fitting memory
4181 * then we try random sampling. */
585af7e2 4182 if (!swap_all_values && server.vm_enabled && (loadedkeys % 5000) == 0) {
b492cf00 4183 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 4184 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 4185 }
242a64f3 4186 if (zmalloc_used_memory() > server.vm_max_memory)
585af7e2 4187 swap_all_values = 1; /* We are already using too much mem */
b492cf00 4188 }
ed9b544e 4189 }
4190 fclose(fp);
4191 return REDIS_OK;
4192
4193eoferr: /* unexpected end of file is handled here with a fatal exit */
f80dff62 4194 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 4195 exit(1);
4196 return REDIS_ERR; /* Just to avoid warning */
4197}
4198
b58ba105 4199/*================================== Shutdown =============================== */
fab43727 4200static int prepareForShutdown() {
b58ba105
AM
4201 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4202 /* Kill the saving child if there is a background saving in progress.
4203 We want to avoid race conditions, for instance our saving child may
4204 overwrite the synchronous saving did by SHUTDOWN. */
4205 if (server.bgsavechildpid != -1) {
4206 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4207 kill(server.bgsavechildpid,SIGKILL);
4208 rdbRemoveTempFile(server.bgsavechildpid);
4209 }
4210 if (server.appendonly) {
4211 /* Append only file: fsync() the AOF and exit */
4212 fsync(server.appendfd);
4213 if (server.vm_enabled) unlink(server.vm_swap_file);
b58ba105
AM
4214 } else {
4215 /* Snapshotting. Perform a SYNC SAVE and exit */
4216 if (rdbSave(server.dbfilename) == REDIS_OK) {
4217 if (server.daemonize)
4218 unlink(server.pidfile);
4219 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
b58ba105
AM
4220 } else {
4221 /* Ooops.. error saving! The best we can do is to continue
4222 * operating. Note that if there was a background saving process,
4223 * in the next cron() Redis will be notified that the background
4224 * saving aborted, handling special stuff like slaves pending for
4225 * synchronization... */
4226 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
fab43727 4227 return REDIS_ERR;
b58ba105
AM
4228 }
4229 }
8513a757 4230 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
fab43727 4231 return REDIS_OK;
b58ba105
AM
4232}
4233
ed9b544e 4234/*================================== Commands =============================== */
4235
abcb223e 4236static void authCommand(redisClient *c) {
2e77c2ee 4237 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
4238 c->authenticated = 1;
4239 addReply(c,shared.ok);
4240 } else {
4241 c->authenticated = 0;
fa4c0aba 4242 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
4243 }
4244}
4245
ed9b544e 4246static void pingCommand(redisClient *c) {
4247 addReply(c,shared.pong);
4248}
4249
4250static void echoCommand(redisClient *c) {
dd88747b 4251 addReplyBulk(c,c->argv[1]);
ed9b544e 4252}
4253
4254/*=================================== Strings =============================== */
4255
526d00a5 4256static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
ed9b544e 4257 int retval;
10ce1276 4258 long seconds = 0; /* initialized to avoid an harmness warning */
ed9b544e 4259
526d00a5 4260 if (expire) {
4261 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4262 return;
4263 if (seconds <= 0) {
4264 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4265 return;
4266 }
4267 }
4268
37ab76c9 4269 touchWatchedKey(c->db,key);
526d00a5 4270 if (nx) deleteIfVolatile(c->db,key);
4271 retval = dictAdd(c->db->dict,key,val);
ed9b544e 4272 if (retval == DICT_ERR) {
4273 if (!nx) {
1b03836c 4274 /* If the key is about a swapped value, we want a new key object
4275 * to overwrite the old. So we delete the old key in the database.
4276 * This will also make sure that swap pages about the old object
4277 * will be marked as free. */
526d00a5 4278 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4279 incrRefCount(key);
4280 dictReplace(c->db->dict,key,val);
4281 incrRefCount(val);
ed9b544e 4282 } else {
c937aa89 4283 addReply(c,shared.czero);
ed9b544e 4284 return;
4285 }
4286 } else {
526d00a5 4287 incrRefCount(key);
4288 incrRefCount(val);
ed9b544e 4289 }
4290 server.dirty++;
526d00a5 4291 removeExpire(c->db,key);
4292 if (expire) setExpire(c->db,key,time(NULL)+seconds);
c937aa89 4293 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 4294}
4295
4296static void setCommand(redisClient *c) {
526d00a5 4297 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
ed9b544e 4298}
4299
4300static void setnxCommand(redisClient *c) {
526d00a5 4301 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4302}
4303
4304static void setexCommand(redisClient *c) {
4305 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
ed9b544e 4306}
4307
322fc7d8 4308static int getGenericCommand(redisClient *c) {
dd88747b 4309 robj *o;
e0a62c7f 4310
dd88747b 4311 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 4312 return REDIS_OK;
dd88747b 4313
4314 if (o->type != REDIS_STRING) {
4315 addReply(c,shared.wrongtypeerr);
4316 return REDIS_ERR;
ed9b544e 4317 } else {
dd88747b 4318 addReplyBulk(c,o);
4319 return REDIS_OK;
ed9b544e 4320 }
4321}
4322
322fc7d8 4323static void getCommand(redisClient *c) {
4324 getGenericCommand(c);
4325}
4326
f6b141c5 4327static void getsetCommand(redisClient *c) {
322fc7d8 4328 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 4329 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4330 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4331 } else {
4332 incrRefCount(c->argv[1]);
4333 }
4334 incrRefCount(c->argv[2]);
4335 server.dirty++;
4336 removeExpire(c->db,c->argv[1]);
4337}
4338
70003d28 4339static void mgetCommand(redisClient *c) {
70003d28 4340 int j;
e0a62c7f 4341
c937aa89 4342 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 4343 for (j = 1; j < c->argc; j++) {
3305306f 4344 robj *o = lookupKeyRead(c->db,c->argv[j]);
4345 if (o == NULL) {
c937aa89 4346 addReply(c,shared.nullbulk);
70003d28 4347 } else {
70003d28 4348 if (o->type != REDIS_STRING) {
c937aa89 4349 addReply(c,shared.nullbulk);
70003d28 4350 } else {
dd88747b 4351 addReplyBulk(c,o);
70003d28 4352 }
4353 }
4354 }
4355}
4356
6c446631 4357static void msetGenericCommand(redisClient *c, int nx) {
906573e7 4358 int j, busykeys = 0;
6c446631 4359
4360 if ((c->argc % 2) == 0) {
454d4e43 4361 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 4362 return;
4363 }
4364 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4365 * set nothing at all if at least one already key exists. */
4366 if (nx) {
4367 for (j = 1; j < c->argc; j += 2) {
906573e7 4368 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4369 busykeys++;
6c446631 4370 }
4371 }
4372 }
906573e7 4373 if (busykeys) {
4374 addReply(c, shared.czero);
4375 return;
4376 }
6c446631 4377
4378 for (j = 1; j < c->argc; j += 2) {
4379 int retval;
4380
05df7621 4381 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
6c446631 4382 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4383 if (retval == DICT_ERR) {
4384 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4385 incrRefCount(c->argv[j+1]);
4386 } else {
4387 incrRefCount(c->argv[j]);
4388 incrRefCount(c->argv[j+1]);
4389 }
4390 removeExpire(c->db,c->argv[j]);
4391 }
4392 server.dirty += (c->argc-1)/2;
4393 addReply(c, nx ? shared.cone : shared.ok);
4394}
4395
4396static void msetCommand(redisClient *c) {
4397 msetGenericCommand(c,0);
4398}
4399
4400static void msetnxCommand(redisClient *c) {
4401 msetGenericCommand(c,1);
4402}
4403
d68ed120 4404static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 4405 long long value;
4406 int retval;
4407 robj *o;
e0a62c7f 4408
3305306f 4409 o = lookupKeyWrite(c->db,c->argv[1]);
6485f293
PN
4410 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4411 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
ed9b544e 4412
4413 value += incr;
d6f4c262 4414 o = createStringObjectFromLongLong(value);
3305306f 4415 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 4416 if (retval == DICT_ERR) {
3305306f 4417 dictReplace(c->db->dict,c->argv[1],o);
4418 removeExpire(c->db,c->argv[1]);
ed9b544e 4419 } else {
4420 incrRefCount(c->argv[1]);
4421 }
4422 server.dirty++;
c937aa89 4423 addReply(c,shared.colon);
ed9b544e 4424 addReply(c,o);
4425 addReply(c,shared.crlf);
4426}
4427
4428static void incrCommand(redisClient *c) {
a4d1ba9a 4429 incrDecrCommand(c,1);
ed9b544e 4430}
4431
4432static void decrCommand(redisClient *c) {
a4d1ba9a 4433 incrDecrCommand(c,-1);
ed9b544e 4434}
4435
4436static void incrbyCommand(redisClient *c) {
bbe025e0
AM
4437 long long incr;
4438
bd79a6bd 4439 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4440 incrDecrCommand(c,incr);
ed9b544e 4441}
4442
4443static void decrbyCommand(redisClient *c) {
bbe025e0
AM
4444 long long incr;
4445
bd79a6bd 4446 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4447 incrDecrCommand(c,-incr);
ed9b544e 4448}
4449
4b00bebd 4450static void appendCommand(redisClient *c) {
4451 int retval;
4452 size_t totlen;
4453 robj *o;
4454
4455 o = lookupKeyWrite(c->db,c->argv[1]);
4456 if (o == NULL) {
4457 /* Create the key */
4458 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4459 incrRefCount(c->argv[1]);
4460 incrRefCount(c->argv[2]);
4461 totlen = stringObjectLen(c->argv[2]);
4462 } else {
4463 dictEntry *de;
e0a62c7f 4464
4b00bebd 4465 de = dictFind(c->db->dict,c->argv[1]);
4466 assert(de != NULL);
4467
4468 o = dictGetEntryVal(de);
4469 if (o->type != REDIS_STRING) {
4470 addReply(c,shared.wrongtypeerr);
4471 return;
4472 }
4473 /* If the object is specially encoded or shared we have to make
4474 * a copy */
4475 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4476 robj *decoded = getDecodedObject(o);
4477
4478 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4479 decrRefCount(decoded);
4480 dictReplace(c->db->dict,c->argv[1],o);
4481 }
4482 /* APPEND! */
4483 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4484 o->ptr = sdscatlen(o->ptr,
4485 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4486 } else {
4487 o->ptr = sdscatprintf(o->ptr, "%ld",
4488 (unsigned long) c->argv[2]->ptr);
4489 }
4490 totlen = sdslen(o->ptr);
4491 }
4492 server.dirty++;
4493 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4494}
4495
39191553 4496static void substrCommand(redisClient *c) {
4497 robj *o;
4498 long start = atoi(c->argv[2]->ptr);
4499 long end = atoi(c->argv[3]->ptr);
dd88747b 4500 size_t rangelen, strlen;
4501 sds range;
39191553 4502
dd88747b 4503 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4504 checkType(c,o,REDIS_STRING)) return;
39191553 4505
dd88747b 4506 o = getDecodedObject(o);
4507 strlen = sdslen(o->ptr);
8fe7fad7 4508
dd88747b 4509 /* convert negative indexes */
4510 if (start < 0) start = strlen+start;
4511 if (end < 0) end = strlen+end;
4512 if (start < 0) start = 0;
4513 if (end < 0) end = 0;
39191553 4514
dd88747b 4515 /* indexes sanity checks */
4516 if (start > end || (size_t)start >= strlen) {
4517 /* Out of range start or start > end result in null reply */
4518 addReply(c,shared.nullbulk);
4519 decrRefCount(o);
4520 return;
39191553 4521 }
dd88747b 4522 if ((size_t)end >= strlen) end = strlen-1;
4523 rangelen = (end-start)+1;
4524
4525 /* Return the result */
4526 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4527 range = sdsnewlen((char*)o->ptr+start,rangelen);
4528 addReplySds(c,range);
4529 addReply(c,shared.crlf);
4530 decrRefCount(o);
39191553 4531}
4532
ed9b544e 4533/* ========================= Type agnostic commands ========================= */
4534
4535static void delCommand(redisClient *c) {
5109cdff 4536 int deleted = 0, j;
4537
4538 for (j = 1; j < c->argc; j++) {
4539 if (deleteKey(c->db,c->argv[j])) {
37ab76c9 4540 touchWatchedKey(c->db,c->argv[j]);
5109cdff 4541 server.dirty++;
4542 deleted++;
4543 }
4544 }
482b672d 4545 addReplyLongLong(c,deleted);
ed9b544e 4546}
4547
4548static void existsCommand(redisClient *c) {
f4f06efc
PN
4549 expireIfNeeded(c->db,c->argv[1]);
4550 if (dictFind(c->db->dict,c->argv[1])) {
4551 addReply(c, shared.cone);
4552 } else {
4553 addReply(c, shared.czero);
4554 }
ed9b544e 4555}
4556
4557static void selectCommand(redisClient *c) {
4558 int id = atoi(c->argv[1]->ptr);
e0a62c7f 4559
ed9b544e 4560 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4561 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4562 } else {
4563 addReply(c,shared.ok);
4564 }
4565}
4566
4567static void randomkeyCommand(redisClient *c) {
4568 dictEntry *de;
dc4be23e 4569 robj *key;
e0a62c7f 4570
3305306f 4571 while(1) {
4572 de = dictGetRandomKey(c->db->dict);
ce7bef07 4573 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 4574 }
2b619329 4575
ed9b544e 4576 if (de == NULL) {
dc4be23e 4577 addReply(c,shared.nullbulk);
4578 return;
4579 }
4580
4581 key = dictGetEntryKey(de);
4582 if (server.vm_enabled) {
4583 key = dupStringObject(key);
4584 addReplyBulk(c,key);
4585 decrRefCount(key);
ed9b544e 4586 } else {
dc4be23e 4587 addReplyBulk(c,key);
ed9b544e 4588 }
4589}
4590
4591static void keysCommand(redisClient *c) {
4592 dictIterator *di;
4593 dictEntry *de;
4594 sds pattern = c->argv[1]->ptr;
4595 int plen = sdslen(pattern);
a3f9eec2 4596 unsigned long numkeys = 0;
ed9b544e 4597 robj *lenobj = createObject(REDIS_STRING,NULL);
4598
3305306f 4599 di = dictGetIterator(c->db->dict);
ed9b544e 4600 addReply(c,lenobj);
4601 decrRefCount(lenobj);
4602 while((de = dictNext(di)) != NULL) {
4603 robj *keyobj = dictGetEntryKey(de);
3305306f 4604
ed9b544e 4605 sds key = keyobj->ptr;
4606 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4607 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 4608 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4609 addReplyBulk(c,keyobj);
3305306f 4610 numkeys++;
3305306f 4611 }
ed9b544e 4612 }
4613 }
4614 dictReleaseIterator(di);
a3f9eec2 4615 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4616}
4617
4618static void dbsizeCommand(redisClient *c) {
4619 addReplySds(c,
3305306f 4620 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4621}
4622
4623static void lastsaveCommand(redisClient *c) {
4624 addReplySds(c,
c937aa89 4625 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4626}
4627
4628static void typeCommand(redisClient *c) {
3305306f 4629 robj *o;
ed9b544e 4630 char *type;
3305306f 4631
4632 o = lookupKeyRead(c->db,c->argv[1]);
4633 if (o == NULL) {
c937aa89 4634 type = "+none";
ed9b544e 4635 } else {
ed9b544e 4636 switch(o->type) {
c937aa89 4637 case REDIS_STRING: type = "+string"; break;
4638 case REDIS_LIST: type = "+list"; break;
4639 case REDIS_SET: type = "+set"; break;
412a8bce 4640 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4641 case REDIS_HASH: type = "+hash"; break;
4642 default: type = "+unknown"; break;
ed9b544e 4643 }
4644 }
4645 addReplySds(c,sdsnew(type));
4646 addReply(c,shared.crlf);
4647}
4648
4649static void saveCommand(redisClient *c) {
9d65a1bb 4650 if (server.bgsavechildpid != -1) {
05557f6d 4651 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4652 return;
4653 }
f78fd11b 4654 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4655 addReply(c,shared.ok);
4656 } else {
4657 addReply(c,shared.err);
4658 }
4659}
4660
4661static void bgsaveCommand(redisClient *c) {
9d65a1bb 4662 if (server.bgsavechildpid != -1) {
ed9b544e 4663 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4664 return;
4665 }
f78fd11b 4666 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4667 char *status = "+Background saving started\r\n";
4668 addReplySds(c,sdsnew(status));
ed9b544e 4669 } else {
4670 addReply(c,shared.err);
4671 }
4672}
4673
4674static void shutdownCommand(redisClient *c) {
fab43727 4675 if (prepareForShutdown() == REDIS_OK)
4676 exit(0);
4677 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
ed9b544e 4678}
4679
4680static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4681 robj *o;
4682
4683 /* To use the same key as src and dst is probably an error */
4684 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4685 addReply(c,shared.sameobjecterr);
ed9b544e 4686 return;
4687 }
4688
dd88747b 4689 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4690 return;
dd88747b 4691
ed9b544e 4692 incrRefCount(o);
3305306f 4693 deleteIfVolatile(c->db,c->argv[2]);
4694 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 4695 if (nx) {
4696 decrRefCount(o);
c937aa89 4697 addReply(c,shared.czero);
ed9b544e 4698 return;
4699 }
3305306f 4700 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 4701 } else {
4702 incrRefCount(c->argv[2]);
4703 }
3305306f 4704 deleteKey(c->db,c->argv[1]);
b167f877 4705 touchWatchedKey(c->db,c->argv[2]);
ed9b544e 4706 server.dirty++;
c937aa89 4707 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4708}
4709
4710static void renameCommand(redisClient *c) {
4711 renameGenericCommand(c,0);
4712}
4713
4714static void renamenxCommand(redisClient *c) {
4715 renameGenericCommand(c,1);
4716}
4717
4718static void moveCommand(redisClient *c) {
3305306f 4719 robj *o;
4720 redisDb *src, *dst;
ed9b544e 4721 int srcid;
4722
4723 /* Obtain source and target DB pointers */
3305306f 4724 src = c->db;
4725 srcid = c->db->id;
ed9b544e 4726 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4727 addReply(c,shared.outofrangeerr);
ed9b544e 4728 return;
4729 }
3305306f 4730 dst = c->db;
4731 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4732
4733 /* If the user is moving using as target the same
4734 * DB as the source DB it is probably an error. */
4735 if (src == dst) {
c937aa89 4736 addReply(c,shared.sameobjecterr);
ed9b544e 4737 return;
4738 }
4739
4740 /* Check if the element exists and get a reference */
3305306f 4741 o = lookupKeyWrite(c->db,c->argv[1]);
4742 if (!o) {
c937aa89 4743 addReply(c,shared.czero);
ed9b544e 4744 return;
4745 }
4746
4747 /* Try to add the element to the target DB */
3305306f 4748 deleteIfVolatile(dst,c->argv[1]);
4749 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 4750 addReply(c,shared.czero);
ed9b544e 4751 return;
4752 }
3305306f 4753 incrRefCount(c->argv[1]);
ed9b544e 4754 incrRefCount(o);
4755
4756 /* OK! key moved, free the entry in the source DB */
3305306f 4757 deleteKey(src,c->argv[1]);
ed9b544e 4758 server.dirty++;
c937aa89 4759 addReply(c,shared.cone);
ed9b544e 4760}
4761
4762/* =================================== Lists ================================ */
4763static void pushGenericCommand(redisClient *c, int where) {
4764 robj *lobj;
ed9b544e 4765 list *list;
3305306f 4766
4767 lobj = lookupKeyWrite(c->db,c->argv[1]);
4768 if (lobj == NULL) {
95242ab5 4769 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4770 addReply(c,shared.cone);
95242ab5 4771 return;
4772 }
ed9b544e 4773 lobj = createListObject();
4774 list = lobj->ptr;
4775 if (where == REDIS_HEAD) {
6b47e12e 4776 listAddNodeHead(list,c->argv[2]);
ed9b544e 4777 } else {
6b47e12e 4778 listAddNodeTail(list,c->argv[2]);
ed9b544e 4779 }
3305306f 4780 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 4781 incrRefCount(c->argv[1]);
4782 incrRefCount(c->argv[2]);
4783 } else {
ed9b544e 4784 if (lobj->type != REDIS_LIST) {
4785 addReply(c,shared.wrongtypeerr);
4786 return;
4787 }
95242ab5 4788 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4789 addReply(c,shared.cone);
95242ab5 4790 return;
4791 }
ed9b544e 4792 list = lobj->ptr;
4793 if (where == REDIS_HEAD) {
6b47e12e 4794 listAddNodeHead(list,c->argv[2]);
ed9b544e 4795 } else {
6b47e12e 4796 listAddNodeTail(list,c->argv[2]);
ed9b544e 4797 }
4798 incrRefCount(c->argv[2]);
4799 }
4800 server.dirty++;
482b672d 4801 addReplyLongLong(c,listLength(list));
ed9b544e 4802}
4803
4804static void lpushCommand(redisClient *c) {
4805 pushGenericCommand(c,REDIS_HEAD);
4806}
4807
4808static void rpushCommand(redisClient *c) {
4809 pushGenericCommand(c,REDIS_TAIL);
4810}
4811
4812static void llenCommand(redisClient *c) {
3305306f 4813 robj *o;
ed9b544e 4814 list *l;
dd88747b 4815
4816 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4817 checkType(c,o,REDIS_LIST)) return;
e0a62c7f 4818
dd88747b 4819 l = o->ptr;
4820 addReplyUlong(c,listLength(l));
ed9b544e 4821}
4822
4823static void lindexCommand(redisClient *c) {
3305306f 4824 robj *o;
ed9b544e 4825 int index = atoi(c->argv[2]->ptr);
dd88747b 4826 list *list;
4827 listNode *ln;
4828
4829 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4830 checkType(c,o,REDIS_LIST)) return;
4831 list = o->ptr;
4832
4833 ln = listIndex(list, index);
4834 if (ln == NULL) {
c937aa89 4835 addReply(c,shared.nullbulk);
ed9b544e 4836 } else {
dd88747b 4837 robj *ele = listNodeValue(ln);
4838 addReplyBulk(c,ele);
ed9b544e 4839 }
4840}
4841
4842static void lsetCommand(redisClient *c) {
3305306f 4843 robj *o;
ed9b544e 4844 int index = atoi(c->argv[2]->ptr);
dd88747b 4845 list *list;
4846 listNode *ln;
4847
4848 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4849 checkType(c,o,REDIS_LIST)) return;
4850 list = o->ptr;
4851
4852 ln = listIndex(list, index);
4853 if (ln == NULL) {
4854 addReply(c,shared.outofrangeerr);
ed9b544e 4855 } else {
dd88747b 4856 robj *ele = listNodeValue(ln);
ed9b544e 4857
dd88747b 4858 decrRefCount(ele);
4859 listNodeValue(ln) = c->argv[3];
4860 incrRefCount(c->argv[3]);
4861 addReply(c,shared.ok);
4862 server.dirty++;
ed9b544e 4863 }
4864}
4865
4866static void popGenericCommand(redisClient *c, int where) {
3305306f 4867 robj *o;
dd88747b 4868 list *list;
4869 listNode *ln;
3305306f 4870
dd88747b 4871 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4872 checkType(c,o,REDIS_LIST)) return;
4873 list = o->ptr;
ed9b544e 4874
dd88747b 4875 if (where == REDIS_HEAD)
4876 ln = listFirst(list);
4877 else
4878 ln = listLast(list);
ed9b544e 4879
dd88747b 4880 if (ln == NULL) {
4881 addReply(c,shared.nullbulk);
4882 } else {
4883 robj *ele = listNodeValue(ln);
4884 addReplyBulk(c,ele);
4885 listDelNode(list,ln);
3ea27d37 4886 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4887 server.dirty++;
ed9b544e 4888 }
4889}
4890
4891static void lpopCommand(redisClient *c) {
4892 popGenericCommand(c,REDIS_HEAD);
4893}
4894
4895static void rpopCommand(redisClient *c) {
4896 popGenericCommand(c,REDIS_TAIL);
4897}
4898
4899static void lrangeCommand(redisClient *c) {
3305306f 4900 robj *o;
ed9b544e 4901 int start = atoi(c->argv[2]->ptr);
4902 int end = atoi(c->argv[3]->ptr);
dd88747b 4903 int llen;
4904 int rangelen, j;
4905 list *list;
4906 listNode *ln;
4907 robj *ele;
4908
4e27f268 4909 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4910 || checkType(c,o,REDIS_LIST)) return;
dd88747b 4911 list = o->ptr;
4912 llen = listLength(list);
4913
4914 /* convert negative indexes */
4915 if (start < 0) start = llen+start;
4916 if (end < 0) end = llen+end;
4917 if (start < 0) start = 0;
4918 if (end < 0) end = 0;
4919
4920 /* indexes sanity checks */
4921 if (start > end || start >= llen) {
4922 /* Out of range start or start > end result in empty list */
4923 addReply(c,shared.emptymultibulk);
4924 return;
4925 }
4926 if (end >= llen) end = llen-1;
4927 rangelen = (end-start)+1;
3305306f 4928
dd88747b 4929 /* Return the result in form of a multi-bulk reply */
4930 ln = listIndex(list, start);
4931 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4932 for (j = 0; j < rangelen; j++) {
4933 ele = listNodeValue(ln);
4934 addReplyBulk(c,ele);
4935 ln = ln->next;
ed9b544e 4936 }
4937}
4938
4939static void ltrimCommand(redisClient *c) {
3305306f 4940 robj *o;
ed9b544e 4941 int start = atoi(c->argv[2]->ptr);
4942 int end = atoi(c->argv[3]->ptr);
dd88747b 4943 int llen;
4944 int j, ltrim, rtrim;
4945 list *list;
4946 listNode *ln;
4947
4948 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4949 checkType(c,o,REDIS_LIST)) return;
4950 list = o->ptr;
4951 llen = listLength(list);
4952
4953 /* convert negative indexes */
4954 if (start < 0) start = llen+start;
4955 if (end < 0) end = llen+end;
4956 if (start < 0) start = 0;
4957 if (end < 0) end = 0;
4958
4959 /* indexes sanity checks */
4960 if (start > end || start >= llen) {
4961 /* Out of range start or start > end result in empty list */
4962 ltrim = llen;
4963 rtrim = 0;
ed9b544e 4964 } else {
dd88747b 4965 if (end >= llen) end = llen-1;
4966 ltrim = start;
4967 rtrim = llen-end-1;
4968 }
ed9b544e 4969
dd88747b 4970 /* Remove list elements to perform the trim */
4971 for (j = 0; j < ltrim; j++) {
4972 ln = listFirst(list);
4973 listDelNode(list,ln);
4974 }
4975 for (j = 0; j < rtrim; j++) {
4976 ln = listLast(list);
4977 listDelNode(list,ln);
ed9b544e 4978 }
3ea27d37 4979 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4980 server.dirty++;
4981 addReply(c,shared.ok);
ed9b544e 4982}
4983
4984static void lremCommand(redisClient *c) {
3305306f 4985 robj *o;
dd88747b 4986 list *list;
4987 listNode *ln, *next;
4988 int toremove = atoi(c->argv[2]->ptr);
4989 int removed = 0;
4990 int fromtail = 0;
a4d1ba9a 4991
dd88747b 4992 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4993 checkType(c,o,REDIS_LIST)) return;
4994 list = o->ptr;
4995
4996 if (toremove < 0) {
4997 toremove = -toremove;
4998 fromtail = 1;
4999 }
5000 ln = fromtail ? list->tail : list->head;
5001 while (ln) {
5002 robj *ele = listNodeValue(ln);
5003
5004 next = fromtail ? ln->prev : ln->next;
bf028098 5005 if (equalStringObjects(ele,c->argv[3])) {
dd88747b 5006 listDelNode(list,ln);
5007 server.dirty++;
5008 removed++;
5009 if (toremove && removed == toremove) break;
ed9b544e 5010 }
dd88747b 5011 ln = next;
ed9b544e 5012 }
3ea27d37 5013 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5014 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 5015}
5016
12f9d551 5017/* This is the semantic of this command:
0f5f7e9a 5018 * RPOPLPUSH srclist dstlist:
12f9d551 5019 * IF LLEN(srclist) > 0
5020 * element = RPOP srclist
5021 * LPUSH dstlist element
5022 * RETURN element
5023 * ELSE
5024 * RETURN nil
5025 * END
5026 * END
5027 *
5028 * The idea is to be able to get an element from a list in a reliable way
5029 * since the element is not just returned but pushed against another list
5030 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5031 */
0f5f7e9a 5032static void rpoplpushcommand(redisClient *c) {
12f9d551 5033 robj *sobj;
dd88747b 5034 list *srclist;
5035 listNode *ln;
5036
5037 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5038 checkType(c,sobj,REDIS_LIST)) return;
5039 srclist = sobj->ptr;
5040 ln = listLast(srclist);
12f9d551 5041
dd88747b 5042 if (ln == NULL) {
12f9d551 5043 addReply(c,shared.nullbulk);
5044 } else {
dd88747b 5045 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5046 robj *ele = listNodeValue(ln);
5047 list *dstlist;
e20fb74f 5048
dd88747b 5049 if (dobj && dobj->type != REDIS_LIST) {
5050 addReply(c,shared.wrongtypeerr);
5051 return;
5052 }
12f9d551 5053
dd88747b 5054 /* Add the element to the target list (unless it's directly
5055 * passed to some BLPOP-ing client */
5056 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
5057 if (dobj == NULL) {
5058 /* Create the list if the key does not exist */
5059 dobj = createListObject();
5060 dictAdd(c->db->dict,c->argv[2],dobj);
5061 incrRefCount(c->argv[2]);
12f9d551 5062 }
dd88747b 5063 dstlist = dobj->ptr;
5064 listAddNodeHead(dstlist,ele);
5065 incrRefCount(ele);
12f9d551 5066 }
dd88747b 5067
5068 /* Send the element to the client as reply as well */
5069 addReplyBulk(c,ele);
5070
5071 /* Finally remove the element from the source list */
5072 listDelNode(srclist,ln);
3ea27d37 5073 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5074 server.dirty++;
12f9d551 5075 }
5076}
5077
ed9b544e 5078/* ==================================== Sets ================================ */
5079
5080static void saddCommand(redisClient *c) {
ed9b544e 5081 robj *set;
5082
3305306f 5083 set = lookupKeyWrite(c->db,c->argv[1]);
5084 if (set == NULL) {
ed9b544e 5085 set = createSetObject();
3305306f 5086 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 5087 incrRefCount(c->argv[1]);
5088 } else {
ed9b544e 5089 if (set->type != REDIS_SET) {
c937aa89 5090 addReply(c,shared.wrongtypeerr);
ed9b544e 5091 return;
5092 }
5093 }
5094 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5095 incrRefCount(c->argv[2]);
5096 server.dirty++;
c937aa89 5097 addReply(c,shared.cone);
ed9b544e 5098 } else {
c937aa89 5099 addReply(c,shared.czero);
ed9b544e 5100 }
5101}
5102
5103static void sremCommand(redisClient *c) {
3305306f 5104 robj *set;
ed9b544e 5105
dd88747b 5106 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5107 checkType(c,set,REDIS_SET)) return;
5108
5109 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5110 server.dirty++;
5111 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 5112 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5113 addReply(c,shared.cone);
ed9b544e 5114 } else {
dd88747b 5115 addReply(c,shared.czero);
ed9b544e 5116 }
5117}
5118
a4460ef4 5119static void smoveCommand(redisClient *c) {
5120 robj *srcset, *dstset;
5121
5122 srcset = lookupKeyWrite(c->db,c->argv[1]);
5123 dstset = lookupKeyWrite(c->db,c->argv[2]);
5124
5125 /* If the source key does not exist return 0, if it's of the wrong type
5126 * raise an error */
5127 if (srcset == NULL || srcset->type != REDIS_SET) {
5128 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5129 return;
5130 }
5131 /* Error if the destination key is not a set as well */
5132 if (dstset && dstset->type != REDIS_SET) {
5133 addReply(c,shared.wrongtypeerr);
5134 return;
5135 }
5136 /* Remove the element from the source set */
5137 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5138 /* Key not found in the src set! return zero */
5139 addReply(c,shared.czero);
5140 return;
5141 }
3ea27d37 5142 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5143 deleteKey(c->db,c->argv[1]);
a4460ef4 5144 server.dirty++;
5145 /* Add the element to the destination set */
5146 if (!dstset) {
5147 dstset = createSetObject();
5148 dictAdd(c->db->dict,c->argv[2],dstset);
5149 incrRefCount(c->argv[2]);
5150 }
5151 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5152 incrRefCount(c->argv[3]);
5153 addReply(c,shared.cone);
5154}
5155
ed9b544e 5156static void sismemberCommand(redisClient *c) {
3305306f 5157 robj *set;
ed9b544e 5158
dd88747b 5159 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5160 checkType(c,set,REDIS_SET)) return;
5161
5162 if (dictFind(set->ptr,c->argv[2]))
5163 addReply(c,shared.cone);
5164 else
c937aa89 5165 addReply(c,shared.czero);
ed9b544e 5166}
5167
5168static void scardCommand(redisClient *c) {
3305306f 5169 robj *o;
ed9b544e 5170 dict *s;
dd88747b 5171
5172 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5173 checkType(c,o,REDIS_SET)) return;
e0a62c7f 5174
dd88747b 5175 s = o->ptr;
5176 addReplyUlong(c,dictSize(s));
ed9b544e 5177}
5178
12fea928 5179static void spopCommand(redisClient *c) {
5180 robj *set;
5181 dictEntry *de;
5182
dd88747b 5183 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5184 checkType(c,set,REDIS_SET)) return;
5185
5186 de = dictGetRandomKey(set->ptr);
5187 if (de == NULL) {
12fea928 5188 addReply(c,shared.nullbulk);
5189 } else {
dd88747b 5190 robj *ele = dictGetEntryKey(de);
12fea928 5191
dd88747b 5192 addReplyBulk(c,ele);
5193 dictDelete(set->ptr,ele);
5194 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 5195 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5196 server.dirty++;
12fea928 5197 }
5198}
5199
2abb95a9 5200static void srandmemberCommand(redisClient *c) {
5201 robj *set;
5202 dictEntry *de;
5203
dd88747b 5204 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5205 checkType(c,set,REDIS_SET)) return;
5206
5207 de = dictGetRandomKey(set->ptr);
5208 if (de == NULL) {
2abb95a9 5209 addReply(c,shared.nullbulk);
5210 } else {
dd88747b 5211 robj *ele = dictGetEntryKey(de);
2abb95a9 5212
dd88747b 5213 addReplyBulk(c,ele);
2abb95a9 5214 }
5215}
5216
ed9b544e 5217static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5218 dict **d1 = (void*) s1, **d2 = (void*) s2;
5219
3305306f 5220 return dictSize(*d1)-dictSize(*d2);
ed9b544e 5221}
5222
682ac724 5223static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 5224 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5225 dictIterator *di;
5226 dictEntry *de;
5227 robj *lenobj = NULL, *dstset = NULL;
682ac724 5228 unsigned long j, cardinality = 0;
ed9b544e 5229
ed9b544e 5230 for (j = 0; j < setsnum; j++) {
5231 robj *setobj;
3305306f 5232
5233 setobj = dstkey ?
5234 lookupKeyWrite(c->db,setskeys[j]) :
5235 lookupKeyRead(c->db,setskeys[j]);
5236 if (!setobj) {
ed9b544e 5237 zfree(dv);
5faa6025 5238 if (dstkey) {
fdcaae84 5239 if (deleteKey(c->db,dstkey))
5240 server.dirty++;
0d36ded0 5241 addReply(c,shared.czero);
5faa6025 5242 } else {
4e27f268 5243 addReply(c,shared.emptymultibulk);
5faa6025 5244 }
ed9b544e 5245 return;
5246 }
ed9b544e 5247 if (setobj->type != REDIS_SET) {
5248 zfree(dv);
c937aa89 5249 addReply(c,shared.wrongtypeerr);
ed9b544e 5250 return;
5251 }
5252 dv[j] = setobj->ptr;
5253 }
5254 /* Sort sets from the smallest to largest, this will improve our
5255 * algorithm's performace */
5256 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5257
5258 /* The first thing we should output is the total number of elements...
5259 * since this is a multi-bulk write, but at this stage we don't know
5260 * the intersection set size, so we use a trick, append an empty object
5261 * to the output list and save the pointer to later modify it with the
5262 * right length */
5263 if (!dstkey) {
5264 lenobj = createObject(REDIS_STRING,NULL);
5265 addReply(c,lenobj);
5266 decrRefCount(lenobj);
5267 } else {
5268 /* If we have a target key where to store the resulting set
5269 * create this key with an empty set inside */
5270 dstset = createSetObject();
ed9b544e 5271 }
5272
5273 /* Iterate all the elements of the first (smallest) set, and test
5274 * the element against all the other sets, if at least one set does
5275 * not include the element it is discarded */
5276 di = dictGetIterator(dv[0]);
ed9b544e 5277
5278 while((de = dictNext(di)) != NULL) {
5279 robj *ele;
5280
5281 for (j = 1; j < setsnum; j++)
5282 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5283 if (j != setsnum)
5284 continue; /* at least one set does not contain the member */
5285 ele = dictGetEntryKey(de);
5286 if (!dstkey) {
dd88747b 5287 addReplyBulk(c,ele);
ed9b544e 5288 cardinality++;
5289 } else {
5290 dictAdd(dstset->ptr,ele,NULL);
5291 incrRefCount(ele);
5292 }
5293 }
5294 dictReleaseIterator(di);
5295
83cdfe18 5296 if (dstkey) {
3ea27d37 5297 /* Store the resulting set into the target, if the intersection
5298 * is not an empty set. */
83cdfe18 5299 deleteKey(c->db,dstkey);
3ea27d37 5300 if (dictSize((dict*)dstset->ptr) > 0) {
5301 dictAdd(c->db->dict,dstkey,dstset);
5302 incrRefCount(dstkey);
482b672d 5303 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5304 } else {
5305 decrRefCount(dstset);
d36c4e97 5306 addReply(c,shared.czero);
3ea27d37 5307 }
40d224a9 5308 server.dirty++;
d36c4e97 5309 } else {
5310 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 5311 }
ed9b544e 5312 zfree(dv);
5313}
5314
5315static void sinterCommand(redisClient *c) {
5316 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5317}
5318
5319static void sinterstoreCommand(redisClient *c) {
5320 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5321}
5322
f4f56e1d 5323#define REDIS_OP_UNION 0
5324#define REDIS_OP_DIFF 1
2830ca53 5325#define REDIS_OP_INTER 2
f4f56e1d 5326
5327static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 5328 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5329 dictIterator *di;
5330 dictEntry *de;
f4f56e1d 5331 robj *dstset = NULL;
40d224a9 5332 int j, cardinality = 0;
5333
40d224a9 5334 for (j = 0; j < setsnum; j++) {
5335 robj *setobj;
5336
5337 setobj = dstkey ?
5338 lookupKeyWrite(c->db,setskeys[j]) :
5339 lookupKeyRead(c->db,setskeys[j]);
5340 if (!setobj) {
5341 dv[j] = NULL;
5342 continue;
5343 }
5344 if (setobj->type != REDIS_SET) {
5345 zfree(dv);
5346 addReply(c,shared.wrongtypeerr);
5347 return;
5348 }
5349 dv[j] = setobj->ptr;
5350 }
5351
5352 /* We need a temp set object to store our union. If the dstkey
5353 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5354 * this set object will be the resulting object to set into the target key*/
5355 dstset = createSetObject();
5356
40d224a9 5357 /* Iterate all the elements of all the sets, add every element a single
5358 * time to the result set */
5359 for (j = 0; j < setsnum; j++) {
51829ed3 5360 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 5361 if (!dv[j]) continue; /* non existing keys are like empty sets */
5362
5363 di = dictGetIterator(dv[j]);
40d224a9 5364
5365 while((de = dictNext(di)) != NULL) {
5366 robj *ele;
5367
5368 /* dictAdd will not add the same element multiple times */
5369 ele = dictGetEntryKey(de);
f4f56e1d 5370 if (op == REDIS_OP_UNION || j == 0) {
5371 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5372 incrRefCount(ele);
40d224a9 5373 cardinality++;
5374 }
f4f56e1d 5375 } else if (op == REDIS_OP_DIFF) {
5376 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5377 cardinality--;
5378 }
40d224a9 5379 }
5380 }
5381 dictReleaseIterator(di);
51829ed3 5382
d36c4e97 5383 /* result set is empty? Exit asap. */
5384 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 5385 }
5386
f4f56e1d 5387 /* Output the content of the resulting set, if not in STORE mode */
5388 if (!dstkey) {
5389 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5390 di = dictGetIterator(dstset->ptr);
f4f56e1d 5391 while((de = dictNext(di)) != NULL) {
5392 robj *ele;
5393
5394 ele = dictGetEntryKey(de);
dd88747b 5395 addReplyBulk(c,ele);
f4f56e1d 5396 }
5397 dictReleaseIterator(di);
d36c4e97 5398 decrRefCount(dstset);
83cdfe18
AG
5399 } else {
5400 /* If we have a target key where to store the resulting set
5401 * create this key with the result set inside */
5402 deleteKey(c->db,dstkey);
3ea27d37 5403 if (dictSize((dict*)dstset->ptr) > 0) {
5404 dictAdd(c->db->dict,dstkey,dstset);
5405 incrRefCount(dstkey);
482b672d 5406 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5407 } else {
5408 decrRefCount(dstset);
d36c4e97 5409 addReply(c,shared.czero);
3ea27d37 5410 }
40d224a9 5411 server.dirty++;
5412 }
5413 zfree(dv);
5414}
5415
5416static void sunionCommand(redisClient *c) {
f4f56e1d 5417 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 5418}
5419
5420static void sunionstoreCommand(redisClient *c) {
f4f56e1d 5421 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5422}
5423
5424static void sdiffCommand(redisClient *c) {
5425 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5426}
5427
5428static void sdiffstoreCommand(redisClient *c) {
5429 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 5430}
5431
6b47e12e 5432/* ==================================== ZSets =============================== */
5433
5434/* ZSETs are ordered sets using two data structures to hold the same elements
5435 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5436 * data structure.
5437 *
5438 * The elements are added to an hash table mapping Redis objects to scores.
5439 * At the same time the elements are added to a skip list mapping scores
5440 * to Redis objects (so objects are sorted by scores in this "view"). */
5441
5442/* This skiplist implementation is almost a C translation of the original
5443 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5444 * Alternative to Balanced Trees", modified in three ways:
5445 * a) this implementation allows for repeated values.
5446 * b) the comparison is not just by key (our 'score') but by satellite data.
5447 * c) there is a back pointer, so it's a doubly linked list with the back
5448 * pointers being only at "level 1". This allows to traverse the list
5449 * from tail to head, useful for ZREVRANGE. */
5450
5451static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5452 zskiplistNode *zn = zmalloc(sizeof(*zn));
5453
5454 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2f4dd7e0 5455 if (level > 1)
2b37892e 5456 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
2f4dd7e0 5457 else
5458 zn->span = NULL;
6b47e12e 5459 zn->score = score;
5460 zn->obj = obj;
5461 return zn;
5462}
5463
5464static zskiplist *zslCreate(void) {
5465 int j;
5466 zskiplist *zsl;
e0a62c7f 5467
6b47e12e 5468 zsl = zmalloc(sizeof(*zsl));
5469 zsl->level = 1;
cc812361 5470 zsl->length = 0;
6b47e12e 5471 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 5472 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 5473 zsl->header->forward[j] = NULL;
94e543b5 5474
5475 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5476 if (j < ZSKIPLIST_MAXLEVEL-1)
5477 zsl->header->span[j] = 0;
69d95c3e 5478 }
e3870fab 5479 zsl->header->backward = NULL;
5480 zsl->tail = NULL;
6b47e12e 5481 return zsl;
5482}
5483
fd8ccf44 5484static void zslFreeNode(zskiplistNode *node) {
5485 decrRefCount(node->obj);
ad807e6f 5486 zfree(node->forward);
69d95c3e 5487 zfree(node->span);
fd8ccf44 5488 zfree(node);
5489}
5490
5491static void zslFree(zskiplist *zsl) {
ad807e6f 5492 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 5493
ad807e6f 5494 zfree(zsl->header->forward);
69d95c3e 5495 zfree(zsl->header->span);
ad807e6f 5496 zfree(zsl->header);
fd8ccf44 5497 while(node) {
599379dd 5498 next = node->forward[0];
fd8ccf44 5499 zslFreeNode(node);
5500 node = next;
5501 }
ad807e6f 5502 zfree(zsl);
fd8ccf44 5503}
5504
6b47e12e 5505static int zslRandomLevel(void) {
5506 int level = 1;
5507 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5508 level += 1;
10c2baa5 5509 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6b47e12e 5510}
5511
5512static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5513 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 5514 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 5515 int i, level;
5516
5517 x = zsl->header;
5518 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
5519 /* store rank that is crossed to reach the insert position */
5520 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 5521
9d60e6e4 5522 while (x->forward[i] &&
5523 (x->forward[i]->score < score ||
5524 (x->forward[i]->score == score &&
69d95c3e 5525 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 5526 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 5527 x = x->forward[i];
69d95c3e 5528 }
6b47e12e 5529 update[i] = x;
5530 }
6b47e12e 5531 /* we assume the key is not already inside, since we allow duplicated
5532 * scores, and the re-insertion of score and redis object should never
5533 * happpen since the caller of zslInsert() should test in the hash table
5534 * if the element is already inside or not. */
5535 level = zslRandomLevel();
5536 if (level > zsl->level) {
69d95c3e 5537 for (i = zsl->level; i < level; i++) {
2b37892e 5538 rank[i] = 0;
6b47e12e 5539 update[i] = zsl->header;
2b37892e 5540 update[i]->span[i-1] = zsl->length;
69d95c3e 5541 }
6b47e12e 5542 zsl->level = level;
5543 }
5544 x = zslCreateNode(level,score,obj);
5545 for (i = 0; i < level; i++) {
5546 x->forward[i] = update[i]->forward[i];
5547 update[i]->forward[i] = x;
69d95c3e
PN
5548
5549 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
5550 if (i > 0) {
5551 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5552 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5553 }
6b47e12e 5554 }
69d95c3e
PN
5555
5556 /* increment span for untouched levels */
5557 for (i = level; i < zsl->level; i++) {
2b37892e 5558 update[i]->span[i-1]++;
69d95c3e
PN
5559 }
5560
bb975144 5561 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 5562 if (x->forward[0])
5563 x->forward[0]->backward = x;
5564 else
5565 zsl->tail = x;
cc812361 5566 zsl->length++;
6b47e12e 5567}
5568
84105336
PN
5569/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5570void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5571 int i;
5572 for (i = 0; i < zsl->level; i++) {
5573 if (update[i]->forward[i] == x) {
5574 if (i > 0) {
5575 update[i]->span[i-1] += x->span[i-1] - 1;
5576 }
5577 update[i]->forward[i] = x->forward[i];
5578 } else {
5579 /* invariant: i > 0, because update[0]->forward[0]
5580 * is always equal to x */
5581 update[i]->span[i-1] -= 1;
5582 }
5583 }
5584 if (x->forward[0]) {
5585 x->forward[0]->backward = x->backward;
5586 } else {
5587 zsl->tail = x->backward;
5588 }
5589 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5590 zsl->level--;
5591 zsl->length--;
5592}
5593
50c55df5 5594/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 5595static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 5596 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5597 int i;
5598
5599 x = zsl->header;
5600 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 5601 while (x->forward[i] &&
5602 (x->forward[i]->score < score ||
5603 (x->forward[i]->score == score &&
5604 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 5605 x = x->forward[i];
5606 update[i] = x;
5607 }
5608 /* We may have multiple elements with the same score, what we need
5609 * is to find the element with both the right score and object. */
5610 x = x->forward[0];
bf028098 5611 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
84105336 5612 zslDeleteNode(zsl, x, update);
9d60e6e4 5613 zslFreeNode(x);
9d60e6e4 5614 return 1;
5615 } else {
5616 return 0; /* not found */
e197b441 5617 }
5618 return 0; /* not found */
fd8ccf44 5619}
5620
1807985b 5621/* Delete all the elements with score between min and max from the skiplist.
5622 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5623 * Note that this function takes the reference to the hash table view of the
5624 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 5625static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 5626 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5627 unsigned long removed = 0;
5628 int i;
5629
5630 x = zsl->header;
5631 for (i = zsl->level-1; i >= 0; i--) {
5632 while (x->forward[i] && x->forward[i]->score < min)
5633 x = x->forward[i];
5634 update[i] = x;
5635 }
5636 /* We may have multiple elements with the same score, what we need
5637 * is to find the element with both the right score and object. */
5638 x = x->forward[0];
5639 while (x && x->score <= max) {
84105336
PN
5640 zskiplistNode *next = x->forward[0];
5641 zslDeleteNode(zsl, x, update);
1807985b 5642 dictDelete(dict,x->obj);
5643 zslFreeNode(x);
1807985b 5644 removed++;
5645 x = next;
5646 }
5647 return removed; /* not found */
5648}
1807985b 5649
9212eafd 5650/* Delete all the elements with rank between start and end from the skiplist.
2424490f 5651 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
5652static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5653 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5654 unsigned long traversed = 0, removed = 0;
5655 int i;
5656
9212eafd
PN
5657 x = zsl->header;
5658 for (i = zsl->level-1; i >= 0; i--) {
5659 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5660 traversed += i > 0 ? x->span[i-1] : 1;
5661 x = x->forward[i];
1807985b 5662 }
9212eafd
PN
5663 update[i] = x;
5664 }
5665
5666 traversed++;
5667 x = x->forward[0];
5668 while (x && traversed <= end) {
84105336
PN
5669 zskiplistNode *next = x->forward[0];
5670 zslDeleteNode(zsl, x, update);
1807985b 5671 dictDelete(dict,x->obj);
5672 zslFreeNode(x);
1807985b 5673 removed++;
9212eafd 5674 traversed++;
1807985b 5675 x = next;
5676 }
9212eafd 5677 return removed;
1807985b 5678}
5679
50c55df5 5680/* Find the first node having a score equal or greater than the specified one.
5681 * Returns NULL if there is no match. */
5682static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5683 zskiplistNode *x;
5684 int i;
5685
5686 x = zsl->header;
5687 for (i = zsl->level-1; i >= 0; i--) {
5688 while (x->forward[i] && x->forward[i]->score < score)
5689 x = x->forward[i];
5690 }
5691 /* We may have multiple elements with the same score, what we need
5692 * is to find the element with both the right score and object. */
5693 return x->forward[0];
5694}
5695
27b0ccca
PN
5696/* Find the rank for an element by both score and key.
5697 * Returns 0 when the element cannot be found, rank otherwise.
5698 * Note that the rank is 1-based due to the span of zsl->header to the
5699 * first element. */
5700static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5701 zskiplistNode *x;
5702 unsigned long rank = 0;
5703 int i;
5704
5705 x = zsl->header;
5706 for (i = zsl->level-1; i >= 0; i--) {
5707 while (x->forward[i] &&
5708 (x->forward[i]->score < score ||
5709 (x->forward[i]->score == score &&
5710 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 5711 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
5712 x = x->forward[i];
5713 }
5714
5715 /* x might be equal to zsl->header, so test if obj is non-NULL */
bf028098 5716 if (x->obj && equalStringObjects(x->obj,o)) {
27b0ccca
PN
5717 return rank;
5718 }
5719 }
5720 return 0;
5721}
5722
e74825c2
PN
5723/* Finds an element by its rank. The rank argument needs to be 1-based. */
5724zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5725 zskiplistNode *x;
5726 unsigned long traversed = 0;
5727 int i;
5728
5729 x = zsl->header;
5730 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 5731 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5732 {
a50ea45c 5733 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
5734 x = x->forward[i];
5735 }
e74825c2
PN
5736 if (traversed == rank) {
5737 return x;
5738 }
5739 }
5740 return NULL;
5741}
5742
fd8ccf44 5743/* The actual Z-commands implementations */
5744
7db723ad 5745/* This generic command implements both ZADD and ZINCRBY.
e2665397 5746 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 5747 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 5748static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 5749 robj *zsetobj;
5750 zset *zs;
5751 double *score;
5752
5fc9229c 5753 if (isnan(scoreval)) {
5754 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
5755 return;
5756 }
5757
e2665397 5758 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 5759 if (zsetobj == NULL) {
5760 zsetobj = createZsetObject();
e2665397 5761 dictAdd(c->db->dict,key,zsetobj);
5762 incrRefCount(key);
fd8ccf44 5763 } else {
5764 if (zsetobj->type != REDIS_ZSET) {
5765 addReply(c,shared.wrongtypeerr);
5766 return;
5767 }
5768 }
fd8ccf44 5769 zs = zsetobj->ptr;
e2665397 5770
7db723ad 5771 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 5772 * needs to handle the two different conditions. It's all about setting
5773 * '*score', that is, the new score to set, to the right value. */
5774 score = zmalloc(sizeof(double));
5775 if (doincrement) {
5776 dictEntry *de;
5777
5778 /* Read the old score. If the element was not present starts from 0 */
5779 de = dictFind(zs->dict,ele);
5780 if (de) {
5781 double *oldscore = dictGetEntryVal(de);
5782 *score = *oldscore + scoreval;
5783 } else {
5784 *score = scoreval;
5785 }
5fc9229c 5786 if (isnan(*score)) {
5787 addReplySds(c,
5788 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
5789 zfree(score);
5790 /* Note that we don't need to check if the zset may be empty and
5791 * should be removed here, as we can only obtain Nan as score if
5792 * there was already an element in the sorted set. */
5793 return;
5794 }
e2665397 5795 } else {
5796 *score = scoreval;
5797 }
5798
5799 /* What follows is a simple remove and re-insert operation that is common
7db723ad 5800 * to both ZADD and ZINCRBY... */
e2665397 5801 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 5802 /* case 1: New element */
e2665397 5803 incrRefCount(ele); /* added to hash */
5804 zslInsert(zs->zsl,*score,ele);
5805 incrRefCount(ele); /* added to skiplist */
fd8ccf44 5806 server.dirty++;
e2665397 5807 if (doincrement)
e2665397 5808 addReplyDouble(c,*score);
91d71bfc 5809 else
5810 addReply(c,shared.cone);
fd8ccf44 5811 } else {
5812 dictEntry *de;
5813 double *oldscore;
e0a62c7f 5814
fd8ccf44 5815 /* case 2: Score update operation */
e2665397 5816 de = dictFind(zs->dict,ele);
dfc5e96c 5817 redisAssert(de != NULL);
fd8ccf44 5818 oldscore = dictGetEntryVal(de);
5819 if (*score != *oldscore) {
5820 int deleted;
5821
e2665397 5822 /* Remove and insert the element in the skip list with new score */
5823 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 5824 redisAssert(deleted != 0);
e2665397 5825 zslInsert(zs->zsl,*score,ele);
5826 incrRefCount(ele);
5827 /* Update the score in the hash table */
5828 dictReplace(zs->dict,ele,score);
fd8ccf44 5829 server.dirty++;
2161a965 5830 } else {
5831 zfree(score);
fd8ccf44 5832 }
e2665397 5833 if (doincrement)
5834 addReplyDouble(c,*score);
5835 else
5836 addReply(c,shared.czero);
fd8ccf44 5837 }
5838}
5839
e2665397 5840static void zaddCommand(redisClient *c) {
5841 double scoreval;
5842
bd79a6bd 5843 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5844 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5845}
5846
7db723ad 5847static void zincrbyCommand(redisClient *c) {
e2665397 5848 double scoreval;
5849
bd79a6bd 5850 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5851 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5852}
5853
1b7106e7 5854static void zremCommand(redisClient *c) {
5855 robj *zsetobj;
5856 zset *zs;
dd88747b 5857 dictEntry *de;
5858 double *oldscore;
5859 int deleted;
1b7106e7 5860
dd88747b 5861 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5862 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 5863
dd88747b 5864 zs = zsetobj->ptr;
5865 de = dictFind(zs->dict,c->argv[2]);
5866 if (de == NULL) {
5867 addReply(c,shared.czero);
5868 return;
1b7106e7 5869 }
dd88747b 5870 /* Delete from the skiplist */
5871 oldscore = dictGetEntryVal(de);
5872 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5873 redisAssert(deleted != 0);
5874
5875 /* Delete from the hash table */
5876 dictDelete(zs->dict,c->argv[2]);
5877 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5878 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5879 server.dirty++;
5880 addReply(c,shared.cone);
1b7106e7 5881}
5882
1807985b 5883static void zremrangebyscoreCommand(redisClient *c) {
bbe025e0
AM
5884 double min;
5885 double max;
dd88747b 5886 long deleted;
1807985b 5887 robj *zsetobj;
5888 zset *zs;
5889
bd79a6bd
PN
5890 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5891 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
bbe025e0 5892
dd88747b 5893 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5894 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 5895
dd88747b 5896 zs = zsetobj->ptr;
5897 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5898 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5899 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5900 server.dirty += deleted;
482b672d 5901 addReplyLongLong(c,deleted);
1807985b 5902}
5903
9212eafd 5904static void zremrangebyrankCommand(redisClient *c) {
bbe025e0
AM
5905 long start;
5906 long end;
dd88747b 5907 int llen;
5908 long deleted;
9212eafd
PN
5909 robj *zsetobj;
5910 zset *zs;
5911
bd79a6bd
PN
5912 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5913 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 5914
dd88747b 5915 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5916 checkType(c,zsetobj,REDIS_ZSET)) return;
5917 zs = zsetobj->ptr;
5918 llen = zs->zsl->length;
9212eafd 5919
dd88747b 5920 /* convert negative indexes */
5921 if (start < 0) start = llen+start;
5922 if (end < 0) end = llen+end;
5923 if (start < 0) start = 0;
5924 if (end < 0) end = 0;
9212eafd 5925
dd88747b 5926 /* indexes sanity checks */
5927 if (start > end || start >= llen) {
5928 addReply(c,shared.czero);
5929 return;
9212eafd 5930 }
dd88747b 5931 if (end >= llen) end = llen-1;
5932
5933 /* increment start and end because zsl*Rank functions
5934 * use 1-based rank */
5935 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5936 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5937 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5938 server.dirty += deleted;
482b672d 5939 addReplyLongLong(c, deleted);
9212eafd
PN
5940}
5941
8f92e768
PN
5942typedef struct {
5943 dict *dict;
5944 double weight;
5945} zsetopsrc;
5946
5947static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5948 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5949 unsigned long size1, size2;
5950 size1 = d1->dict ? dictSize(d1->dict) : 0;
5951 size2 = d2->dict ? dictSize(d2->dict) : 0;
5952 return size1 - size2;
5953}
5954
d2764cd6
PN
5955#define REDIS_AGGR_SUM 1
5956#define REDIS_AGGR_MIN 2
5957#define REDIS_AGGR_MAX 3
bc000c1d 5958#define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
d2764cd6
PN
5959
5960inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5961 if (aggregate == REDIS_AGGR_SUM) {
5962 *target = *target + val;
5963 } else if (aggregate == REDIS_AGGR_MIN) {
5964 *target = val < *target ? val : *target;
5965 } else if (aggregate == REDIS_AGGR_MAX) {
5966 *target = val > *target ? val : *target;
5967 } else {
5968 /* safety net */
f83c6cb5 5969 redisPanic("Unknown ZUNION/INTER aggregate type");
d2764cd6
PN
5970 }
5971}
5972
2830ca53 5973static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
bc000c1d 5974 int i, j, setnum;
d2764cd6 5975 int aggregate = REDIS_AGGR_SUM;
8f92e768 5976 zsetopsrc *src;
2830ca53
PN
5977 robj *dstobj;
5978 zset *dstzset;
b287c9bb
PN
5979 dictIterator *di;
5980 dictEntry *de;
5981
bc000c1d
JC
5982 /* expect setnum input keys to be given */
5983 setnum = atoi(c->argv[2]->ptr);
5984 if (setnum < 1) {
5d373da9 5985 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
2830ca53 5986 return;
b287c9bb 5987 }
2830ca53
PN
5988
5989 /* test if the expected number of keys would overflow */
bc000c1d 5990 if (3+setnum > c->argc) {
b287c9bb
PN
5991 addReply(c,shared.syntaxerr);
5992 return;
5993 }
5994
2830ca53 5995 /* read keys to be used for input */
bc000c1d
JC
5996 src = zmalloc(sizeof(zsetopsrc) * setnum);
5997 for (i = 0, j = 3; i < setnum; i++, j++) {
5998 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
5999 if (!obj) {
8f92e768 6000 src[i].dict = NULL;
b287c9bb 6001 } else {
bc000c1d
JC
6002 if (obj->type == REDIS_ZSET) {
6003 src[i].dict = ((zset*)obj->ptr)->dict;
6004 } else if (obj->type == REDIS_SET) {
6005 src[i].dict = (obj->ptr);
6006 } else {
8f92e768 6007 zfree(src);
b287c9bb
PN
6008 addReply(c,shared.wrongtypeerr);
6009 return;
6010 }
b287c9bb 6011 }
2830ca53
PN
6012
6013 /* default all weights to 1 */
8f92e768 6014 src[i].weight = 1.0;
b287c9bb
PN
6015 }
6016
2830ca53
PN
6017 /* parse optional extra arguments */
6018 if (j < c->argc) {
d2764cd6 6019 int remaining = c->argc - j;
b287c9bb 6020
2830ca53 6021 while (remaining) {
bc000c1d 6022 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 6023 j++; remaining--;
bc000c1d 6024 for (i = 0; i < setnum; i++, j++, remaining--) {
bd79a6bd 6025 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
bbe025e0 6026 return;
2830ca53 6027 }
d2764cd6
PN
6028 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6029 j++; remaining--;
6030 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6031 aggregate = REDIS_AGGR_SUM;
6032 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6033 aggregate = REDIS_AGGR_MIN;
6034 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6035 aggregate = REDIS_AGGR_MAX;
6036 } else {
6037 zfree(src);
6038 addReply(c,shared.syntaxerr);
6039 return;
6040 }
6041 j++; remaining--;
2830ca53 6042 } else {
8f92e768 6043 zfree(src);
2830ca53
PN
6044 addReply(c,shared.syntaxerr);
6045 return;
6046 }
6047 }
6048 }
b287c9bb 6049
d2764cd6
PN
6050 /* sort sets from the smallest to largest, this will improve our
6051 * algorithm's performance */
bc000c1d 6052 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
d2764cd6 6053
2830ca53
PN
6054 dstobj = createZsetObject();
6055 dstzset = dstobj->ptr;
6056
6057 if (op == REDIS_OP_INTER) {
8f92e768
PN
6058 /* skip going over all entries if the smallest zset is NULL or empty */
6059 if (src[0].dict && dictSize(src[0].dict) > 0) {
6060 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6061 * from small to large, all src[i > 0].dict are non-empty too */
6062 di = dictGetIterator(src[0].dict);
2830ca53 6063 while((de = dictNext(di)) != NULL) {
d2764cd6 6064 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6065 *score = src[0].weight * zunionInterDictValue(de);
2830ca53 6066
bc000c1d 6067 for (j = 1; j < setnum; j++) {
d2764cd6 6068 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6069 if (other) {
bc000c1d 6070 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6071 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6072 } else {
6073 break;
6074 }
6075 }
b287c9bb 6076
2830ca53 6077 /* skip entry when not present in every source dict */
bc000c1d 6078 if (j != setnum) {
2830ca53
PN
6079 zfree(score);
6080 } else {
6081 robj *o = dictGetEntryKey(de);
6082 dictAdd(dstzset->dict,o,score);
6083 incrRefCount(o); /* added to dictionary */
6084 zslInsert(dstzset->zsl,*score,o);
6085 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
6086 }
6087 }
2830ca53
PN
6088 dictReleaseIterator(di);
6089 }
6090 } else if (op == REDIS_OP_UNION) {
bc000c1d 6091 for (i = 0; i < setnum; i++) {
8f92e768 6092 if (!src[i].dict) continue;
2830ca53 6093
8f92e768 6094 di = dictGetIterator(src[i].dict);
2830ca53
PN
6095 while((de = dictNext(di)) != NULL) {
6096 /* skip key when already processed */
6097 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6098
d2764cd6 6099 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6100 *score = src[i].weight * zunionInterDictValue(de);
2830ca53 6101
d2764cd6
PN
6102 /* because the zsets are sorted by size, its only possible
6103 * for sets at larger indices to hold this entry */
bc000c1d 6104 for (j = (i+1); j < setnum; j++) {
d2764cd6 6105 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6106 if (other) {
bc000c1d 6107 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6108 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6109 }
6110 }
b287c9bb 6111
2830ca53
PN
6112 robj *o = dictGetEntryKey(de);
6113 dictAdd(dstzset->dict,o,score);
6114 incrRefCount(o); /* added to dictionary */
6115 zslInsert(dstzset->zsl,*score,o);
6116 incrRefCount(o); /* added to skiplist */
6117 }
6118 dictReleaseIterator(di);
b287c9bb 6119 }
2830ca53
PN
6120 } else {
6121 /* unknown operator */
6122 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
6123 }
6124
6125 deleteKey(c->db,dstkey);
3ea27d37 6126 if (dstzset->zsl->length) {
6127 dictAdd(c->db->dict,dstkey,dstobj);
6128 incrRefCount(dstkey);
482b672d 6129 addReplyLongLong(c, dstzset->zsl->length);
3ea27d37 6130 server.dirty++;
6131 } else {
8bca8773 6132 decrRefCount(dstobj);
3ea27d37 6133 addReply(c, shared.czero);
6134 }
8f92e768 6135 zfree(src);
b287c9bb
PN
6136}
6137
5d373da9 6138static void zunionstoreCommand(redisClient *c) {
2830ca53 6139 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
6140}
6141
5d373da9 6142static void zinterstoreCommand(redisClient *c) {
2830ca53 6143 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
6144}
6145
e3870fab 6146static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 6147 robj *o;
bbe025e0
AM
6148 long start;
6149 long end;
752da584 6150 int withscores = 0;
dd88747b 6151 int llen;
6152 int rangelen, j;
6153 zset *zsetobj;
6154 zskiplist *zsl;
6155 zskiplistNode *ln;
6156 robj *ele;
752da584 6157
bd79a6bd
PN
6158 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6159 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 6160
752da584 6161 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6162 withscores = 1;
6163 } else if (c->argc >= 5) {
6164 addReply(c,shared.syntaxerr);
6165 return;
6166 }
cc812361 6167
4e27f268 6168 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6169 || checkType(c,o,REDIS_ZSET)) return;
dd88747b 6170 zsetobj = o->ptr;
6171 zsl = zsetobj->zsl;
6172 llen = zsl->length;
cc812361 6173
dd88747b 6174 /* convert negative indexes */
6175 if (start < 0) start = llen+start;
6176 if (end < 0) end = llen+end;
6177 if (start < 0) start = 0;
6178 if (end < 0) end = 0;
cc812361 6179
dd88747b 6180 /* indexes sanity checks */
6181 if (start > end || start >= llen) {
6182 /* Out of range start or start > end result in empty list */
6183 addReply(c,shared.emptymultibulk);
6184 return;
6185 }
6186 if (end >= llen) end = llen-1;
6187 rangelen = (end-start)+1;
cc812361 6188
dd88747b 6189 /* check if starting point is trivial, before searching
6190 * the element in log(N) time */
6191 if (reverse) {
6192 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6193 } else {
6194 ln = start == 0 ?
6195 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6196 }
cc812361 6197
dd88747b 6198 /* Return the result in form of a multi-bulk reply */
6199 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6200 withscores ? (rangelen*2) : rangelen));
6201 for (j = 0; j < rangelen; j++) {
6202 ele = ln->obj;
6203 addReplyBulk(c,ele);
6204 if (withscores)
6205 addReplyDouble(c,ln->score);
6206 ln = reverse ? ln->backward : ln->forward[0];
cc812361 6207 }
6208}
6209
e3870fab 6210static void zrangeCommand(redisClient *c) {
6211 zrangeGenericCommand(c,0);
6212}
6213
6214static void zrevrangeCommand(redisClient *c) {
6215 zrangeGenericCommand(c,1);
6216}
6217
f44dd428 6218/* This command implements both ZRANGEBYSCORE and ZCOUNT.
6219 * If justcount is non-zero, just the count is returned. */
6220static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 6221 robj *o;
f44dd428 6222 double min, max;
6223 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 6224 int offset = 0, limit = -1;
0500ef27
SH
6225 int withscores = 0;
6226 int badsyntax = 0;
6227
f44dd428 6228 /* Parse the min-max interval. If one of the values is prefixed
6229 * by the "(" character, it's considered "open". For instance
6230 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6231 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6232 if (((char*)c->argv[2]->ptr)[0] == '(') {
6233 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6234 minex = 1;
6235 } else {
6236 min = strtod(c->argv[2]->ptr,NULL);
6237 }
6238 if (((char*)c->argv[3]->ptr)[0] == '(') {
6239 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6240 maxex = 1;
6241 } else {
6242 max = strtod(c->argv[3]->ptr,NULL);
6243 }
6244
6245 /* Parse "WITHSCORES": note that if the command was called with
6246 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6247 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 6248 if (c->argc == 5 || c->argc == 8) {
3a3978b1 6249 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6250 withscores = 1;
6251 else
6252 badsyntax = 1;
0500ef27 6253 }
3a3978b1 6254 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 6255 badsyntax = 1;
0500ef27 6256 if (badsyntax) {
454d4e43 6257 addReplySds(c,
6258 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 6259 return;
0500ef27
SH
6260 }
6261
f44dd428 6262 /* Parse "LIMIT" */
0500ef27 6263 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 6264 addReply(c,shared.syntaxerr);
6265 return;
0500ef27 6266 } else if (c->argc == (7 + withscores)) {
80181f78 6267 offset = atoi(c->argv[5]->ptr);
6268 limit = atoi(c->argv[6]->ptr);
0b13687c 6269 if (offset < 0) offset = 0;
80181f78 6270 }
50c55df5 6271
f44dd428 6272 /* Ok, lookup the key and get the range */
50c55df5 6273 o = lookupKeyRead(c->db,c->argv[1]);
6274 if (o == NULL) {
4e27f268 6275 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6276 } else {
6277 if (o->type != REDIS_ZSET) {
6278 addReply(c,shared.wrongtypeerr);
6279 } else {
6280 zset *zsetobj = o->ptr;
6281 zskiplist *zsl = zsetobj->zsl;
6282 zskiplistNode *ln;
f44dd428 6283 robj *ele, *lenobj = NULL;
6284 unsigned long rangelen = 0;
50c55df5 6285
f44dd428 6286 /* Get the first node with the score >= min, or with
6287 * score > min if 'minex' is true. */
50c55df5 6288 ln = zslFirstWithScore(zsl,min);
f44dd428 6289 while (minex && ln && ln->score == min) ln = ln->forward[0];
6290
50c55df5 6291 if (ln == NULL) {
6292 /* No element matching the speciifed interval */
f44dd428 6293 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6294 return;
6295 }
6296
6297 /* We don't know in advance how many matching elements there
6298 * are in the list, so we push this object that will represent
6299 * the multi-bulk length in the output buffer, and will "fix"
6300 * it later */
f44dd428 6301 if (!justcount) {
6302 lenobj = createObject(REDIS_STRING,NULL);
6303 addReply(c,lenobj);
6304 decrRefCount(lenobj);
6305 }
50c55df5 6306
f44dd428 6307 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 6308 if (offset) {
6309 offset--;
6310 ln = ln->forward[0];
6311 continue;
6312 }
6313 if (limit == 0) break;
f44dd428 6314 if (!justcount) {
6315 ele = ln->obj;
dd88747b 6316 addReplyBulk(c,ele);
f44dd428 6317 if (withscores)
6318 addReplyDouble(c,ln->score);
6319 }
50c55df5 6320 ln = ln->forward[0];
6321 rangelen++;
80181f78 6322 if (limit > 0) limit--;
50c55df5 6323 }
f44dd428 6324 if (justcount) {
482b672d 6325 addReplyLongLong(c,(long)rangelen);
f44dd428 6326 } else {
6327 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6328 withscores ? (rangelen*2) : rangelen);
6329 }
50c55df5 6330 }
6331 }
6332}
6333
f44dd428 6334static void zrangebyscoreCommand(redisClient *c) {
6335 genericZrangebyscoreCommand(c,0);
6336}
6337
6338static void zcountCommand(redisClient *c) {
6339 genericZrangebyscoreCommand(c,1);
6340}
6341
3c41331e 6342static void zcardCommand(redisClient *c) {
e197b441 6343 robj *o;
6344 zset *zs;
dd88747b 6345
6346 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6347 checkType(c,o,REDIS_ZSET)) return;
6348
6349 zs = o->ptr;
6350 addReplyUlong(c,zs->zsl->length);
e197b441 6351}
6352
6e333bbe 6353static void zscoreCommand(redisClient *c) {
6354 robj *o;
6355 zset *zs;
dd88747b 6356 dictEntry *de;
6357
6358 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6359 checkType(c,o,REDIS_ZSET)) return;
6360
6361 zs = o->ptr;
6362 de = dictFind(zs->dict,c->argv[2]);
6363 if (!de) {
96d8b4ee 6364 addReply(c,shared.nullbulk);
6e333bbe 6365 } else {
dd88747b 6366 double *score = dictGetEntryVal(de);
6e333bbe 6367
dd88747b 6368 addReplyDouble(c,*score);
6e333bbe 6369 }
6370}
6371
798d9e55 6372static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 6373 robj *o;
dd88747b 6374 zset *zs;
6375 zskiplist *zsl;
6376 dictEntry *de;
6377 unsigned long rank;
6378 double *score;
6379
6380 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6381 checkType(c,o,REDIS_ZSET)) return;
6382
6383 zs = o->ptr;
6384 zsl = zs->zsl;
6385 de = dictFind(zs->dict,c->argv[2]);
6386 if (!de) {
69d95c3e
PN
6387 addReply(c,shared.nullbulk);
6388 return;
6389 }
69d95c3e 6390
dd88747b 6391 score = dictGetEntryVal(de);
6392 rank = zslGetRank(zsl, *score, c->argv[2]);
6393 if (rank) {
6394 if (reverse) {
482b672d 6395 addReplyLongLong(c, zsl->length - rank);
27b0ccca 6396 } else {
482b672d 6397 addReplyLongLong(c, rank-1);
69d95c3e 6398 }
dd88747b 6399 } else {
6400 addReply(c,shared.nullbulk);
978c2c94 6401 }
6402}
6403
798d9e55
PN
6404static void zrankCommand(redisClient *c) {
6405 zrankGenericCommand(c, 0);
6406}
6407
6408static void zrevrankCommand(redisClient *c) {
6409 zrankGenericCommand(c, 1);
6410}
6411
7fb16bac
PN
6412/* ========================= Hashes utility functions ======================= */
6413#define REDIS_HASH_KEY 1
6414#define REDIS_HASH_VALUE 2
978c2c94 6415
7fb16bac
PN
6416/* Check the length of a number of objects to see if we need to convert a
6417 * zipmap to a real hash. Note that we only check string encoded objects
6418 * as their string length can be queried in constant time. */
6419static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6420 int i;
6421 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
978c2c94 6422
7fb16bac
PN
6423 for (i = start; i <= end; i++) {
6424 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6425 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6426 {
6427 convertToRealHash(subject);
978c2c94 6428 return;
6429 }
6430 }
7fb16bac 6431}
bae2c7ec 6432
97224de7
PN
6433/* Encode given objects in-place when the hash uses a dict. */
6434static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6435 if (subject->encoding == REDIS_ENCODING_HT) {
3f973463
PN
6436 if (o1) *o1 = tryObjectEncoding(*o1);
6437 if (o2) *o2 = tryObjectEncoding(*o2);
97224de7
PN
6438 }
6439}
6440
7fb16bac 6441/* Get the value from a hash identified by key. Returns either a string
a3f3af86
PN
6442 * object or NULL if the value cannot be found. The refcount of the object
6443 * is always increased by 1 when the value was found. */
7fb16bac
PN
6444static robj *hashGet(robj *o, robj *key) {
6445 robj *value = NULL;
978c2c94 6446 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7fb16bac
PN
6447 unsigned char *v;
6448 unsigned int vlen;
6449 key = getDecodedObject(key);
6450 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6451 value = createStringObject((char*)v,vlen);
6452 }
6453 decrRefCount(key);
6454 } else {
6455 dictEntry *de = dictFind(o->ptr,key);
6456 if (de != NULL) {
6457 value = dictGetEntryVal(de);
a3f3af86 6458 incrRefCount(value);
7fb16bac
PN
6459 }
6460 }
6461 return value;
6462}
978c2c94 6463
7fb16bac
PN
6464/* Test if the key exists in the given hash. Returns 1 if the key
6465 * exists and 0 when it doesn't. */
6466static int hashExists(robj *o, robj *key) {
6467 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6468 key = getDecodedObject(key);
6469 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6470 decrRefCount(key);
6471 return 1;
6472 }
6473 decrRefCount(key);
6474 } else {
6475 if (dictFind(o->ptr,key) != NULL) {
6476 return 1;
6477 }
6478 }
6479 return 0;
6480}
bae2c7ec 6481
7fb16bac
PN
6482/* Add an element, discard the old if the key already exists.
6483 * Return 0 on insert and 1 on update. */
feb8d7e6 6484static int hashSet(robj *o, robj *key, robj *value) {
7fb16bac
PN
6485 int update = 0;
6486 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6487 key = getDecodedObject(key);
6488 value = getDecodedObject(value);
6489 o->ptr = zipmapSet(o->ptr,
6490 key->ptr,sdslen(key->ptr),
6491 value->ptr,sdslen(value->ptr), &update);
6492 decrRefCount(key);
6493 decrRefCount(value);
6494
6495 /* Check if the zipmap needs to be upgraded to a real hash table */
6496 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
bae2c7ec 6497 convertToRealHash(o);
978c2c94 6498 } else {
7fb16bac
PN
6499 if (dictReplace(o->ptr,key,value)) {
6500 /* Insert */
6501 incrRefCount(key);
978c2c94 6502 } else {
7fb16bac 6503 /* Update */
978c2c94 6504 update = 1;
6505 }
7fb16bac 6506 incrRefCount(value);
978c2c94 6507 }
7fb16bac 6508 return update;
978c2c94 6509}
6510
7fb16bac
PN
6511/* Delete an element from a hash.
6512 * Return 1 on deleted and 0 on not found. */
6513static int hashDelete(robj *o, robj *key) {
6514 int deleted = 0;
6515 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6516 key = getDecodedObject(key);
6517 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6518 decrRefCount(key);
6519 } else {
6520 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6521 /* Always check if the dictionary needs a resize after a delete. */
6522 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
d33278d1 6523 }
7fb16bac
PN
6524 return deleted;
6525}
d33278d1 6526
7fb16bac 6527/* Return the number of elements in a hash. */
c811bb38 6528static unsigned long hashLength(robj *o) {
7fb16bac
PN
6529 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6530 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6531}
6532
6533/* Structure to hold hash iteration abstration. Note that iteration over
6534 * hashes involves both fields and values. Because it is possible that
6535 * not both are required, store pointers in the iterator to avoid
6536 * unnecessary memory allocation for fields/values. */
6537typedef struct {
6538 int encoding;
6539 unsigned char *zi;
6540 unsigned char *zk, *zv;
6541 unsigned int zklen, zvlen;
6542
6543 dictIterator *di;
6544 dictEntry *de;
6545} hashIterator;
6546
c44d3b56
PN
6547static hashIterator *hashInitIterator(robj *subject) {
6548 hashIterator *hi = zmalloc(sizeof(hashIterator));
7fb16bac
PN
6549 hi->encoding = subject->encoding;
6550 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6551 hi->zi = zipmapRewind(subject->ptr);
6552 } else if (hi->encoding == REDIS_ENCODING_HT) {
6553 hi->di = dictGetIterator(subject->ptr);
d33278d1 6554 } else {
7fb16bac 6555 redisAssert(NULL);
d33278d1 6556 }
c44d3b56 6557 return hi;
7fb16bac 6558}
d33278d1 6559
7fb16bac
PN
6560static void hashReleaseIterator(hashIterator *hi) {
6561 if (hi->encoding == REDIS_ENCODING_HT) {
6562 dictReleaseIterator(hi->di);
d33278d1 6563 }
c44d3b56 6564 zfree(hi);
7fb16bac 6565}
d33278d1 6566
7fb16bac
PN
6567/* Move to the next entry in the hash. Return REDIS_OK when the next entry
6568 * could be found and REDIS_ERR when the iterator reaches the end. */
c811bb38 6569static int hashNext(hashIterator *hi) {
7fb16bac
PN
6570 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6571 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6572 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6573 } else {
6574 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6575 }
6576 return REDIS_OK;
6577}
d33278d1 6578
0c390abc 6579/* Get key or value object at current iteration position.
a3f3af86 6580 * This increases the refcount of the field object by 1. */
c811bb38 6581static robj *hashCurrent(hashIterator *hi, int what) {
7fb16bac
PN
6582 robj *o;
6583 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6584 if (what & REDIS_HASH_KEY) {
6585 o = createStringObject((char*)hi->zk,hi->zklen);
6586 } else {
6587 o = createStringObject((char*)hi->zv,hi->zvlen);
d33278d1 6588 }
d33278d1 6589 } else {
7fb16bac
PN
6590 if (what & REDIS_HASH_KEY) {
6591 o = dictGetEntryKey(hi->de);
6592 } else {
6593 o = dictGetEntryVal(hi->de);
d33278d1 6594 }
a3f3af86 6595 incrRefCount(o);
d33278d1 6596 }
7fb16bac 6597 return o;
d33278d1
PN
6598}
6599
7fb16bac
PN
6600static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6601 robj *o = lookupKeyWrite(c->db,key);
01426b05
PN
6602 if (o == NULL) {
6603 o = createHashObject();
7fb16bac
PN
6604 dictAdd(c->db->dict,key,o);
6605 incrRefCount(key);
01426b05
PN
6606 } else {
6607 if (o->type != REDIS_HASH) {
6608 addReply(c,shared.wrongtypeerr);
7fb16bac 6609 return NULL;
01426b05
PN
6610 }
6611 }
7fb16bac
PN
6612 return o;
6613}
01426b05 6614
7fb16bac
PN
6615/* ============================= Hash commands ============================== */
6616static void hsetCommand(redisClient *c) {
6e9e463f 6617 int update;
7fb16bac 6618 robj *o;
bbe025e0 6619
7fb16bac
PN
6620 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6621 hashTryConversion(o,c->argv,2,3);
97224de7 6622 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6623 update = hashSet(o,c->argv[2],c->argv[3]);
6e9e463f 6624 addReply(c, update ? shared.czero : shared.cone);
7fb16bac
PN
6625 server.dirty++;
6626}
01426b05 6627
1f1c7695
PN
6628static void hsetnxCommand(redisClient *c) {
6629 robj *o;
6630 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6631 hashTryConversion(o,c->argv,2,3);
6632
6633 if (hashExists(o, c->argv[2])) {
6634 addReply(c, shared.czero);
01426b05 6635 } else {
97224de7 6636 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6637 hashSet(o,c->argv[2],c->argv[3]);
1f1c7695
PN
6638 addReply(c, shared.cone);
6639 server.dirty++;
6640 }
6641}
01426b05 6642
7fb16bac
PN
6643static void hmsetCommand(redisClient *c) {
6644 int i;
6645 robj *o;
01426b05 6646
7fb16bac
PN
6647 if ((c->argc % 2) == 1) {
6648 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6649 return;
6650 }
01426b05 6651
7fb16bac
PN
6652 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6653 hashTryConversion(o,c->argv,2,c->argc-1);
6654 for (i = 2; i < c->argc; i += 2) {
97224de7 6655 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
feb8d7e6 6656 hashSet(o,c->argv[i],c->argv[i+1]);
7fb16bac
PN
6657 }
6658 addReply(c, shared.ok);
edc2f63a 6659 server.dirty++;
7fb16bac
PN
6660}
6661
6662static void hincrbyCommand(redisClient *c) {
6663 long long value, incr;
6664 robj *o, *current, *new;
6665
bd79a6bd 6666 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
7fb16bac
PN
6667 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6668 if ((current = hashGet(o,c->argv[2])) != NULL) {
946342c1
PN
6669 if (getLongLongFromObjectOrReply(c,current,&value,
6670 "hash value is not an integer") != REDIS_OK) {
6671 decrRefCount(current);
6672 return;
6673 }
a3f3af86 6674 decrRefCount(current);
7fb16bac
PN
6675 } else {
6676 value = 0;
01426b05
PN
6677 }
6678
7fb16bac 6679 value += incr;
3f973463
PN
6680 new = createStringObjectFromLongLong(value);
6681 hashTryObjectEncoding(o,&c->argv[2],NULL);
feb8d7e6 6682 hashSet(o,c->argv[2],new);
7fb16bac
PN
6683 decrRefCount(new);
6684 addReplyLongLong(c,value);
01426b05 6685 server.dirty++;
01426b05
PN
6686}
6687
978c2c94 6688static void hgetCommand(redisClient *c) {
7fb16bac 6689 robj *o, *value;
dd88747b 6690 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6691 checkType(c,o,REDIS_HASH)) return;
6692
7fb16bac
PN
6693 if ((value = hashGet(o,c->argv[2])) != NULL) {
6694 addReplyBulk(c,value);
a3f3af86 6695 decrRefCount(value);
dd88747b 6696 } else {
7fb16bac 6697 addReply(c,shared.nullbulk);
69d95c3e 6698 }
69d95c3e
PN
6699}
6700
09aeb579
PN
6701static void hmgetCommand(redisClient *c) {
6702 int i;
7fb16bac
PN
6703 robj *o, *value;
6704 o = lookupKeyRead(c->db,c->argv[1]);
6705 if (o != NULL && o->type != REDIS_HASH) {
6706 addReply(c,shared.wrongtypeerr);
09aeb579
PN
6707 }
6708
7fb16bac
PN
6709 /* Note the check for o != NULL happens inside the loop. This is
6710 * done because objects that cannot be found are considered to be
6711 * an empty hash. The reply should then be a series of NULLs. */
09aeb579 6712 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7fb16bac
PN
6713 for (i = 2; i < c->argc; i++) {
6714 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6715 addReplyBulk(c,value);
a3f3af86 6716 decrRefCount(value);
7fb16bac
PN
6717 } else {
6718 addReply(c,shared.nullbulk);
09aeb579
PN
6719 }
6720 }
6721}
6722
07efaf74 6723static void hdelCommand(redisClient *c) {
dd88747b 6724 robj *o;
dd88747b 6725 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6726 checkType(c,o,REDIS_HASH)) return;
07efaf74 6727
7fb16bac
PN
6728 if (hashDelete(o,c->argv[2])) {
6729 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6730 addReply(c,shared.cone);
6731 server.dirty++;
dd88747b 6732 } else {
7fb16bac 6733 addReply(c,shared.czero);
07efaf74 6734 }
6735}
6736
92b27fe9 6737static void hlenCommand(redisClient *c) {
6738 robj *o;
dd88747b 6739 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 6740 checkType(c,o,REDIS_HASH)) return;
6741
7fb16bac 6742 addReplyUlong(c,hashLength(o));
92b27fe9 6743}
6744
78409a0f 6745static void genericHgetallCommand(redisClient *c, int flags) {
7fb16bac 6746 robj *o, *lenobj, *obj;
78409a0f 6747 unsigned long count = 0;
c44d3b56 6748 hashIterator *hi;
78409a0f 6749
4e27f268 6750 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
78409a0f 6751 || checkType(c,o,REDIS_HASH)) return;
6752
6753 lenobj = createObject(REDIS_STRING,NULL);
6754 addReply(c,lenobj);
6755 decrRefCount(lenobj);
6756
c44d3b56
PN
6757 hi = hashInitIterator(o);
6758 while (hashNext(hi) != REDIS_ERR) {
7fb16bac 6759 if (flags & REDIS_HASH_KEY) {
c44d3b56 6760 obj = hashCurrent(hi,REDIS_HASH_KEY);
7fb16bac 6761 addReplyBulk(c,obj);
a3f3af86 6762 decrRefCount(obj);
7fb16bac 6763 count++;
78409a0f 6764 }
7fb16bac 6765 if (flags & REDIS_HASH_VALUE) {
c44d3b56 6766 obj = hashCurrent(hi,REDIS_HASH_VALUE);
7fb16bac 6767 addReplyBulk(c,obj);
a3f3af86 6768 decrRefCount(obj);
7fb16bac 6769 count++;
78409a0f 6770 }
78409a0f 6771 }
c44d3b56 6772 hashReleaseIterator(hi);
7fb16bac 6773
78409a0f 6774 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6775}
6776
6777static void hkeysCommand(redisClient *c) {
7fb16bac 6778 genericHgetallCommand(c,REDIS_HASH_KEY);
78409a0f 6779}
6780
6781static void hvalsCommand(redisClient *c) {
7fb16bac 6782 genericHgetallCommand(c,REDIS_HASH_VALUE);
78409a0f 6783}
6784
6785static void hgetallCommand(redisClient *c) {
7fb16bac 6786 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
78409a0f 6787}
6788
a86f14b1 6789static void hexistsCommand(redisClient *c) {
6790 robj *o;
a86f14b1 6791 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6792 checkType(c,o,REDIS_HASH)) return;
6793
7fb16bac 6794 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
a86f14b1 6795}
6796
ada386b2 6797static void convertToRealHash(robj *o) {
6798 unsigned char *key, *val, *p, *zm = o->ptr;
6799 unsigned int klen, vlen;
6800 dict *dict = dictCreate(&hashDictType,NULL);
6801
6802 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6803 p = zipmapRewind(zm);
6804 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6805 robj *keyobj, *valobj;
6806
6807 keyobj = createStringObject((char*)key,klen);
6808 valobj = createStringObject((char*)val,vlen);
05df7621 6809 keyobj = tryObjectEncoding(keyobj);
6810 valobj = tryObjectEncoding(valobj);
ada386b2 6811 dictAdd(dict,keyobj,valobj);
6812 }
6813 o->encoding = REDIS_ENCODING_HT;
6814 o->ptr = dict;
6815 zfree(zm);
6816}
6817
6b47e12e 6818/* ========================= Non type-specific commands ==================== */
6819
ed9b544e 6820static void flushdbCommand(redisClient *c) {
ca37e9cd 6821 server.dirty += dictSize(c->db->dict);
9b30e1a2 6822 touchWatchedKeysOnFlush(c->db->id);
3305306f 6823 dictEmpty(c->db->dict);
6824 dictEmpty(c->db->expires);
ed9b544e 6825 addReply(c,shared.ok);
ed9b544e 6826}
6827
6828static void flushallCommand(redisClient *c) {
9b30e1a2 6829 touchWatchedKeysOnFlush(-1);
ca37e9cd 6830 server.dirty += emptyDb();
ed9b544e 6831 addReply(c,shared.ok);
500ece7c 6832 if (server.bgsavechildpid != -1) {
6833 kill(server.bgsavechildpid,SIGKILL);
6834 rdbRemoveTempFile(server.bgsavechildpid);
6835 }
f78fd11b 6836 rdbSave(server.dbfilename);
ca37e9cd 6837 server.dirty++;
ed9b544e 6838}
6839
56906eef 6840static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 6841 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 6842 so->type = type;
6843 so->pattern = pattern;
6844 return so;
6845}
6846
6847/* Return the value associated to the key with a name obtained
55017f9d
PN
6848 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6849 * The returned object will always have its refcount increased by 1
6850 * when it is non-NULL. */
56906eef 6851static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6d7d1370 6852 char *p, *f;
ed9b544e 6853 sds spat, ssub;
6d7d1370
PN
6854 robj keyobj, fieldobj, *o;
6855 int prefixlen, sublen, postfixlen, fieldlen;
ed9b544e 6856 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6857 struct {
f1017b3f 6858 long len;
6859 long free;
ed9b544e 6860 char buf[REDIS_SORTKEY_MAX+1];
6d7d1370 6861 } keyname, fieldname;
ed9b544e 6862
28173a49 6863 /* If the pattern is "#" return the substitution object itself in order
6864 * to implement the "SORT ... GET #" feature. */
6865 spat = pattern->ptr;
6866 if (spat[0] == '#' && spat[1] == '\0') {
55017f9d 6867 incrRefCount(subst);
28173a49 6868 return subst;
6869 }
6870
6871 /* The substitution object may be specially encoded. If so we create
9d65a1bb 6872 * a decoded object on the fly. Otherwise getDecodedObject will just
6873 * increment the ref count, that we'll decrement later. */
6874 subst = getDecodedObject(subst);
942a3961 6875
ed9b544e 6876 ssub = subst->ptr;
6877 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6878 p = strchr(spat,'*');
ed5a857a 6879 if (!p) {
6880 decrRefCount(subst);
6881 return NULL;
6882 }
ed9b544e 6883
6d7d1370
PN
6884 /* Find out if we're dealing with a hash dereference. */
6885 if ((f = strstr(p+1, "->")) != NULL) {
6886 fieldlen = sdslen(spat)-(f-spat);
6887 /* this also copies \0 character */
6888 memcpy(fieldname.buf,f+2,fieldlen-1);
6889 fieldname.len = fieldlen-2;
6890 } else {
6891 fieldlen = 0;
6892 }
6893
ed9b544e 6894 prefixlen = p-spat;
6895 sublen = sdslen(ssub);
6d7d1370 6896 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
ed9b544e 6897 memcpy(keyname.buf,spat,prefixlen);
6898 memcpy(keyname.buf+prefixlen,ssub,sublen);
6899 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6900 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6901 keyname.len = prefixlen+sublen+postfixlen;
942a3961 6902 decrRefCount(subst);
6903
6d7d1370
PN
6904 /* Lookup substituted key */
6905 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6906 o = lookupKeyRead(db,&keyobj);
55017f9d
PN
6907 if (o == NULL) return NULL;
6908
6909 if (fieldlen > 0) {
6910 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6d7d1370 6911
705dad38
PN
6912 /* Retrieve value from hash by the field name. This operation
6913 * already increases the refcount of the returned object. */
6d7d1370
PN
6914 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6915 o = hashGet(o, &fieldobj);
705dad38 6916 } else {
55017f9d 6917 if (o->type != REDIS_STRING) return NULL;
b6f07345 6918
705dad38
PN
6919 /* Every object that this function returns needs to have its refcount
6920 * increased. sortCommand decreases it again. */
6921 incrRefCount(o);
6d7d1370
PN
6922 }
6923
6924 return o;
ed9b544e 6925}
6926
6927/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6928 * the additional parameter is not standard but a BSD-specific we have to
6929 * pass sorting parameters via the global 'server' structure */
6930static int sortCompare(const void *s1, const void *s2) {
6931 const redisSortObject *so1 = s1, *so2 = s2;
6932 int cmp;
6933
6934 if (!server.sort_alpha) {
6935 /* Numeric sorting. Here it's trivial as we precomputed scores */
6936 if (so1->u.score > so2->u.score) {
6937 cmp = 1;
6938 } else if (so1->u.score < so2->u.score) {
6939 cmp = -1;
6940 } else {
6941 cmp = 0;
6942 }
6943 } else {
6944 /* Alphanumeric sorting */
6945 if (server.sort_bypattern) {
6946 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6947 /* At least one compare object is NULL */
6948 if (so1->u.cmpobj == so2->u.cmpobj)
6949 cmp = 0;
6950 else if (so1->u.cmpobj == NULL)
6951 cmp = -1;
6952 else
6953 cmp = 1;
6954 } else {
6955 /* We have both the objects, use strcoll */
6956 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6957 }
6958 } else {
08ee9b57 6959 /* Compare elements directly. */
6960 cmp = compareStringObjects(so1->obj,so2->obj);
ed9b544e 6961 }
6962 }
6963 return server.sort_desc ? -cmp : cmp;
6964}
6965
6966/* The SORT command is the most complex command in Redis. Warning: this code
6967 * is optimized for speed and a bit less for readability */
6968static void sortCommand(redisClient *c) {
ed9b544e 6969 list *operations;
6970 int outputlen = 0;
6971 int desc = 0, alpha = 0;
6972 int limit_start = 0, limit_count = -1, start, end;
6973 int j, dontsort = 0, vectorlen;
6974 int getop = 0; /* GET operation counter */
443c6409 6975 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 6976 redisSortObject *vector; /* Resulting vector to sort */
6977
6978 /* Lookup the key to sort. It must be of the right types */
3305306f 6979 sortval = lookupKeyRead(c->db,c->argv[1]);
6980 if (sortval == NULL) {
4e27f268 6981 addReply(c,shared.emptymultibulk);
ed9b544e 6982 return;
6983 }
a5eb649b 6984 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6985 sortval->type != REDIS_ZSET)
6986 {
c937aa89 6987 addReply(c,shared.wrongtypeerr);
ed9b544e 6988 return;
6989 }
6990
6991 /* Create a list of operations to perform for every sorted element.
6992 * Operations can be GET/DEL/INCR/DECR */
6993 operations = listCreate();
092dac2a 6994 listSetFreeMethod(operations,zfree);
ed9b544e 6995 j = 2;
6996
6997 /* Now we need to protect sortval incrementing its count, in the future
6998 * SORT may have options able to overwrite/delete keys during the sorting
6999 * and the sorted key itself may get destroied */
7000 incrRefCount(sortval);
7001
7002 /* The SORT command has an SQL-alike syntax, parse it */
7003 while(j < c->argc) {
7004 int leftargs = c->argc-j-1;
7005 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7006 desc = 0;
7007 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7008 desc = 1;
7009 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7010 alpha = 1;
7011 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7012 limit_start = atoi(c->argv[j+1]->ptr);
7013 limit_count = atoi(c->argv[j+2]->ptr);
7014 j+=2;
443c6409 7015 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7016 storekey = c->argv[j+1];
7017 j++;
ed9b544e 7018 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7019 sortby = c->argv[j+1];
7020 /* If the BY pattern does not contain '*', i.e. it is constant,
7021 * we don't need to sort nor to lookup the weight keys. */
7022 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7023 j++;
7024 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7025 listAddNodeTail(operations,createSortOperation(
7026 REDIS_SORT_GET,c->argv[j+1]));
7027 getop++;
7028 j++;
ed9b544e 7029 } else {
7030 decrRefCount(sortval);
7031 listRelease(operations);
c937aa89 7032 addReply(c,shared.syntaxerr);
ed9b544e 7033 return;
7034 }
7035 j++;
7036 }
7037
7038 /* Load the sorting vector with all the objects to sort */
a5eb649b 7039 switch(sortval->type) {
7040 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
7041 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7042 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
f83c6cb5 7043 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
a5eb649b 7044 }
ed9b544e 7045 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 7046 j = 0;
a5eb649b 7047
ed9b544e 7048 if (sortval->type == REDIS_LIST) {
7049 list *list = sortval->ptr;
6208b3a7 7050 listNode *ln;
c7df85a4 7051 listIter li;
6208b3a7 7052
c7df85a4 7053 listRewind(list,&li);
7054 while((ln = listNext(&li))) {
ed9b544e 7055 robj *ele = ln->value;
7056 vector[j].obj = ele;
7057 vector[j].u.score = 0;
7058 vector[j].u.cmpobj = NULL;
ed9b544e 7059 j++;
7060 }
7061 } else {
a5eb649b 7062 dict *set;
ed9b544e 7063 dictIterator *di;
7064 dictEntry *setele;
7065
a5eb649b 7066 if (sortval->type == REDIS_SET) {
7067 set = sortval->ptr;
7068 } else {
7069 zset *zs = sortval->ptr;
7070 set = zs->dict;
7071 }
7072
ed9b544e 7073 di = dictGetIterator(set);
ed9b544e 7074 while((setele = dictNext(di)) != NULL) {
7075 vector[j].obj = dictGetEntryKey(setele);
7076 vector[j].u.score = 0;
7077 vector[j].u.cmpobj = NULL;
7078 j++;
7079 }
7080 dictReleaseIterator(di);
7081 }
dfc5e96c 7082 redisAssert(j == vectorlen);
ed9b544e 7083
7084 /* Now it's time to load the right scores in the sorting vector */
7085 if (dontsort == 0) {
7086 for (j = 0; j < vectorlen; j++) {
6d7d1370 7087 robj *byval;
ed9b544e 7088 if (sortby) {
6d7d1370 7089 /* lookup value to sort by */
3305306f 7090 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
705dad38 7091 if (!byval) continue;
ed9b544e 7092 } else {
6d7d1370
PN
7093 /* use object itself to sort by */
7094 byval = vector[j].obj;
7095 }
7096
7097 if (alpha) {
08ee9b57 7098 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6d7d1370
PN
7099 } else {
7100 if (byval->encoding == REDIS_ENCODING_RAW) {
7101 vector[j].u.score = strtod(byval->ptr,NULL);
16fa22f1 7102 } else if (byval->encoding == REDIS_ENCODING_INT) {
6d7d1370
PN
7103 /* Don't need to decode the object if it's
7104 * integer-encoded (the only encoding supported) so
7105 * far. We can just cast it */
16fa22f1
PN
7106 vector[j].u.score = (long)byval->ptr;
7107 } else {
7108 redisAssert(1 != 1);
942a3961 7109 }
ed9b544e 7110 }
6d7d1370 7111
705dad38
PN
7112 /* when the object was retrieved using lookupKeyByPattern,
7113 * its refcount needs to be decreased. */
7114 if (sortby) {
7115 decrRefCount(byval);
ed9b544e 7116 }
7117 }
7118 }
7119
7120 /* We are ready to sort the vector... perform a bit of sanity check
7121 * on the LIMIT option too. We'll use a partial version of quicksort. */
7122 start = (limit_start < 0) ? 0 : limit_start;
7123 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7124 if (start >= vectorlen) {
7125 start = vectorlen-1;
7126 end = vectorlen-2;
7127 }
7128 if (end >= vectorlen) end = vectorlen-1;
7129
7130 if (dontsort == 0) {
7131 server.sort_desc = desc;
7132 server.sort_alpha = alpha;
7133 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 7134 if (sortby && (start != 0 || end != vectorlen-1))
7135 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7136 else
7137 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 7138 }
7139
7140 /* Send command output to the output buffer, performing the specified
7141 * GET/DEL/INCR/DECR operations if any. */
7142 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 7143 if (storekey == NULL) {
7144 /* STORE option not specified, sent the sorting result to client */
7145 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7146 for (j = start; j <= end; j++) {
7147 listNode *ln;
c7df85a4 7148 listIter li;
7149
dd88747b 7150 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 7151 listRewind(operations,&li);
7152 while((ln = listNext(&li))) {
443c6409 7153 redisSortOperation *sop = ln->value;
7154 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7155 vector[j].obj);
7156
7157 if (sop->type == REDIS_SORT_GET) {
55017f9d 7158 if (!val) {
443c6409 7159 addReply(c,shared.nullbulk);
7160 } else {
dd88747b 7161 addReplyBulk(c,val);
55017f9d 7162 decrRefCount(val);
443c6409 7163 }
7164 } else {
dfc5e96c 7165 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 7166 }
7167 }
ed9b544e 7168 }
443c6409 7169 } else {
7170 robj *listObject = createListObject();
7171 list *listPtr = (list*) listObject->ptr;
7172
7173 /* STORE option specified, set the sorting result as a List object */
7174 for (j = start; j <= end; j++) {
7175 listNode *ln;
c7df85a4 7176 listIter li;
7177
443c6409 7178 if (!getop) {
7179 listAddNodeTail(listPtr,vector[j].obj);
7180 incrRefCount(vector[j].obj);
7181 }
c7df85a4 7182 listRewind(operations,&li);
7183 while((ln = listNext(&li))) {
443c6409 7184 redisSortOperation *sop = ln->value;
7185 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7186 vector[j].obj);
7187
7188 if (sop->type == REDIS_SORT_GET) {
55017f9d 7189 if (!val) {
443c6409 7190 listAddNodeTail(listPtr,createStringObject("",0));
7191 } else {
55017f9d
PN
7192 /* We should do a incrRefCount on val because it is
7193 * added to the list, but also a decrRefCount because
7194 * it is returned by lookupKeyByPattern. This results
7195 * in doing nothing at all. */
443c6409 7196 listAddNodeTail(listPtr,val);
443c6409 7197 }
ed9b544e 7198 } else {
dfc5e96c 7199 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 7200 }
ed9b544e 7201 }
ed9b544e 7202 }
121796f7 7203 if (dictReplace(c->db->dict,storekey,listObject)) {
7204 incrRefCount(storekey);
7205 }
443c6409 7206 /* Note: we add 1 because the DB is dirty anyway since even if the
7207 * SORT result is empty a new key is set and maybe the old content
7208 * replaced. */
7209 server.dirty += 1+outputlen;
7210 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 7211 }
7212
7213 /* Cleanup */
7214 decrRefCount(sortval);
7215 listRelease(operations);
7216 for (j = 0; j < vectorlen; j++) {
16fa22f1 7217 if (alpha && vector[j].u.cmpobj)
ed9b544e 7218 decrRefCount(vector[j].u.cmpobj);
7219 }
7220 zfree(vector);
7221}
7222
ec6c7a1d 7223/* Convert an amount of bytes into a human readable string in the form
7224 * of 100B, 2G, 100M, 4K, and so forth. */
7225static void bytesToHuman(char *s, unsigned long long n) {
7226 double d;
7227
7228 if (n < 1024) {
7229 /* Bytes */
7230 sprintf(s,"%lluB",n);
7231 return;
7232 } else if (n < (1024*1024)) {
7233 d = (double)n/(1024);
7234 sprintf(s,"%.2fK",d);
7235 } else if (n < (1024LL*1024*1024)) {
7236 d = (double)n/(1024*1024);
7237 sprintf(s,"%.2fM",d);
7238 } else if (n < (1024LL*1024*1024*1024)) {
7239 d = (double)n/(1024LL*1024*1024);
b72f6a4b 7240 sprintf(s,"%.2fG",d);
ec6c7a1d 7241 }
7242}
7243
1c85b79f 7244/* Create the string returned by the INFO command. This is decoupled
7245 * by the INFO command itself as we need to report the same information
7246 * on memory corruption problems. */
7247static sds genRedisInfoString(void) {
ed9b544e 7248 sds info;
7249 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 7250 int j;
ec6c7a1d 7251 char hmem[64];
55a8298f 7252
b72f6a4b 7253 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 7254 info = sdscatprintf(sdsempty(),
7255 "redis_version:%s\r\n"
5436146c
PN
7256 "redis_git_sha1:%s\r\n"
7257 "redis_git_dirty:%d\r\n"
f1017b3f 7258 "arch_bits:%s\r\n"
7a932b74 7259 "multiplexing_api:%s\r\n"
0d7170a4 7260 "process_id:%ld\r\n"
682ac724 7261 "uptime_in_seconds:%ld\r\n"
7262 "uptime_in_days:%ld\r\n"
ed9b544e 7263 "connected_clients:%d\r\n"
7264 "connected_slaves:%d\r\n"
f86a74e9 7265 "blocked_clients:%d\r\n"
5fba9f71 7266 "used_memory:%zu\r\n"
ec6c7a1d 7267 "used_memory_human:%s\r\n"
ed9b544e 7268 "changes_since_last_save:%lld\r\n"
be2bb6b0 7269 "bgsave_in_progress:%d\r\n"
682ac724 7270 "last_save_time:%ld\r\n"
b3fad521 7271 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 7272 "total_connections_received:%lld\r\n"
7273 "total_commands_processed:%lld\r\n"
2a6a2ed1 7274 "expired_keys:%lld\r\n"
3be2c9d7 7275 "hash_max_zipmap_entries:%zu\r\n"
7276 "hash_max_zipmap_value:%zu\r\n"
ffc6b7f8 7277 "pubsub_channels:%ld\r\n"
7278 "pubsub_patterns:%u\r\n"
7d98e08c 7279 "vm_enabled:%d\r\n"
a0f643ea 7280 "role:%s\r\n"
ed9b544e 7281 ,REDIS_VERSION,
5436146c 7282 REDIS_GIT_SHA1,
274e45e3 7283 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
f1017b3f 7284 (sizeof(long) == 8) ? "64" : "32",
7a932b74 7285 aeGetApiName(),
0d7170a4 7286 (long) getpid(),
a0f643ea 7287 uptime,
7288 uptime/(3600*24),
ed9b544e 7289 listLength(server.clients)-listLength(server.slaves),
7290 listLength(server.slaves),
d5d55fc3 7291 server.blpop_blocked_clients,
b72f6a4b 7292 zmalloc_used_memory(),
ec6c7a1d 7293 hmem,
ed9b544e 7294 server.dirty,
9d65a1bb 7295 server.bgsavechildpid != -1,
ed9b544e 7296 server.lastsave,
b3fad521 7297 server.bgrewritechildpid != -1,
ed9b544e 7298 server.stat_numconnections,
7299 server.stat_numcommands,
2a6a2ed1 7300 server.stat_expiredkeys,
55a8298f 7301 server.hash_max_zipmap_entries,
7302 server.hash_max_zipmap_value,
ffc6b7f8 7303 dictSize(server.pubsub_channels),
7304 listLength(server.pubsub_patterns),
7d98e08c 7305 server.vm_enabled != 0,
a0f643ea 7306 server.masterhost == NULL ? "master" : "slave"
ed9b544e 7307 );
a0f643ea 7308 if (server.masterhost) {
7309 info = sdscatprintf(info,
7310 "master_host:%s\r\n"
7311 "master_port:%d\r\n"
7312 "master_link_status:%s\r\n"
7313 "master_last_io_seconds_ago:%d\r\n"
7314 ,server.masterhost,
7315 server.masterport,
7316 (server.replstate == REDIS_REPL_CONNECTED) ?
7317 "up" : "down",
f72b934d 7318 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 7319 );
7320 }
7d98e08c 7321 if (server.vm_enabled) {
1064ef87 7322 lockThreadedIO();
7d98e08c 7323 info = sdscatprintf(info,
7324 "vm_conf_max_memory:%llu\r\n"
7325 "vm_conf_page_size:%llu\r\n"
7326 "vm_conf_pages:%llu\r\n"
7327 "vm_stats_used_pages:%llu\r\n"
7328 "vm_stats_swapped_objects:%llu\r\n"
7329 "vm_stats_swappin_count:%llu\r\n"
7330 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 7331 "vm_stats_io_newjobs_len:%lu\r\n"
7332 "vm_stats_io_processing_len:%lu\r\n"
7333 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 7334 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 7335 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 7336 ,(unsigned long long) server.vm_max_memory,
7337 (unsigned long long) server.vm_page_size,
7338 (unsigned long long) server.vm_pages,
7339 (unsigned long long) server.vm_stats_used_pages,
7340 (unsigned long long) server.vm_stats_swapped_objects,
7341 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 7342 (unsigned long long) server.vm_stats_swapouts,
7343 (unsigned long) listLength(server.io_newjobs),
7344 (unsigned long) listLength(server.io_processing),
7345 (unsigned long) listLength(server.io_processed),
d5d55fc3 7346 (unsigned long) server.io_active_threads,
7347 (unsigned long) server.vm_blocked_clients
7d98e08c 7348 );
1064ef87 7349 unlockThreadedIO();
7d98e08c 7350 }
c3cb078d 7351 for (j = 0; j < server.dbnum; j++) {
7352 long long keys, vkeys;
7353
7354 keys = dictSize(server.db[j].dict);
7355 vkeys = dictSize(server.db[j].expires);
7356 if (keys || vkeys) {
9d65a1bb 7357 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 7358 j, keys, vkeys);
7359 }
7360 }
1c85b79f 7361 return info;
7362}
7363
7364static void infoCommand(redisClient *c) {
7365 sds info = genRedisInfoString();
83c6a618 7366 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7367 (unsigned long)sdslen(info)));
ed9b544e 7368 addReplySds(c,info);
70003d28 7369 addReply(c,shared.crlf);
ed9b544e 7370}
7371
3305306f 7372static void monitorCommand(redisClient *c) {
7373 /* ignore MONITOR if aleady slave or in monitor mode */
7374 if (c->flags & REDIS_SLAVE) return;
7375
7376 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7377 c->slaveseldb = 0;
6b47e12e 7378 listAddNodeTail(server.monitors,c);
3305306f 7379 addReply(c,shared.ok);
7380}
7381
7382/* ================================= Expire ================================= */
7383static int removeExpire(redisDb *db, robj *key) {
7384 if (dictDelete(db->expires,key) == DICT_OK) {
7385 return 1;
7386 } else {
7387 return 0;
7388 }
7389}
7390
7391static int setExpire(redisDb *db, robj *key, time_t when) {
7392 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7393 return 0;
7394 } else {
7395 incrRefCount(key);
7396 return 1;
7397 }
7398}
7399
bb32ede5 7400/* Return the expire time of the specified key, or -1 if no expire
7401 * is associated with this key (i.e. the key is non volatile) */
7402static time_t getExpire(redisDb *db, robj *key) {
7403 dictEntry *de;
7404
7405 /* No expire? return ASAP */
7406 if (dictSize(db->expires) == 0 ||
7407 (de = dictFind(db->expires,key)) == NULL) return -1;
7408
7409 return (time_t) dictGetEntryVal(de);
7410}
7411
3305306f 7412static int expireIfNeeded(redisDb *db, robj *key) {
7413 time_t when;
7414 dictEntry *de;
7415
7416 /* No expire? return ASAP */
7417 if (dictSize(db->expires) == 0 ||
7418 (de = dictFind(db->expires,key)) == NULL) return 0;
7419
7420 /* Lookup the expire */
7421 when = (time_t) dictGetEntryVal(de);
7422 if (time(NULL) <= when) return 0;
7423
7424 /* Delete the key */
7425 dictDelete(db->expires,key);
2a6a2ed1 7426 server.stat_expiredkeys++;
3305306f 7427 return dictDelete(db->dict,key) == DICT_OK;
7428}
7429
7430static int deleteIfVolatile(redisDb *db, robj *key) {
7431 dictEntry *de;
7432
7433 /* No expire? return ASAP */
7434 if (dictSize(db->expires) == 0 ||
7435 (de = dictFind(db->expires,key)) == NULL) return 0;
7436
7437 /* Delete the key */
0c66a471 7438 server.dirty++;
2a6a2ed1 7439 server.stat_expiredkeys++;
3305306f 7440 dictDelete(db->expires,key);
7441 return dictDelete(db->dict,key) == DICT_OK;
7442}
7443
bbe025e0 7444static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
3305306f 7445 dictEntry *de;
bbe025e0
AM
7446 time_t seconds;
7447
bd79a6bd 7448 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
bbe025e0
AM
7449
7450 seconds -= offset;
3305306f 7451
802e8373 7452 de = dictFind(c->db->dict,key);
3305306f 7453 if (de == NULL) {
7454 addReply(c,shared.czero);
7455 return;
7456 }
d4dd6556 7457 if (seconds <= 0) {
43e5ccdf 7458 if (deleteKey(c->db,key)) server.dirty++;
7459 addReply(c, shared.cone);
3305306f 7460 return;
7461 } else {
7462 time_t when = time(NULL)+seconds;
802e8373 7463 if (setExpire(c->db,key,when)) {
3305306f 7464 addReply(c,shared.cone);
77423026 7465 server.dirty++;
7466 } else {
3305306f 7467 addReply(c,shared.czero);
77423026 7468 }
3305306f 7469 return;
7470 }
7471}
7472
802e8373 7473static void expireCommand(redisClient *c) {
bbe025e0 7474 expireGenericCommand(c,c->argv[1],c->argv[2],0);
802e8373 7475}
7476
7477static void expireatCommand(redisClient *c) {
bbe025e0 7478 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
802e8373 7479}
7480
fd88489a 7481static void ttlCommand(redisClient *c) {
7482 time_t expire;
7483 int ttl = -1;
7484
7485 expire = getExpire(c->db,c->argv[1]);
7486 if (expire != -1) {
7487 ttl = (int) (expire-time(NULL));
7488 if (ttl < 0) ttl = -1;
7489 }
7490 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7491}
7492
6e469882 7493/* ================================ MULTI/EXEC ============================== */
7494
7495/* Client state initialization for MULTI/EXEC */
7496static void initClientMultiState(redisClient *c) {
7497 c->mstate.commands = NULL;
7498 c->mstate.count = 0;
7499}
7500
7501/* Release all the resources associated with MULTI/EXEC state */
7502static void freeClientMultiState(redisClient *c) {
7503 int j;
7504
7505 for (j = 0; j < c->mstate.count; j++) {
7506 int i;
7507 multiCmd *mc = c->mstate.commands+j;
7508
7509 for (i = 0; i < mc->argc; i++)
7510 decrRefCount(mc->argv[i]);
7511 zfree(mc->argv);
7512 }
7513 zfree(c->mstate.commands);
7514}
7515
7516/* Add a new command into the MULTI commands queue */
7517static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7518 multiCmd *mc;
7519 int j;
7520
7521 c->mstate.commands = zrealloc(c->mstate.commands,
7522 sizeof(multiCmd)*(c->mstate.count+1));
7523 mc = c->mstate.commands+c->mstate.count;
7524 mc->cmd = cmd;
7525 mc->argc = c->argc;
7526 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7527 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7528 for (j = 0; j < c->argc; j++)
7529 incrRefCount(mc->argv[j]);
7530 c->mstate.count++;
7531}
7532
7533static void multiCommand(redisClient *c) {
6531c94d 7534 if (c->flags & REDIS_MULTI) {
7535 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
7536 return;
7537 }
6e469882 7538 c->flags |= REDIS_MULTI;
36c548f0 7539 addReply(c,shared.ok);
6e469882 7540}
7541
18b6cb76
DJ
7542static void discardCommand(redisClient *c) {
7543 if (!(c->flags & REDIS_MULTI)) {
7544 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7545 return;
7546 }
7547
7548 freeClientMultiState(c);
7549 initClientMultiState(c);
7550 c->flags &= (~REDIS_MULTI);
7551 addReply(c,shared.ok);
7552}
7553
66c8853f 7554/* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7555 * implememntation for more information. */
7556static void execCommandReplicateMulti(redisClient *c) {
7557 struct redisCommand *cmd;
7558 robj *multistring = createStringObject("MULTI",5);
7559
7560 cmd = lookupCommand("multi");
7561 if (server.appendonly)
7562 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7563 if (listLength(server.slaves))
7564 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7565 decrRefCount(multistring);
7566}
7567
6e469882 7568static void execCommand(redisClient *c) {
7569 int j;
7570 robj **orig_argv;
7571 int orig_argc;
7572
7573 if (!(c->flags & REDIS_MULTI)) {
7574 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7575 return;
7576 }
7577
37ab76c9 7578 /* Check if we need to abort the EXEC if some WATCHed key was touched.
7579 * A failed EXEC will return a multi bulk nil object. */
7580 if (c->flags & REDIS_DIRTY_CAS) {
7581 freeClientMultiState(c);
7582 initClientMultiState(c);
7583 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7584 unwatchAllKeys(c);
7585 addReply(c,shared.nullmultibulk);
7586 return;
7587 }
7588
66c8853f 7589 /* Replicate a MULTI request now that we are sure the block is executed.
7590 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7591 * both the AOF and the replication link will have the same consistency
7592 * and atomicity guarantees. */
7593 execCommandReplicateMulti(c);
7594
7595 /* Exec all the queued commands */
1ad4d316 7596 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
6e469882 7597 orig_argv = c->argv;
7598 orig_argc = c->argc;
7599 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7600 for (j = 0; j < c->mstate.count; j++) {
7601 c->argc = c->mstate.commands[j].argc;
7602 c->argv = c->mstate.commands[j].argv;
7603 call(c,c->mstate.commands[j].cmd);
7604 }
7605 c->argv = orig_argv;
7606 c->argc = orig_argc;
7607 freeClientMultiState(c);
7608 initClientMultiState(c);
1ad4d316 7609 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
66c8853f 7610 /* Make sure the EXEC command is always replicated / AOF, since we
7611 * always send the MULTI command (we can't know beforehand if the
7612 * next operations will contain at least a modification to the DB). */
7613 server.dirty++;
6e469882 7614}
7615
4409877e 7616/* =========================== Blocking Operations ========================= */
7617
7618/* Currently Redis blocking operations support is limited to list POP ops,
7619 * so the current implementation is not fully generic, but it is also not
7620 * completely specific so it will not require a rewrite to support new
7621 * kind of blocking operations in the future.
7622 *
7623 * Still it's important to note that list blocking operations can be already
7624 * used as a notification mechanism in order to implement other blocking
7625 * operations at application level, so there must be a very strong evidence
7626 * of usefulness and generality before new blocking operations are implemented.
7627 *
7628 * This is how the current blocking POP works, we use BLPOP as example:
7629 * - If the user calls BLPOP and the key exists and contains a non empty list
7630 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7631 * if there is not to block.
7632 * - If instead BLPOP is called and the key does not exists or the list is
7633 * empty we need to block. In order to do so we remove the notification for
7634 * new data to read in the client socket (so that we'll not serve new
7635 * requests if the blocking request is not served). Also we put the client
37ab76c9 7636 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
4409877e 7637 * blocking for this keys.
7638 * - If a PUSH operation against a key with blocked clients waiting is
7639 * performed, we serve the first in the list: basically instead to push
7640 * the new element inside the list we return it to the (first / oldest)
7641 * blocking client, unblock the client, and remove it form the list.
7642 *
7643 * The above comment and the source code should be enough in order to understand
7644 * the implementation and modify / fix it later.
7645 */
7646
7647/* Set a client in blocking mode for the specified key, with the specified
7648 * timeout */
b177fd30 7649static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 7650 dictEntry *de;
7651 list *l;
b177fd30 7652 int j;
4409877e 7653
37ab76c9 7654 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
7655 c->blocking_keys_num = numkeys;
4409877e 7656 c->blockingto = timeout;
b177fd30 7657 for (j = 0; j < numkeys; j++) {
7658 /* Add the key in the client structure, to map clients -> keys */
37ab76c9 7659 c->blocking_keys[j] = keys[j];
b177fd30 7660 incrRefCount(keys[j]);
4409877e 7661
b177fd30 7662 /* And in the other "side", to map keys -> clients */
37ab76c9 7663 de = dictFind(c->db->blocking_keys,keys[j]);
b177fd30 7664 if (de == NULL) {
7665 int retval;
7666
7667 /* For every key we take a list of clients blocked for it */
7668 l = listCreate();
37ab76c9 7669 retval = dictAdd(c->db->blocking_keys,keys[j],l);
b177fd30 7670 incrRefCount(keys[j]);
7671 assert(retval == DICT_OK);
7672 } else {
7673 l = dictGetEntryVal(de);
7674 }
7675 listAddNodeTail(l,c);
4409877e 7676 }
b177fd30 7677 /* Mark the client as a blocked client */
4409877e 7678 c->flags |= REDIS_BLOCKED;
d5d55fc3 7679 server.blpop_blocked_clients++;
4409877e 7680}
7681
7682/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 7683static void unblockClientWaitingData(redisClient *c) {
4409877e 7684 dictEntry *de;
7685 list *l;
b177fd30 7686 int j;
4409877e 7687
37ab76c9 7688 assert(c->blocking_keys != NULL);
b177fd30 7689 /* The client may wait for multiple keys, so unblock it for every key. */
37ab76c9 7690 for (j = 0; j < c->blocking_keys_num; j++) {
b177fd30 7691 /* Remove this client from the list of clients waiting for this key. */
37ab76c9 7692 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
b177fd30 7693 assert(de != NULL);
7694 l = dictGetEntryVal(de);
7695 listDelNode(l,listSearchKey(l,c));
7696 /* If the list is empty we need to remove it to avoid wasting memory */
7697 if (listLength(l) == 0)
37ab76c9 7698 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
7699 decrRefCount(c->blocking_keys[j]);
b177fd30 7700 }
7701 /* Cleanup the client structure */
37ab76c9 7702 zfree(c->blocking_keys);
7703 c->blocking_keys = NULL;
4409877e 7704 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 7705 server.blpop_blocked_clients--;
5921aa36 7706 /* We want to process data if there is some command waiting
b0d8747d 7707 * in the input buffer. Note that this is safe even if
7708 * unblockClientWaitingData() gets called from freeClient() because
7709 * freeClient() will be smart enough to call this function
7710 * *after* c->querybuf was set to NULL. */
4409877e 7711 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7712}
7713
7714/* This should be called from any function PUSHing into lists.
7715 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7716 * 'ele' is the element pushed.
7717 *
7718 * If the function returns 0 there was no client waiting for a list push
7719 * against this key.
7720 *
7721 * If the function returns 1 there was a client waiting for a list push
7722 * against this key, the element was passed to this client thus it's not
7723 * needed to actually add it to the list and the caller should return asap. */
7724static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7725 struct dictEntry *de;
7726 redisClient *receiver;
7727 list *l;
7728 listNode *ln;
7729
37ab76c9 7730 de = dictFind(c->db->blocking_keys,key);
4409877e 7731 if (de == NULL) return 0;
7732 l = dictGetEntryVal(de);
7733 ln = listFirst(l);
7734 assert(ln != NULL);
7735 receiver = ln->value;
4409877e 7736
b177fd30 7737 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 7738 addReplyBulk(receiver,key);
7739 addReplyBulk(receiver,ele);
b0d8747d 7740 unblockClientWaitingData(receiver);
4409877e 7741 return 1;
7742}
7743
7744/* Blocking RPOP/LPOP */
7745static void blockingPopGenericCommand(redisClient *c, int where) {
7746 robj *o;
7747 time_t timeout;
b177fd30 7748 int j;
4409877e 7749
b177fd30 7750 for (j = 1; j < c->argc-1; j++) {
7751 o = lookupKeyWrite(c->db,c->argv[j]);
7752 if (o != NULL) {
7753 if (o->type != REDIS_LIST) {
7754 addReply(c,shared.wrongtypeerr);
4409877e 7755 return;
b177fd30 7756 } else {
7757 list *list = o->ptr;
7758 if (listLength(list) != 0) {
7759 /* If the list contains elements fall back to the usual
7760 * non-blocking POP operation */
7761 robj *argv[2], **orig_argv;
7762 int orig_argc;
e0a62c7f 7763
b177fd30 7764 /* We need to alter the command arguments before to call
7765 * popGenericCommand() as the command takes a single key. */
7766 orig_argv = c->argv;
7767 orig_argc = c->argc;
7768 argv[1] = c->argv[j];
7769 c->argv = argv;
7770 c->argc = 2;
7771
7772 /* Also the return value is different, we need to output
7773 * the multi bulk reply header and the key name. The
7774 * "real" command will add the last element (the value)
7775 * for us. If this souds like an hack to you it's just
7776 * because it is... */
7777 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 7778 addReplyBulk(c,argv[1]);
b177fd30 7779 popGenericCommand(c,where);
7780
7781 /* Fix the client structure with the original stuff */
7782 c->argv = orig_argv;
7783 c->argc = orig_argc;
7784 return;
7785 }
4409877e 7786 }
7787 }
7788 }
7789 /* If the list is empty or the key does not exists we must block */
b177fd30 7790 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 7791 if (timeout > 0) timeout += time(NULL);
b177fd30 7792 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 7793}
7794
7795static void blpopCommand(redisClient *c) {
7796 blockingPopGenericCommand(c,REDIS_HEAD);
7797}
7798
7799static void brpopCommand(redisClient *c) {
7800 blockingPopGenericCommand(c,REDIS_TAIL);
7801}
7802
ed9b544e 7803/* =============================== Replication ============================= */
7804
a4d1ba9a 7805static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7806 ssize_t nwritten, ret = size;
7807 time_t start = time(NULL);
7808
7809 timeout++;
7810 while(size) {
7811 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7812 nwritten = write(fd,ptr,size);
7813 if (nwritten == -1) return -1;
7814 ptr += nwritten;
7815 size -= nwritten;
7816 }
7817 if ((time(NULL)-start) > timeout) {
7818 errno = ETIMEDOUT;
7819 return -1;
7820 }
7821 }
7822 return ret;
7823}
7824
a4d1ba9a 7825static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7826 ssize_t nread, totread = 0;
7827 time_t start = time(NULL);
7828
7829 timeout++;
7830 while(size) {
7831 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7832 nread = read(fd,ptr,size);
7833 if (nread == -1) return -1;
7834 ptr += nread;
7835 size -= nread;
7836 totread += nread;
7837 }
7838 if ((time(NULL)-start) > timeout) {
7839 errno = ETIMEDOUT;
7840 return -1;
7841 }
7842 }
7843 return totread;
7844}
7845
7846static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7847 ssize_t nread = 0;
7848
7849 size--;
7850 while(size) {
7851 char c;
7852
7853 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7854 if (c == '\n') {
7855 *ptr = '\0';
7856 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7857 return nread;
7858 } else {
7859 *ptr++ = c;
7860 *ptr = '\0';
7861 nread++;
7862 }
7863 }
7864 return nread;
7865}
7866
7867static void syncCommand(redisClient *c) {
40d224a9 7868 /* ignore SYNC if aleady slave or in monitor mode */
7869 if (c->flags & REDIS_SLAVE) return;
7870
7871 /* SYNC can't be issued when the server has pending data to send to
7872 * the client about already issued commands. We need a fresh reply
7873 * buffer registering the differences between the BGSAVE and the current
7874 * dataset, so that we can copy to other slaves if needed. */
7875 if (listLength(c->reply) != 0) {
7876 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7877 return;
7878 }
7879
7880 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7881 /* Here we need to check if there is a background saving operation
7882 * in progress, or if it is required to start one */
9d65a1bb 7883 if (server.bgsavechildpid != -1) {
40d224a9 7884 /* Ok a background save is in progress. Let's check if it is a good
7885 * one for replication, i.e. if there is another slave that is
7886 * registering differences since the server forked to save */
7887 redisClient *slave;
7888 listNode *ln;
c7df85a4 7889 listIter li;
40d224a9 7890
c7df85a4 7891 listRewind(server.slaves,&li);
7892 while((ln = listNext(&li))) {
40d224a9 7893 slave = ln->value;
7894 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 7895 }
7896 if (ln) {
7897 /* Perfect, the server is already registering differences for
7898 * another slave. Set the right state, and copy the buffer. */
7899 listRelease(c->reply);
7900 c->reply = listDup(slave->reply);
40d224a9 7901 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7902 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7903 } else {
7904 /* No way, we need to wait for the next BGSAVE in order to
7905 * register differences */
7906 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7907 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7908 }
7909 } else {
7910 /* Ok we don't have a BGSAVE in progress, let's start one */
7911 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7912 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7913 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7914 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7915 return;
7916 }
7917 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7918 }
6208b3a7 7919 c->repldbfd = -1;
40d224a9 7920 c->flags |= REDIS_SLAVE;
7921 c->slaveseldb = 0;
6b47e12e 7922 listAddNodeTail(server.slaves,c);
40d224a9 7923 return;
7924}
7925
6208b3a7 7926static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7927 redisClient *slave = privdata;
7928 REDIS_NOTUSED(el);
7929 REDIS_NOTUSED(mask);
7930 char buf[REDIS_IOBUF_LEN];
7931 ssize_t nwritten, buflen;
7932
7933 if (slave->repldboff == 0) {
7934 /* Write the bulk write count before to transfer the DB. In theory here
7935 * we don't know how much room there is in the output buffer of the
7936 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7937 * operations) will never be smaller than the few bytes we need. */
7938 sds bulkcount;
7939
7940 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7941 slave->repldbsize);
7942 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7943 {
7944 sdsfree(bulkcount);
7945 freeClient(slave);
7946 return;
7947 }
7948 sdsfree(bulkcount);
7949 }
7950 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7951 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7952 if (buflen <= 0) {
7953 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7954 (buflen == 0) ? "premature EOF" : strerror(errno));
7955 freeClient(slave);
7956 return;
7957 }
7958 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 7959 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 7960 strerror(errno));
7961 freeClient(slave);
7962 return;
7963 }
7964 slave->repldboff += nwritten;
7965 if (slave->repldboff == slave->repldbsize) {
7966 close(slave->repldbfd);
7967 slave->repldbfd = -1;
7968 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7969 slave->replstate = REDIS_REPL_ONLINE;
7970 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 7971 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 7972 freeClient(slave);
7973 return;
7974 }
7975 addReplySds(slave,sdsempty());
7976 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7977 }
7978}
ed9b544e 7979
a3b21203 7980/* This function is called at the end of every backgrond saving.
7981 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7982 * otherwise REDIS_ERR is passed to the function.
7983 *
7984 * The goal of this function is to handle slaves waiting for a successful
7985 * background saving in order to perform non-blocking synchronization. */
7986static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 7987 listNode *ln;
7988 int startbgsave = 0;
c7df85a4 7989 listIter li;
ed9b544e 7990
c7df85a4 7991 listRewind(server.slaves,&li);
7992 while((ln = listNext(&li))) {
6208b3a7 7993 redisClient *slave = ln->value;
ed9b544e 7994
6208b3a7 7995 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7996 startbgsave = 1;
7997 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7998 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 7999 struct redis_stat buf;
e0a62c7f 8000
6208b3a7 8001 if (bgsaveerr != REDIS_OK) {
8002 freeClient(slave);
8003 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8004 continue;
8005 }
8006 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 8007 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 8008 freeClient(slave);
8009 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8010 continue;
8011 }
8012 slave->repldboff = 0;
8013 slave->repldbsize = buf.st_size;
8014 slave->replstate = REDIS_REPL_SEND_BULK;
8015 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 8016 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 8017 freeClient(slave);
8018 continue;
8019 }
8020 }
ed9b544e 8021 }
6208b3a7 8022 if (startbgsave) {
8023 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 8024 listIter li;
8025
8026 listRewind(server.slaves,&li);
6208b3a7 8027 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 8028 while((ln = listNext(&li))) {
6208b3a7 8029 redisClient *slave = ln->value;
ed9b544e 8030
6208b3a7 8031 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8032 freeClient(slave);
8033 }
8034 }
8035 }
ed9b544e 8036}
8037
8038static int syncWithMaster(void) {
d0ccebcf 8039 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 8040 long dumpsize;
ed9b544e 8041 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 8042 int dfd, maxtries = 5;
ed9b544e 8043
8044 if (fd == -1) {
8045 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8046 strerror(errno));
8047 return REDIS_ERR;
8048 }
d0ccebcf 8049
8050 /* AUTH with the master if required. */
8051 if(server.masterauth) {
8052 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8053 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8054 close(fd);
8055 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8056 strerror(errno));
8057 return REDIS_ERR;
8058 }
8059 /* Read the AUTH result. */
8060 if (syncReadLine(fd,buf,1024,3600) == -1) {
8061 close(fd);
8062 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8063 strerror(errno));
8064 return REDIS_ERR;
8065 }
8066 if (buf[0] != '+') {
8067 close(fd);
8068 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8069 return REDIS_ERR;
8070 }
8071 }
8072
ed9b544e 8073 /* Issue the SYNC command */
8074 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8075 close(fd);
8076 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8077 strerror(errno));
8078 return REDIS_ERR;
8079 }
8080 /* Read the bulk write count */
8c4d91fc 8081 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 8082 close(fd);
8083 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8084 strerror(errno));
8085 return REDIS_ERR;
8086 }
4aa701c1 8087 if (buf[0] != '$') {
8088 close(fd);
8089 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8090 return REDIS_ERR;
8091 }
18e61fa2 8092 dumpsize = strtol(buf+1,NULL,10);
8093 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 8094 /* Read the bulk write data on a temp file */
8c5abee8 8095 while(maxtries--) {
8096 snprintf(tmpfile,256,
8097 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8098 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8099 if (dfd != -1) break;
5de9ad7c 8100 sleep(1);
8c5abee8 8101 }
ed9b544e 8102 if (dfd == -1) {
8103 close(fd);
8104 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8105 return REDIS_ERR;
8106 }
8107 while(dumpsize) {
8108 int nread, nwritten;
8109
8110 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8111 if (nread == -1) {
8112 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8113 strerror(errno));
8114 close(fd);
8115 close(dfd);
8116 return REDIS_ERR;
8117 }
8118 nwritten = write(dfd,buf,nread);
8119 if (nwritten == -1) {
8120 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8121 close(fd);
8122 close(dfd);
8123 return REDIS_ERR;
8124 }
8125 dumpsize -= nread;
8126 }
8127 close(dfd);
8128 if (rename(tmpfile,server.dbfilename) == -1) {
8129 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8130 unlink(tmpfile);
8131 close(fd);
8132 return REDIS_ERR;
8133 }
8134 emptyDb();
f78fd11b 8135 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 8136 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8137 close(fd);
8138 return REDIS_ERR;
8139 }
8140 server.master = createClient(fd);
8141 server.master->flags |= REDIS_MASTER;
179b3952 8142 server.master->authenticated = 1;
ed9b544e 8143 server.replstate = REDIS_REPL_CONNECTED;
8144 return REDIS_OK;
8145}
8146
321b0e13 8147static void slaveofCommand(redisClient *c) {
8148 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8149 !strcasecmp(c->argv[2]->ptr,"one")) {
8150 if (server.masterhost) {
8151 sdsfree(server.masterhost);
8152 server.masterhost = NULL;
8153 if (server.master) freeClient(server.master);
8154 server.replstate = REDIS_REPL_NONE;
8155 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8156 }
8157 } else {
8158 sdsfree(server.masterhost);
8159 server.masterhost = sdsdup(c->argv[1]->ptr);
8160 server.masterport = atoi(c->argv[2]->ptr);
8161 if (server.master) freeClient(server.master);
8162 server.replstate = REDIS_REPL_CONNECT;
8163 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8164 server.masterhost, server.masterport);
8165 }
8166 addReply(c,shared.ok);
8167}
8168
3fd78bcd 8169/* ============================ Maxmemory directive ======================== */
8170
a5819310 8171/* Try to free one object form the pre-allocated objects free list.
8172 * This is useful under low mem conditions as by default we take 1 million
8173 * free objects allocated. On success REDIS_OK is returned, otherwise
8174 * REDIS_ERR. */
8175static int tryFreeOneObjectFromFreelist(void) {
f870935d 8176 robj *o;
8177
a5819310 8178 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8179 if (listLength(server.objfreelist)) {
8180 listNode *head = listFirst(server.objfreelist);
8181 o = listNodeValue(head);
8182 listDelNode(server.objfreelist,head);
8183 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8184 zfree(o);
8185 return REDIS_OK;
8186 } else {
8187 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8188 return REDIS_ERR;
8189 }
f870935d 8190}
8191
3fd78bcd 8192/* This function gets called when 'maxmemory' is set on the config file to limit
8193 * the max memory used by the server, and we are out of memory.
8194 * This function will try to, in order:
8195 *
8196 * - Free objects from the free list
8197 * - Try to remove keys with an EXPIRE set
8198 *
8199 * It is not possible to free enough memory to reach used-memory < maxmemory
8200 * the server will start refusing commands that will enlarge even more the
8201 * memory usage.
8202 */
8203static void freeMemoryIfNeeded(void) {
8204 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 8205 int j, k, freed = 0;
8206
8207 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8208 for (j = 0; j < server.dbnum; j++) {
8209 int minttl = -1;
8210 robj *minkey = NULL;
8211 struct dictEntry *de;
8212
8213 if (dictSize(server.db[j].expires)) {
8214 freed = 1;
8215 /* From a sample of three keys drop the one nearest to
8216 * the natural expire */
8217 for (k = 0; k < 3; k++) {
8218 time_t t;
8219
8220 de = dictGetRandomKey(server.db[j].expires);
8221 t = (time_t) dictGetEntryVal(de);
8222 if (minttl == -1 || t < minttl) {
8223 minkey = dictGetEntryKey(de);
8224 minttl = t;
3fd78bcd 8225 }
3fd78bcd 8226 }
a5819310 8227 deleteKey(server.db+j,minkey);
3fd78bcd 8228 }
3fd78bcd 8229 }
a5819310 8230 if (!freed) return; /* nothing to free... */
3fd78bcd 8231 }
8232}
8233
f80dff62 8234/* ============================== Append Only file ========================== */
8235
28ed1f33 8236/* Write the append only file buffer on disk.
8237 *
8238 * Since we are required to write the AOF before replying to the client,
8239 * and the only way the client socket can get a write is entering when the
8240 * the event loop, we accumulate all the AOF writes in a memory
8241 * buffer and write it on disk using this function just before entering
8242 * the event loop again. */
8243static void flushAppendOnlyFile(void) {
8244 time_t now;
8245 ssize_t nwritten;
8246
8247 if (sdslen(server.aofbuf) == 0) return;
8248
8249 /* We want to perform a single write. This should be guaranteed atomic
8250 * at least if the filesystem we are writing is a real physical one.
8251 * While this will save us against the server being killed I don't think
8252 * there is much to do about the whole server stopping for power problems
8253 * or alike */
8254 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8255 if (nwritten != (signed)sdslen(server.aofbuf)) {
8256 /* Ooops, we are in troubles. The best thing to do for now is
8257 * aborting instead of giving the illusion that everything is
8258 * working as expected. */
8259 if (nwritten == -1) {
8260 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8261 } else {
8262 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8263 }
8264 exit(1);
8265 }
8266 sdsfree(server.aofbuf);
8267 server.aofbuf = sdsempty();
8268
8269 /* Fsync if needed */
8270 now = time(NULL);
8271 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8272 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8273 now-server.lastfsync > 1))
8274 {
8275 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8276 * flushing metadata. */
8277 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8278 server.lastfsync = now;
8279 }
8280}
8281
9376e434
PN
8282static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8283 int j;
8284 buf = sdscatprintf(buf,"*%d\r\n",argc);
8285 for (j = 0; j < argc; j++) {
8286 robj *o = getDecodedObject(argv[j]);
8287 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8288 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8289 buf = sdscatlen(buf,"\r\n",2);
8290 decrRefCount(o);
8291 }
8292 return buf;
8293}
8294
8295static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8296 int argc = 3;
8297 long when;
8298 robj *argv[3];
8299
8300 /* Make sure we can use strtol */
8301 seconds = getDecodedObject(seconds);
8302 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8303 decrRefCount(seconds);
8304
8305 argv[0] = createStringObject("EXPIREAT",8);
8306 argv[1] = key;
8307 argv[2] = createObject(REDIS_STRING,
8308 sdscatprintf(sdsempty(),"%ld",when));
8309 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8310 decrRefCount(argv[0]);
8311 decrRefCount(argv[2]);
8312 return buf;
8313}
8314
f80dff62 8315static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8316 sds buf = sdsempty();
f80dff62 8317 robj *tmpargv[3];
8318
8319 /* The DB this command was targetting is not the same as the last command
8320 * we appendend. To issue a SELECT command is needed. */
8321 if (dictid != server.appendseldb) {
8322 char seldb[64];
8323
8324 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 8325 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 8326 (unsigned long)strlen(seldb),seldb);
f80dff62 8327 server.appendseldb = dictid;
8328 }
8329
f80dff62 8330 if (cmd->proc == expireCommand) {
9376e434
PN
8331 /* Translate EXPIRE into EXPIREAT */
8332 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8333 } else if (cmd->proc == setexCommand) {
8334 /* Translate SETEX to SET and EXPIREAT */
8335 tmpargv[0] = createStringObject("SET",3);
f80dff62 8336 tmpargv[1] = argv[1];
9376e434
PN
8337 tmpargv[2] = argv[3];
8338 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8339 decrRefCount(tmpargv[0]);
8340 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8341 } else {
8342 buf = catAppendOnlyGenericCommand(buf,argc,argv);
f80dff62 8343 }
8344
28ed1f33 8345 /* Append to the AOF buffer. This will be flushed on disk just before
8346 * of re-entering the event loop, so before the client will get a
8347 * positive reply about the operation performed. */
8348 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8349
85a83172 8350 /* If a background append only file rewriting is in progress we want to
8351 * accumulate the differences between the child DB and the current one
8352 * in a buffer, so that when the child process will do its work we
8353 * can append the differences to the new append only file. */
8354 if (server.bgrewritechildpid != -1)
8355 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8356
8357 sdsfree(buf);
f80dff62 8358}
8359
8360/* In Redis commands are always executed in the context of a client, so in
8361 * order to load the append only file we need to create a fake client. */
8362static struct redisClient *createFakeClient(void) {
8363 struct redisClient *c = zmalloc(sizeof(*c));
8364
8365 selectDb(c,0);
8366 c->fd = -1;
8367 c->querybuf = sdsempty();
8368 c->argc = 0;
8369 c->argv = NULL;
8370 c->flags = 0;
9387d17d 8371 /* We set the fake client as a slave waiting for the synchronization
8372 * so that Redis will not try to send replies to this client. */
8373 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 8374 c->reply = listCreate();
8375 listSetFreeMethod(c->reply,decrRefCount);
8376 listSetDupMethod(c->reply,dupClientReplyValue);
4132ad8d 8377 initClientMultiState(c);
f80dff62 8378 return c;
8379}
8380
8381static void freeFakeClient(struct redisClient *c) {
8382 sdsfree(c->querybuf);
8383 listRelease(c->reply);
4132ad8d 8384 freeClientMultiState(c);
f80dff62 8385 zfree(c);
8386}
8387
8388/* Replay the append log file. On error REDIS_OK is returned. On non fatal
8389 * error (the append only file is zero-length) REDIS_ERR is returned. On
8390 * fatal error an error message is logged and the program exists. */
8391int loadAppendOnlyFile(char *filename) {
8392 struct redisClient *fakeClient;
8393 FILE *fp = fopen(filename,"r");
8394 struct redis_stat sb;
b492cf00 8395 unsigned long long loadedkeys = 0;
4132ad8d 8396 int appendonly = server.appendonly;
f80dff62 8397
8398 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8399 return REDIS_ERR;
8400
8401 if (fp == NULL) {
8402 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8403 exit(1);
8404 }
8405
4132ad8d
PN
8406 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8407 * to the same file we're about to read. */
8408 server.appendonly = 0;
8409
f80dff62 8410 fakeClient = createFakeClient();
8411 while(1) {
8412 int argc, j;
8413 unsigned long len;
8414 robj **argv;
8415 char buf[128];
8416 sds argsds;
8417 struct redisCommand *cmd;
8418
8419 if (fgets(buf,sizeof(buf),fp) == NULL) {
8420 if (feof(fp))
8421 break;
8422 else
8423 goto readerr;
8424 }
8425 if (buf[0] != '*') goto fmterr;
8426 argc = atoi(buf+1);
8427 argv = zmalloc(sizeof(robj*)*argc);
8428 for (j = 0; j < argc; j++) {
8429 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8430 if (buf[0] != '$') goto fmterr;
8431 len = strtol(buf+1,NULL,10);
8432 argsds = sdsnewlen(NULL,len);
0f151ef1 8433 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 8434 argv[j] = createObject(REDIS_STRING,argsds);
8435 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8436 }
8437
8438 /* Command lookup */
8439 cmd = lookupCommand(argv[0]->ptr);
8440 if (!cmd) {
8441 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8442 exit(1);
8443 }
bdcb92f2 8444 /* Try object encoding */
f80dff62 8445 if (cmd->flags & REDIS_CMD_BULK)
05df7621 8446 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
f80dff62 8447 /* Run the command in the context of a fake client */
8448 fakeClient->argc = argc;
8449 fakeClient->argv = argv;
8450 cmd->proc(fakeClient);
8451 /* Discard the reply objects list from the fake client */
8452 while(listLength(fakeClient->reply))
8453 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8454 /* Clean up, ready for the next command */
8455 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8456 zfree(argv);
b492cf00 8457 /* Handle swapping while loading big datasets when VM is on */
8458 loadedkeys++;
8459 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8460 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 8461 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 8462 }
8463 }
f80dff62 8464 }
4132ad8d
PN
8465
8466 /* This point can only be reached when EOF is reached without errors.
8467 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8468 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8469
f80dff62 8470 fclose(fp);
8471 freeFakeClient(fakeClient);
4132ad8d 8472 server.appendonly = appendonly;
f80dff62 8473 return REDIS_OK;
8474
8475readerr:
8476 if (feof(fp)) {
8477 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8478 } else {
8479 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8480 }
8481 exit(1);
8482fmterr:
8483 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8484 exit(1);
8485}
8486
9d65a1bb 8487/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
9c8e3cee 8488static int fwriteBulkObject(FILE *fp, robj *obj) {
9d65a1bb 8489 char buf[128];
b9bc0eef 8490 int decrrc = 0;
8491
f2d9f50f 8492 /* Avoid the incr/decr ref count business if possible to help
8493 * copy-on-write (we are often in a child process when this function
8494 * is called).
8495 * Also makes sure that key objects don't get incrRefCount-ed when VM
8496 * is enabled */
8497 if (obj->encoding != REDIS_ENCODING_RAW) {
b9bc0eef 8498 obj = getDecodedObject(obj);
8499 decrrc = 1;
8500 }
9d65a1bb 8501 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8502 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 8503 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8504 goto err;
9d65a1bb 8505 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 8506 if (decrrc) decrRefCount(obj);
9d65a1bb 8507 return 1;
8508err:
b9bc0eef 8509 if (decrrc) decrRefCount(obj);
9d65a1bb 8510 return 0;
8511}
8512
9c8e3cee 8513/* Write binary-safe string into a file in the bulkformat
8514 * $<count>\r\n<payload>\r\n */
8515static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8516 char buf[128];
8517
8518 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8519 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8520 if (len && fwrite(s,len,1,fp) == 0) return 0;
8521 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8522 return 1;
8523}
8524
9d65a1bb 8525/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8526static int fwriteBulkDouble(FILE *fp, double d) {
8527 char buf[128], dbuf[128];
8528
8529 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8530 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8531 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8532 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8533 return 1;
8534}
8535
8536/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8537static int fwriteBulkLong(FILE *fp, long l) {
8538 char buf[128], lbuf[128];
8539
8540 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8541 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8542 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8543 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8544 return 1;
8545}
8546
8547/* Write a sequence of commands able to fully rebuild the dataset into
8548 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8549static int rewriteAppendOnlyFile(char *filename) {
8550 dictIterator *di = NULL;
8551 dictEntry *de;
8552 FILE *fp;
8553 char tmpfile[256];
8554 int j;
8555 time_t now = time(NULL);
8556
8557 /* Note that we have to use a different temp name here compared to the
8558 * one used by rewriteAppendOnlyFileBackground() function. */
8559 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8560 fp = fopen(tmpfile,"w");
8561 if (!fp) {
8562 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8563 return REDIS_ERR;
8564 }
8565 for (j = 0; j < server.dbnum; j++) {
8566 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8567 redisDb *db = server.db+j;
8568 dict *d = db->dict;
8569 if (dictSize(d) == 0) continue;
8570 di = dictGetIterator(d);
8571 if (!di) {
8572 fclose(fp);
8573 return REDIS_ERR;
8574 }
8575
8576 /* SELECT the new DB */
8577 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 8578 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 8579
8580 /* Iterate this DB writing every entry */
8581 while((de = dictNext(di)) != NULL) {
e7546c63 8582 robj *key, *o;
8583 time_t expiretime;
8584 int swapped;
8585
8586 key = dictGetEntryKey(de);
b9bc0eef 8587 /* If the value for this key is swapped, load a preview in memory.
8588 * We use a "swapped" flag to remember if we need to free the
8589 * value object instead to just increment the ref count anyway
8590 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 8591 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8592 key->storage == REDIS_VM_SWAPPING) {
e7546c63 8593 o = dictGetEntryVal(de);
8594 swapped = 0;
8595 } else {
8596 o = vmPreviewObject(key);
e7546c63 8597 swapped = 1;
8598 }
8599 expiretime = getExpire(db,key);
9d65a1bb 8600
8601 /* Save the key and associated value */
9d65a1bb 8602 if (o->type == REDIS_STRING) {
8603 /* Emit a SET command */
8604 char cmd[]="*3\r\n$3\r\nSET\r\n";
8605 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8606 /* Key and value */
9c8e3cee 8607 if (fwriteBulkObject(fp,key) == 0) goto werr;
8608 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 8609 } else if (o->type == REDIS_LIST) {
8610 /* Emit the RPUSHes needed to rebuild the list */
8611 list *list = o->ptr;
8612 listNode *ln;
c7df85a4 8613 listIter li;
9d65a1bb 8614
c7df85a4 8615 listRewind(list,&li);
8616 while((ln = listNext(&li))) {
9d65a1bb 8617 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8618 robj *eleobj = listNodeValue(ln);
8619
8620 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8621 if (fwriteBulkObject(fp,key) == 0) goto werr;
8622 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8623 }
8624 } else if (o->type == REDIS_SET) {
8625 /* Emit the SADDs needed to rebuild the set */
8626 dict *set = o->ptr;
8627 dictIterator *di = dictGetIterator(set);
8628 dictEntry *de;
8629
8630 while((de = dictNext(di)) != NULL) {
8631 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8632 robj *eleobj = dictGetEntryKey(de);
8633
8634 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8635 if (fwriteBulkObject(fp,key) == 0) goto werr;
8636 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8637 }
8638 dictReleaseIterator(di);
8639 } else if (o->type == REDIS_ZSET) {
8640 /* Emit the ZADDs needed to rebuild the sorted set */
8641 zset *zs = o->ptr;
8642 dictIterator *di = dictGetIterator(zs->dict);
8643 dictEntry *de;
8644
8645 while((de = dictNext(di)) != NULL) {
8646 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8647 robj *eleobj = dictGetEntryKey(de);
8648 double *score = dictGetEntryVal(de);
8649
8650 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8651 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8652 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 8653 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8654 }
8655 dictReleaseIterator(di);
9c8e3cee 8656 } else if (o->type == REDIS_HASH) {
8657 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8658
8659 /* Emit the HSETs needed to rebuild the hash */
8660 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8661 unsigned char *p = zipmapRewind(o->ptr);
8662 unsigned char *field, *val;
8663 unsigned int flen, vlen;
8664
8665 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8666 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8667 if (fwriteBulkObject(fp,key) == 0) goto werr;
8668 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8669 return -1;
8670 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8671 return -1;
8672 }
8673 } else {
8674 dictIterator *di = dictGetIterator(o->ptr);
8675 dictEntry *de;
8676
8677 while((de = dictNext(di)) != NULL) {
8678 robj *field = dictGetEntryKey(de);
8679 robj *val = dictGetEntryVal(de);
8680
8681 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8682 if (fwriteBulkObject(fp,key) == 0) goto werr;
8683 if (fwriteBulkObject(fp,field) == -1) return -1;
8684 if (fwriteBulkObject(fp,val) == -1) return -1;
8685 }
8686 dictReleaseIterator(di);
8687 }
9d65a1bb 8688 } else {
f83c6cb5 8689 redisPanic("Unknown object type");
9d65a1bb 8690 }
8691 /* Save the expire time */
8692 if (expiretime != -1) {
e96e4fbf 8693 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 8694 /* If this key is already expired skip it */
8695 if (expiretime < now) continue;
8696 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8697 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8698 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8699 }
b9bc0eef 8700 if (swapped) decrRefCount(o);
9d65a1bb 8701 }
8702 dictReleaseIterator(di);
8703 }
8704
8705 /* Make sure data will not remain on the OS's output buffers */
8706 fflush(fp);
8707 fsync(fileno(fp));
8708 fclose(fp);
e0a62c7f 8709
9d65a1bb 8710 /* Use RENAME to make sure the DB file is changed atomically only
8711 * if the generate DB file is ok. */
8712 if (rename(tmpfile,filename) == -1) {
8713 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8714 unlink(tmpfile);
8715 return REDIS_ERR;
8716 }
8717 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8718 return REDIS_OK;
8719
8720werr:
8721 fclose(fp);
8722 unlink(tmpfile);
e96e4fbf 8723 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 8724 if (di) dictReleaseIterator(di);
8725 return REDIS_ERR;
8726}
8727
8728/* This is how rewriting of the append only file in background works:
8729 *
8730 * 1) The user calls BGREWRITEAOF
8731 * 2) Redis calls this function, that forks():
8732 * 2a) the child rewrite the append only file in a temp file.
8733 * 2b) the parent accumulates differences in server.bgrewritebuf.
8734 * 3) When the child finished '2a' exists.
8735 * 4) The parent will trap the exit code, if it's OK, will append the
8736 * data accumulated into server.bgrewritebuf into the temp file, and
8737 * finally will rename(2) the temp file in the actual file name.
8738 * The the new file is reopened as the new append only file. Profit!
8739 */
8740static int rewriteAppendOnlyFileBackground(void) {
8741 pid_t childpid;
8742
8743 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 8744 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 8745 if ((childpid = fork()) == 0) {
8746 /* Child */
8747 char tmpfile[256];
9d65a1bb 8748
054e426d 8749 if (server.vm_enabled) vmReopenSwapFile();
8750 close(server.fd);
9d65a1bb 8751 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8752 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 8753 _exit(0);
9d65a1bb 8754 } else {
478c2c6f 8755 _exit(1);
9d65a1bb 8756 }
8757 } else {
8758 /* Parent */
8759 if (childpid == -1) {
8760 redisLog(REDIS_WARNING,
8761 "Can't rewrite append only file in background: fork: %s",
8762 strerror(errno));
8763 return REDIS_ERR;
8764 }
8765 redisLog(REDIS_NOTICE,
8766 "Background append only file rewriting started by pid %d",childpid);
8767 server.bgrewritechildpid = childpid;
884d4b39 8768 updateDictResizePolicy();
85a83172 8769 /* We set appendseldb to -1 in order to force the next call to the
8770 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8771 * accumulated by the parent into server.bgrewritebuf will start
8772 * with a SELECT statement and it will be safe to merge. */
8773 server.appendseldb = -1;
9d65a1bb 8774 return REDIS_OK;
8775 }
8776 return REDIS_OK; /* unreached */
8777}
8778
8779static void bgrewriteaofCommand(redisClient *c) {
8780 if (server.bgrewritechildpid != -1) {
8781 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8782 return;
8783 }
8784 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 8785 char *status = "+Background append only file rewriting started\r\n";
8786 addReplySds(c,sdsnew(status));
9d65a1bb 8787 } else {
8788 addReply(c,shared.err);
8789 }
8790}
8791
8792static void aofRemoveTempFile(pid_t childpid) {
8793 char tmpfile[256];
8794
8795 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8796 unlink(tmpfile);
8797}
8798
996cb5f7 8799/* Virtual Memory is composed mainly of two subsystems:
8800 * - Blocking Virutal Memory
8801 * - Threaded Virtual Memory I/O
8802 * The two parts are not fully decoupled, but functions are split among two
8803 * different sections of the source code (delimited by comments) in order to
8804 * make more clear what functionality is about the blocking VM and what about
8805 * the threaded (not blocking) VM.
8806 *
8807 * Redis VM design:
8808 *
8809 * Redis VM is a blocking VM (one that blocks reading swapped values from
8810 * disk into memory when a value swapped out is needed in memory) that is made
8811 * unblocking by trying to examine the command argument vector in order to
8812 * load in background values that will likely be needed in order to exec
8813 * the command. The command is executed only once all the relevant keys
8814 * are loaded into memory.
8815 *
8816 * This basically is almost as simple of a blocking VM, but almost as parallel
8817 * as a fully non-blocking VM.
8818 */
8819
2e5eb04e 8820/* Called when the user switches from "appendonly yes" to "appendonly no"
8821 * at runtime using the CONFIG command. */
8822static void stopAppendOnly(void) {
8823 flushAppendOnlyFile();
8824 fsync(server.appendfd);
8825 close(server.appendfd);
8826
8827 server.appendfd = -1;
8828 server.appendseldb = -1;
8829 server.appendonly = 0;
8830 /* rewrite operation in progress? kill it, wait child exit */
8831 if (server.bgsavechildpid != -1) {
8832 int statloc;
8833
30dd89b6 8834 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8835 wait3(&statloc,0,NULL);
2e5eb04e 8836 /* reset the buffer accumulating changes while the child saves */
8837 sdsfree(server.bgrewritebuf);
8838 server.bgrewritebuf = sdsempty();
30dd89b6 8839 server.bgsavechildpid = -1;
2e5eb04e 8840 }
8841}
8842
8843/* Called when the user switches from "appendonly no" to "appendonly yes"
8844 * at runtime using the CONFIG command. */
8845static int startAppendOnly(void) {
8846 server.appendonly = 1;
8847 server.lastfsync = time(NULL);
8848 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8849 if (server.appendfd == -1) {
8850 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8851 return REDIS_ERR;
8852 }
8853 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8854 server.appendonly = 0;
8855 close(server.appendfd);
8856 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8857 return REDIS_ERR;
8858 }
8859 return REDIS_OK;
8860}
8861
996cb5f7 8862/* =================== Virtual Memory - Blocking Side ====================== */
054e426d 8863
75680a3c 8864static void vmInit(void) {
8865 off_t totsize;
996cb5f7 8866 int pipefds[2];
bcaa7a4f 8867 size_t stacksize;
8b5bb414 8868 struct flock fl;
75680a3c 8869
4ad37480 8870 if (server.vm_max_threads != 0)
8871 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8872
054e426d 8873 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8b5bb414 8874 /* Try to open the old swap file, otherwise create it */
6fa987e3 8875 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8876 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8877 }
75680a3c 8878 if (server.vm_fp == NULL) {
6fa987e3 8879 redisLog(REDIS_WARNING,
8b5bb414 8880 "Can't open the swap file: %s. Exiting.",
6fa987e3 8881 strerror(errno));
75680a3c 8882 exit(1);
8883 }
8884 server.vm_fd = fileno(server.vm_fp);
8b5bb414 8885 /* Lock the swap file for writing, this is useful in order to avoid
8886 * another instance to use the same swap file for a config error. */
8887 fl.l_type = F_WRLCK;
8888 fl.l_whence = SEEK_SET;
8889 fl.l_start = fl.l_len = 0;
8890 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8891 redisLog(REDIS_WARNING,
8892 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8893 exit(1);
8894 }
8895 /* Initialize */
75680a3c 8896 server.vm_next_page = 0;
8897 server.vm_near_pages = 0;
7d98e08c 8898 server.vm_stats_used_pages = 0;
8899 server.vm_stats_swapped_objects = 0;
8900 server.vm_stats_swapouts = 0;
8901 server.vm_stats_swapins = 0;
75680a3c 8902 totsize = server.vm_pages*server.vm_page_size;
8903 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8904 if (ftruncate(server.vm_fd,totsize) == -1) {
8905 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8906 strerror(errno));
8907 exit(1);
8908 } else {
8909 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8910 }
7d30035d 8911 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 8912 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 8913 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 8914 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 8915
996cb5f7 8916 /* Initialize threaded I/O (used by Virtual Memory) */
8917 server.io_newjobs = listCreate();
8918 server.io_processing = listCreate();
8919 server.io_processed = listCreate();
d5d55fc3 8920 server.io_ready_clients = listCreate();
92f8e882 8921 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 8922 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8923 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 8924 server.io_active_threads = 0;
996cb5f7 8925 if (pipe(pipefds) == -1) {
8926 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8927 ,strerror(errno));
8928 exit(1);
8929 }
8930 server.io_ready_pipe_read = pipefds[0];
8931 server.io_ready_pipe_write = pipefds[1];
8932 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 8933 /* LZF requires a lot of stack */
8934 pthread_attr_init(&server.io_threads_attr);
8935 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8936 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8937 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 8938 /* Listen for events in the threaded I/O pipe */
8939 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8940 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8941 oom("creating file event");
75680a3c 8942}
8943
06224fec 8944/* Mark the page as used */
8945static void vmMarkPageUsed(off_t page) {
8946 off_t byte = page/8;
8947 int bit = page&7;
970e10bb 8948 redisAssert(vmFreePage(page) == 1);
06224fec 8949 server.vm_bitmap[byte] |= 1<<bit;
8950}
8951
8952/* Mark N contiguous pages as used, with 'page' being the first. */
8953static void vmMarkPagesUsed(off_t page, off_t count) {
8954 off_t j;
8955
8956 for (j = 0; j < count; j++)
7d30035d 8957 vmMarkPageUsed(page+j);
7d98e08c 8958 server.vm_stats_used_pages += count;
7c775e09 8959 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8960 (long long)count, (long long)page);
06224fec 8961}
8962
8963/* Mark the page as free */
8964static void vmMarkPageFree(off_t page) {
8965 off_t byte = page/8;
8966 int bit = page&7;
970e10bb 8967 redisAssert(vmFreePage(page) == 0);
06224fec 8968 server.vm_bitmap[byte] &= ~(1<<bit);
8969}
8970
8971/* Mark N contiguous pages as free, with 'page' being the first. */
8972static void vmMarkPagesFree(off_t page, off_t count) {
8973 off_t j;
8974
8975 for (j = 0; j < count; j++)
7d30035d 8976 vmMarkPageFree(page+j);
7d98e08c 8977 server.vm_stats_used_pages -= count;
7c775e09 8978 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8979 (long long)count, (long long)page);
06224fec 8980}
8981
8982/* Test if the page is free */
8983static int vmFreePage(off_t page) {
8984 off_t byte = page/8;
8985 int bit = page&7;
7d30035d 8986 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 8987}
8988
8989/* Find N contiguous free pages storing the first page of the cluster in *first.
e0a62c7f 8990 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
3a66edc7 8991 * REDIS_ERR is returned.
06224fec 8992 *
8993 * This function uses a simple algorithm: we try to allocate
8994 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8995 * again from the start of the swap file searching for free spaces.
8996 *
8997 * If it looks pretty clear that there are no free pages near our offset
8998 * we try to find less populated places doing a forward jump of
8999 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9000 * without hurry, and then we jump again and so forth...
e0a62c7f 9001 *
06224fec 9002 * This function can be improved using a free list to avoid to guess
9003 * too much, since we could collect data about freed pages.
9004 *
9005 * note: I implemented this function just after watching an episode of
9006 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9007 */
c7df85a4 9008static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 9009 off_t base, offset = 0, since_jump = 0, numfree = 0;
9010
9011 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9012 server.vm_near_pages = 0;
9013 server.vm_next_page = 0;
9014 }
9015 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9016 base = server.vm_next_page;
9017
9018 while(offset < server.vm_pages) {
9019 off_t this = base+offset;
9020
9021 /* If we overflow, restart from page zero */
9022 if (this >= server.vm_pages) {
9023 this -= server.vm_pages;
9024 if (this == 0) {
9025 /* Just overflowed, what we found on tail is no longer
9026 * interesting, as it's no longer contiguous. */
9027 numfree = 0;
9028 }
9029 }
9030 if (vmFreePage(this)) {
9031 /* This is a free page */
9032 numfree++;
9033 /* Already got N free pages? Return to the caller, with success */
9034 if (numfree == n) {
7d30035d 9035 *first = this-(n-1);
9036 server.vm_next_page = this+1;
7c775e09 9037 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 9038 return REDIS_OK;
06224fec 9039 }
9040 } else {
9041 /* The current one is not a free page */
9042 numfree = 0;
9043 }
9044
9045 /* Fast-forward if the current page is not free and we already
9046 * searched enough near this place. */
9047 since_jump++;
9048 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9049 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9050 since_jump = 0;
9051 /* Note that even if we rewind after the jump, we are don't need
9052 * to make sure numfree is set to zero as we only jump *if* it
9053 * is set to zero. */
9054 } else {
9055 /* Otherwise just check the next page */
9056 offset++;
9057 }
9058 }
3a66edc7 9059 return REDIS_ERR;
9060}
9061
a5819310 9062/* Write the specified object at the specified page of the swap file */
9063static int vmWriteObjectOnSwap(robj *o, off_t page) {
9064 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9065 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9066 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9067 redisLog(REDIS_WARNING,
9ebed7cf 9068 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 9069 strerror(errno));
9070 return REDIS_ERR;
9071 }
9072 rdbSaveObject(server.vm_fp,o);
ba76a8f9 9073 fflush(server.vm_fp);
a5819310 9074 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9075 return REDIS_OK;
9076}
9077
3a66edc7 9078/* Swap the 'val' object relative to 'key' into disk. Store all the information
9079 * needed to later retrieve the object into the key object.
9080 * If we can't find enough contiguous empty pages to swap the object on disk
9081 * REDIS_ERR is returned. */
a69a0c9c 9082static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 9083 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 9084 off_t page;
9085
9086 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 9087 assert(key->refcount == 1);
3a66edc7 9088 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 9089 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 9090 key->vm.page = page;
9091 key->vm.usedpages = pages;
9092 key->storage = REDIS_VM_SWAPPED;
d894161b 9093 key->vtype = val->type;
3a66edc7 9094 decrRefCount(val); /* Deallocate the object from memory. */
9095 vmMarkPagesUsed(page,pages);
7d30035d 9096 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
9097 (unsigned char*) key->ptr,
9098 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 9099 server.vm_stats_swapped_objects++;
9100 server.vm_stats_swapouts++;
3a66edc7 9101 return REDIS_OK;
9102}
9103
a5819310 9104static robj *vmReadObjectFromSwap(off_t page, int type) {
9105 robj *o;
3a66edc7 9106
a5819310 9107 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9108 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 9109 redisLog(REDIS_WARNING,
d5d55fc3 9110 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 9111 strerror(errno));
478c2c6f 9112 _exit(1);
3a66edc7 9113 }
a5819310 9114 o = rdbLoadObject(type,server.vm_fp);
9115 if (o == NULL) {
d5d55fc3 9116 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 9117 _exit(1);
3a66edc7 9118 }
a5819310 9119 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9120 return o;
9121}
9122
9123/* Load the value object relative to the 'key' object from swap to memory.
9124 * The newly allocated object is returned.
9125 *
9126 * If preview is true the unserialized object is returned to the caller but
9127 * no changes are made to the key object, nor the pages are marked as freed */
9128static robj *vmGenericLoadObject(robj *key, int preview) {
9129 robj *val;
9130
d5d55fc3 9131 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
a5819310 9132 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 9133 if (!preview) {
9134 key->storage = REDIS_VM_MEMORY;
9135 key->vm.atime = server.unixtime;
9136 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9137 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
9138 (unsigned char*) key->ptr);
7d98e08c 9139 server.vm_stats_swapped_objects--;
38aba9a1 9140 } else {
9141 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
9142 (unsigned char*) key->ptr);
7e69548d 9143 }
7d98e08c 9144 server.vm_stats_swapins++;
3a66edc7 9145 return val;
06224fec 9146}
9147
7e69548d 9148/* Plain object loading, from swap to memory */
9149static robj *vmLoadObject(robj *key) {
996cb5f7 9150 /* If we are loading the object in background, stop it, we
9151 * need to load this object synchronously ASAP. */
9152 if (key->storage == REDIS_VM_LOADING)
9153 vmCancelThreadedIOJob(key);
7e69548d 9154 return vmGenericLoadObject(key,0);
9155}
9156
9157/* Just load the value on disk, without to modify the key.
9158 * This is useful when we want to perform some operation on the value
9159 * without to really bring it from swap to memory, like while saving the
9160 * dataset or rewriting the append only log. */
9161static robj *vmPreviewObject(robj *key) {
9162 return vmGenericLoadObject(key,1);
9163}
9164
4ef8de8a 9165/* How a good candidate is this object for swapping?
9166 * The better candidate it is, the greater the returned value.
9167 *
9168 * Currently we try to perform a fast estimation of the object size in
9169 * memory, and combine it with aging informations.
9170 *
9171 * Basically swappability = idle-time * log(estimated size)
9172 *
9173 * Bigger objects are preferred over smaller objects, but not
9174 * proportionally, this is why we use the logarithm. This algorithm is
9175 * just a first try and will probably be tuned later. */
9176static double computeObjectSwappability(robj *o) {
9177 time_t age = server.unixtime - o->vm.atime;
9178 long asize = 0;
9179 list *l;
9180 dict *d;
9181 struct dictEntry *de;
9182 int z;
9183
9184 if (age <= 0) return 0;
9185 switch(o->type) {
9186 case REDIS_STRING:
9187 if (o->encoding != REDIS_ENCODING_RAW) {
9188 asize = sizeof(*o);
9189 } else {
9190 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9191 }
9192 break;
9193 case REDIS_LIST:
9194 l = o->ptr;
9195 listNode *ln = listFirst(l);
9196
9197 asize = sizeof(list);
9198 if (ln) {
9199 robj *ele = ln->value;
9200 long elesize;
9201
9202 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9203 (sizeof(*o)+sdslen(ele->ptr)) :
9204 sizeof(*o);
9205 asize += (sizeof(listNode)+elesize)*listLength(l);
9206 }
9207 break;
9208 case REDIS_SET:
9209 case REDIS_ZSET:
9210 z = (o->type == REDIS_ZSET);
9211 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9212
9213 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9214 if (z) asize += sizeof(zset)-sizeof(dict);
9215 if (dictSize(d)) {
9216 long elesize;
9217 robj *ele;
9218
9219 de = dictGetRandomKey(d);
9220 ele = dictGetEntryKey(de);
9221 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9222 (sizeof(*o)+sdslen(ele->ptr)) :
9223 sizeof(*o);
9224 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9225 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9226 }
9227 break;
a97b9060 9228 case REDIS_HASH:
9229 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9230 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9231 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9232 unsigned int klen, vlen;
9233 unsigned char *key, *val;
9234
9235 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9236 klen = 0;
9237 vlen = 0;
9238 }
9239 asize = len*(klen+vlen+3);
9240 } else if (o->encoding == REDIS_ENCODING_HT) {
9241 d = o->ptr;
9242 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9243 if (dictSize(d)) {
9244 long elesize;
9245 robj *ele;
9246
9247 de = dictGetRandomKey(d);
9248 ele = dictGetEntryKey(de);
9249 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9250 (sizeof(*o)+sdslen(ele->ptr)) :
9251 sizeof(*o);
9252 ele = dictGetEntryVal(de);
9253 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9254 (sizeof(*o)+sdslen(ele->ptr)) :
9255 sizeof(*o);
9256 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9257 }
9258 }
9259 break;
4ef8de8a 9260 }
c8c72447 9261 return (double)age*log(1+asize);
4ef8de8a 9262}
9263
9264/* Try to swap an object that's a good candidate for swapping.
9265 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 9266 * to swap any object at all.
9267 *
9268 * If 'usethreaded' is true, Redis will try to swap the object in background
9269 * using I/O threads. */
9270static int vmSwapOneObject(int usethreads) {
4ef8de8a 9271 int j, i;
9272 struct dictEntry *best = NULL;
9273 double best_swappability = 0;
b9bc0eef 9274 redisDb *best_db = NULL;
4ef8de8a 9275 robj *key, *val;
9276
9277 for (j = 0; j < server.dbnum; j++) {
9278 redisDb *db = server.db+j;
b72f6a4b 9279 /* Why maxtries is set to 100?
9280 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9281 * are swappable objects */
b0d8747d 9282 int maxtries = 100;
4ef8de8a 9283
9284 if (dictSize(db->dict) == 0) continue;
9285 for (i = 0; i < 5; i++) {
9286 dictEntry *de;
9287 double swappability;
9288
e3cadb8a 9289 if (maxtries) maxtries--;
4ef8de8a 9290 de = dictGetRandomKey(db->dict);
9291 key = dictGetEntryKey(de);
9292 val = dictGetEntryVal(de);
1064ef87 9293 /* Only swap objects that are currently in memory.
9294 *
9295 * Also don't swap shared objects if threaded VM is on, as we
9296 * try to ensure that the main thread does not touch the
9297 * object while the I/O thread is using it, but we can't
9298 * control other keys without adding additional mutex. */
9299 if (key->storage != REDIS_VM_MEMORY ||
9300 (server.vm_max_threads != 0 && val->refcount != 1)) {
e3cadb8a 9301 if (maxtries) i--; /* don't count this try */
9302 continue;
9303 }
4ef8de8a 9304 swappability = computeObjectSwappability(val);
9305 if (!best || swappability > best_swappability) {
9306 best = de;
9307 best_swappability = swappability;
b9bc0eef 9308 best_db = db;
4ef8de8a 9309 }
9310 }
9311 }
7c775e09 9312 if (best == NULL) return REDIS_ERR;
4ef8de8a 9313 key = dictGetEntryKey(best);
9314 val = dictGetEntryVal(best);
9315
e3cadb8a 9316 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 9317 key->ptr, best_swappability);
9318
9319 /* Unshare the key if needed */
9320 if (key->refcount > 1) {
9321 robj *newkey = dupStringObject(key);
9322 decrRefCount(key);
9323 key = dictGetEntryKey(best) = newkey;
9324 }
9325 /* Swap it */
a69a0c9c 9326 if (usethreads) {
b9bc0eef 9327 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 9328 return REDIS_OK;
9329 } else {
a69a0c9c 9330 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9331 dictGetEntryVal(best) = NULL;
9332 return REDIS_OK;
9333 } else {
9334 return REDIS_ERR;
9335 }
4ef8de8a 9336 }
9337}
9338
a69a0c9c 9339static int vmSwapOneObjectBlocking() {
9340 return vmSwapOneObject(0);
9341}
9342
9343static int vmSwapOneObjectThreaded() {
9344 return vmSwapOneObject(1);
9345}
9346
7e69548d 9347/* Return true if it's safe to swap out objects in a given moment.
9348 * Basically we don't want to swap objects out while there is a BGSAVE
9349 * or a BGAEOREWRITE running in backgroud. */
9350static int vmCanSwapOut(void) {
9351 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9352}
9353
1b03836c 9354/* Delete a key if swapped. Returns 1 if the key was found, was swapped
9355 * and was deleted. Otherwise 0 is returned. */
9356static int deleteIfSwapped(redisDb *db, robj *key) {
9357 dictEntry *de;
9358 robj *foundkey;
9359
9360 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9361 foundkey = dictGetEntryKey(de);
9362 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9363 deleteKey(db,key);
9364 return 1;
9365}
9366
996cb5f7 9367/* =================== Virtual Memory - Threaded I/O ======================= */
9368
b9bc0eef 9369static void freeIOJob(iojob *j) {
d5d55fc3 9370 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9371 j->type == REDIS_IOJOB_DO_SWAP ||
9372 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
b9bc0eef 9373 decrRefCount(j->val);
78ebe4c8 9374 /* We don't decrRefCount the j->key field as we did't incremented
9375 * the count creating IO Jobs. This is because the key field here is
9376 * just used as an indentifier and if a key is removed the Job should
9377 * never be touched again. */
b9bc0eef 9378 zfree(j);
9379}
9380
996cb5f7 9381/* Every time a thread finished a Job, it writes a byte into the write side
9382 * of an unix pipe in order to "awake" the main thread, and this function
9383 * is called. */
9384static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9385 int mask)
9386{
9387 char buf[1];
b0d8747d 9388 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 9389 REDIS_NOTUSED(el);
9390 REDIS_NOTUSED(mask);
9391 REDIS_NOTUSED(privdata);
9392
9393 /* For every byte we read in the read side of the pipe, there is one
9394 * I/O job completed to process. */
9395 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 9396 iojob *j;
9397 listNode *ln;
9398 robj *key;
9399 struct dictEntry *de;
9400
996cb5f7 9401 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 9402
9403 /* Get the processed element (the oldest one) */
9404 lockThreadedIO();
1064ef87 9405 assert(listLength(server.io_processed) != 0);
f6c0bba8 9406 if (toprocess == -1) {
9407 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9408 if (toprocess <= 0) toprocess = 1;
9409 }
b9bc0eef 9410 ln = listFirst(server.io_processed);
9411 j = ln->value;
9412 listDelNode(server.io_processed,ln);
9413 unlockThreadedIO();
9414 /* If this job is marked as canceled, just ignore it */
9415 if (j->canceled) {
9416 freeIOJob(j);
9417 continue;
9418 }
9419 /* Post process it in the main thread, as there are things we
9420 * can do just here to avoid race conditions and/or invasive locks */
6c96ba7d 9421 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 9422 de = dictFind(j->db->dict,j->key);
9423 assert(de != NULL);
9424 key = dictGetEntryKey(de);
9425 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9426 redisDb *db;
9427
b9bc0eef 9428 /* Key loaded, bring it at home */
9429 key->storage = REDIS_VM_MEMORY;
9430 key->vm.atime = server.unixtime;
9431 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9432 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9433 (unsigned char*) key->ptr);
9434 server.vm_stats_swapped_objects--;
9435 server.vm_stats_swapins++;
d5d55fc3 9436 dictGetEntryVal(de) = j->val;
9437 incrRefCount(j->val);
9438 db = j->db;
b9bc0eef 9439 freeIOJob(j);
d5d55fc3 9440 /* Handle clients waiting for this key to be loaded. */
9441 handleClientsBlockedOnSwappedKey(db,key);
b9bc0eef 9442 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9443 /* Now we know the amount of pages required to swap this object.
9444 * Let's find some space for it, and queue this task again
9445 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 9446 if (!vmCanSwapOut() ||
9447 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9448 {
9449 /* Ooops... no space or we can't swap as there is
9450 * a fork()ed Redis trying to save stuff on disk. */
b9bc0eef 9451 freeIOJob(j);
054e426d 9452 key->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 9453 } else {
c7df85a4 9454 /* Note that we need to mark this pages as used now,
9455 * if the job will be canceled, we'll mark them as freed
9456 * again. */
9457 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 9458 j->type = REDIS_IOJOB_DO_SWAP;
9459 lockThreadedIO();
9460 queueIOJob(j);
9461 unlockThreadedIO();
9462 }
9463 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9464 robj *val;
9465
9466 /* Key swapped. We can finally free some memory. */
6c96ba7d 9467 if (key->storage != REDIS_VM_SWAPPING) {
9468 printf("key->storage: %d\n",key->storage);
9469 printf("key->name: %s\n",(char*)key->ptr);
9470 printf("key->refcount: %d\n",key->refcount);
9471 printf("val: %p\n",(void*)j->val);
9472 printf("val->type: %d\n",j->val->type);
9473 printf("val->ptr: %s\n",(char*)j->val->ptr);
9474 }
9475 redisAssert(key->storage == REDIS_VM_SWAPPING);
b9bc0eef 9476 val = dictGetEntryVal(de);
9477 key->vm.page = j->page;
9478 key->vm.usedpages = j->pages;
9479 key->storage = REDIS_VM_SWAPPED;
9480 key->vtype = j->val->type;
9481 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 9482 dictGetEntryVal(de) = NULL;
b9bc0eef 9483 redisLog(REDIS_DEBUG,
9484 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9485 (unsigned char*) key->ptr,
9486 (unsigned long long) j->page, (unsigned long long) j->pages);
9487 server.vm_stats_swapped_objects++;
9488 server.vm_stats_swapouts++;
9489 freeIOJob(j);
f11b8647 9490 /* Put a few more swap requests in queue if we are still
9491 * out of memory */
b0d8747d 9492 if (trytoswap && vmCanSwapOut() &&
9493 zmalloc_used_memory() > server.vm_max_memory)
9494 {
f11b8647 9495 int more = 1;
9496 while(more) {
9497 lockThreadedIO();
9498 more = listLength(server.io_newjobs) <
9499 (unsigned) server.vm_max_threads;
9500 unlockThreadedIO();
9501 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 9502 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9503 trytoswap = 0;
9504 break;
9505 }
f11b8647 9506 }
9507 }
b9bc0eef 9508 }
c953f24b 9509 processed++;
f6c0bba8 9510 if (processed == toprocess) return;
996cb5f7 9511 }
9512 if (retval < 0 && errno != EAGAIN) {
9513 redisLog(REDIS_WARNING,
9514 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9515 strerror(errno));
9516 }
9517}
9518
9519static void lockThreadedIO(void) {
9520 pthread_mutex_lock(&server.io_mutex);
9521}
9522
9523static void unlockThreadedIO(void) {
9524 pthread_mutex_unlock(&server.io_mutex);
9525}
9526
9527/* Remove the specified object from the threaded I/O queue if still not
9528 * processed, otherwise make sure to flag it as canceled. */
9529static void vmCancelThreadedIOJob(robj *o) {
9530 list *lists[3] = {
6c96ba7d 9531 server.io_newjobs, /* 0 */
9532 server.io_processing, /* 1 */
9533 server.io_processed /* 2 */
996cb5f7 9534 };
9535 int i;
9536
9537 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 9538again:
996cb5f7 9539 lockThreadedIO();
9540 /* Search for a matching key in one of the queues */
9541 for (i = 0; i < 3; i++) {
9542 listNode *ln;
c7df85a4 9543 listIter li;
996cb5f7 9544
c7df85a4 9545 listRewind(lists[i],&li);
9546 while ((ln = listNext(&li)) != NULL) {
996cb5f7 9547 iojob *job = ln->value;
9548
6c96ba7d 9549 if (job->canceled) continue; /* Skip this, already canceled. */
78ebe4c8 9550 if (job->key == o) {
970e10bb 9551 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9552 (void*)job, (char*)o->ptr, job->type, i);
427a2153 9553 /* Mark the pages as free since the swap didn't happened
9554 * or happened but is now discarded. */
970e10bb 9555 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 9556 vmMarkPagesFree(job->page,job->pages);
9557 /* Cancel the job. It depends on the list the job is
9558 * living in. */
996cb5f7 9559 switch(i) {
9560 case 0: /* io_newjobs */
6c96ba7d 9561 /* If the job was yet not processed the best thing to do
996cb5f7 9562 * is to remove it from the queue at all */
6c96ba7d 9563 freeIOJob(job);
996cb5f7 9564 listDelNode(lists[i],ln);
9565 break;
9566 case 1: /* io_processing */
d5d55fc3 9567 /* Oh Shi- the thread is messing with the Job:
9568 *
9569 * Probably it's accessing the object if this is a
9570 * PREPARE_SWAP or DO_SWAP job.
9571 * If it's a LOAD job it may be reading from disk and
9572 * if we don't wait for the job to terminate before to
9573 * cancel it, maybe in a few microseconds data can be
9574 * corrupted in this pages. So the short story is:
9575 *
9576 * Better to wait for the job to move into the
9577 * next queue (processed)... */
9578
9579 /* We try again and again until the job is completed. */
9580 unlockThreadedIO();
9581 /* But let's wait some time for the I/O thread
9582 * to finish with this job. After all this condition
9583 * should be very rare. */
9584 usleep(1);
9585 goto again;
996cb5f7 9586 case 2: /* io_processed */
2e111efe 9587 /* The job was already processed, that's easy...
9588 * just mark it as canceled so that we'll ignore it
9589 * when processing completed jobs. */
996cb5f7 9590 job->canceled = 1;
9591 break;
9592 }
c7df85a4 9593 /* Finally we have to adjust the storage type of the object
9594 * in order to "UNDO" the operaiton. */
996cb5f7 9595 if (o->storage == REDIS_VM_LOADING)
9596 o->storage = REDIS_VM_SWAPPED;
9597 else if (o->storage == REDIS_VM_SWAPPING)
9598 o->storage = REDIS_VM_MEMORY;
9599 unlockThreadedIO();
9600 return;
9601 }
9602 }
9603 }
9604 unlockThreadedIO();
9605 assert(1 != 1); /* We should never reach this */
9606}
9607
b9bc0eef 9608static void *IOThreadEntryPoint(void *arg) {
9609 iojob *j;
9610 listNode *ln;
9611 REDIS_NOTUSED(arg);
9612
9613 pthread_detach(pthread_self());
9614 while(1) {
9615 /* Get a new job to process */
9616 lockThreadedIO();
9617 if (listLength(server.io_newjobs) == 0) {
9618 /* No new jobs in queue, exit. */
9ebed7cf 9619 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9620 (long) pthread_self());
b9bc0eef 9621 server.io_active_threads--;
9622 unlockThreadedIO();
9623 return NULL;
9624 }
9625 ln = listFirst(server.io_newjobs);
9626 j = ln->value;
9627 listDelNode(server.io_newjobs,ln);
9628 /* Add the job in the processing queue */
9629 j->thread = pthread_self();
9630 listAddNodeTail(server.io_processing,j);
9631 ln = listLast(server.io_processing); /* We use ln later to remove it */
9632 unlockThreadedIO();
9ebed7cf 9633 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9634 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 9635
9636 /* Process the Job */
9637 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9638 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
b9bc0eef 9639 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9640 FILE *fp = fopen("/dev/null","w+");
9641 j->pages = rdbSavedObjectPages(j->val,fp);
9642 fclose(fp);
9643 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 9644 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9645 j->canceled = 1;
b9bc0eef 9646 }
9647
9648 /* Done: insert the job into the processed queue */
9ebed7cf 9649 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9650 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 9651 lockThreadedIO();
9652 listDelNode(server.io_processing,ln);
9653 listAddNodeTail(server.io_processed,j);
9654 unlockThreadedIO();
e0a62c7f 9655
b9bc0eef 9656 /* Signal the main thread there is new stuff to process */
9657 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9658 }
9659 return NULL; /* never reached */
9660}
9661
9662static void spawnIOThread(void) {
9663 pthread_t thread;
478c2c6f 9664 sigset_t mask, omask;
a97b9060 9665 int err;
b9bc0eef 9666
478c2c6f 9667 sigemptyset(&mask);
9668 sigaddset(&mask,SIGCHLD);
9669 sigaddset(&mask,SIGHUP);
9670 sigaddset(&mask,SIGPIPE);
9671 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 9672 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9673 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9674 strerror(err));
9675 usleep(1000000);
9676 }
478c2c6f 9677 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 9678 server.io_active_threads++;
9679}
9680
4ee9488d 9681/* We need to wait for the last thread to exit before we are able to
9682 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 9683static void waitEmptyIOJobsQueue(void) {
4ee9488d 9684 while(1) {
76b7233a 9685 int io_processed_len;
9686
4ee9488d 9687 lockThreadedIO();
054e426d 9688 if (listLength(server.io_newjobs) == 0 &&
9689 listLength(server.io_processing) == 0 &&
9690 server.io_active_threads == 0)
9691 {
4ee9488d 9692 unlockThreadedIO();
9693 return;
9694 }
76b7233a 9695 /* While waiting for empty jobs queue condition we post-process some
9696 * finshed job, as I/O threads may be hanging trying to write against
9697 * the io_ready_pipe_write FD but there are so much pending jobs that
9698 * it's blocking. */
9699 io_processed_len = listLength(server.io_processed);
4ee9488d 9700 unlockThreadedIO();
76b7233a 9701 if (io_processed_len) {
9702 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9703 usleep(1000); /* 1 millisecond */
9704 } else {
9705 usleep(10000); /* 10 milliseconds */
9706 }
4ee9488d 9707 }
9708}
9709
054e426d 9710static void vmReopenSwapFile(void) {
478c2c6f 9711 /* Note: we don't close the old one as we are in the child process
9712 * and don't want to mess at all with the original file object. */
054e426d 9713 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9714 if (server.vm_fp == NULL) {
9715 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9716 server.vm_swap_file);
478c2c6f 9717 _exit(1);
054e426d 9718 }
9719 server.vm_fd = fileno(server.vm_fp);
9720}
9721
b9bc0eef 9722/* This function must be called while with threaded IO locked */
9723static void queueIOJob(iojob *j) {
6c96ba7d 9724 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9725 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 9726 listAddNodeTail(server.io_newjobs,j);
9727 if (server.io_active_threads < server.vm_max_threads)
9728 spawnIOThread();
9729}
9730
9731static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9732 iojob *j;
e0a62c7f 9733
b9bc0eef 9734 assert(key->storage == REDIS_VM_MEMORY);
9735 assert(key->refcount == 1);
9736
9737 j = zmalloc(sizeof(*j));
9738 j->type = REDIS_IOJOB_PREPARE_SWAP;
9739 j->db = db;
78ebe4c8 9740 j->key = key;
b9bc0eef 9741 j->val = val;
9742 incrRefCount(val);
9743 j->canceled = 0;
9744 j->thread = (pthread_t) -1;
f11b8647 9745 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 9746
9747 lockThreadedIO();
9748 queueIOJob(j);
9749 unlockThreadedIO();
9750 return REDIS_OK;
9751}
9752
b0d8747d 9753/* ============ Virtual Memory - Blocking clients on missing keys =========== */
9754
d5d55fc3 9755/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9756 * If there is not already a job loading the key, it is craeted.
9757 * The key is added to the io_keys list in the client structure, and also
9758 * in the hash table mapping swapped keys to waiting clients, that is,
9759 * server.io_waited_keys. */
9760static int waitForSwappedKey(redisClient *c, robj *key) {
9761 struct dictEntry *de;
9762 robj *o;
9763 list *l;
9764
9765 /* If the key does not exist or is already in RAM we don't need to
9766 * block the client at all. */
9767 de = dictFind(c->db->dict,key);
9768 if (de == NULL) return 0;
9769 o = dictGetEntryKey(de);
9770 if (o->storage == REDIS_VM_MEMORY) {
9771 return 0;
9772 } else if (o->storage == REDIS_VM_SWAPPING) {
9773 /* We were swapping the key, undo it! */
9774 vmCancelThreadedIOJob(o);
9775 return 0;
9776 }
e0a62c7f 9777
d5d55fc3 9778 /* OK: the key is either swapped, or being loaded just now. */
9779
9780 /* Add the key to the list of keys this client is waiting for.
9781 * This maps clients to keys they are waiting for. */
9782 listAddNodeTail(c->io_keys,key);
9783 incrRefCount(key);
9784
9785 /* Add the client to the swapped keys => clients waiting map. */
9786 de = dictFind(c->db->io_keys,key);
9787 if (de == NULL) {
9788 int retval;
9789
9790 /* For every key we take a list of clients blocked for it */
9791 l = listCreate();
9792 retval = dictAdd(c->db->io_keys,key,l);
9793 incrRefCount(key);
9794 assert(retval == DICT_OK);
9795 } else {
9796 l = dictGetEntryVal(de);
9797 }
9798 listAddNodeTail(l,c);
9799
9800 /* Are we already loading the key from disk? If not create a job */
9801 if (o->storage == REDIS_VM_SWAPPED) {
9802 iojob *j;
9803
9804 o->storage = REDIS_VM_LOADING;
9805 j = zmalloc(sizeof(*j));
9806 j->type = REDIS_IOJOB_LOAD;
9807 j->db = c->db;
78ebe4c8 9808 j->key = o;
d5d55fc3 9809 j->key->vtype = o->vtype;
9810 j->page = o->vm.page;
9811 j->val = NULL;
9812 j->canceled = 0;
9813 j->thread = (pthread_t) -1;
9814 lockThreadedIO();
9815 queueIOJob(j);
9816 unlockThreadedIO();
9817 }
9818 return 1;
9819}
9820
6f078746
PN
9821/* Preload keys for any command with first, last and step values for
9822 * the command keys prototype, as defined in the command table. */
9823static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9824 int j, last;
9825 if (cmd->vm_firstkey == 0) return;
9826 last = cmd->vm_lastkey;
9827 if (last < 0) last = argc+last;
9828 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
9829 redisAssert(j < argc);
9830 waitForSwappedKey(c,argv[j]);
9831 }
9832}
9833
5d373da9 9834/* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
739ba0d2
PN
9835 * Note that the number of keys to preload is user-defined, so we need to
9836 * apply a sanity check against argc. */
ca1788b5 9837static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
76583ea4 9838 int i, num;
ca1788b5 9839 REDIS_NOTUSED(cmd);
ca1788b5
PN
9840
9841 num = atoi(argv[2]->ptr);
739ba0d2 9842 if (num > (argc-3)) return;
76583ea4 9843 for (i = 0; i < num; i++) {
ca1788b5 9844 waitForSwappedKey(c,argv[3+i]);
76583ea4
PN
9845 }
9846}
9847
3805e04f
PN
9848/* Preload keys needed to execute the entire MULTI/EXEC block.
9849 *
9850 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
9851 * and will block the client when any command requires a swapped out value. */
9852static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9853 int i, margc;
9854 struct redisCommand *mcmd;
9855 robj **margv;
9856 REDIS_NOTUSED(cmd);
9857 REDIS_NOTUSED(argc);
9858 REDIS_NOTUSED(argv);
9859
9860 if (!(c->flags & REDIS_MULTI)) return;
9861 for (i = 0; i < c->mstate.count; i++) {
9862 mcmd = c->mstate.commands[i].cmd;
9863 margc = c->mstate.commands[i].argc;
9864 margv = c->mstate.commands[i].argv;
9865
9866 if (mcmd->vm_preload_proc != NULL) {
9867 mcmd->vm_preload_proc(c,mcmd,margc,margv);
9868 } else {
9869 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
9870 }
76583ea4
PN
9871 }
9872}
9873
b0d8747d 9874/* Is this client attempting to run a command against swapped keys?
d5d55fc3 9875 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 9876 *
d5d55fc3 9877 * The important idea about this function is that it can fail! If keys will
9878 * still be swapped when the client is resumed, this key lookups will
9879 * just block loading keys from disk. In practical terms this should only
9880 * happen with SORT BY command or if there is a bug in this function.
9881 *
9882 * Return 1 if the client is marked as blocked, 0 if the client can
9883 * continue as the keys it is going to access appear to be in memory. */
0a6f3f0f 9884static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
76583ea4 9885 if (cmd->vm_preload_proc != NULL) {
ca1788b5 9886 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
76583ea4 9887 } else {
6f078746 9888 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
76583ea4
PN
9889 }
9890
d5d55fc3 9891 /* If the client was blocked for at least one key, mark it as blocked. */
9892 if (listLength(c->io_keys)) {
9893 c->flags |= REDIS_IO_WAIT;
9894 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9895 server.vm_blocked_clients++;
9896 return 1;
9897 } else {
9898 return 0;
9899 }
9900}
9901
9902/* Remove the 'key' from the list of blocked keys for a given client.
9903 *
9904 * The function returns 1 when there are no longer blocking keys after
9905 * the current one was removed (and the client can be unblocked). */
9906static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9907 list *l;
9908 listNode *ln;
9909 listIter li;
9910 struct dictEntry *de;
9911
9912 /* Remove the key from the list of keys this client is waiting for. */
9913 listRewind(c->io_keys,&li);
9914 while ((ln = listNext(&li)) != NULL) {
bf028098 9915 if (equalStringObjects(ln->value,key)) {
d5d55fc3 9916 listDelNode(c->io_keys,ln);
9917 break;
9918 }
9919 }
9920 assert(ln != NULL);
9921
9922 /* Remove the client form the key => waiting clients map. */
9923 de = dictFind(c->db->io_keys,key);
9924 assert(de != NULL);
9925 l = dictGetEntryVal(de);
9926 ln = listSearchKey(l,c);
9927 assert(ln != NULL);
9928 listDelNode(l,ln);
9929 if (listLength(l) == 0)
9930 dictDelete(c->db->io_keys,key);
9931
9932 return listLength(c->io_keys) == 0;
9933}
9934
9935static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9936 struct dictEntry *de;
9937 list *l;
9938 listNode *ln;
9939 int len;
9940
9941 de = dictFind(db->io_keys,key);
9942 if (!de) return;
9943
9944 l = dictGetEntryVal(de);
9945 len = listLength(l);
9946 /* Note: we can't use something like while(listLength(l)) as the list
9947 * can be freed by the calling function when we remove the last element. */
9948 while (len--) {
9949 ln = listFirst(l);
9950 redisClient *c = ln->value;
9951
9952 if (dontWaitForSwappedKey(c,key)) {
9953 /* Put the client in the list of clients ready to go as we
9954 * loaded all the keys about it. */
9955 listAddNodeTail(server.io_ready_clients,c);
9956 }
9957 }
b0d8747d 9958}
b0d8747d 9959
500ece7c 9960/* =========================== Remote Configuration ========================= */
9961
9962static void configSetCommand(redisClient *c) {
9963 robj *o = getDecodedObject(c->argv[3]);
2e5eb04e 9964 long long ll;
9965
500ece7c 9966 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9967 zfree(server.dbfilename);
9968 server.dbfilename = zstrdup(o->ptr);
9969 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9970 zfree(server.requirepass);
9971 server.requirepass = zstrdup(o->ptr);
9972 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9973 zfree(server.masterauth);
9974 server.masterauth = zstrdup(o->ptr);
9975 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
2e5eb04e 9976 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9977 ll < 0) goto badfmt;
9978 server.maxmemory = ll;
9979 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
9980 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9981 ll < 0 || ll > LONG_MAX) goto badfmt;
9982 server.maxidletime = ll;
1b677732 9983 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
9984 if (!strcasecmp(o->ptr,"no")) {
9985 server.appendfsync = APPENDFSYNC_NO;
9986 } else if (!strcasecmp(o->ptr,"everysec")) {
9987 server.appendfsync = APPENDFSYNC_EVERYSEC;
9988 } else if (!strcasecmp(o->ptr,"always")) {
9989 server.appendfsync = APPENDFSYNC_ALWAYS;
9990 } else {
9991 goto badfmt;
9992 }
2e5eb04e 9993 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
9994 int old = server.appendonly;
9995 int new = yesnotoi(o->ptr);
9996
9997 if (new == -1) goto badfmt;
9998 if (old != new) {
9999 if (new == 0) {
10000 stopAppendOnly();
10001 } else {
10002 if (startAppendOnly() == REDIS_ERR) {
10003 addReplySds(c,sdscatprintf(sdsempty(),
10004 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10005 decrRefCount(o);
10006 return;
10007 }
10008 }
10009 }
a34e0a25 10010 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10011 int vlen, j;
10012 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10013
10014 /* Perform sanity check before setting the new config:
10015 * - Even number of args
10016 * - Seconds >= 1, changes >= 0 */
10017 if (vlen & 1) {
10018 sdsfreesplitres(v,vlen);
10019 goto badfmt;
10020 }
10021 for (j = 0; j < vlen; j++) {
10022 char *eptr;
10023 long val;
10024
10025 val = strtoll(v[j], &eptr, 10);
10026 if (eptr[0] != '\0' ||
10027 ((j & 1) == 0 && val < 1) ||
10028 ((j & 1) == 1 && val < 0)) {
10029 sdsfreesplitres(v,vlen);
10030 goto badfmt;
10031 }
10032 }
10033 /* Finally set the new config */
10034 resetServerSaveParams();
10035 for (j = 0; j < vlen; j += 2) {
10036 time_t seconds;
10037 int changes;
10038
10039 seconds = strtoll(v[j],NULL,10);
10040 changes = strtoll(v[j+1],NULL,10);
10041 appendServerSaveParams(seconds, changes);
10042 }
10043 sdsfreesplitres(v,vlen);
500ece7c 10044 } else {
10045 addReplySds(c,sdscatprintf(sdsempty(),
10046 "-ERR not supported CONFIG parameter %s\r\n",
10047 (char*)c->argv[2]->ptr));
10048 decrRefCount(o);
10049 return;
10050 }
10051 decrRefCount(o);
10052 addReply(c,shared.ok);
a34e0a25 10053 return;
10054
10055badfmt: /* Bad format errors */
10056 addReplySds(c,sdscatprintf(sdsempty(),
10057 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10058 (char*)o->ptr,
10059 (char*)c->argv[2]->ptr));
10060 decrRefCount(o);
500ece7c 10061}
10062
10063static void configGetCommand(redisClient *c) {
10064 robj *o = getDecodedObject(c->argv[2]);
10065 robj *lenobj = createObject(REDIS_STRING,NULL);
10066 char *pattern = o->ptr;
10067 int matches = 0;
10068
10069 addReply(c,lenobj);
10070 decrRefCount(lenobj);
10071
10072 if (stringmatch(pattern,"dbfilename",0)) {
10073 addReplyBulkCString(c,"dbfilename");
10074 addReplyBulkCString(c,server.dbfilename);
10075 matches++;
10076 }
10077 if (stringmatch(pattern,"requirepass",0)) {
10078 addReplyBulkCString(c,"requirepass");
10079 addReplyBulkCString(c,server.requirepass);
10080 matches++;
10081 }
10082 if (stringmatch(pattern,"masterauth",0)) {
10083 addReplyBulkCString(c,"masterauth");
10084 addReplyBulkCString(c,server.masterauth);
10085 matches++;
10086 }
10087 if (stringmatch(pattern,"maxmemory",0)) {
10088 char buf[128];
10089
2e5eb04e 10090 ll2string(buf,128,server.maxmemory);
500ece7c 10091 addReplyBulkCString(c,"maxmemory");
10092 addReplyBulkCString(c,buf);
10093 matches++;
10094 }
2e5eb04e 10095 if (stringmatch(pattern,"timeout",0)) {
10096 char buf[128];
10097
10098 ll2string(buf,128,server.maxidletime);
10099 addReplyBulkCString(c,"timeout");
10100 addReplyBulkCString(c,buf);
10101 matches++;
10102 }
10103 if (stringmatch(pattern,"appendonly",0)) {
10104 addReplyBulkCString(c,"appendonly");
10105 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10106 matches++;
10107 }
1b677732 10108 if (stringmatch(pattern,"appendfsync",0)) {
10109 char *policy;
10110
10111 switch(server.appendfsync) {
10112 case APPENDFSYNC_NO: policy = "no"; break;
10113 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10114 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10115 default: policy = "unknown"; break; /* too harmless to panic */
10116 }
10117 addReplyBulkCString(c,"appendfsync");
10118 addReplyBulkCString(c,policy);
10119 matches++;
10120 }
a34e0a25 10121 if (stringmatch(pattern,"save",0)) {
10122 sds buf = sdsempty();
10123 int j;
10124
10125 for (j = 0; j < server.saveparamslen; j++) {
10126 buf = sdscatprintf(buf,"%ld %d",
10127 server.saveparams[j].seconds,
10128 server.saveparams[j].changes);
10129 if (j != server.saveparamslen-1)
10130 buf = sdscatlen(buf," ",1);
10131 }
10132 addReplyBulkCString(c,"save");
10133 addReplyBulkCString(c,buf);
10134 sdsfree(buf);
10135 matches++;
10136 }
500ece7c 10137 decrRefCount(o);
10138 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10139}
10140
10141static void configCommand(redisClient *c) {
10142 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10143 if (c->argc != 4) goto badarity;
10144 configSetCommand(c);
10145 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10146 if (c->argc != 3) goto badarity;
10147 configGetCommand(c);
10148 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10149 if (c->argc != 2) goto badarity;
10150 server.stat_numcommands = 0;
10151 server.stat_numconnections = 0;
10152 server.stat_expiredkeys = 0;
10153 server.stat_starttime = time(NULL);
10154 addReply(c,shared.ok);
10155 } else {
10156 addReplySds(c,sdscatprintf(sdsempty(),
10157 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10158 }
10159 return;
10160
10161badarity:
10162 addReplySds(c,sdscatprintf(sdsempty(),
10163 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10164 (char*) c->argv[1]->ptr));
10165}
10166
befec3cd 10167/* =========================== Pubsub implementation ======================== */
10168
ffc6b7f8 10169static void freePubsubPattern(void *p) {
10170 pubsubPattern *pat = p;
10171
10172 decrRefCount(pat->pattern);
10173 zfree(pat);
10174}
10175
10176static int listMatchPubsubPattern(void *a, void *b) {
10177 pubsubPattern *pa = a, *pb = b;
10178
10179 return (pa->client == pb->client) &&
bf028098 10180 (equalStringObjects(pa->pattern,pb->pattern));
ffc6b7f8 10181}
10182
10183/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10184 * 0 if the client was already subscribed to that channel. */
10185static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
befec3cd 10186 struct dictEntry *de;
10187 list *clients = NULL;
10188 int retval = 0;
10189
ffc6b7f8 10190 /* Add the channel to the client -> channels hash table */
10191 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
befec3cd 10192 retval = 1;
ffc6b7f8 10193 incrRefCount(channel);
10194 /* Add the client to the channel -> list of clients hash table */
10195 de = dictFind(server.pubsub_channels,channel);
befec3cd 10196 if (de == NULL) {
10197 clients = listCreate();
ffc6b7f8 10198 dictAdd(server.pubsub_channels,channel,clients);
10199 incrRefCount(channel);
befec3cd 10200 } else {
10201 clients = dictGetEntryVal(de);
10202 }
10203 listAddNodeTail(clients,c);
10204 }
10205 /* Notify the client */
10206 addReply(c,shared.mbulk3);
10207 addReply(c,shared.subscribebulk);
ffc6b7f8 10208 addReplyBulk(c,channel);
482b672d 10209 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
befec3cd 10210 return retval;
10211}
10212
ffc6b7f8 10213/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10214 * 0 if the client was not subscribed to the specified channel. */
10215static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
befec3cd 10216 struct dictEntry *de;
10217 list *clients;
10218 listNode *ln;
10219 int retval = 0;
10220
ffc6b7f8 10221 /* Remove the channel from the client -> channels hash table */
10222 incrRefCount(channel); /* channel may be just a pointer to the same object
201037f5 10223 we have in the hash tables. Protect it... */
ffc6b7f8 10224 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
befec3cd 10225 retval = 1;
ffc6b7f8 10226 /* Remove the client from the channel -> clients list hash table */
10227 de = dictFind(server.pubsub_channels,channel);
befec3cd 10228 assert(de != NULL);
10229 clients = dictGetEntryVal(de);
10230 ln = listSearchKey(clients,c);
10231 assert(ln != NULL);
10232 listDelNode(clients,ln);
ff767a75 10233 if (listLength(clients) == 0) {
10234 /* Free the list and associated hash entry at all if this was
10235 * the latest client, so that it will be possible to abuse
ffc6b7f8 10236 * Redis PUBSUB creating millions of channels. */
10237 dictDelete(server.pubsub_channels,channel);
ff767a75 10238 }
befec3cd 10239 }
10240 /* Notify the client */
10241 if (notify) {
10242 addReply(c,shared.mbulk3);
10243 addReply(c,shared.unsubscribebulk);
ffc6b7f8 10244 addReplyBulk(c,channel);
482b672d 10245 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10246 listLength(c->pubsub_patterns));
10247
10248 }
10249 decrRefCount(channel); /* it is finally safe to release it */
10250 return retval;
10251}
10252
10253/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10254static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10255 int retval = 0;
10256
10257 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10258 retval = 1;
10259 pubsubPattern *pat;
10260 listAddNodeTail(c->pubsub_patterns,pattern);
10261 incrRefCount(pattern);
10262 pat = zmalloc(sizeof(*pat));
10263 pat->pattern = getDecodedObject(pattern);
10264 pat->client = c;
10265 listAddNodeTail(server.pubsub_patterns,pat);
10266 }
10267 /* Notify the client */
10268 addReply(c,shared.mbulk3);
10269 addReply(c,shared.psubscribebulk);
10270 addReplyBulk(c,pattern);
482b672d 10271 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
ffc6b7f8 10272 return retval;
10273}
10274
10275/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10276 * 0 if the client was not subscribed to the specified channel. */
10277static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10278 listNode *ln;
10279 pubsubPattern pat;
10280 int retval = 0;
10281
10282 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10283 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10284 retval = 1;
10285 listDelNode(c->pubsub_patterns,ln);
10286 pat.client = c;
10287 pat.pattern = pattern;
10288 ln = listSearchKey(server.pubsub_patterns,&pat);
10289 listDelNode(server.pubsub_patterns,ln);
10290 }
10291 /* Notify the client */
10292 if (notify) {
10293 addReply(c,shared.mbulk3);
10294 addReply(c,shared.punsubscribebulk);
10295 addReplyBulk(c,pattern);
482b672d 10296 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10297 listLength(c->pubsub_patterns));
befec3cd 10298 }
ffc6b7f8 10299 decrRefCount(pattern);
befec3cd 10300 return retval;
10301}
10302
ffc6b7f8 10303/* Unsubscribe from all the channels. Return the number of channels the
10304 * client was subscribed from. */
10305static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10306 dictIterator *di = dictGetIterator(c->pubsub_channels);
befec3cd 10307 dictEntry *de;
10308 int count = 0;
10309
10310 while((de = dictNext(di)) != NULL) {
ffc6b7f8 10311 robj *channel = dictGetEntryKey(de);
befec3cd 10312
ffc6b7f8 10313 count += pubsubUnsubscribeChannel(c,channel,notify);
befec3cd 10314 }
10315 dictReleaseIterator(di);
10316 return count;
10317}
10318
ffc6b7f8 10319/* Unsubscribe from all the patterns. Return the number of patterns the
10320 * client was subscribed from. */
10321static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10322 listNode *ln;
10323 listIter li;
10324 int count = 0;
10325
10326 listRewind(c->pubsub_patterns,&li);
10327 while ((ln = listNext(&li)) != NULL) {
10328 robj *pattern = ln->value;
10329
10330 count += pubsubUnsubscribePattern(c,pattern,notify);
10331 }
10332 return count;
10333}
10334
befec3cd 10335/* Publish a message */
ffc6b7f8 10336static int pubsubPublishMessage(robj *channel, robj *message) {
befec3cd 10337 int receivers = 0;
10338 struct dictEntry *de;
ffc6b7f8 10339 listNode *ln;
10340 listIter li;
befec3cd 10341
ffc6b7f8 10342 /* Send to clients listening for that channel */
10343 de = dictFind(server.pubsub_channels,channel);
befec3cd 10344 if (de) {
10345 list *list = dictGetEntryVal(de);
10346 listNode *ln;
10347 listIter li;
10348
10349 listRewind(list,&li);
10350 while ((ln = listNext(&li)) != NULL) {
10351 redisClient *c = ln->value;
10352
10353 addReply(c,shared.mbulk3);
10354 addReply(c,shared.messagebulk);
ffc6b7f8 10355 addReplyBulk(c,channel);
befec3cd 10356 addReplyBulk(c,message);
10357 receivers++;
10358 }
10359 }
ffc6b7f8 10360 /* Send to clients listening to matching channels */
10361 if (listLength(server.pubsub_patterns)) {
10362 listRewind(server.pubsub_patterns,&li);
10363 channel = getDecodedObject(channel);
10364 while ((ln = listNext(&li)) != NULL) {
10365 pubsubPattern *pat = ln->value;
10366
10367 if (stringmatchlen((char*)pat->pattern->ptr,
10368 sdslen(pat->pattern->ptr),
10369 (char*)channel->ptr,
10370 sdslen(channel->ptr),0)) {
c8d0ea0e 10371 addReply(pat->client,shared.mbulk4);
10372 addReply(pat->client,shared.pmessagebulk);
10373 addReplyBulk(pat->client,pat->pattern);
ffc6b7f8 10374 addReplyBulk(pat->client,channel);
10375 addReplyBulk(pat->client,message);
10376 receivers++;
10377 }
10378 }
10379 decrRefCount(channel);
10380 }
befec3cd 10381 return receivers;
10382}
10383
10384static void subscribeCommand(redisClient *c) {
10385 int j;
10386
10387 for (j = 1; j < c->argc; j++)
ffc6b7f8 10388 pubsubSubscribeChannel(c,c->argv[j]);
befec3cd 10389}
10390
10391static void unsubscribeCommand(redisClient *c) {
10392 if (c->argc == 1) {
ffc6b7f8 10393 pubsubUnsubscribeAllChannels(c,1);
10394 return;
10395 } else {
10396 int j;
10397
10398 for (j = 1; j < c->argc; j++)
10399 pubsubUnsubscribeChannel(c,c->argv[j],1);
10400 }
10401}
10402
10403static void psubscribeCommand(redisClient *c) {
10404 int j;
10405
10406 for (j = 1; j < c->argc; j++)
10407 pubsubSubscribePattern(c,c->argv[j]);
10408}
10409
10410static void punsubscribeCommand(redisClient *c) {
10411 if (c->argc == 1) {
10412 pubsubUnsubscribeAllPatterns(c,1);
befec3cd 10413 return;
10414 } else {
10415 int j;
10416
10417 for (j = 1; j < c->argc; j++)
ffc6b7f8 10418 pubsubUnsubscribePattern(c,c->argv[j],1);
befec3cd 10419 }
10420}
10421
10422static void publishCommand(redisClient *c) {
10423 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
482b672d 10424 addReplyLongLong(c,receivers);
befec3cd 10425}
10426
37ab76c9 10427/* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10428 *
10429 * The implementation uses a per-DB hash table mapping keys to list of clients
10430 * WATCHing those keys, so that given a key that is going to be modified
10431 * we can mark all the associated clients as dirty.
10432 *
10433 * Also every client contains a list of WATCHed keys so that's possible to
10434 * un-watch such keys when the client is freed or when UNWATCH is called. */
10435
10436/* In the client->watched_keys list we need to use watchedKey structures
10437 * as in order to identify a key in Redis we need both the key name and the
10438 * DB */
10439typedef struct watchedKey {
10440 robj *key;
10441 redisDb *db;
10442} watchedKey;
10443
10444/* Watch for the specified key */
10445static void watchForKey(redisClient *c, robj *key) {
10446 list *clients = NULL;
10447 listIter li;
10448 listNode *ln;
10449 watchedKey *wk;
10450
10451 /* Check if we are already watching for this key */
10452 listRewind(c->watched_keys,&li);
10453 while((ln = listNext(&li))) {
10454 wk = listNodeValue(ln);
10455 if (wk->db == c->db && equalStringObjects(key,wk->key))
10456 return; /* Key already watched */
10457 }
10458 /* This key is not already watched in this DB. Let's add it */
10459 clients = dictFetchValue(c->db->watched_keys,key);
10460 if (!clients) {
10461 clients = listCreate();
10462 dictAdd(c->db->watched_keys,key,clients);
10463 incrRefCount(key);
10464 }
10465 listAddNodeTail(clients,c);
10466 /* Add the new key to the lits of keys watched by this client */
10467 wk = zmalloc(sizeof(*wk));
10468 wk->key = key;
10469 wk->db = c->db;
10470 incrRefCount(key);
10471 listAddNodeTail(c->watched_keys,wk);
10472}
10473
10474/* Unwatch all the keys watched by this client. To clean the EXEC dirty
10475 * flag is up to the caller. */
10476static void unwatchAllKeys(redisClient *c) {
10477 listIter li;
10478 listNode *ln;
10479
10480 if (listLength(c->watched_keys) == 0) return;
10481 listRewind(c->watched_keys,&li);
10482 while((ln = listNext(&li))) {
10483 list *clients;
10484 watchedKey *wk;
10485
10486 /* Lookup the watched key -> clients list and remove the client
10487 * from the list */
10488 wk = listNodeValue(ln);
10489 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10490 assert(clients != NULL);
10491 listDelNode(clients,listSearchKey(clients,c));
10492 /* Kill the entry at all if this was the only client */
10493 if (listLength(clients) == 0)
10494 dictDelete(wk->db->watched_keys, wk->key);
10495 /* Remove this watched key from the client->watched list */
10496 listDelNode(c->watched_keys,ln);
10497 decrRefCount(wk->key);
10498 zfree(wk);
10499 }
10500}
10501
ca3f830b 10502/* "Touch" a key, so that if this key is being WATCHed by some client the
37ab76c9 10503 * next EXEC will fail. */
10504static void touchWatchedKey(redisDb *db, robj *key) {
10505 list *clients;
10506 listIter li;
10507 listNode *ln;
10508
10509 if (dictSize(db->watched_keys) == 0) return;
10510 clients = dictFetchValue(db->watched_keys, key);
10511 if (!clients) return;
10512
10513 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
10514 /* Check if we are already watching for this key */
10515 listRewind(clients,&li);
10516 while((ln = listNext(&li))) {
10517 redisClient *c = listNodeValue(ln);
10518
10519 c->flags |= REDIS_DIRTY_CAS;
10520 }
10521}
10522
9b30e1a2 10523/* On FLUSHDB or FLUSHALL all the watched keys that are present before the
10524 * flush but will be deleted as effect of the flushing operation should
10525 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
10526 * a FLUSHALL operation (all the DBs flushed). */
10527static void touchWatchedKeysOnFlush(int dbid) {
10528 listIter li1, li2;
10529 listNode *ln;
10530
10531 /* For every client, check all the waited keys */
10532 listRewind(server.clients,&li1);
10533 while((ln = listNext(&li1))) {
10534 redisClient *c = listNodeValue(ln);
10535 listRewind(c->watched_keys,&li2);
10536 while((ln = listNext(&li2))) {
10537 watchedKey *wk = listNodeValue(ln);
10538
10539 /* For every watched key matching the specified DB, if the
10540 * key exists, mark the client as dirty, as the key will be
10541 * removed. */
10542 if (dbid == -1 || wk->db->id == dbid) {
10543 if (dictFind(wk->db->dict, wk->key) != NULL)
10544 c->flags |= REDIS_DIRTY_CAS;
10545 }
10546 }
10547 }
10548}
10549
37ab76c9 10550static void watchCommand(redisClient *c) {
10551 int j;
10552
6531c94d 10553 if (c->flags & REDIS_MULTI) {
10554 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
10555 return;
10556 }
37ab76c9 10557 for (j = 1; j < c->argc; j++)
10558 watchForKey(c,c->argv[j]);
10559 addReply(c,shared.ok);
10560}
10561
10562static void unwatchCommand(redisClient *c) {
10563 unwatchAllKeys(c);
10564 c->flags &= (~REDIS_DIRTY_CAS);
10565 addReply(c,shared.ok);
10566}
10567
7f957c92 10568/* ================================= Debugging ============================== */
10569
ba798261 10570/* Compute the sha1 of string at 's' with 'len' bytes long.
10571 * The SHA1 is then xored againt the string pointed by digest.
10572 * Since xor is commutative, this operation is used in order to
10573 * "add" digests relative to unordered elements.
10574 *
10575 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
10576static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
10577 SHA1_CTX ctx;
10578 unsigned char hash[20], *s = ptr;
10579 int j;
10580
10581 SHA1Init(&ctx);
10582 SHA1Update(&ctx,s,len);
10583 SHA1Final(hash,&ctx);
10584
10585 for (j = 0; j < 20; j++)
10586 digest[j] ^= hash[j];
10587}
10588
10589static void xorObjectDigest(unsigned char *digest, robj *o) {
10590 o = getDecodedObject(o);
10591 xorDigest(digest,o->ptr,sdslen(o->ptr));
10592 decrRefCount(o);
10593}
10594
10595/* This function instead of just computing the SHA1 and xoring it
10596 * against diget, also perform the digest of "digest" itself and
10597 * replace the old value with the new one.
10598 *
10599 * So the final digest will be:
10600 *
10601 * digest = SHA1(digest xor SHA1(data))
10602 *
10603 * This function is used every time we want to preserve the order so
10604 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
10605 *
10606 * Also note that mixdigest("foo") followed by mixdigest("bar")
10607 * will lead to a different digest compared to "fo", "obar".
10608 */
10609static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
10610 SHA1_CTX ctx;
10611 char *s = ptr;
10612
10613 xorDigest(digest,s,len);
10614 SHA1Init(&ctx);
10615 SHA1Update(&ctx,digest,20);
10616 SHA1Final(digest,&ctx);
10617}
10618
10619static void mixObjectDigest(unsigned char *digest, robj *o) {
10620 o = getDecodedObject(o);
10621 mixDigest(digest,o->ptr,sdslen(o->ptr));
10622 decrRefCount(o);
10623}
10624
10625/* Compute the dataset digest. Since keys, sets elements, hashes elements
10626 * are not ordered, we use a trick: every aggregate digest is the xor
10627 * of the digests of their elements. This way the order will not change
10628 * the result. For list instead we use a feedback entering the output digest
10629 * as input in order to ensure that a different ordered list will result in
10630 * a different digest. */
10631static void computeDatasetDigest(unsigned char *final) {
10632 unsigned char digest[20];
10633 char buf[128];
10634 dictIterator *di = NULL;
10635 dictEntry *de;
10636 int j;
10637 uint32_t aux;
10638
10639 memset(final,0,20); /* Start with a clean result */
10640
10641 for (j = 0; j < server.dbnum; j++) {
10642 redisDb *db = server.db+j;
10643
10644 if (dictSize(db->dict) == 0) continue;
10645 di = dictGetIterator(db->dict);
10646
10647 /* hash the DB id, so the same dataset moved in a different
10648 * DB will lead to a different digest */
10649 aux = htonl(j);
10650 mixDigest(final,&aux,sizeof(aux));
10651
10652 /* Iterate this DB writing every entry */
10653 while((de = dictNext(di)) != NULL) {
cbae1d34 10654 robj *key, *o, *kcopy;
ba798261 10655 time_t expiretime;
10656
10657 memset(digest,0,20); /* This key-val digest */
10658 key = dictGetEntryKey(de);
cbae1d34 10659
10660 if (!server.vm_enabled) {
10661 mixObjectDigest(digest,key);
ba798261 10662 o = dictGetEntryVal(de);
ba798261 10663 } else {
cbae1d34 10664 /* Don't work with the key directly as when VM is active
10665 * this is unsafe: TODO: fix decrRefCount to check if the
10666 * count really reached 0 to avoid this mess */
10667 kcopy = dupStringObject(key);
10668 mixObjectDigest(digest,kcopy);
10669 o = lookupKeyRead(db,kcopy);
10670 decrRefCount(kcopy);
ba798261 10671 }
10672 aux = htonl(o->type);
10673 mixDigest(digest,&aux,sizeof(aux));
10674 expiretime = getExpire(db,key);
10675
10676 /* Save the key and associated value */
10677 if (o->type == REDIS_STRING) {
10678 mixObjectDigest(digest,o);
10679 } else if (o->type == REDIS_LIST) {
10680 list *list = o->ptr;
10681 listNode *ln;
10682 listIter li;
10683
10684 listRewind(list,&li);
10685 while((ln = listNext(&li))) {
10686 robj *eleobj = listNodeValue(ln);
10687
10688 mixObjectDigest(digest,eleobj);
10689 }
10690 } else if (o->type == REDIS_SET) {
10691 dict *set = o->ptr;
10692 dictIterator *di = dictGetIterator(set);
10693 dictEntry *de;
10694
10695 while((de = dictNext(di)) != NULL) {
10696 robj *eleobj = dictGetEntryKey(de);
10697
10698 xorObjectDigest(digest,eleobj);
10699 }
10700 dictReleaseIterator(di);
10701 } else if (o->type == REDIS_ZSET) {
10702 zset *zs = o->ptr;
10703 dictIterator *di = dictGetIterator(zs->dict);
10704 dictEntry *de;
10705
10706 while((de = dictNext(di)) != NULL) {
10707 robj *eleobj = dictGetEntryKey(de);
10708 double *score = dictGetEntryVal(de);
10709 unsigned char eledigest[20];
10710
10711 snprintf(buf,sizeof(buf),"%.17g",*score);
10712 memset(eledigest,0,20);
10713 mixObjectDigest(eledigest,eleobj);
10714 mixDigest(eledigest,buf,strlen(buf));
10715 xorDigest(digest,eledigest,20);
10716 }
10717 dictReleaseIterator(di);
10718 } else if (o->type == REDIS_HASH) {
10719 hashIterator *hi;
10720 robj *obj;
10721
10722 hi = hashInitIterator(o);
10723 while (hashNext(hi) != REDIS_ERR) {
10724 unsigned char eledigest[20];
10725
10726 memset(eledigest,0,20);
10727 obj = hashCurrent(hi,REDIS_HASH_KEY);
10728 mixObjectDigest(eledigest,obj);
10729 decrRefCount(obj);
10730 obj = hashCurrent(hi,REDIS_HASH_VALUE);
10731 mixObjectDigest(eledigest,obj);
10732 decrRefCount(obj);
10733 xorDigest(digest,eledigest,20);
10734 }
10735 hashReleaseIterator(hi);
10736 } else {
10737 redisPanic("Unknown object type");
10738 }
ba798261 10739 /* If the key has an expire, add it to the mix */
10740 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
10741 /* We can finally xor the key-val digest to the final digest */
10742 xorDigest(final,digest,20);
10743 }
10744 dictReleaseIterator(di);
10745 }
10746}
10747
7f957c92 10748static void debugCommand(redisClient *c) {
10749 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10750 *((char*)-1) = 'x';
210e29f7 10751 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10752 if (rdbSave(server.dbfilename) != REDIS_OK) {
10753 addReply(c,shared.err);
10754 return;
10755 }
10756 emptyDb();
10757 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10758 addReply(c,shared.err);
10759 return;
10760 }
10761 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10762 addReply(c,shared.ok);
71c2b467 10763 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10764 emptyDb();
10765 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10766 addReply(c,shared.err);
10767 return;
10768 }
10769 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10770 addReply(c,shared.ok);
333298da 10771 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10772 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10773 robj *key, *val;
10774
10775 if (!de) {
10776 addReply(c,shared.nokeyerr);
10777 return;
10778 }
10779 key = dictGetEntryKey(de);
10780 val = dictGetEntryVal(de);
59146ef3 10781 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
10782 key->storage == REDIS_VM_SWAPPING)) {
07efaf74 10783 char *strenc;
10784 char buf[128];
10785
10786 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10787 strenc = strencoding[val->encoding];
10788 } else {
10789 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10790 strenc = buf;
10791 }
ace06542 10792 addReplySds(c,sdscatprintf(sdsempty(),
10793 "+Key at:%p refcount:%d, value at:%p refcount:%d "
07efaf74 10794 "encoding:%s serializedlength:%lld\r\n",
682ac724 10795 (void*)key, key->refcount, (void*)val, val->refcount,
07efaf74 10796 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 10797 } else {
10798 addReplySds(c,sdscatprintf(sdsempty(),
10799 "+Key at:%p refcount:%d, value swapped at: page %llu "
10800 "using %llu pages\r\n",
10801 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10802 (unsigned long long) key->vm.usedpages));
10803 }
78ebe4c8 10804 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10805 lookupKeyRead(c->db,c->argv[2]);
10806 addReply(c,shared.ok);
7d30035d 10807 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10808 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10809 robj *key, *val;
10810
10811 if (!server.vm_enabled) {
10812 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10813 return;
10814 }
10815 if (!de) {
10816 addReply(c,shared.nokeyerr);
10817 return;
10818 }
10819 key = dictGetEntryKey(de);
10820 val = dictGetEntryVal(de);
4ef8de8a 10821 /* If the key is shared we want to create a copy */
10822 if (key->refcount > 1) {
10823 robj *newkey = dupStringObject(key);
10824 decrRefCount(key);
10825 key = dictGetEntryKey(de) = newkey;
10826 }
10827 /* Swap it */
7d30035d 10828 if (key->storage != REDIS_VM_MEMORY) {
10829 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 10830 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 10831 dictGetEntryVal(de) = NULL;
10832 addReply(c,shared.ok);
10833 } else {
10834 addReply(c,shared.err);
10835 }
59305dc7 10836 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
10837 long keys, j;
10838 robj *key, *val;
10839 char buf[128];
10840
10841 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
10842 return;
10843 for (j = 0; j < keys; j++) {
10844 snprintf(buf,sizeof(buf),"key:%lu",j);
10845 key = createStringObject(buf,strlen(buf));
10846 if (lookupKeyRead(c->db,key) != NULL) {
10847 decrRefCount(key);
10848 continue;
10849 }
10850 snprintf(buf,sizeof(buf),"value:%lu",j);
10851 val = createStringObject(buf,strlen(buf));
10852 dictAdd(c->db->dict,key,val);
10853 }
10854 addReply(c,shared.ok);
ba798261 10855 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
10856 unsigned char digest[20];
10857 sds d = sdsnew("+");
10858 int j;
10859
10860 computeDatasetDigest(digest);
10861 for (j = 0; j < 20; j++)
10862 d = sdscatprintf(d, "%02x",digest[j]);
10863
10864 d = sdscatlen(d,"\r\n",2);
10865 addReplySds(c,d);
7f957c92 10866 } else {
333298da 10867 addReplySds(c,sdsnew(
bdcb92f2 10868 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 10869 }
10870}
56906eef 10871
6c96ba7d 10872static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 10873 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
fdfb02e7 10874 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
dfc5e96c 10875#ifdef HAVE_BACKTRACE
10876 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10877 *((char*)-1) = 'x';
10878#endif
10879}
10880
c651fd9e 10881static void _redisPanic(char *msg, char *file, int line) {
10882 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
17772754 10883 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
c651fd9e 10884#ifdef HAVE_BACKTRACE
10885 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10886 *((char*)-1) = 'x';
10887#endif
10888}
10889
bcfc686d 10890/* =================================== Main! ================================ */
56906eef 10891
bcfc686d 10892#ifdef __linux__
10893int linuxOvercommitMemoryValue(void) {
10894 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10895 char buf[64];
56906eef 10896
bcfc686d 10897 if (!fp) return -1;
10898 if (fgets(buf,64,fp) == NULL) {
10899 fclose(fp);
10900 return -1;
10901 }
10902 fclose(fp);
56906eef 10903
bcfc686d 10904 return atoi(buf);
10905}
10906
10907void linuxOvercommitMemoryWarning(void) {
10908 if (linuxOvercommitMemoryValue() == 0) {
7ccd2d0a 10909 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
bcfc686d 10910 }
10911}
10912#endif /* __linux__ */
10913
10914static void daemonize(void) {
10915 int fd;
10916 FILE *fp;
10917
10918 if (fork() != 0) exit(0); /* parent exits */
10919 setsid(); /* create a new session */
10920
10921 /* Every output goes to /dev/null. If Redis is daemonized but
10922 * the 'logfile' is set to 'stdout' in the configuration file
10923 * it will not log at all. */
10924 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10925 dup2(fd, STDIN_FILENO);
10926 dup2(fd, STDOUT_FILENO);
10927 dup2(fd, STDERR_FILENO);
10928 if (fd > STDERR_FILENO) close(fd);
10929 }
10930 /* Try to write the pid file */
10931 fp = fopen(server.pidfile,"w");
10932 if (fp) {
10933 fprintf(fp,"%d\n",getpid());
10934 fclose(fp);
56906eef 10935 }
56906eef 10936}
10937
42ab0172 10938static void version() {
8a3b0d2d 10939 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
10940 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
42ab0172
AO
10941 exit(0);
10942}
10943
723fb69b
AO
10944static void usage() {
10945 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 10946 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
10947 exit(1);
10948}
10949
bcfc686d 10950int main(int argc, char **argv) {
9651a787 10951 time_t start;
10952
bcfc686d 10953 initServerConfig();
1a132bbc 10954 sortCommandTable();
bcfc686d 10955 if (argc == 2) {
44efe66e 10956 if (strcmp(argv[1], "-v") == 0 ||
10957 strcmp(argv[1], "--version") == 0) version();
10958 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 10959 resetServerSaveParams();
10960 loadServerConfig(argv[1]);
723fb69b
AO
10961 } else if ((argc > 2)) {
10962 usage();
bcfc686d 10963 } else {
10964 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10965 }
bcfc686d 10966 if (server.daemonize) daemonize();
71c54b21 10967 initServer();
bcfc686d 10968 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10969#ifdef __linux__
10970 linuxOvercommitMemoryWarning();
10971#endif
9651a787 10972 start = time(NULL);
bcfc686d 10973 if (server.appendonly) {
10974 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 10975 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 10976 } else {
10977 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 10978 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 10979 }
bcfc686d 10980 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 10981 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 10982 aeMain(server.el);
10983 aeDeleteEventLoop(server.el);
10984 return 0;
10985}
10986
10987/* ============================= Backtrace support ========================= */
10988
10989#ifdef HAVE_BACKTRACE
10990static char *findFuncName(void *pointer, unsigned long *offset);
10991
56906eef 10992static void *getMcontextEip(ucontext_t *uc) {
10993#if defined(__FreeBSD__)
10994 return (void*) uc->uc_mcontext.mc_eip;
10995#elif defined(__dietlibc__)
10996 return (void*) uc->uc_mcontext.eip;
06db1f50 10997#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 10998 #if __x86_64__
10999 return (void*) uc->uc_mcontext->__ss.__rip;
11000 #else
56906eef 11001 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 11002 #endif
06db1f50 11003#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 11004 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 11005 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 11006 #else
11007 return (void*) uc->uc_mcontext->__ss.__eip;
e0a62c7f 11008 #endif
54bac49d 11009#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 11010 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 11011#elif defined(__ia64__) /* Linux IA64 */
11012 return (void*) uc->uc_mcontext.sc_ip;
11013#else
11014 return NULL;
56906eef 11015#endif
11016}
11017
11018static void segvHandler(int sig, siginfo_t *info, void *secret) {
11019 void *trace[100];
11020 char **messages = NULL;
11021 int i, trace_size = 0;
11022 unsigned long offset=0;
56906eef 11023 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 11024 sds infostring;
56906eef 11025 REDIS_NOTUSED(info);
11026
11027 redisLog(REDIS_WARNING,
11028 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 11029 infostring = genRedisInfoString();
11030 redisLog(REDIS_WARNING, "%s",infostring);
11031 /* It's not safe to sdsfree() the returned string under memory
11032 * corruption conditions. Let it leak as we are going to abort */
e0a62c7f 11033
56906eef 11034 trace_size = backtrace(trace, 100);
de96dbfe 11035 /* overwrite sigaction with caller's address */
b91cf5ef 11036 if (getMcontextEip(uc) != NULL) {
11037 trace[1] = getMcontextEip(uc);
11038 }
56906eef 11039 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 11040
d76412d1 11041 for (i=1; i<trace_size; ++i) {
56906eef 11042 char *fn = findFuncName(trace[i], &offset), *p;
11043
11044 p = strchr(messages[i],'+');
11045 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11046 redisLog(REDIS_WARNING,"%s", messages[i]);
11047 } else {
11048 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11049 }
11050 }
b177fd30 11051 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 11052 _exit(0);
fe3bbfbe 11053}
56906eef 11054
fab43727 11055static void sigtermHandler(int sig) {
11056 REDIS_NOTUSED(sig);
b58ba105 11057
fab43727 11058 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11059 server.shutdown_asap = 1;
b58ba105
AM
11060}
11061
56906eef 11062static void setupSigSegvAction(void) {
11063 struct sigaction act;
11064
11065 sigemptyset (&act.sa_mask);
11066 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11067 * is used. Otherwise, sa_handler is used */
11068 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11069 act.sa_sigaction = segvHandler;
11070 sigaction (SIGSEGV, &act, NULL);
11071 sigaction (SIGBUS, &act, NULL);
12fea928 11072 sigaction (SIGFPE, &act, NULL);
11073 sigaction (SIGILL, &act, NULL);
11074 sigaction (SIGBUS, &act, NULL);
b58ba105
AM
11075
11076 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
fab43727 11077 act.sa_handler = sigtermHandler;
b58ba105 11078 sigaction (SIGTERM, &act, NULL);
e65fdc78 11079 return;
56906eef 11080}
e65fdc78 11081
bcfc686d 11082#include "staticsymbols.h"
11083/* This function try to convert a pointer into a function name. It's used in
11084 * oreder to provide a backtrace under segmentation fault that's able to
11085 * display functions declared as static (otherwise the backtrace is useless). */
11086static char *findFuncName(void *pointer, unsigned long *offset){
11087 int i, ret = -1;
11088 unsigned long off, minoff = 0;
ed9b544e 11089
bcfc686d 11090 /* Try to match against the Symbol with the smallest offset */
11091 for (i=0; symsTable[i].pointer; i++) {
11092 unsigned long lp = (unsigned long) pointer;
0bc03378 11093
bcfc686d 11094 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11095 off=lp-symsTable[i].pointer;
11096 if (ret < 0 || off < minoff) {
11097 minoff=off;
11098 ret=i;
11099 }
11100 }
0bc03378 11101 }
bcfc686d 11102 if (ret == -1) return NULL;
11103 *offset = minoff;
11104 return symsTable[ret].name;
0bc03378 11105}
bcfc686d 11106#else /* HAVE_BACKTRACE */
11107static void setupSigSegvAction(void) {
0bc03378 11108}
bcfc686d 11109#endif /* HAVE_BACKTRACE */
0bc03378 11110
ed9b544e 11111
ed9b544e 11112
bcfc686d 11113/* The End */
11114
11115
ed9b544e 11116