]> git.saurik.com Git - redis.git/blame - redis.c
use qsort and bsearch to lookup commands in O(log(N)) instead of O(N)
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
9005896c 30#define REDIS_VERSION "2.1.1"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
40#include <signal.h>
fbf9bcdb 41
42#ifdef HAVE_BACKTRACE
c9468bcf 43#include <execinfo.h>
44#include <ucontext.h>
fbf9bcdb 45#endif /* HAVE_BACKTRACE */
46
ed9b544e 47#include <sys/wait.h>
48#include <errno.h>
49#include <assert.h>
50#include <ctype.h>
51#include <stdarg.h>
52#include <inttypes.h>
53#include <arpa/inet.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#include <sys/time.h>
57#include <sys/resource.h>
2895e862 58#include <sys/uio.h>
f78fd11b 59#include <limits.h>
fb82e75c 60#include <float.h>
a7866db6 61#include <math.h>
92f8e882 62#include <pthread.h>
0bc1b2f6 63
64#if defined(__sun)
5043dff3 65#include "solarisfixes.h"
66#endif
ed9b544e 67
c9468bcf 68#include "redis.h"
ed9b544e 69#include "ae.h" /* Event driven programming library */
70#include "sds.h" /* Dynamic safe strings */
71#include "anet.h" /* Networking the easy way */
72#include "dict.h" /* Hash tables */
73#include "adlist.h" /* Linked lists */
74#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 75#include "lzf.h" /* LZF compression library */
76#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
ba798261 77#include "zipmap.h" /* Compact dictionary-alike data structure */
78#include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
5436146c 79#include "release.h" /* Release and/or git repository information */
ed9b544e 80
81/* Error codes */
82#define REDIS_OK 0
83#define REDIS_ERR -1
84
85/* Static server configuration */
86#define REDIS_SERVERPORT 6379 /* TCP port */
87#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 88#define REDIS_IOBUF_LEN 1024
ed9b544e 89#define REDIS_LOADBUF_LEN 1024
248ea310 90#define REDIS_STATIC_ARGS 8
ed9b544e 91#define REDIS_DEFAULT_DBNUM 16
92#define REDIS_CONFIGLINE_MAX 1024
93#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
94#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
8ca3e9d1 95#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
6f376729 96#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 97#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98
99/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
100#define REDIS_WRITEV_THRESHOLD 3
101/* Max number of iovecs used for each writev call */
102#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 103
104/* Hash table parameters */
105#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 106
107/* Command flags */
3fd78bcd 108#define REDIS_CMD_BULK 1 /* Bulk write command */
109#define REDIS_CMD_INLINE 2 /* Inline command */
110/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
111 this flags will return an error when the 'maxmemory' option is set in the
112 config file and the server is using more than maxmemory bytes of memory.
113 In short this commands are denied on low memory conditions. */
114#define REDIS_CMD_DENYOOM 4
4005fef1 115#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
ed9b544e 116
117/* Object types */
118#define REDIS_STRING 0
119#define REDIS_LIST 1
120#define REDIS_SET 2
1812e024 121#define REDIS_ZSET 3
122#define REDIS_HASH 4
f78fd11b 123
5234952b 124/* Objects encoding. Some kind of objects like Strings and Hashes can be
125 * internally represented in multiple ways. The 'encoding' field of the object
126 * is set to one of this fields for this object. */
942a3961 127#define REDIS_ENCODING_RAW 0 /* Raw representation */
128#define REDIS_ENCODING_INT 1 /* Encoded as integer */
5234952b 129#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
130#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
942a3961 131
07efaf74 132static char* strencoding[] = {
133 "raw", "int", "zipmap", "hashtable"
134};
135
f78fd11b 136/* Object types only used for dumping to disk */
bb32ede5 137#define REDIS_EXPIRETIME 253
ed9b544e 138#define REDIS_SELECTDB 254
139#define REDIS_EOF 255
140
f78fd11b 141/* Defines related to the dump file format. To store 32 bits lengths for short
142 * keys requires a lot of space, so we check the most significant 2 bits of
143 * the first byte to interpreter the length:
144 *
145 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
146 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
147 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 148 * 11|000000 this means: specially encoded object will follow. The six bits
149 * number specify the kind of object that follows.
150 * See the REDIS_RDB_ENC_* defines.
f78fd11b 151 *
10c43610 152 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
153 * values, will fit inside. */
f78fd11b 154#define REDIS_RDB_6BITLEN 0
155#define REDIS_RDB_14BITLEN 1
156#define REDIS_RDB_32BITLEN 2
17be1a4a 157#define REDIS_RDB_ENCVAL 3
f78fd11b 158#define REDIS_RDB_LENERR UINT_MAX
159
a4d1ba9a 160/* When a length of a string object stored on disk has the first two bits
161 * set, the remaining two bits specify a special encoding for the object
162 * accordingly to the following defines: */
163#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
164#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
165#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 166#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 167
75680a3c 168/* Virtual memory object->where field. */
169#define REDIS_VM_MEMORY 0 /* The object is on memory */
170#define REDIS_VM_SWAPPED 1 /* The object is on disk */
171#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
172#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
173
06224fec 174/* Virtual memory static configuration stuff.
175 * Check vmFindContiguousPages() to know more about this magic numbers. */
176#define REDIS_VM_MAX_NEAR_PAGES 65536
177#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 178#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 179#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 180/* The following is the *percentage* of completed I/O jobs to process when the
181 * handelr is called. While Virtual Memory I/O operations are performed by
182 * threads, this operations must be processed by the main thread when completed
183 * in order to take effect. */
c953f24b 184#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 185
ed9b544e 186/* Client flags */
d5d55fc3 187#define REDIS_SLAVE 1 /* This client is a slave server */
188#define REDIS_MASTER 2 /* This client is a master server */
189#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
190#define REDIS_MULTI 8 /* This client is in a MULTI context */
191#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
192#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
37ab76c9 193#define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
ed9b544e 194
40d224a9 195/* Slave replication state - slave side */
ed9b544e 196#define REDIS_REPL_NONE 0 /* No active replication */
197#define REDIS_REPL_CONNECT 1 /* Must connect to master */
198#define REDIS_REPL_CONNECTED 2 /* Connected to master */
199
40d224a9 200/* Slave replication state - from the point of view of master
201 * Note that in SEND_BULK and ONLINE state the slave receives new updates
202 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
203 * to start the next background saving in order to send updates to it. */
204#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
205#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
206#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
207#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
208
ed9b544e 209/* List related stuff */
210#define REDIS_HEAD 0
211#define REDIS_TAIL 1
212
213/* Sort operations */
214#define REDIS_SORT_GET 0
443c6409 215#define REDIS_SORT_ASC 1
216#define REDIS_SORT_DESC 2
ed9b544e 217#define REDIS_SORTKEY_MAX 1024
218
219/* Log levels */
220#define REDIS_DEBUG 0
f870935d 221#define REDIS_VERBOSE 1
222#define REDIS_NOTICE 2
223#define REDIS_WARNING 3
ed9b544e 224
225/* Anti-warning macro... */
226#define REDIS_NOTUSED(V) ((void) V)
227
6b47e12e 228#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
229#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 230
48f0308a 231/* Append only defines */
232#define APPENDFSYNC_NO 0
233#define APPENDFSYNC_ALWAYS 1
234#define APPENDFSYNC_EVERYSEC 2
235
cbba7dd7 236/* Hashes related defaults */
237#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
238#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
239
dfc5e96c 240/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 241#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
c651fd9e 242#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
6c96ba7d 243static void _redisAssert(char *estr, char *file, int line);
c651fd9e 244static void _redisPanic(char *msg, char *file, int line);
dfc5e96c 245
ed9b544e 246/*================================= Data types ============================== */
247
248/* A redis object, that is a type able to hold a string / list / set */
75680a3c 249
250/* The VM object structure */
251struct redisObjectVM {
3a66edc7 252 off_t page; /* the page at witch the object is stored on disk */
253 off_t usedpages; /* number of pages used on disk */
254 time_t atime; /* Last access time */
75680a3c 255} vm;
256
257/* The actual Redis Object */
ed9b544e 258typedef struct redisObject {
ed9b544e 259 void *ptr;
942a3961 260 unsigned char type;
261 unsigned char encoding;
d894161b 262 unsigned char storage; /* If this object is a key, where is the value?
263 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
264 unsigned char vtype; /* If this object is a key, and value is swapped out,
265 * this is the type of the swapped out object. */
ed9b544e 266 int refcount;
75680a3c 267 /* VM fields, this are only allocated if VM is active, otherwise the
268 * object allocation function will just allocate
269 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
270 * Redis without VM active will not have any overhead. */
271 struct redisObjectVM vm;
ed9b544e 272} robj;
273
dfc5e96c 274/* Macro used to initalize a Redis object allocated on the stack.
275 * Note that this macro is taken near the structure definition to make sure
276 * we'll update it when the structure is changed, to avoid bugs like
277 * bug #85 introduced exactly in this way. */
278#define initStaticStringObject(_var,_ptr) do { \
279 _var.refcount = 1; \
280 _var.type = REDIS_STRING; \
281 _var.encoding = REDIS_ENCODING_RAW; \
282 _var.ptr = _ptr; \
3a66edc7 283 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 284} while(0);
285
3305306f 286typedef struct redisDb {
4409877e 287 dict *dict; /* The keyspace for this DB */
288 dict *expires; /* Timeout of keys with a timeout set */
37ab76c9 289 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 290 dict *io_keys; /* Keys with clients waiting for VM I/O */
37ab76c9 291 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
3305306f 292 int id;
293} redisDb;
294
6e469882 295/* Client MULTI/EXEC state */
296typedef struct multiCmd {
297 robj **argv;
298 int argc;
299 struct redisCommand *cmd;
300} multiCmd;
301
302typedef struct multiState {
303 multiCmd *commands; /* Array of MULTI commands */
304 int count; /* Total number of MULTI commands */
305} multiState;
306
ed9b544e 307/* With multiplexing we need to take per-clinet state.
308 * Clients are taken in a liked list. */
309typedef struct redisClient {
310 int fd;
3305306f 311 redisDb *db;
ed9b544e 312 int dictid;
313 sds querybuf;
e8a74421 314 robj **argv, **mbargv;
315 int argc, mbargc;
40d224a9 316 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 317 int multibulk; /* multi bulk command format active */
ed9b544e 318 list *reply;
319 int sentlen;
320 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 321 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 322 int slaveseldb; /* slave selected db, if this client is a slave */
323 int authenticated; /* when requirepass is non-NULL */
324 int replstate; /* replication state if this is a slave */
325 int repldbfd; /* replication DB file descriptor */
6e469882 326 long repldboff; /* replication DB file offset */
40d224a9 327 off_t repldbsize; /* replication DB file size */
6e469882 328 multiState mstate; /* MULTI/EXEC state */
37ab76c9 329 robj **blocking_keys; /* The key we are waiting to terminate a blocking
4409877e 330 * operation such as BLPOP. Otherwise NULL. */
37ab76c9 331 int blocking_keys_num; /* Number of blocking keys */
4409877e 332 time_t blockingto; /* Blocking operation timeout. If UNIX current time
333 * is >= blockingto then the operation timed out. */
92f8e882 334 list *io_keys; /* Keys this client is waiting to be loaded from the
335 * swap file in order to continue. */
37ab76c9 336 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
ffc6b7f8 337 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
338 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
ed9b544e 339} redisClient;
340
341struct saveparam {
342 time_t seconds;
343 int changes;
344};
345
346/* Global server state structure */
347struct redisServer {
348 int port;
349 int fd;
3305306f 350 redisDb *db;
ed9b544e 351 long long dirty; /* changes to DB from the last save */
352 list *clients;
87eca727 353 list *slaves, *monitors;
ed9b544e 354 char neterr[ANET_ERR_LEN];
355 aeEventLoop *el;
356 int cronloops; /* number of times the cron function run */
357 list *objfreelist; /* A list of freed objects to avoid malloc() */
358 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 359 /* Fields used only for stats */
360 time_t stat_starttime; /* server start time */
361 long long stat_numcommands; /* number of processed commands */
362 long long stat_numconnections; /* number of connections received */
2a6a2ed1 363 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 364 /* Configuration */
365 int verbosity;
366 int glueoutputbuf;
367 int maxidletime;
368 int dbnum;
369 int daemonize;
44b38ef4 370 int appendonly;
48f0308a 371 int appendfsync;
fab43727 372 int shutdown_asap;
48f0308a 373 time_t lastfsync;
44b38ef4 374 int appendfd;
375 int appendseldb;
ed329fcf 376 char *pidfile;
9f3c422c 377 pid_t bgsavechildpid;
9d65a1bb 378 pid_t bgrewritechildpid;
379 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
28ed1f33 380 sds aofbuf; /* AOF buffer, written before entering the event loop */
ed9b544e 381 struct saveparam *saveparams;
382 int saveparamslen;
383 char *logfile;
384 char *bindaddr;
385 char *dbfilename;
44b38ef4 386 char *appendfilename;
abcb223e 387 char *requirepass;
121f70cf 388 int rdbcompression;
8ca3e9d1 389 int activerehashing;
ed9b544e 390 /* Replication related */
391 int isslave;
d0ccebcf 392 char *masterauth;
ed9b544e 393 char *masterhost;
394 int masterport;
40d224a9 395 redisClient *master; /* client that is master for this slave */
ed9b544e 396 int replstate;
285add55 397 unsigned int maxclients;
4ef8de8a 398 unsigned long long maxmemory;
d5d55fc3 399 unsigned int blpop_blocked_clients;
400 unsigned int vm_blocked_clients;
ed9b544e 401 /* Sort parameters - qsort_r() is only available under BSD so we
402 * have to take this state global, in order to pass it to sortCompare() */
403 int sort_desc;
404 int sort_alpha;
405 int sort_bypattern;
75680a3c 406 /* Virtual memory configuration */
407 int vm_enabled;
054e426d 408 char *vm_swap_file;
75680a3c 409 off_t vm_page_size;
410 off_t vm_pages;
4ef8de8a 411 unsigned long long vm_max_memory;
cbba7dd7 412 /* Hashes config */
413 size_t hash_max_zipmap_entries;
414 size_t hash_max_zipmap_value;
75680a3c 415 /* Virtual memory state */
416 FILE *vm_fp;
417 int vm_fd;
418 off_t vm_next_page; /* Next probably empty page */
419 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 420 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 421 time_t unixtime; /* Unix time sampled every second. */
92f8e882 422 /* Virtual memory I/O threads stuff */
92f8e882 423 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 424 * put the result of the operation in the io_done list. While the
425 * job is being processed, it's put on io_processing queue. */
426 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
427 list *io_processing; /* List of VM I/O jobs being processed */
428 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 429 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 430 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 431 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
432 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 433 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 434 int io_active_threads; /* Number of running I/O threads */
435 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 436 /* Our main thread is blocked on the event loop, locking for sockets ready
437 * to be read or written, so when a threaded I/O operation is ready to be
438 * processed by the main thread, the I/O thread will use a unix pipe to
439 * awake the main thread. The followings are the two pipe FDs. */
440 int io_ready_pipe_read;
441 int io_ready_pipe_write;
7d98e08c 442 /* Virtual memory stats */
443 unsigned long long vm_stats_used_pages;
444 unsigned long long vm_stats_swapped_objects;
445 unsigned long long vm_stats_swapouts;
446 unsigned long long vm_stats_swapins;
befec3cd 447 /* Pubsub */
ffc6b7f8 448 dict *pubsub_channels; /* Map channels to list of subscribed clients */
449 list *pubsub_patterns; /* A list of pubsub_patterns */
befec3cd 450 /* Misc */
b9bc0eef 451 FILE *devnull;
ed9b544e 452};
453
ffc6b7f8 454typedef struct pubsubPattern {
455 redisClient *client;
456 robj *pattern;
457} pubsubPattern;
458
ed9b544e 459typedef void redisCommandProc(redisClient *c);
ca1788b5 460typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
ed9b544e 461struct redisCommand {
462 char *name;
463 redisCommandProc *proc;
464 int arity;
465 int flags;
76583ea4
PN
466 /* Use a function to determine which keys need to be loaded
467 * in the background prior to executing this command. Takes precedence
468 * over vm_firstkey and others, ignored when NULL */
ca1788b5 469 redisVmPreloadProc *vm_preload_proc;
7c775e09 470 /* What keys should be loaded in background when calling this command? */
471 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
472 int vm_lastkey; /* THe last argument that's a key */
473 int vm_keystep; /* The step between first and last key */
ed9b544e 474};
475
de96dbfe 476struct redisFunctionSym {
477 char *name;
56906eef 478 unsigned long pointer;
de96dbfe 479};
480
ed9b544e 481typedef struct _redisSortObject {
482 robj *obj;
483 union {
484 double score;
485 robj *cmpobj;
486 } u;
487} redisSortObject;
488
489typedef struct _redisSortOperation {
490 int type;
491 robj *pattern;
492} redisSortOperation;
493
6b47e12e 494/* ZSETs use a specialized version of Skiplists */
495
496typedef struct zskiplistNode {
497 struct zskiplistNode **forward;
e3870fab 498 struct zskiplistNode *backward;
912b9165 499 unsigned int *span;
6b47e12e 500 double score;
501 robj *obj;
502} zskiplistNode;
503
504typedef struct zskiplist {
e3870fab 505 struct zskiplistNode *header, *tail;
d13f767c 506 unsigned long length;
6b47e12e 507 int level;
508} zskiplist;
509
1812e024 510typedef struct zset {
511 dict *dict;
6b47e12e 512 zskiplist *zsl;
1812e024 513} zset;
514
6b47e12e 515/* Our shared "common" objects */
516
05df7621 517#define REDIS_SHARED_INTEGERS 10000
ed9b544e 518struct sharedObjectsStruct {
c937aa89 519 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 520 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 521 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
522 *outofrangeerr, *plus,
ed9b544e 523 *select0, *select1, *select2, *select3, *select4,
befec3cd 524 *select5, *select6, *select7, *select8, *select9,
c8d0ea0e 525 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
526 *mbulk4, *psubscribebulk, *punsubscribebulk,
527 *integers[REDIS_SHARED_INTEGERS];
ed9b544e 528} shared;
529
a7866db6 530/* Global vars that are actally used as constants. The following double
531 * values are used for double on-disk serialization, and are initialized
532 * at runtime to avoid strange compiler optimizations. */
533
534static double R_Zero, R_PosInf, R_NegInf, R_Nan;
535
92f8e882 536/* VM threaded I/O request message */
b9bc0eef 537#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
538#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
539#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 540typedef struct iojob {
996cb5f7 541 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 542 redisDb *db;/* Redis database */
92f8e882 543 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 544 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 545 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
546 off_t page; /* Swap page where to read/write the object */
248ea310 547 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 548 int canceled; /* True if this command was canceled by blocking side of VM */
549 pthread_t thread; /* ID of the thread processing this entry */
550} iojob;
92f8e882 551
ed9b544e 552/*================================ Prototypes =============================== */
553
554static void freeStringObject(robj *o);
555static void freeListObject(robj *o);
556static void freeSetObject(robj *o);
557static void decrRefCount(void *o);
558static robj *createObject(int type, void *ptr);
559static void freeClient(redisClient *c);
f78fd11b 560static int rdbLoad(char *filename);
ed9b544e 561static void addReply(redisClient *c, robj *obj);
562static void addReplySds(redisClient *c, sds s);
563static void incrRefCount(robj *o);
f78fd11b 564static int rdbSaveBackground(char *filename);
ed9b544e 565static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 566static robj *dupStringObject(robj *o);
248ea310 567static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
dd142b9c 568static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
28ed1f33 569static void flushAppendOnlyFile(void);
44b38ef4 570static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 571static int syncWithMaster(void);
05df7621 572static robj *tryObjectEncoding(robj *o);
9d65a1bb 573static robj *getDecodedObject(robj *o);
3305306f 574static int removeExpire(redisDb *db, robj *key);
575static int expireIfNeeded(redisDb *db, robj *key);
576static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 577static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 578static int deleteKey(redisDb *db, robj *key);
bb32ede5 579static time_t getExpire(redisDb *db, robj *key);
580static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 581static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 582static void freeMemoryIfNeeded(void);
de96dbfe 583static int processCommand(redisClient *c);
56906eef 584static void setupSigSegvAction(void);
a3b21203 585static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 586static void aofRemoveTempFile(pid_t childpid);
0ea663ea 587static size_t stringObjectLen(robj *o);
638e42ac 588static void processInputBuffer(redisClient *c);
6b47e12e 589static zskiplist *zslCreate(void);
fd8ccf44 590static void zslFree(zskiplist *zsl);
2b59cfdf 591static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 592static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 593static void initClientMultiState(redisClient *c);
594static void freeClientMultiState(redisClient *c);
595static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 596static void unblockClientWaitingData(redisClient *c);
4409877e 597static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 598static void vmInit(void);
a35ddf12 599static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 600static robj *vmLoadObject(robj *key);
7e69548d 601static robj *vmPreviewObject(robj *key);
a69a0c9c 602static int vmSwapOneObjectBlocking(void);
603static int vmSwapOneObjectThreaded(void);
7e69548d 604static int vmCanSwapOut(void);
a5819310 605static int tryFreeOneObjectFromFreelist(void);
996cb5f7 606static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
607static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
608static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 609static void lockThreadedIO(void);
610static void unlockThreadedIO(void);
611static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
612static void freeIOJob(iojob *j);
613static void queueIOJob(iojob *j);
a5819310 614static int vmWriteObjectOnSwap(robj *o, off_t page);
615static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 616static void waitEmptyIOJobsQueue(void);
617static void vmReopenSwapFile(void);
970e10bb 618static int vmFreePage(off_t page);
ca1788b5 619static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
3805e04f 620static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
0a6f3f0f 621static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
d5d55fc3 622static int dontWaitForSwappedKey(redisClient *c, robj *key);
623static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
624static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
625static struct redisCommand *lookupCommand(char *name);
626static void call(redisClient *c, struct redisCommand *cmd);
627static void resetClient(redisClient *c);
ada386b2 628static void convertToRealHash(robj *o);
ffc6b7f8 629static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
630static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
631static void freePubsubPattern(void *p);
632static int listMatchPubsubPattern(void *a, void *b);
633static int compareStringObjects(robj *a, robj *b);
bf028098 634static int equalStringObjects(robj *a, robj *b);
befec3cd 635static void usage();
8f63ddca 636static int rewriteAppendOnlyFileBackground(void);
242a64f3 637static int vmSwapObjectBlocking(robj *key, robj *val);
fab43727 638static int prepareForShutdown();
37ab76c9 639static void touchWatchedKey(redisDb *db, robj *key);
9b30e1a2 640static void touchWatchedKeysOnFlush(int dbid);
37ab76c9 641static void unwatchAllKeys(redisClient *c);
ed9b544e 642
abcb223e 643static void authCommand(redisClient *c);
ed9b544e 644static void pingCommand(redisClient *c);
645static void echoCommand(redisClient *c);
646static void setCommand(redisClient *c);
647static void setnxCommand(redisClient *c);
526d00a5 648static void setexCommand(redisClient *c);
ed9b544e 649static void getCommand(redisClient *c);
650static void delCommand(redisClient *c);
651static void existsCommand(redisClient *c);
652static void incrCommand(redisClient *c);
653static void decrCommand(redisClient *c);
654static void incrbyCommand(redisClient *c);
655static void decrbyCommand(redisClient *c);
656static void selectCommand(redisClient *c);
657static void randomkeyCommand(redisClient *c);
658static void keysCommand(redisClient *c);
659static void dbsizeCommand(redisClient *c);
660static void lastsaveCommand(redisClient *c);
661static void saveCommand(redisClient *c);
662static void bgsaveCommand(redisClient *c);
9d65a1bb 663static void bgrewriteaofCommand(redisClient *c);
ed9b544e 664static void shutdownCommand(redisClient *c);
665static void moveCommand(redisClient *c);
666static void renameCommand(redisClient *c);
667static void renamenxCommand(redisClient *c);
668static void lpushCommand(redisClient *c);
669static void rpushCommand(redisClient *c);
670static void lpopCommand(redisClient *c);
671static void rpopCommand(redisClient *c);
672static void llenCommand(redisClient *c);
673static void lindexCommand(redisClient *c);
674static void lrangeCommand(redisClient *c);
675static void ltrimCommand(redisClient *c);
676static void typeCommand(redisClient *c);
677static void lsetCommand(redisClient *c);
678static void saddCommand(redisClient *c);
679static void sremCommand(redisClient *c);
a4460ef4 680static void smoveCommand(redisClient *c);
ed9b544e 681static void sismemberCommand(redisClient *c);
682static void scardCommand(redisClient *c);
12fea928 683static void spopCommand(redisClient *c);
2abb95a9 684static void srandmemberCommand(redisClient *c);
ed9b544e 685static void sinterCommand(redisClient *c);
686static void sinterstoreCommand(redisClient *c);
40d224a9 687static void sunionCommand(redisClient *c);
688static void sunionstoreCommand(redisClient *c);
f4f56e1d 689static void sdiffCommand(redisClient *c);
690static void sdiffstoreCommand(redisClient *c);
ed9b544e 691static void syncCommand(redisClient *c);
692static void flushdbCommand(redisClient *c);
693static void flushallCommand(redisClient *c);
694static void sortCommand(redisClient *c);
695static void lremCommand(redisClient *c);
0f5f7e9a 696static void rpoplpushcommand(redisClient *c);
ed9b544e 697static void infoCommand(redisClient *c);
70003d28 698static void mgetCommand(redisClient *c);
87eca727 699static void monitorCommand(redisClient *c);
3305306f 700static void expireCommand(redisClient *c);
802e8373 701static void expireatCommand(redisClient *c);
f6b141c5 702static void getsetCommand(redisClient *c);
fd88489a 703static void ttlCommand(redisClient *c);
321b0e13 704static void slaveofCommand(redisClient *c);
7f957c92 705static void debugCommand(redisClient *c);
f6b141c5 706static void msetCommand(redisClient *c);
707static void msetnxCommand(redisClient *c);
fd8ccf44 708static void zaddCommand(redisClient *c);
7db723ad 709static void zincrbyCommand(redisClient *c);
cc812361 710static void zrangeCommand(redisClient *c);
50c55df5 711static void zrangebyscoreCommand(redisClient *c);
f44dd428 712static void zcountCommand(redisClient *c);
e3870fab 713static void zrevrangeCommand(redisClient *c);
3c41331e 714static void zcardCommand(redisClient *c);
1b7106e7 715static void zremCommand(redisClient *c);
6e333bbe 716static void zscoreCommand(redisClient *c);
1807985b 717static void zremrangebyscoreCommand(redisClient *c);
6e469882 718static void multiCommand(redisClient *c);
719static void execCommand(redisClient *c);
18b6cb76 720static void discardCommand(redisClient *c);
4409877e 721static void blpopCommand(redisClient *c);
722static void brpopCommand(redisClient *c);
4b00bebd 723static void appendCommand(redisClient *c);
39191553 724static void substrCommand(redisClient *c);
69d95c3e 725static void zrankCommand(redisClient *c);
798d9e55 726static void zrevrankCommand(redisClient *c);
978c2c94 727static void hsetCommand(redisClient *c);
1f1c7695 728static void hsetnxCommand(redisClient *c);
978c2c94 729static void hgetCommand(redisClient *c);
09aeb579
PN
730static void hmsetCommand(redisClient *c);
731static void hmgetCommand(redisClient *c);
07efaf74 732static void hdelCommand(redisClient *c);
92b27fe9 733static void hlenCommand(redisClient *c);
9212eafd 734static void zremrangebyrankCommand(redisClient *c);
5d373da9 735static void zunionstoreCommand(redisClient *c);
736static void zinterstoreCommand(redisClient *c);
78409a0f 737static void hkeysCommand(redisClient *c);
738static void hvalsCommand(redisClient *c);
739static void hgetallCommand(redisClient *c);
a86f14b1 740static void hexistsCommand(redisClient *c);
500ece7c 741static void configCommand(redisClient *c);
01426b05 742static void hincrbyCommand(redisClient *c);
befec3cd 743static void subscribeCommand(redisClient *c);
744static void unsubscribeCommand(redisClient *c);
ffc6b7f8 745static void psubscribeCommand(redisClient *c);
746static void punsubscribeCommand(redisClient *c);
befec3cd 747static void publishCommand(redisClient *c);
37ab76c9 748static void watchCommand(redisClient *c);
749static void unwatchCommand(redisClient *c);
f6b141c5 750
ed9b544e 751/*================================= Globals ================================= */
752
753/* Global vars */
754static struct redisServer server; /* server global state */
1a132bbc
PN
755static struct redisCommand *commandTable;
756static unsigned int commandTableSize;
757static struct redisCommand readonlyCommandTable[] = {
76583ea4
PN
758 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
760 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
526d00a5 761 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
762 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
763 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
764 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
765 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
766 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
767 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
768 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
769 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
770 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
771 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
778 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
780 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
781 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
782 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
783 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
784 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
785 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
786 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
790 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
791 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
792 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
793 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
794 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
795 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
796 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
798 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
799 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
800 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
5d373da9 801 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
802 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
803 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
807 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
808 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
809 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
810 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
811 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
1f1c7695 812 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 813 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
d33278d1 814 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 815 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
01426b05 816 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
817 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
818 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
819 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
820 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
821 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 822 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
823 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
824 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
825 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
826 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
827 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
828 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
831 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
832 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
833 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
840 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
842 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
843 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
844 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
845 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
846 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
3805e04f 847 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
848 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
849 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
850 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
851 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
852 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
853 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
854 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
855 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
856 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
857 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 858 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
befec3cd 859 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
860 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
ffc6b7f8 861 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
862 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
4005fef1 863 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
37ab76c9 864 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
865 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
76583ea4 866 {NULL,NULL,0,0,NULL,0,0,0}
ed9b544e 867};
bcfc686d 868
ed9b544e 869/*============================ Utility functions ============================ */
870
871/* Glob-style pattern matching. */
500ece7c 872static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 873 const char *string, int stringLen, int nocase)
874{
875 while(patternLen) {
876 switch(pattern[0]) {
877 case '*':
878 while (pattern[1] == '*') {
879 pattern++;
880 patternLen--;
881 }
882 if (patternLen == 1)
883 return 1; /* match */
884 while(stringLen) {
885 if (stringmatchlen(pattern+1, patternLen-1,
886 string, stringLen, nocase))
887 return 1; /* match */
888 string++;
889 stringLen--;
890 }
891 return 0; /* no match */
892 break;
893 case '?':
894 if (stringLen == 0)
895 return 0; /* no match */
896 string++;
897 stringLen--;
898 break;
899 case '[':
900 {
901 int not, match;
902
903 pattern++;
904 patternLen--;
905 not = pattern[0] == '^';
906 if (not) {
907 pattern++;
908 patternLen--;
909 }
910 match = 0;
911 while(1) {
912 if (pattern[0] == '\\') {
913 pattern++;
914 patternLen--;
915 if (pattern[0] == string[0])
916 match = 1;
917 } else if (pattern[0] == ']') {
918 break;
919 } else if (patternLen == 0) {
920 pattern--;
921 patternLen++;
922 break;
923 } else if (pattern[1] == '-' && patternLen >= 3) {
924 int start = pattern[0];
925 int end = pattern[2];
926 int c = string[0];
927 if (start > end) {
928 int t = start;
929 start = end;
930 end = t;
931 }
932 if (nocase) {
933 start = tolower(start);
934 end = tolower(end);
935 c = tolower(c);
936 }
937 pattern += 2;
938 patternLen -= 2;
939 if (c >= start && c <= end)
940 match = 1;
941 } else {
942 if (!nocase) {
943 if (pattern[0] == string[0])
944 match = 1;
945 } else {
946 if (tolower((int)pattern[0]) == tolower((int)string[0]))
947 match = 1;
948 }
949 }
950 pattern++;
951 patternLen--;
952 }
953 if (not)
954 match = !match;
955 if (!match)
956 return 0; /* no match */
957 string++;
958 stringLen--;
959 break;
960 }
961 case '\\':
962 if (patternLen >= 2) {
963 pattern++;
964 patternLen--;
965 }
966 /* fall through */
967 default:
968 if (!nocase) {
969 if (pattern[0] != string[0])
970 return 0; /* no match */
971 } else {
972 if (tolower((int)pattern[0]) != tolower((int)string[0]))
973 return 0; /* no match */
974 }
975 string++;
976 stringLen--;
977 break;
978 }
979 pattern++;
980 patternLen--;
981 if (stringLen == 0) {
982 while(*pattern == '*') {
983 pattern++;
984 patternLen--;
985 }
986 break;
987 }
988 }
989 if (patternLen == 0 && stringLen == 0)
990 return 1;
991 return 0;
992}
993
500ece7c 994static int stringmatch(const char *pattern, const char *string, int nocase) {
995 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
996}
997
2b619329 998/* Convert a string representing an amount of memory into the number of
999 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
1000 * (1024*1024*1024).
1001 *
1002 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1003 * set to 0 */
1004static long long memtoll(const char *p, int *err) {
1005 const char *u;
1006 char buf[128];
1007 long mul; /* unit multiplier */
1008 long long val;
1009 unsigned int digits;
1010
1011 if (err) *err = 0;
1012 /* Search the first non digit character. */
1013 u = p;
1014 if (*u == '-') u++;
1015 while(*u && isdigit(*u)) u++;
1016 if (*u == '\0' || !strcasecmp(u,"b")) {
1017 mul = 1;
72324005 1018 } else if (!strcasecmp(u,"k")) {
2b619329 1019 mul = 1000;
72324005 1020 } else if (!strcasecmp(u,"kb")) {
2b619329 1021 mul = 1024;
72324005 1022 } else if (!strcasecmp(u,"m")) {
2b619329 1023 mul = 1000*1000;
72324005 1024 } else if (!strcasecmp(u,"mb")) {
2b619329 1025 mul = 1024*1024;
72324005 1026 } else if (!strcasecmp(u,"g")) {
2b619329 1027 mul = 1000L*1000*1000;
72324005 1028 } else if (!strcasecmp(u,"gb")) {
2b619329 1029 mul = 1024L*1024*1024;
1030 } else {
1031 if (err) *err = 1;
1032 mul = 1;
1033 }
1034 digits = u-p;
1035 if (digits >= sizeof(buf)) {
1036 if (err) *err = 1;
1037 return LLONG_MAX;
1038 }
1039 memcpy(buf,p,digits);
1040 buf[digits] = '\0';
1041 val = strtoll(buf,NULL,10);
1042 return val*mul;
1043}
1044
ee14da56 1045/* Convert a long long into a string. Returns the number of
1046 * characters needed to represent the number, that can be shorter if passed
1047 * buffer length is not enough to store the whole number. */
1048static int ll2string(char *s, size_t len, long long value) {
1049 char buf[32], *p;
1050 unsigned long long v;
1051 size_t l;
1052
1053 if (len == 0) return 0;
1054 v = (value < 0) ? -value : value;
1055 p = buf+31; /* point to the last character */
1056 do {
1057 *p-- = '0'+(v%10);
1058 v /= 10;
1059 } while(v);
1060 if (value < 0) *p-- = '-';
1061 p++;
1062 l = 32-(p-buf);
1063 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1064 memcpy(s,p,l);
1065 s[l] = '\0';
1066 return l;
1067}
1068
56906eef 1069static void redisLog(int level, const char *fmt, ...) {
ed9b544e 1070 va_list ap;
1071 FILE *fp;
1072
1073 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1074 if (!fp) return;
1075
1076 va_start(ap, fmt);
1077 if (level >= server.verbosity) {
6766f45e 1078 char *c = ".-*#";
1904ecc1 1079 char buf[64];
1080 time_t now;
1081
1082 now = time(NULL);
6c9385e0 1083 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 1084 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 1085 vfprintf(fp, fmt, ap);
1086 fprintf(fp,"\n");
1087 fflush(fp);
1088 }
1089 va_end(ap);
1090
1091 if (server.logfile) fclose(fp);
1092}
1093
1094/*====================== Hash table type implementation ==================== */
1095
1096/* This is an hash table type that uses the SDS dynamic strings libary as
1097 * keys and radis objects as values (objects can hold SDS strings,
1098 * lists, sets). */
1099
1812e024 1100static void dictVanillaFree(void *privdata, void *val)
1101{
1102 DICT_NOTUSED(privdata);
1103 zfree(val);
1104}
1105
4409877e 1106static void dictListDestructor(void *privdata, void *val)
1107{
1108 DICT_NOTUSED(privdata);
1109 listRelease((list*)val);
1110}
1111
ed9b544e 1112static int sdsDictKeyCompare(void *privdata, const void *key1,
1113 const void *key2)
1114{
1115 int l1,l2;
1116 DICT_NOTUSED(privdata);
1117
1118 l1 = sdslen((sds)key1);
1119 l2 = sdslen((sds)key2);
1120 if (l1 != l2) return 0;
1121 return memcmp(key1, key2, l1) == 0;
1122}
1123
1124static void dictRedisObjectDestructor(void *privdata, void *val)
1125{
1126 DICT_NOTUSED(privdata);
1127
a35ddf12 1128 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 1129 decrRefCount(val);
1130}
1131
942a3961 1132static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1133 const void *key2)
1134{
1135 const robj *o1 = key1, *o2 = key2;
1136 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1137}
1138
942a3961 1139static unsigned int dictObjHash(const void *key) {
ed9b544e 1140 const robj *o = key;
1141 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1142}
1143
942a3961 1144static int dictEncObjKeyCompare(void *privdata, const void *key1,
1145 const void *key2)
1146{
9d65a1bb 1147 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1148 int cmp;
942a3961 1149
2a1198b4 1150 if (o1->encoding == REDIS_ENCODING_INT &&
dc05abde 1151 o2->encoding == REDIS_ENCODING_INT)
1152 return o1->ptr == o2->ptr;
2a1198b4 1153
9d65a1bb 1154 o1 = getDecodedObject(o1);
1155 o2 = getDecodedObject(o2);
1156 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1157 decrRefCount(o1);
1158 decrRefCount(o2);
1159 return cmp;
942a3961 1160}
1161
1162static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1163 robj *o = (robj*) key;
942a3961 1164
ed9e4966 1165 if (o->encoding == REDIS_ENCODING_RAW) {
1166 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1167 } else {
1168 if (o->encoding == REDIS_ENCODING_INT) {
1169 char buf[32];
1170 int len;
1171
ee14da56 1172 len = ll2string(buf,32,(long)o->ptr);
ed9e4966 1173 return dictGenHashFunction((unsigned char*)buf, len);
1174 } else {
1175 unsigned int hash;
1176
1177 o = getDecodedObject(o);
1178 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1179 decrRefCount(o);
1180 return hash;
1181 }
1182 }
942a3961 1183}
1184
f2d9f50f 1185/* Sets type and expires */
ed9b544e 1186static dictType setDictType = {
942a3961 1187 dictEncObjHash, /* hash function */
ed9b544e 1188 NULL, /* key dup */
1189 NULL, /* val dup */
942a3961 1190 dictEncObjKeyCompare, /* key compare */
ed9b544e 1191 dictRedisObjectDestructor, /* key destructor */
1192 NULL /* val destructor */
1193};
1194
f2d9f50f 1195/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1196static dictType zsetDictType = {
1197 dictEncObjHash, /* hash function */
1198 NULL, /* key dup */
1199 NULL, /* val dup */
1200 dictEncObjKeyCompare, /* key compare */
1201 dictRedisObjectDestructor, /* key destructor */
da0a1620 1202 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1203};
1204
f2d9f50f 1205/* Db->dict */
5234952b 1206static dictType dbDictType = {
942a3961 1207 dictObjHash, /* hash function */
ed9b544e 1208 NULL, /* key dup */
1209 NULL, /* val dup */
942a3961 1210 dictObjKeyCompare, /* key compare */
ed9b544e 1211 dictRedisObjectDestructor, /* key destructor */
1212 dictRedisObjectDestructor /* val destructor */
1213};
1214
f2d9f50f 1215/* Db->expires */
1216static dictType keyptrDictType = {
1217 dictObjHash, /* hash function */
1218 NULL, /* key dup */
1219 NULL, /* val dup */
1220 dictObjKeyCompare, /* key compare */
1221 dictRedisObjectDestructor, /* key destructor */
1222 NULL /* val destructor */
1223};
1224
5234952b 1225/* Hash type hash table (note that small hashes are represented with zimpaps) */
1226static dictType hashDictType = {
1227 dictEncObjHash, /* hash function */
1228 NULL, /* key dup */
1229 NULL, /* val dup */
1230 dictEncObjKeyCompare, /* key compare */
1231 dictRedisObjectDestructor, /* key destructor */
1232 dictRedisObjectDestructor /* val destructor */
1233};
1234
4409877e 1235/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1236 * lists as values. It's used for blocking operations (BLPOP) and to
1237 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1238static dictType keylistDictType = {
1239 dictObjHash, /* hash function */
1240 NULL, /* key dup */
1241 NULL, /* val dup */
1242 dictObjKeyCompare, /* key compare */
1243 dictRedisObjectDestructor, /* key destructor */
1244 dictListDestructor /* val destructor */
1245};
1246
42ab0172
AO
1247static void version();
1248
ed9b544e 1249/* ========================= Random utility functions ======================= */
1250
1251/* Redis generally does not try to recover from out of memory conditions
1252 * when allocating objects or strings, it is not clear if it will be possible
1253 * to report this condition to the client since the networking layer itself
1254 * is based on heap allocation for send buffers, so we simply abort.
1255 * At least the code will be simpler to read... */
1256static void oom(const char *msg) {
71c54b21 1257 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1258 sleep(1);
1259 abort();
1260}
1261
1262/* ====================== Redis server networking stuff ===================== */
56906eef 1263static void closeTimedoutClients(void) {
ed9b544e 1264 redisClient *c;
ed9b544e 1265 listNode *ln;
1266 time_t now = time(NULL);
c7df85a4 1267 listIter li;
ed9b544e 1268
c7df85a4 1269 listRewind(server.clients,&li);
1270 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1271 c = listNodeValue(ln);
f86a74e9 1272 if (server.maxidletime &&
1273 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1274 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
ffc6b7f8 1275 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1276 listLength(c->pubsub_patterns) == 0 &&
d6cc8867 1277 (now - c->lastinteraction > server.maxidletime))
f86a74e9 1278 {
f870935d 1279 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1280 freeClient(c);
f86a74e9 1281 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1282 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1283 addReply(c,shared.nullmultibulk);
b0d8747d 1284 unblockClientWaitingData(c);
f86a74e9 1285 }
ed9b544e 1286 }
1287 }
ed9b544e 1288}
1289
12fea928 1290static int htNeedsResize(dict *dict) {
1291 long long size, used;
1292
1293 size = dictSlots(dict);
1294 used = dictSize(dict);
1295 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1296 (used*100/size < REDIS_HT_MINFILL));
1297}
1298
0bc03378 1299/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1300 * we resize the hash table to save memory */
56906eef 1301static void tryResizeHashTables(void) {
0bc03378 1302 int j;
1303
1304 for (j = 0; j < server.dbnum; j++) {
5413c40d 1305 if (htNeedsResize(server.db[j].dict))
0bc03378 1306 dictResize(server.db[j].dict);
12fea928 1307 if (htNeedsResize(server.db[j].expires))
1308 dictResize(server.db[j].expires);
0bc03378 1309 }
1310}
1311
8ca3e9d1 1312/* Our hash table implementation performs rehashing incrementally while
1313 * we write/read from the hash table. Still if the server is idle, the hash
1314 * table will use two tables for a long time. So we try to use 1 millisecond
1315 * of CPU time at every serverCron() loop in order to rehash some key. */
1316static void incrementallyRehash(void) {
1317 int j;
1318
1319 for (j = 0; j < server.dbnum; j++) {
1320 if (dictIsRehashing(server.db[j].dict)) {
1321 dictRehashMilliseconds(server.db[j].dict,1);
1322 break; /* already used our millisecond for this loop... */
1323 }
1324 }
1325}
1326
9d65a1bb 1327/* A background saving child (BGSAVE) terminated its work. Handle this. */
1328void backgroundSaveDoneHandler(int statloc) {
1329 int exitcode = WEXITSTATUS(statloc);
1330 int bysignal = WIFSIGNALED(statloc);
1331
1332 if (!bysignal && exitcode == 0) {
1333 redisLog(REDIS_NOTICE,
1334 "Background saving terminated with success");
1335 server.dirty = 0;
1336 server.lastsave = time(NULL);
1337 } else if (!bysignal && exitcode != 0) {
1338 redisLog(REDIS_WARNING, "Background saving error");
1339 } else {
1340 redisLog(REDIS_WARNING,
454eea7c 1341 "Background saving terminated by signal %d", WTERMSIG(statloc));
9d65a1bb 1342 rdbRemoveTempFile(server.bgsavechildpid);
1343 }
1344 server.bgsavechildpid = -1;
1345 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1346 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1347 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1348}
1349
1350/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1351 * Handle this. */
1352void backgroundRewriteDoneHandler(int statloc) {
1353 int exitcode = WEXITSTATUS(statloc);
1354 int bysignal = WIFSIGNALED(statloc);
1355
1356 if (!bysignal && exitcode == 0) {
1357 int fd;
1358 char tmpfile[256];
1359
1360 redisLog(REDIS_NOTICE,
1361 "Background append only file rewriting terminated with success");
1362 /* Now it's time to flush the differences accumulated by the parent */
1363 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1364 fd = open(tmpfile,O_WRONLY|O_APPEND);
1365 if (fd == -1) {
1366 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1367 goto cleanup;
1368 }
1369 /* Flush our data... */
1370 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1371 (signed) sdslen(server.bgrewritebuf)) {
1372 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1373 close(fd);
1374 goto cleanup;
1375 }
b32627cd 1376 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1377 /* Now our work is to rename the temp file into the stable file. And
1378 * switch the file descriptor used by the server for append only. */
1379 if (rename(tmpfile,server.appendfilename) == -1) {
1380 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1381 close(fd);
1382 goto cleanup;
1383 }
1384 /* Mission completed... almost */
1385 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1386 if (server.appendfd != -1) {
1387 /* If append only is actually enabled... */
1388 close(server.appendfd);
1389 server.appendfd = fd;
1390 fsync(fd);
85a83172 1391 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1392 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1393 } else {
1394 /* If append only is disabled we just generate a dump in this
1395 * format. Why not? */
1396 close(fd);
1397 }
1398 } else if (!bysignal && exitcode != 0) {
1399 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1400 } else {
1401 redisLog(REDIS_WARNING,
454eea7c 1402 "Background append only file rewriting terminated by signal %d",
1403 WTERMSIG(statloc));
9d65a1bb 1404 }
1405cleanup:
1406 sdsfree(server.bgrewritebuf);
1407 server.bgrewritebuf = sdsempty();
1408 aofRemoveTempFile(server.bgrewritechildpid);
1409 server.bgrewritechildpid = -1;
1410}
1411
884d4b39 1412/* This function is called once a background process of some kind terminates,
1413 * as we want to avoid resizing the hash tables when there is a child in order
1414 * to play well with copy-on-write (otherwise when a resize happens lots of
1415 * memory pages are copied). The goal of this function is to update the ability
1416 * for dict.c to resize the hash tables accordingly to the fact we have o not
1417 * running childs. */
1418static void updateDictResizePolicy(void) {
1419 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1420 dictEnableResize();
1421 else
1422 dictDisableResize();
1423}
1424
56906eef 1425static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1426 int j, loops = server.cronloops++;
ed9b544e 1427 REDIS_NOTUSED(eventLoop);
1428 REDIS_NOTUSED(id);
1429 REDIS_NOTUSED(clientData);
1430
3a66edc7 1431 /* We take a cached value of the unix time in the global state because
1432 * with virtual memory and aging there is to store the current time
1433 * in objects at every object access, and accuracy is not needed.
1434 * To access a global var is faster than calling time(NULL) */
1435 server.unixtime = time(NULL);
1436
fab43727 1437 /* We received a SIGTERM, shutting down here in a safe way, as it is
1438 * not ok doing so inside the signal handler. */
1439 if (server.shutdown_asap) {
1440 if (prepareForShutdown() == REDIS_OK) exit(0);
1441 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1442 }
1443
0bc03378 1444 /* Show some info about non-empty databases */
ed9b544e 1445 for (j = 0; j < server.dbnum; j++) {
dec423d9 1446 long long size, used, vkeys;
94754ccc 1447
3305306f 1448 size = dictSlots(server.db[j].dict);
1449 used = dictSize(server.db[j].dict);
94754ccc 1450 vkeys = dictSize(server.db[j].expires);
1763929f 1451 if (!(loops % 50) && (used || vkeys)) {
f870935d 1452 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1453 /* dictPrintStats(server.dict); */
ed9b544e 1454 }
ed9b544e 1455 }
1456
0bc03378 1457 /* We don't want to resize the hash tables while a bacground saving
1458 * is in progress: the saving child is created using fork() that is
1459 * implemented with a copy-on-write semantic in most modern systems, so
1460 * if we resize the HT while there is the saving child at work actually
1461 * a lot of memory movements in the parent will cause a lot of pages
1462 * copied. */
8ca3e9d1 1463 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1464 if (!(loops % 10)) tryResizeHashTables();
1465 if (server.activerehashing) incrementallyRehash();
884d4b39 1466 }
0bc03378 1467
ed9b544e 1468 /* Show information about connected clients */
1763929f 1469 if (!(loops % 50)) {
bdcb92f2 1470 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
ed9b544e 1471 listLength(server.clients)-listLength(server.slaves),
1472 listLength(server.slaves),
bdcb92f2 1473 zmalloc_used_memory());
ed9b544e 1474 }
1475
1476 /* Close connections of timedout clients */
1763929f 1477 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1478 closeTimedoutClients();
1479
9d65a1bb 1480 /* Check if a background saving or AOF rewrite in progress terminated */
1481 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1482 int statloc;
9d65a1bb 1483 pid_t pid;
1484
1485 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1486 if (pid == server.bgsavechildpid) {
1487 backgroundSaveDoneHandler(statloc);
ed9b544e 1488 } else {
9d65a1bb 1489 backgroundRewriteDoneHandler(statloc);
ed9b544e 1490 }
884d4b39 1491 updateDictResizePolicy();
ed9b544e 1492 }
1493 } else {
1494 /* If there is not a background saving in progress check if
1495 * we have to save now */
1496 time_t now = time(NULL);
1497 for (j = 0; j < server.saveparamslen; j++) {
1498 struct saveparam *sp = server.saveparams+j;
1499
1500 if (server.dirty >= sp->changes &&
1501 now-server.lastsave > sp->seconds) {
1502 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1503 sp->changes, sp->seconds);
f78fd11b 1504 rdbSaveBackground(server.dbfilename);
ed9b544e 1505 break;
1506 }
1507 }
1508 }
94754ccc 1509
f2324293 1510 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1511 * will use few CPU cycles if there are few expiring keys, otherwise
1512 * it will get more aggressive to avoid that too much memory is used by
1513 * keys that can be removed from the keyspace. */
94754ccc 1514 for (j = 0; j < server.dbnum; j++) {
f2324293 1515 int expired;
94754ccc 1516 redisDb *db = server.db+j;
94754ccc 1517
f2324293 1518 /* Continue to expire if at the end of the cycle more than 25%
1519 * of the keys were expired. */
1520 do {
4ef8de8a 1521 long num = dictSize(db->expires);
94754ccc 1522 time_t now = time(NULL);
1523
f2324293 1524 expired = 0;
94754ccc 1525 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1526 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1527 while (num--) {
1528 dictEntry *de;
1529 time_t t;
1530
1531 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1532 t = (time_t) dictGetEntryVal(de);
1533 if (now > t) {
1534 deleteKey(db,dictGetEntryKey(de));
f2324293 1535 expired++;
2a6a2ed1 1536 server.stat_expiredkeys++;
94754ccc 1537 }
1538 }
f2324293 1539 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1540 }
1541
4ef8de8a 1542 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1543 * is enbled. Try to free objects from the free list first. */
7e69548d 1544 if (vmCanSwapOut()) {
1545 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1546 server.vm_max_memory)
1547 {
72e9fd40 1548 int retval;
1549
a5819310 1550 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1551 retval = (server.vm_max_threads == 0) ?
1552 vmSwapOneObjectBlocking() :
1553 vmSwapOneObjectThreaded();
1763929f 1554 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1555 zmalloc_used_memory() >
1556 (server.vm_max_memory+server.vm_max_memory/10))
1557 {
1558 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1559 }
72e9fd40 1560 /* Note that when using threade I/O we free just one object,
1561 * because anyway when the I/O thread in charge to swap this
1562 * object out will finish, the handler of completed jobs
1563 * will try to swap more objects if we are still out of memory. */
1564 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1565 }
1566 }
1567
ed9b544e 1568 /* Check if we should connect to a MASTER */
1763929f 1569 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1570 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1571 if (syncWithMaster() == REDIS_OK) {
1572 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
8f63ddca 1573 if (server.appendonly) rewriteAppendOnlyFileBackground();
ed9b544e 1574 }
1575 }
1763929f 1576 return 100;
ed9b544e 1577}
1578
d5d55fc3 1579/* This function gets called every time Redis is entering the
1580 * main loop of the event driven library, that is, before to sleep
1581 * for ready file descriptors. */
1582static void beforeSleep(struct aeEventLoop *eventLoop) {
1583 REDIS_NOTUSED(eventLoop);
1584
28ed1f33 1585 /* Awake clients that got all the swapped keys they requested */
d5d55fc3 1586 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1587 listIter li;
1588 listNode *ln;
1589
1590 listRewind(server.io_ready_clients,&li);
1591 while((ln = listNext(&li))) {
1592 redisClient *c = ln->value;
1593 struct redisCommand *cmd;
1594
1595 /* Resume the client. */
1596 listDelNode(server.io_ready_clients,ln);
1597 c->flags &= (~REDIS_IO_WAIT);
1598 server.vm_blocked_clients--;
1599 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1600 readQueryFromClient, c);
1601 cmd = lookupCommand(c->argv[0]->ptr);
1602 assert(cmd != NULL);
1603 call(c,cmd);
1604 resetClient(c);
1605 /* There may be more data to process in the input buffer. */
1606 if (c->querybuf && sdslen(c->querybuf) > 0)
1607 processInputBuffer(c);
1608 }
1609 }
28ed1f33 1610 /* Write the AOF buffer on disk */
1611 flushAppendOnlyFile();
d5d55fc3 1612}
1613
ed9b544e 1614static void createSharedObjects(void) {
05df7621 1615 int j;
1616
ed9b544e 1617 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1618 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1619 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1620 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1621 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1622 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1623 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1624 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1625 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1626 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1627 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1628 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1629 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1630 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1631 "-ERR no such key\r\n"));
ed9b544e 1632 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1633 "-ERR syntax error\r\n"));
c937aa89 1634 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1635 "-ERR source and destination objects are the same\r\n"));
1636 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1637 "-ERR index out of range\r\n"));
ed9b544e 1638 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1639 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1640 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1641 shared.select0 = createStringObject("select 0\r\n",10);
1642 shared.select1 = createStringObject("select 1\r\n",10);
1643 shared.select2 = createStringObject("select 2\r\n",10);
1644 shared.select3 = createStringObject("select 3\r\n",10);
1645 shared.select4 = createStringObject("select 4\r\n",10);
1646 shared.select5 = createStringObject("select 5\r\n",10);
1647 shared.select6 = createStringObject("select 6\r\n",10);
1648 shared.select7 = createStringObject("select 7\r\n",10);
1649 shared.select8 = createStringObject("select 8\r\n",10);
1650 shared.select9 = createStringObject("select 9\r\n",10);
befec3cd 1651 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
c8d0ea0e 1652 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
befec3cd 1653 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
fc46bb71 1654 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
ffc6b7f8 1655 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1656 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
befec3cd 1657 shared.mbulk3 = createStringObject("*3\r\n",4);
c8d0ea0e 1658 shared.mbulk4 = createStringObject("*4\r\n",4);
05df7621 1659 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1660 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1661 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1662 }
ed9b544e 1663}
1664
1665static void appendServerSaveParams(time_t seconds, int changes) {
1666 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1667 server.saveparams[server.saveparamslen].seconds = seconds;
1668 server.saveparams[server.saveparamslen].changes = changes;
1669 server.saveparamslen++;
1670}
1671
bcfc686d 1672static void resetServerSaveParams() {
ed9b544e 1673 zfree(server.saveparams);
1674 server.saveparams = NULL;
1675 server.saveparamslen = 0;
1676}
1677
1678static void initServerConfig() {
1679 server.dbnum = REDIS_DEFAULT_DBNUM;
1680 server.port = REDIS_SERVERPORT;
f870935d 1681 server.verbosity = REDIS_VERBOSE;
ed9b544e 1682 server.maxidletime = REDIS_MAXIDLETIME;
1683 server.saveparams = NULL;
1684 server.logfile = NULL; /* NULL = log on standard output */
1685 server.bindaddr = NULL;
1686 server.glueoutputbuf = 1;
1687 server.daemonize = 0;
44b38ef4 1688 server.appendonly = 0;
1b677732 1689 server.appendfsync = APPENDFSYNC_EVERYSEC;
48f0308a 1690 server.lastfsync = time(NULL);
44b38ef4 1691 server.appendfd = -1;
1692 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1693 server.pidfile = zstrdup("/var/run/redis.pid");
1694 server.dbfilename = zstrdup("dump.rdb");
1695 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1696 server.requirepass = NULL;
b0553789 1697 server.rdbcompression = 1;
8ca3e9d1 1698 server.activerehashing = 1;
285add55 1699 server.maxclients = 0;
d5d55fc3 1700 server.blpop_blocked_clients = 0;
3fd78bcd 1701 server.maxmemory = 0;
75680a3c 1702 server.vm_enabled = 0;
054e426d 1703 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1704 server.vm_page_size = 256; /* 256 bytes per page */
1705 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1706 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1707 server.vm_max_threads = 4;
d5d55fc3 1708 server.vm_blocked_clients = 0;
cbba7dd7 1709 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1710 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
fab43727 1711 server.shutdown_asap = 0;
75680a3c 1712
bcfc686d 1713 resetServerSaveParams();
ed9b544e 1714
1715 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1716 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1717 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1718 /* Replication related */
1719 server.isslave = 0;
d0ccebcf 1720 server.masterauth = NULL;
ed9b544e 1721 server.masterhost = NULL;
1722 server.masterport = 6379;
1723 server.master = NULL;
1724 server.replstate = REDIS_REPL_NONE;
a7866db6 1725
1726 /* Double constants initialization */
1727 R_Zero = 0.0;
1728 R_PosInf = 1.0/R_Zero;
1729 R_NegInf = -1.0/R_Zero;
1730 R_Nan = R_Zero/R_Zero;
ed9b544e 1731}
1732
1733static void initServer() {
1734 int j;
1735
1736 signal(SIGHUP, SIG_IGN);
1737 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1738 setupSigSegvAction();
ed9b544e 1739
b9bc0eef 1740 server.devnull = fopen("/dev/null","w");
1741 if (server.devnull == NULL) {
1742 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1743 exit(1);
1744 }
ed9b544e 1745 server.clients = listCreate();
1746 server.slaves = listCreate();
87eca727 1747 server.monitors = listCreate();
ed9b544e 1748 server.objfreelist = listCreate();
1749 createSharedObjects();
1750 server.el = aeCreateEventLoop();
3305306f 1751 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
ed9b544e 1752 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1753 if (server.fd == -1) {
1754 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1755 exit(1);
1756 }
3305306f 1757 for (j = 0; j < server.dbnum; j++) {
5234952b 1758 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1759 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
37ab76c9 1760 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1761 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1762 if (server.vm_enabled)
1763 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1764 server.db[j].id = j;
1765 }
ffc6b7f8 1766 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1767 server.pubsub_patterns = listCreate();
1768 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1769 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
ed9b544e 1770 server.cronloops = 0;
9f3c422c 1771 server.bgsavechildpid = -1;
9d65a1bb 1772 server.bgrewritechildpid = -1;
1773 server.bgrewritebuf = sdsempty();
28ed1f33 1774 server.aofbuf = sdsempty();
ed9b544e 1775 server.lastsave = time(NULL);
1776 server.dirty = 0;
ed9b544e 1777 server.stat_numcommands = 0;
1778 server.stat_numconnections = 0;
2a6a2ed1 1779 server.stat_expiredkeys = 0;
ed9b544e 1780 server.stat_starttime = time(NULL);
3a66edc7 1781 server.unixtime = time(NULL);
d8f8b666 1782 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1783 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1784 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1785
1786 if (server.appendonly) {
3bb225d6 1787 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1788 if (server.appendfd == -1) {
1789 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1790 strerror(errno));
1791 exit(1);
1792 }
1793 }
75680a3c 1794
1795 if (server.vm_enabled) vmInit();
ed9b544e 1796}
1797
1798/* Empty the whole database */
ca37e9cd 1799static long long emptyDb() {
ed9b544e 1800 int j;
ca37e9cd 1801 long long removed = 0;
ed9b544e 1802
3305306f 1803 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1804 removed += dictSize(server.db[j].dict);
3305306f 1805 dictEmpty(server.db[j].dict);
1806 dictEmpty(server.db[j].expires);
1807 }
ca37e9cd 1808 return removed;
ed9b544e 1809}
1810
85dd2f3a 1811static int yesnotoi(char *s) {
1812 if (!strcasecmp(s,"yes")) return 1;
1813 else if (!strcasecmp(s,"no")) return 0;
1814 else return -1;
1815}
1816
ed9b544e 1817/* I agree, this is a very rudimental way to load a configuration...
1818 will improve later if the config gets more complex */
1819static void loadServerConfig(char *filename) {
c9a111ac 1820 FILE *fp;
ed9b544e 1821 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1822 int linenum = 0;
1823 sds line = NULL;
c9a111ac 1824
1825 if (filename[0] == '-' && filename[1] == '\0')
1826 fp = stdin;
1827 else {
1828 if ((fp = fopen(filename,"r")) == NULL) {
9a22de82 1829 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
c9a111ac 1830 exit(1);
1831 }
ed9b544e 1832 }
c9a111ac 1833
ed9b544e 1834 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1835 sds *argv;
1836 int argc, j;
1837
1838 linenum++;
1839 line = sdsnew(buf);
1840 line = sdstrim(line," \t\r\n");
1841
1842 /* Skip comments and blank lines*/
1843 if (line[0] == '#' || line[0] == '\0') {
1844 sdsfree(line);
1845 continue;
1846 }
1847
1848 /* Split into arguments */
1849 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1850 sdstolower(argv[0]);
1851
1852 /* Execute config directives */
bb0b03a3 1853 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1854 server.maxidletime = atoi(argv[1]);
0150db36 1855 if (server.maxidletime < 0) {
ed9b544e 1856 err = "Invalid timeout value"; goto loaderr;
1857 }
bb0b03a3 1858 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1859 server.port = atoi(argv[1]);
1860 if (server.port < 1 || server.port > 65535) {
1861 err = "Invalid port"; goto loaderr;
1862 }
bb0b03a3 1863 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1864 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1865 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1866 int seconds = atoi(argv[1]);
1867 int changes = atoi(argv[2]);
1868 if (seconds < 1 || changes < 0) {
1869 err = "Invalid save parameters"; goto loaderr;
1870 }
1871 appendServerSaveParams(seconds,changes);
bb0b03a3 1872 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1873 if (chdir(argv[1]) == -1) {
1874 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1875 argv[1], strerror(errno));
1876 exit(1);
1877 }
bb0b03a3 1878 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1879 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1880 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1881 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1882 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1883 else {
1884 err = "Invalid log level. Must be one of debug, notice, warning";
1885 goto loaderr;
1886 }
bb0b03a3 1887 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1888 FILE *logfp;
ed9b544e 1889
1890 server.logfile = zstrdup(argv[1]);
bb0b03a3 1891 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1892 zfree(server.logfile);
1893 server.logfile = NULL;
1894 }
1895 if (server.logfile) {
1896 /* Test if we are able to open the file. The server will not
1897 * be able to abort just for this problem later... */
c9a111ac 1898 logfp = fopen(server.logfile,"a");
1899 if (logfp == NULL) {
ed9b544e 1900 err = sdscatprintf(sdsempty(),
1901 "Can't open the log file: %s", strerror(errno));
1902 goto loaderr;
1903 }
c9a111ac 1904 fclose(logfp);
ed9b544e 1905 }
bb0b03a3 1906 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1907 server.dbnum = atoi(argv[1]);
1908 if (server.dbnum < 1) {
1909 err = "Invalid number of databases"; goto loaderr;
1910 }
b3f83f12
JZ
1911 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1912 loadServerConfig(argv[1]);
285add55 1913 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1914 server.maxclients = atoi(argv[1]);
3fd78bcd 1915 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
2b619329 1916 server.maxmemory = memtoll(argv[1],NULL);
bb0b03a3 1917 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1918 server.masterhost = sdsnew(argv[1]);
1919 server.masterport = atoi(argv[2]);
1920 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1921 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1922 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1923 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1924 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1925 err = "argument must be 'yes' or 'no'"; goto loaderr;
1926 }
121f70cf 1927 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1928 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
8ca3e9d1 1929 err = "argument must be 'yes' or 'no'"; goto loaderr;
1930 }
1931 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1932 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
121f70cf 1933 err = "argument must be 'yes' or 'no'"; goto loaderr;
1934 }
bb0b03a3 1935 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1936 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1937 err = "argument must be 'yes' or 'no'"; goto loaderr;
1938 }
44b38ef4 1939 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1940 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1941 err = "argument must be 'yes' or 'no'"; goto loaderr;
1942 }
f3b52411
PN
1943 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1944 zfree(server.appendfilename);
1945 server.appendfilename = zstrdup(argv[1]);
48f0308a 1946 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1947 if (!strcasecmp(argv[1],"no")) {
48f0308a 1948 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1949 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1950 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1951 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1952 server.appendfsync = APPENDFSYNC_EVERYSEC;
1953 } else {
1954 err = "argument must be 'no', 'always' or 'everysec'";
1955 goto loaderr;
1956 }
bb0b03a3 1957 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 1958 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1959 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 1960 zfree(server.pidfile);
054e426d 1961 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1962 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 1963 zfree(server.dbfilename);
054e426d 1964 server.dbfilename = zstrdup(argv[1]);
75680a3c 1965 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1966 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1967 err = "argument must be 'yes' or 'no'"; goto loaderr;
1968 }
054e426d 1969 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 1970 zfree(server.vm_swap_file);
054e426d 1971 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 1972 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2b619329 1973 server.vm_max_memory = memtoll(argv[1],NULL);
4ef8de8a 1974 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2b619329 1975 server.vm_page_size = memtoll(argv[1], NULL);
4ef8de8a 1976 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2b619329 1977 server.vm_pages = memtoll(argv[1], NULL);
92f8e882 1978 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1979 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 1980 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2b619329 1981 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
cbba7dd7 1982 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2b619329 1983 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
ed9b544e 1984 } else {
1985 err = "Bad directive or wrong number of arguments"; goto loaderr;
1986 }
1987 for (j = 0; j < argc; j++)
1988 sdsfree(argv[j]);
1989 zfree(argv);
1990 sdsfree(line);
1991 }
c9a111ac 1992 if (fp != stdin) fclose(fp);
ed9b544e 1993 return;
1994
1995loaderr:
1996 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1997 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1998 fprintf(stderr, ">>> '%s'\n", line);
1999 fprintf(stderr, "%s\n", err);
2000 exit(1);
2001}
2002
2003static void freeClientArgv(redisClient *c) {
2004 int j;
2005
2006 for (j = 0; j < c->argc; j++)
2007 decrRefCount(c->argv[j]);
e8a74421 2008 for (j = 0; j < c->mbargc; j++)
2009 decrRefCount(c->mbargv[j]);
ed9b544e 2010 c->argc = 0;
e8a74421 2011 c->mbargc = 0;
ed9b544e 2012}
2013
2014static void freeClient(redisClient *c) {
2015 listNode *ln;
2016
4409877e 2017 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 2018 * call, we have to set querybuf to NULL *before* to call
2019 * unblockClientWaitingData() to avoid processInputBuffer() will get
2020 * called. Also it is important to remove the file events after
2021 * this, because this call adds the READABLE event. */
4409877e 2022 sdsfree(c->querybuf);
2023 c->querybuf = NULL;
2024 if (c->flags & REDIS_BLOCKED)
b0d8747d 2025 unblockClientWaitingData(c);
4409877e 2026
37ab76c9 2027 /* UNWATCH all the keys */
2028 unwatchAllKeys(c);
2029 listRelease(c->watched_keys);
ffc6b7f8 2030 /* Unsubscribe from all the pubsub channels */
2031 pubsubUnsubscribeAllChannels(c,0);
2032 pubsubUnsubscribeAllPatterns(c,0);
2033 dictRelease(c->pubsub_channels);
2034 listRelease(c->pubsub_patterns);
befec3cd 2035 /* Obvious cleanup */
ed9b544e 2036 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2037 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 2038 listRelease(c->reply);
2039 freeClientArgv(c);
2040 close(c->fd);
92f8e882 2041 /* Remove from the list of clients */
ed9b544e 2042 ln = listSearchKey(server.clients,c);
dfc5e96c 2043 redisAssert(ln != NULL);
ed9b544e 2044 listDelNode(server.clients,ln);
37ab76c9 2045 /* Remove from the list of clients that are now ready to be restarted
2046 * after waiting for swapped keys */
d5d55fc3 2047 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2048 ln = listSearchKey(server.io_ready_clients,c);
2049 if (ln) {
2050 listDelNode(server.io_ready_clients,ln);
2051 server.vm_blocked_clients--;
2052 }
2053 }
37ab76c9 2054 /* Remove from the list of clients waiting for swapped keys */
d5d55fc3 2055 while (server.vm_enabled && listLength(c->io_keys)) {
2056 ln = listFirst(c->io_keys);
2057 dontWaitForSwappedKey(c,ln->value);
92f8e882 2058 }
b3e3d0d7 2059 listRelease(c->io_keys);
befec3cd 2060 /* Master/slave cleanup */
ed9b544e 2061 if (c->flags & REDIS_SLAVE) {
6208b3a7 2062 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2063 close(c->repldbfd);
87eca727 2064 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2065 ln = listSearchKey(l,c);
dfc5e96c 2066 redisAssert(ln != NULL);
87eca727 2067 listDelNode(l,ln);
ed9b544e 2068 }
2069 if (c->flags & REDIS_MASTER) {
2070 server.master = NULL;
2071 server.replstate = REDIS_REPL_CONNECT;
2072 }
befec3cd 2073 /* Release memory */
93ea3759 2074 zfree(c->argv);
e8a74421 2075 zfree(c->mbargv);
6e469882 2076 freeClientMultiState(c);
ed9b544e 2077 zfree(c);
2078}
2079
cc30e368 2080#define GLUEREPLY_UP_TO (1024)
ed9b544e 2081static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 2082 int copylen = 0;
2083 char buf[GLUEREPLY_UP_TO];
6208b3a7 2084 listNode *ln;
c7df85a4 2085 listIter li;
ed9b544e 2086 robj *o;
2087
c7df85a4 2088 listRewind(c->reply,&li);
2089 while((ln = listNext(&li))) {
c28b42ac 2090 int objlen;
2091
ed9b544e 2092 o = ln->value;
c28b42ac 2093 objlen = sdslen(o->ptr);
2094 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2095 memcpy(buf+copylen,o->ptr,objlen);
2096 copylen += objlen;
ed9b544e 2097 listDelNode(c->reply,ln);
c28b42ac 2098 } else {
2099 if (copylen == 0) return;
2100 break;
ed9b544e 2101 }
ed9b544e 2102 }
c28b42ac 2103 /* Now the output buffer is empty, add the new single element */
2104 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2105 listAddNodeHead(c->reply,o);
ed9b544e 2106}
2107
2108static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2109 redisClient *c = privdata;
2110 int nwritten = 0, totwritten = 0, objlen;
2111 robj *o;
2112 REDIS_NOTUSED(el);
2113 REDIS_NOTUSED(mask);
2114
2895e862 2115 /* Use writev() if we have enough buffers to send */
7ea870c0 2116 if (!server.glueoutputbuf &&
e0a62c7f 2117 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
7ea870c0 2118 !(c->flags & REDIS_MASTER))
2895e862 2119 {
2120 sendReplyToClientWritev(el, fd, privdata, mask);
2121 return;
2122 }
2895e862 2123
ed9b544e 2124 while(listLength(c->reply)) {
c28b42ac 2125 if (server.glueoutputbuf && listLength(c->reply) > 1)
2126 glueReplyBuffersIfNeeded(c);
2127
ed9b544e 2128 o = listNodeValue(listFirst(c->reply));
2129 objlen = sdslen(o->ptr);
2130
2131 if (objlen == 0) {
2132 listDelNode(c->reply,listFirst(c->reply));
2133 continue;
2134 }
2135
2136 if (c->flags & REDIS_MASTER) {
6f376729 2137 /* Don't reply to a master */
ed9b544e 2138 nwritten = objlen - c->sentlen;
2139 } else {
a4d1ba9a 2140 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 2141 if (nwritten <= 0) break;
2142 }
2143 c->sentlen += nwritten;
2144 totwritten += nwritten;
2145 /* If we fully sent the object on head go to the next one */
2146 if (c->sentlen == objlen) {
2147 listDelNode(c->reply,listFirst(c->reply));
2148 c->sentlen = 0;
2149 }
6f376729 2150 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 2151 * bytes, in a single threaded server it's a good idea to serve
6f376729 2152 * other clients as well, even if a very large request comes from
2153 * super fast link that is always able to accept data (in real world
12f9d551 2154 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 2155 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 2156 }
2157 if (nwritten == -1) {
2158 if (errno == EAGAIN) {
2159 nwritten = 0;
2160 } else {
f870935d 2161 redisLog(REDIS_VERBOSE,
ed9b544e 2162 "Error writing to client: %s", strerror(errno));
2163 freeClient(c);
2164 return;
2165 }
2166 }
2167 if (totwritten > 0) c->lastinteraction = time(NULL);
2168 if (listLength(c->reply) == 0) {
2169 c->sentlen = 0;
2170 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2171 }
2172}
2173
2895e862 2174static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2175{
2176 redisClient *c = privdata;
2177 int nwritten = 0, totwritten = 0, objlen, willwrite;
2178 robj *o;
2179 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2180 int offset, ion = 0;
2181 REDIS_NOTUSED(el);
2182 REDIS_NOTUSED(mask);
2183
2184 listNode *node;
2185 while (listLength(c->reply)) {
2186 offset = c->sentlen;
2187 ion = 0;
2188 willwrite = 0;
2189
2190 /* fill-in the iov[] array */
2191 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2192 o = listNodeValue(node);
2193 objlen = sdslen(o->ptr);
2194
e0a62c7f 2195 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2895e862 2196 break;
2197
2198 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2199 break; /* no more iovecs */
2200
2201 iov[ion].iov_base = ((char*)o->ptr) + offset;
2202 iov[ion].iov_len = objlen - offset;
2203 willwrite += objlen - offset;
2204 offset = 0; /* just for the first item */
2205 ion++;
2206 }
2207
2208 if(willwrite == 0)
2209 break;
2210
2211 /* write all collected blocks at once */
2212 if((nwritten = writev(fd, iov, ion)) < 0) {
2213 if (errno != EAGAIN) {
f870935d 2214 redisLog(REDIS_VERBOSE,
2895e862 2215 "Error writing to client: %s", strerror(errno));
2216 freeClient(c);
2217 return;
2218 }
2219 break;
2220 }
2221
2222 totwritten += nwritten;
2223 offset = c->sentlen;
2224
2225 /* remove written robjs from c->reply */
2226 while (nwritten && listLength(c->reply)) {
2227 o = listNodeValue(listFirst(c->reply));
2228 objlen = sdslen(o->ptr);
2229
2230 if(nwritten >= objlen - offset) {
2231 listDelNode(c->reply, listFirst(c->reply));
2232 nwritten -= objlen - offset;
2233 c->sentlen = 0;
2234 } else {
2235 /* partial write */
2236 c->sentlen += nwritten;
2237 break;
2238 }
2239 offset = 0;
2240 }
2241 }
2242
e0a62c7f 2243 if (totwritten > 0)
2895e862 2244 c->lastinteraction = time(NULL);
2245
2246 if (listLength(c->reply) == 0) {
2247 c->sentlen = 0;
2248 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2249 }
2250}
2251
1a132bbc
PN
2252static int qsortRedisCommands(const void *r1, const void *r2) {
2253 return strcasecmp(
2254 ((struct redisCommand*)r1)->name,
2255 ((struct redisCommand*)r2)->name);
2256}
2257
2258static void sortCommandTable() {
2259 int i = 0, size = 0;
2260
2261 /* Determine and store the size of the command table */
2262 while(readonlyCommandTable[i++].name != NULL) size++;
2263 commandTableSize = size;
2264
2265 /* Copy and sort the read-only version of the command table */
2266 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2267 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
2268 qsort(commandTable,size,sizeof(struct redisCommand),qsortRedisCommands);
2269}
2270
ed9b544e 2271static struct redisCommand *lookupCommand(char *name) {
1a132bbc
PN
2272 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2273 return bsearch(
2274 &tmp,
2275 commandTable,
2276 commandTableSize,
2277 sizeof(struct redisCommand),
2278 qsortRedisCommands);
ed9b544e 2279}
2280
2281/* resetClient prepare the client to process the next command */
2282static void resetClient(redisClient *c) {
2283 freeClientArgv(c);
2284 c->bulklen = -1;
e8a74421 2285 c->multibulk = 0;
ed9b544e 2286}
2287
6e469882 2288/* Call() is the core of Redis execution of a command */
2289static void call(redisClient *c, struct redisCommand *cmd) {
2290 long long dirty;
2291
2292 dirty = server.dirty;
2293 cmd->proc(c);
4005fef1 2294 dirty = server.dirty-dirty;
2295
2296 if (server.appendonly && dirty)
6e469882 2297 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
4005fef1 2298 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2299 listLength(server.slaves))
248ea310 2300 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2301 if (listLength(server.monitors))
dd142b9c 2302 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2303 server.stat_numcommands++;
2304}
2305
ed9b544e 2306/* If this function gets called we already read a whole
2307 * command, argments are in the client argv/argc fields.
2308 * processCommand() execute the command or prepare the
2309 * server for a bulk read from the client.
2310 *
2311 * If 1 is returned the client is still alive and valid and
2312 * and other operations can be performed by the caller. Otherwise
2313 * if 0 is returned the client was destroied (i.e. after QUIT). */
2314static int processCommand(redisClient *c) {
2315 struct redisCommand *cmd;
ed9b544e 2316
3fd78bcd 2317 /* Free some memory if needed (maxmemory setting) */
2318 if (server.maxmemory) freeMemoryIfNeeded();
2319
e8a74421 2320 /* Handle the multi bulk command type. This is an alternative protocol
2321 * supported by Redis in order to receive commands that are composed of
2322 * multiple binary-safe "bulk" arguments. The latency of processing is
2323 * a bit higher but this allows things like multi-sets, so if this
2324 * protocol is used only for MSET and similar commands this is a big win. */
2325 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2326 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2327 if (c->multibulk <= 0) {
2328 resetClient(c);
2329 return 1;
2330 } else {
2331 decrRefCount(c->argv[c->argc-1]);
2332 c->argc--;
2333 return 1;
2334 }
2335 } else if (c->multibulk) {
2336 if (c->bulklen == -1) {
2337 if (((char*)c->argv[0]->ptr)[0] != '$') {
2338 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2339 resetClient(c);
2340 return 1;
2341 } else {
2342 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2343 decrRefCount(c->argv[0]);
2344 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2345 c->argc--;
2346 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2347 resetClient(c);
2348 return 1;
2349 }
2350 c->argc--;
2351 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2352 return 1;
2353 }
2354 } else {
2355 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2356 c->mbargv[c->mbargc] = c->argv[0];
2357 c->mbargc++;
2358 c->argc--;
2359 c->multibulk--;
2360 if (c->multibulk == 0) {
2361 robj **auxargv;
2362 int auxargc;
2363
2364 /* Here we need to swap the multi-bulk argc/argv with the
2365 * normal argc/argv of the client structure. */
2366 auxargv = c->argv;
2367 c->argv = c->mbargv;
2368 c->mbargv = auxargv;
2369
2370 auxargc = c->argc;
2371 c->argc = c->mbargc;
2372 c->mbargc = auxargc;
2373
2374 /* We need to set bulklen to something different than -1
2375 * in order for the code below to process the command without
2376 * to try to read the last argument of a bulk command as
2377 * a special argument. */
2378 c->bulklen = 0;
2379 /* continue below and process the command */
2380 } else {
2381 c->bulklen = -1;
2382 return 1;
2383 }
2384 }
2385 }
2386 /* -- end of multi bulk commands processing -- */
2387
ed9b544e 2388 /* The QUIT command is handled as a special case. Normal command
2389 * procs are unable to close the client connection safely */
bb0b03a3 2390 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2391 freeClient(c);
2392 return 0;
2393 }
d5d55fc3 2394
2395 /* Now lookup the command and check ASAP about trivial error conditions
2396 * such wrong arity, bad command name and so forth. */
ed9b544e 2397 cmd = lookupCommand(c->argv[0]->ptr);
2398 if (!cmd) {
2c14807b 2399 addReplySds(c,
2400 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2401 (char*)c->argv[0]->ptr));
ed9b544e 2402 resetClient(c);
2403 return 1;
2404 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2405 (c->argc < -cmd->arity)) {
454d4e43 2406 addReplySds(c,
2407 sdscatprintf(sdsempty(),
2408 "-ERR wrong number of arguments for '%s' command\r\n",
2409 cmd->name));
ed9b544e 2410 resetClient(c);
2411 return 1;
2412 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2413 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2414 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2415
2416 decrRefCount(c->argv[c->argc-1]);
2417 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2418 c->argc--;
2419 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2420 resetClient(c);
2421 return 1;
2422 }
2423 c->argc--;
2424 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2425 /* It is possible that the bulk read is already in the
8d0490e7 2426 * buffer. Check this condition and handle it accordingly.
2427 * This is just a fast path, alternative to call processInputBuffer().
2428 * It's a good idea since the code is small and this condition
2429 * happens most of the times. */
ed9b544e 2430 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2431 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2432 c->argc++;
2433 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2434 } else {
d5d55fc3 2435 /* Otherwise return... there is to read the last argument
2436 * from the socket. */
ed9b544e 2437 return 1;
2438 }
2439 }
942a3961 2440 /* Let's try to encode the bulk object to save space. */
2441 if (cmd->flags & REDIS_CMD_BULK)
05df7621 2442 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
942a3961 2443
e63943a4 2444 /* Check if the user is authenticated */
2445 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2446 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2447 resetClient(c);
2448 return 1;
2449 }
2450
b61a28fe 2451 /* Handle the maxmemory directive */
2452 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2453 zmalloc_used_memory() > server.maxmemory)
2454 {
2455 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2456 resetClient(c);
2457 return 1;
2458 }
2459
d6cc8867 2460 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
e6cca5db 2461 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2462 &&
ffc6b7f8 2463 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2464 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2465 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
d6cc8867 2466 resetClient(c);
2467 return 1;
2468 }
2469
ed9b544e 2470 /* Exec the command */
6531c94d 2471 if (c->flags & REDIS_MULTI &&
2472 cmd->proc != execCommand && cmd->proc != discardCommand &&
2473 cmd->proc != multiCommand && cmd->proc != watchCommand)
2474 {
6e469882 2475 queueMultiCommand(c,cmd);
2476 addReply(c,shared.queued);
2477 } else {
d5d55fc3 2478 if (server.vm_enabled && server.vm_max_threads > 0 &&
0a6f3f0f 2479 blockClientOnSwappedKeys(c,cmd)) return 1;
6e469882 2480 call(c,cmd);
2481 }
ed9b544e 2482
2483 /* Prepare the client for the next command */
ed9b544e 2484 resetClient(c);
2485 return 1;
2486}
2487
248ea310 2488static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2489 listNode *ln;
c7df85a4 2490 listIter li;
ed9b544e 2491 int outc = 0, j;
93ea3759 2492 robj **outv;
248ea310 2493 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2494 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2495 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2496 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2497 robj *lenobj;
93ea3759 2498
2499 if (argc <= REDIS_STATIC_ARGS) {
2500 outv = static_outv;
2501 } else {
248ea310 2502 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2503 }
248ea310 2504
2505 lenobj = createObject(REDIS_STRING,
2506 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2507 lenobj->refcount = 0;
2508 outv[outc++] = lenobj;
ed9b544e 2509 for (j = 0; j < argc; j++) {
248ea310 2510 lenobj = createObject(REDIS_STRING,
2511 sdscatprintf(sdsempty(),"$%lu\r\n",
2512 (unsigned long) stringObjectLen(argv[j])));
2513 lenobj->refcount = 0;
2514 outv[outc++] = lenobj;
ed9b544e 2515 outv[outc++] = argv[j];
248ea310 2516 outv[outc++] = shared.crlf;
ed9b544e 2517 }
ed9b544e 2518
40d224a9 2519 /* Increment all the refcounts at start and decrement at end in order to
2520 * be sure to free objects if there is no slave in a replication state
2521 * able to be feed with commands */
2522 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2523 listRewind(slaves,&li);
2524 while((ln = listNext(&li))) {
ed9b544e 2525 redisClient *slave = ln->value;
40d224a9 2526
2527 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2528 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2529
2530 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2531 if (slave->slaveseldb != dictid) {
2532 robj *selectcmd;
2533
2534 switch(dictid) {
2535 case 0: selectcmd = shared.select0; break;
2536 case 1: selectcmd = shared.select1; break;
2537 case 2: selectcmd = shared.select2; break;
2538 case 3: selectcmd = shared.select3; break;
2539 case 4: selectcmd = shared.select4; break;
2540 case 5: selectcmd = shared.select5; break;
2541 case 6: selectcmd = shared.select6; break;
2542 case 7: selectcmd = shared.select7; break;
2543 case 8: selectcmd = shared.select8; break;
2544 case 9: selectcmd = shared.select9; break;
2545 default:
2546 selectcmd = createObject(REDIS_STRING,
2547 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2548 selectcmd->refcount = 0;
2549 break;
2550 }
2551 addReply(slave,selectcmd);
2552 slave->slaveseldb = dictid;
2553 }
2554 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2555 }
40d224a9 2556 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2557 if (outv != static_outv) zfree(outv);
ed9b544e 2558}
2559
dd142b9c 2560static sds sdscatrepr(sds s, char *p, size_t len) {
2561 s = sdscatlen(s,"\"",1);
2562 while(len--) {
2563 switch(*p) {
2564 case '\\':
2565 case '"':
2566 s = sdscatprintf(s,"\\%c",*p);
2567 break;
2568 case '\n': s = sdscatlen(s,"\\n",1); break;
2569 case '\r': s = sdscatlen(s,"\\r",1); break;
2570 case '\t': s = sdscatlen(s,"\\t",1); break;
2571 case '\a': s = sdscatlen(s,"\\a",1); break;
2572 case '\b': s = sdscatlen(s,"\\b",1); break;
2573 default:
2574 if (isprint(*p))
2575 s = sdscatprintf(s,"%c",*p);
2576 else
2577 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2578 break;
2579 }
2580 p++;
2581 }
2582 return sdscatlen(s,"\"",1);
2583}
2584
2585static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2586 listNode *ln;
2587 listIter li;
2588 int j;
2589 sds cmdrepr = sdsnew("+");
2590 robj *cmdobj;
2591 struct timeval tv;
2592
2593 gettimeofday(&tv,NULL);
2594 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2595 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2596
2597 for (j = 0; j < argc; j++) {
2598 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2599 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2600 } else {
2601 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2602 sdslen(argv[j]->ptr));
2603 }
2604 if (j != argc-1)
2605 cmdrepr = sdscatlen(cmdrepr," ",1);
2606 }
2607 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2608 cmdobj = createObject(REDIS_STRING,cmdrepr);
2609
2610 listRewind(monitors,&li);
2611 while((ln = listNext(&li))) {
2612 redisClient *monitor = ln->value;
2613 addReply(monitor,cmdobj);
2614 }
2615 decrRefCount(cmdobj);
2616}
2617
638e42ac 2618static void processInputBuffer(redisClient *c) {
ed9b544e 2619again:
4409877e 2620 /* Before to process the input buffer, make sure the client is not
2621 * waitig for a blocking operation such as BLPOP. Note that the first
2622 * iteration the client is never blocked, otherwise the processInputBuffer
2623 * would not be called at all, but after the execution of the first commands
2624 * in the input buffer the client may be blocked, and the "goto again"
2625 * will try to reiterate. The following line will make it return asap. */
92f8e882 2626 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2627 if (c->bulklen == -1) {
2628 /* Read the first line of the query */
2629 char *p = strchr(c->querybuf,'\n');
2630 size_t querylen;
644fafa3 2631
ed9b544e 2632 if (p) {
2633 sds query, *argv;
2634 int argc, j;
e0a62c7f 2635
ed9b544e 2636 query = c->querybuf;
2637 c->querybuf = sdsempty();
2638 querylen = 1+(p-(query));
2639 if (sdslen(query) > querylen) {
2640 /* leave data after the first line of the query in the buffer */
2641 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2642 }
2643 *p = '\0'; /* remove "\n" */
2644 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2645 sdsupdatelen(query);
2646
2647 /* Now we can split the query in arguments */
ed9b544e 2648 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2649 sdsfree(query);
2650
2651 if (c->argv) zfree(c->argv);
2652 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2653
2654 for (j = 0; j < argc; j++) {
ed9b544e 2655 if (sdslen(argv[j])) {
2656 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2657 c->argc++;
2658 } else {
2659 sdsfree(argv[j]);
2660 }
2661 }
2662 zfree(argv);
7c49733c 2663 if (c->argc) {
2664 /* Execute the command. If the client is still valid
2665 * after processCommand() return and there is something
2666 * on the query buffer try to process the next command. */
2667 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2668 } else {
2669 /* Nothing to process, argc == 0. Just process the query
2670 * buffer if it's not empty or return to the caller */
2671 if (sdslen(c->querybuf)) goto again;
2672 }
ed9b544e 2673 return;
644fafa3 2674 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2675 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2676 freeClient(c);
2677 return;
2678 }
2679 } else {
2680 /* Bulk read handling. Note that if we are at this point
2681 the client already sent a command terminated with a newline,
2682 we are reading the bulk data that is actually the last
2683 argument of the command. */
2684 int qbl = sdslen(c->querybuf);
2685
2686 if (c->bulklen <= qbl) {
2687 /* Copy everything but the final CRLF as final argument */
2688 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2689 c->argc++;
2690 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2691 /* Process the command. If the client is still valid after
2692 * the processing and there is more data in the buffer
2693 * try to parse it. */
2694 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2695 return;
2696 }
2697 }
2698}
2699
638e42ac 2700static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2701 redisClient *c = (redisClient*) privdata;
2702 char buf[REDIS_IOBUF_LEN];
2703 int nread;
2704 REDIS_NOTUSED(el);
2705 REDIS_NOTUSED(mask);
2706
2707 nread = read(fd, buf, REDIS_IOBUF_LEN);
2708 if (nread == -1) {
2709 if (errno == EAGAIN) {
2710 nread = 0;
2711 } else {
f870935d 2712 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2713 freeClient(c);
2714 return;
2715 }
2716 } else if (nread == 0) {
f870935d 2717 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2718 freeClient(c);
2719 return;
2720 }
2721 if (nread) {
2722 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2723 c->lastinteraction = time(NULL);
2724 } else {
2725 return;
2726 }
168ac5c6 2727 processInputBuffer(c);
638e42ac 2728}
2729
ed9b544e 2730static int selectDb(redisClient *c, int id) {
2731 if (id < 0 || id >= server.dbnum)
2732 return REDIS_ERR;
3305306f 2733 c->db = &server.db[id];
ed9b544e 2734 return REDIS_OK;
2735}
2736
40d224a9 2737static void *dupClientReplyValue(void *o) {
2738 incrRefCount((robj*)o);
12d090d2 2739 return o;
40d224a9 2740}
2741
ffc6b7f8 2742static int listMatchObjects(void *a, void *b) {
bf028098 2743 return equalStringObjects(a,b);
ffc6b7f8 2744}
2745
ed9b544e 2746static redisClient *createClient(int fd) {
2747 redisClient *c = zmalloc(sizeof(*c));
2748
2749 anetNonBlock(NULL,fd);
2750 anetTcpNoDelay(NULL,fd);
2751 if (!c) return NULL;
2752 selectDb(c,0);
2753 c->fd = fd;
2754 c->querybuf = sdsempty();
2755 c->argc = 0;
93ea3759 2756 c->argv = NULL;
ed9b544e 2757 c->bulklen = -1;
e8a74421 2758 c->multibulk = 0;
2759 c->mbargc = 0;
2760 c->mbargv = NULL;
ed9b544e 2761 c->sentlen = 0;
2762 c->flags = 0;
2763 c->lastinteraction = time(NULL);
abcb223e 2764 c->authenticated = 0;
40d224a9 2765 c->replstate = REDIS_REPL_NONE;
6b47e12e 2766 c->reply = listCreate();
ed9b544e 2767 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2768 listSetDupMethod(c->reply,dupClientReplyValue);
37ab76c9 2769 c->blocking_keys = NULL;
2770 c->blocking_keys_num = 0;
92f8e882 2771 c->io_keys = listCreate();
87c68815 2772 c->watched_keys = listCreate();
92f8e882 2773 listSetFreeMethod(c->io_keys,decrRefCount);
ffc6b7f8 2774 c->pubsub_channels = dictCreate(&setDictType,NULL);
2775 c->pubsub_patterns = listCreate();
2776 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2777 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
ed9b544e 2778 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2779 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2780 freeClient(c);
2781 return NULL;
2782 }
6b47e12e 2783 listAddNodeTail(server.clients,c);
6e469882 2784 initClientMultiState(c);
ed9b544e 2785 return c;
2786}
2787
2788static void addReply(redisClient *c, robj *obj) {
2789 if (listLength(c->reply) == 0 &&
6208b3a7 2790 (c->replstate == REDIS_REPL_NONE ||
2791 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2792 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2793 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2794
2795 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2796 obj = dupStringObject(obj);
2797 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2798 }
9d65a1bb 2799 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2800}
2801
2802static void addReplySds(redisClient *c, sds s) {
2803 robj *o = createObject(REDIS_STRING,s);
2804 addReply(c,o);
2805 decrRefCount(o);
2806}
2807
e2665397 2808static void addReplyDouble(redisClient *c, double d) {
2809 char buf[128];
2810
2811 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2812 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2813 (unsigned long) strlen(buf),buf));
e2665397 2814}
2815
aa7c2934
PN
2816static void addReplyLongLong(redisClient *c, long long ll) {
2817 char buf[128];
2818 size_t len;
2819
2820 if (ll == 0) {
2821 addReply(c,shared.czero);
2822 return;
2823 } else if (ll == 1) {
2824 addReply(c,shared.cone);
2825 return;
2826 }
482b672d 2827 buf[0] = ':';
2828 len = ll2string(buf+1,sizeof(buf)-1,ll);
2829 buf[len+1] = '\r';
2830 buf[len+2] = '\n';
2831 addReplySds(c,sdsnewlen(buf,len+3));
aa7c2934
PN
2832}
2833
92b27fe9 2834static void addReplyUlong(redisClient *c, unsigned long ul) {
2835 char buf[128];
2836 size_t len;
2837
dd88747b 2838 if (ul == 0) {
2839 addReply(c,shared.czero);
2840 return;
2841 } else if (ul == 1) {
2842 addReply(c,shared.cone);
2843 return;
2844 }
92b27fe9 2845 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2846 addReplySds(c,sdsnewlen(buf,len));
2847}
2848
942a3961 2849static void addReplyBulkLen(redisClient *c, robj *obj) {
482b672d 2850 size_t len, intlen;
2851 char buf[128];
942a3961 2852
2853 if (obj->encoding == REDIS_ENCODING_RAW) {
2854 len = sdslen(obj->ptr);
2855 } else {
2856 long n = (long)obj->ptr;
2857
e054afda 2858 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2859 len = 1;
2860 if (n < 0) {
2861 len++;
2862 n = -n;
2863 }
2864 while((n = n/10) != 0) {
2865 len++;
2866 }
2867 }
482b672d 2868 buf[0] = '$';
2869 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2870 buf[intlen+1] = '\r';
2871 buf[intlen+2] = '\n';
2872 addReplySds(c,sdsnewlen(buf,intlen+3));
942a3961 2873}
2874
dd88747b 2875static void addReplyBulk(redisClient *c, robj *obj) {
2876 addReplyBulkLen(c,obj);
2877 addReply(c,obj);
2878 addReply(c,shared.crlf);
2879}
2880
500ece7c 2881/* In the CONFIG command we need to add vanilla C string as bulk replies */
2882static void addReplyBulkCString(redisClient *c, char *s) {
2883 if (s == NULL) {
2884 addReply(c,shared.nullbulk);
2885 } else {
2886 robj *o = createStringObject(s,strlen(s));
2887 addReplyBulk(c,o);
2888 decrRefCount(o);
2889 }
2890}
2891
ed9b544e 2892static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2893 int cport, cfd;
2894 char cip[128];
285add55 2895 redisClient *c;
ed9b544e 2896 REDIS_NOTUSED(el);
2897 REDIS_NOTUSED(mask);
2898 REDIS_NOTUSED(privdata);
2899
2900 cfd = anetAccept(server.neterr, fd, cip, &cport);
2901 if (cfd == AE_ERR) {
f870935d 2902 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2903 return;
2904 }
f870935d 2905 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2906 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2907 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2908 close(cfd); /* May be already closed, just ingore errors */
2909 return;
2910 }
285add55 2911 /* If maxclient directive is set and this is one client more... close the
2912 * connection. Note that we create the client instead to check before
2913 * for this condition, since now the socket is already set in nonblocking
2914 * mode and we can send an error for free using the Kernel I/O */
2915 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2916 char *err = "-ERR max number of clients reached\r\n";
2917
2918 /* That's a best effort error message, don't check write errors */
fee803ba 2919 if (write(c->fd,err,strlen(err)) == -1) {
2920 /* Nothing to do, Just to avoid the warning... */
2921 }
285add55 2922 freeClient(c);
2923 return;
2924 }
ed9b544e 2925 server.stat_numconnections++;
2926}
2927
2928/* ======================= Redis objects implementation ===================== */
2929
2930static robj *createObject(int type, void *ptr) {
2931 robj *o;
2932
a5819310 2933 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2934 if (listLength(server.objfreelist)) {
2935 listNode *head = listFirst(server.objfreelist);
2936 o = listNodeValue(head);
2937 listDelNode(server.objfreelist,head);
a5819310 2938 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2939 } else {
75680a3c 2940 if (server.vm_enabled) {
a5819310 2941 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2942 o = zmalloc(sizeof(*o));
2943 } else {
2944 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2945 }
ed9b544e 2946 }
ed9b544e 2947 o->type = type;
942a3961 2948 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2949 o->ptr = ptr;
2950 o->refcount = 1;
3a66edc7 2951 if (server.vm_enabled) {
1064ef87 2952 /* Note that this code may run in the context of an I/O thread
2953 * and accessing to server.unixtime in theory is an error
2954 * (no locks). But in practice this is safe, and even if we read
2955 * garbage Redis will not fail, as it's just a statistical info */
3a66edc7 2956 o->vm.atime = server.unixtime;
2957 o->storage = REDIS_VM_MEMORY;
2958 }
ed9b544e 2959 return o;
2960}
2961
2962static robj *createStringObject(char *ptr, size_t len) {
2963 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2964}
2965
3f973463
PN
2966static robj *createStringObjectFromLongLong(long long value) {
2967 robj *o;
2968 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2969 incrRefCount(shared.integers[value]);
2970 o = shared.integers[value];
2971 } else {
3f973463 2972 if (value >= LONG_MIN && value <= LONG_MAX) {
10dea8dc 2973 o = createObject(REDIS_STRING, NULL);
3f973463
PN
2974 o->encoding = REDIS_ENCODING_INT;
2975 o->ptr = (void*)((long)value);
2976 } else {
ee14da56 2977 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3f973463
PN
2978 }
2979 }
2980 return o;
2981}
2982
4ef8de8a 2983static robj *dupStringObject(robj *o) {
b9bc0eef 2984 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2985 return createStringObject(o->ptr,sdslen(o->ptr));
2986}
2987
ed9b544e 2988static robj *createListObject(void) {
2989 list *l = listCreate();
2990
ed9b544e 2991 listSetFreeMethod(l,decrRefCount);
2992 return createObject(REDIS_LIST,l);
2993}
2994
2995static robj *createSetObject(void) {
2996 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2997 return createObject(REDIS_SET,d);
2998}
2999
5234952b 3000static robj *createHashObject(void) {
3001 /* All the Hashes start as zipmaps. Will be automatically converted
3002 * into hash tables if there are enough elements or big elements
3003 * inside. */
3004 unsigned char *zm = zipmapNew();
3005 robj *o = createObject(REDIS_HASH,zm);
3006 o->encoding = REDIS_ENCODING_ZIPMAP;
3007 return o;
3008}
3009
1812e024 3010static robj *createZsetObject(void) {
6b47e12e 3011 zset *zs = zmalloc(sizeof(*zs));
3012
3013 zs->dict = dictCreate(&zsetDictType,NULL);
3014 zs->zsl = zslCreate();
3015 return createObject(REDIS_ZSET,zs);
1812e024 3016}
3017
ed9b544e 3018static void freeStringObject(robj *o) {
942a3961 3019 if (o->encoding == REDIS_ENCODING_RAW) {
3020 sdsfree(o->ptr);
3021 }
ed9b544e 3022}
3023
3024static void freeListObject(robj *o) {
3025 listRelease((list*) o->ptr);
3026}
3027
3028static void freeSetObject(robj *o) {
3029 dictRelease((dict*) o->ptr);
3030}
3031
fd8ccf44 3032static void freeZsetObject(robj *o) {
3033 zset *zs = o->ptr;
3034
3035 dictRelease(zs->dict);
3036 zslFree(zs->zsl);
3037 zfree(zs);
3038}
3039
ed9b544e 3040static void freeHashObject(robj *o) {
cbba7dd7 3041 switch (o->encoding) {
3042 case REDIS_ENCODING_HT:
3043 dictRelease((dict*) o->ptr);
3044 break;
3045 case REDIS_ENCODING_ZIPMAP:
3046 zfree(o->ptr);
3047 break;
3048 default:
f83c6cb5 3049 redisPanic("Unknown hash encoding type");
cbba7dd7 3050 break;
3051 }
ed9b544e 3052}
3053
3054static void incrRefCount(robj *o) {
3055 o->refcount++;
3056}
3057
3058static void decrRefCount(void *obj) {
3059 robj *o = obj;
94754ccc 3060
c651fd9e 3061 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
970e10bb 3062 /* Object is a key of a swapped out value, or in the process of being
3063 * loaded. */
996cb5f7 3064 if (server.vm_enabled &&
3065 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3066 {
996cb5f7 3067 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 3068 redisAssert(o->type == REDIS_STRING);
a35ddf12 3069 freeStringObject(o);
3070 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 3071 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 3072 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3073 !listAddNodeHead(server.objfreelist,o))
3074 zfree(o);
a5819310 3075 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 3076 server.vm_stats_swapped_objects--;
a35ddf12 3077 return;
3078 }
996cb5f7 3079 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 3080 if (--(o->refcount) == 0) {
996cb5f7 3081 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3082 vmCancelThreadedIOJob(obj);
ed9b544e 3083 switch(o->type) {
3084 case REDIS_STRING: freeStringObject(o); break;
3085 case REDIS_LIST: freeListObject(o); break;
3086 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 3087 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 3088 case REDIS_HASH: freeHashObject(o); break;
f83c6cb5 3089 default: redisPanic("Unknown object type"); break;
ed9b544e 3090 }
a5819310 3091 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 3092 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3093 !listAddNodeHead(server.objfreelist,o))
3094 zfree(o);
a5819310 3095 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 3096 }
3097}
3098
942a3961 3099static robj *lookupKey(redisDb *db, robj *key) {
3100 dictEntry *de = dictFind(db->dict,key);
3a66edc7 3101 if (de) {
55cf8433 3102 robj *key = dictGetEntryKey(de);
3103 robj *val = dictGetEntryVal(de);
3a66edc7 3104
55cf8433 3105 if (server.vm_enabled) {
996cb5f7 3106 if (key->storage == REDIS_VM_MEMORY ||
3107 key->storage == REDIS_VM_SWAPPING)
3108 {
3109 /* If we were swapping the object out, stop it, this key
3110 * was requested. */
3111 if (key->storage == REDIS_VM_SWAPPING)
3112 vmCancelThreadedIOJob(key);
55cf8433 3113 /* Update the access time of the key for the aging algorithm. */
3114 key->vm.atime = server.unixtime;
3115 } else {
d5d55fc3 3116 int notify = (key->storage == REDIS_VM_LOADING);
3117
55cf8433 3118 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 3119 redisAssert(val == NULL);
55cf8433 3120 val = vmLoadObject(key);
3121 dictGetEntryVal(de) = val;
d5d55fc3 3122
3123 /* Clients blocked by the VM subsystem may be waiting for
3124 * this key... */
3125 if (notify) handleClientsBlockedOnSwappedKey(db,key);
55cf8433 3126 }
3127 }
3128 return val;
3a66edc7 3129 } else {
3130 return NULL;
3131 }
942a3961 3132}
3133
3134static robj *lookupKeyRead(redisDb *db, robj *key) {
3135 expireIfNeeded(db,key);
3136 return lookupKey(db,key);
3137}
3138
3139static robj *lookupKeyWrite(redisDb *db, robj *key) {
3140 deleteIfVolatile(db,key);
37ab76c9 3141 touchWatchedKey(db,key);
942a3961 3142 return lookupKey(db,key);
3143}
3144
92b27fe9 3145static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3146 robj *o = lookupKeyRead(c->db, key);
3147 if (!o) addReply(c,reply);
3148 return o;
3149}
3150
3151static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3152 robj *o = lookupKeyWrite(c->db, key);
3153 if (!o) addReply(c,reply);
3154 return o;
3155}
3156
3157static int checkType(redisClient *c, robj *o, int type) {
3158 if (o->type != type) {
3159 addReply(c,shared.wrongtypeerr);
3160 return 1;
3161 }
3162 return 0;
3163}
3164
942a3961 3165static int deleteKey(redisDb *db, robj *key) {
3166 int retval;
3167
3168 /* We need to protect key from destruction: after the first dictDelete()
3169 * it may happen that 'key' is no longer valid if we don't increment
3170 * it's count. This may happen when we get the object reference directly
3171 * from the hash table with dictRandomKey() or dict iterators */
3172 incrRefCount(key);
3173 if (dictSize(db->expires)) dictDelete(db->expires,key);
3174 retval = dictDelete(db->dict,key);
3175 decrRefCount(key);
3176
3177 return retval == DICT_OK;
3178}
3179
724a51b1 3180/* Check if the nul-terminated string 's' can be represented by a long
3181 * (that is, is a number that fits into long without any other space or
3182 * character before or after the digits).
3183 *
3184 * If so, the function returns REDIS_OK and *longval is set to the value
3185 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 3186static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 3187 char buf[32], *endptr;
3188 long value;
3189 int slen;
e0a62c7f 3190
724a51b1 3191 value = strtol(s, &endptr, 10);
3192 if (endptr[0] != '\0') return REDIS_ERR;
ee14da56 3193 slen = ll2string(buf,32,value);
724a51b1 3194
3195 /* If the number converted back into a string is not identical
3196 * then it's not possible to encode the string as integer */
f69f2cba 3197 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 3198 if (longval) *longval = value;
3199 return REDIS_OK;
3200}
3201
942a3961 3202/* Try to encode a string object in order to save space */
05df7621 3203static robj *tryObjectEncoding(robj *o) {
942a3961 3204 long value;
942a3961 3205 sds s = o->ptr;
3305306f 3206
942a3961 3207 if (o->encoding != REDIS_ENCODING_RAW)
05df7621 3208 return o; /* Already encoded */
3305306f 3209
05df7621 3210 /* It's not safe to encode shared objects: shared objects can be shared
942a3961 3211 * everywhere in the "object space" of Redis. Encoded objects can only
3212 * appear as "values" (and not, for instance, as keys) */
05df7621 3213 if (o->refcount > 1) return o;
3305306f 3214
942a3961 3215 /* Currently we try to encode only strings */
dfc5e96c 3216 redisAssert(o->type == REDIS_STRING);
94754ccc 3217
724a51b1 3218 /* Check if we can represent this string as a long integer */
05df7621 3219 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
942a3961 3220
3221 /* Ok, this object can be encoded */
05df7621 3222 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3223 decrRefCount(o);
3224 incrRefCount(shared.integers[value]);
3225 return shared.integers[value];
3226 } else {
3227 o->encoding = REDIS_ENCODING_INT;
3228 sdsfree(o->ptr);
3229 o->ptr = (void*) value;
3230 return o;
3231 }
942a3961 3232}
3233
9d65a1bb 3234/* Get a decoded version of an encoded object (returned as a new object).
3235 * If the object is already raw-encoded just increment the ref count. */
3236static robj *getDecodedObject(robj *o) {
942a3961 3237 robj *dec;
e0a62c7f 3238
9d65a1bb 3239 if (o->encoding == REDIS_ENCODING_RAW) {
3240 incrRefCount(o);
3241 return o;
3242 }
942a3961 3243 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3244 char buf[32];
3245
ee14da56 3246 ll2string(buf,32,(long)o->ptr);
942a3961 3247 dec = createStringObject(buf,strlen(buf));
3248 return dec;
3249 } else {
08ee9b57 3250 redisPanic("Unknown encoding type");
942a3961 3251 }
3305306f 3252}
3253
d7f43c08 3254/* Compare two string objects via strcmp() or alike.
3255 * Note that the objects may be integer-encoded. In such a case we
ee14da56 3256 * use ll2string() to get a string representation of the numbers on the stack
1fd9bc8a 3257 * and compare the strings, it's much faster than calling getDecodedObject().
3258 *
3259 * Important note: if objects are not integer encoded, but binary-safe strings,
3260 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3261 * binary safe. */
724a51b1 3262static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 3263 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 3264 char bufa[128], bufb[128], *astr, *bstr;
3265 int bothsds = 1;
724a51b1 3266
e197b441 3267 if (a == b) return 0;
d7f43c08 3268 if (a->encoding != REDIS_ENCODING_RAW) {
ee14da56 3269 ll2string(bufa,sizeof(bufa),(long) a->ptr);
d7f43c08 3270 astr = bufa;
3271 bothsds = 0;
724a51b1 3272 } else {
d7f43c08 3273 astr = a->ptr;
724a51b1 3274 }
d7f43c08 3275 if (b->encoding != REDIS_ENCODING_RAW) {
ee14da56 3276 ll2string(bufb,sizeof(bufb),(long) b->ptr);
d7f43c08 3277 bstr = bufb;
3278 bothsds = 0;
3279 } else {
3280 bstr = b->ptr;
3281 }
3282 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3283}
3284
bf028098 3285/* Equal string objects return 1 if the two objects are the same from the
3286 * point of view of a string comparison, otherwise 0 is returned. Note that
3287 * this function is faster then checking for (compareStringObject(a,b) == 0)
3288 * because it can perform some more optimization. */
3289static int equalStringObjects(robj *a, robj *b) {
3290 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3291 return a->ptr == b->ptr;
3292 } else {
3293 return compareStringObjects(a,b) == 0;
3294 }
3295}
3296
0ea663ea 3297static size_t stringObjectLen(robj *o) {
dfc5e96c 3298 redisAssert(o->type == REDIS_STRING);
0ea663ea 3299 if (o->encoding == REDIS_ENCODING_RAW) {
3300 return sdslen(o->ptr);
3301 } else {
3302 char buf[32];
3303
ee14da56 3304 return ll2string(buf,32,(long)o->ptr);
0ea663ea 3305 }
3306}
3307
bd79a6bd
PN
3308static int getDoubleFromObject(robj *o, double *target) {
3309 double value;
682c73e8 3310 char *eptr;
bbe025e0 3311
bd79a6bd
PN
3312 if (o == NULL) {
3313 value = 0;
3314 } else {
3315 redisAssert(o->type == REDIS_STRING);
3316 if (o->encoding == REDIS_ENCODING_RAW) {
3317 value = strtod(o->ptr, &eptr);
682c73e8 3318 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3319 } else if (o->encoding == REDIS_ENCODING_INT) {
3320 value = (long)o->ptr;
3321 } else {
946342c1 3322 redisPanic("Unknown string encoding");
bd79a6bd
PN
3323 }
3324 }
3325
bd79a6bd
PN
3326 *target = value;
3327 return REDIS_OK;
3328}
bbe025e0 3329
bd79a6bd
PN
3330static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3331 double value;
3332 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3333 if (msg != NULL) {
3334 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3335 } else {
3336 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3337 }
bbe025e0
AM
3338 return REDIS_ERR;
3339 }
3340
bd79a6bd 3341 *target = value;
bbe025e0
AM
3342 return REDIS_OK;
3343}
3344
bd79a6bd
PN
3345static int getLongLongFromObject(robj *o, long long *target) {
3346 long long value;
682c73e8 3347 char *eptr;
bbe025e0 3348
bd79a6bd
PN
3349 if (o == NULL) {
3350 value = 0;
3351 } else {
3352 redisAssert(o->type == REDIS_STRING);
3353 if (o->encoding == REDIS_ENCODING_RAW) {
3354 value = strtoll(o->ptr, &eptr, 10);
682c73e8 3355 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3356 } else if (o->encoding == REDIS_ENCODING_INT) {
3357 value = (long)o->ptr;
3358 } else {
946342c1 3359 redisPanic("Unknown string encoding");
bd79a6bd
PN
3360 }
3361 }
3362
bd79a6bd
PN
3363 *target = value;
3364 return REDIS_OK;
3365}
bbe025e0 3366
bd79a6bd
PN
3367static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3368 long long value;
3369 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3370 if (msg != NULL) {
3371 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3372 } else {
3373 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3374 }
bbe025e0
AM
3375 return REDIS_ERR;
3376 }
3377
bd79a6bd 3378 *target = value;
bbe025e0
AM
3379 return REDIS_OK;
3380}
3381
bd79a6bd
PN
3382static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3383 long long value;
bbe025e0 3384
bd79a6bd
PN
3385 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3386 if (value < LONG_MIN || value > LONG_MAX) {
3387 if (msg != NULL) {
3388 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3389 } else {
3390 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3391 }
bbe025e0
AM
3392 return REDIS_ERR;
3393 }
3394
bd79a6bd 3395 *target = value;
bbe025e0
AM
3396 return REDIS_OK;
3397}
3398
06233c45 3399/*============================ RDB saving/loading =========================== */
ed9b544e 3400
f78fd11b 3401static int rdbSaveType(FILE *fp, unsigned char type) {
3402 if (fwrite(&type,1,1,fp) == 0) return -1;
3403 return 0;
3404}
3405
bb32ede5 3406static int rdbSaveTime(FILE *fp, time_t t) {
3407 int32_t t32 = (int32_t) t;
3408 if (fwrite(&t32,4,1,fp) == 0) return -1;
3409 return 0;
3410}
3411
e3566d4b 3412/* check rdbLoadLen() comments for more info */
f78fd11b 3413static int rdbSaveLen(FILE *fp, uint32_t len) {
3414 unsigned char buf[2];
3415
3416 if (len < (1<<6)) {
3417 /* Save a 6 bit len */
10c43610 3418 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3419 if (fwrite(buf,1,1,fp) == 0) return -1;
3420 } else if (len < (1<<14)) {
3421 /* Save a 14 bit len */
10c43610 3422 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3423 buf[1] = len&0xFF;
17be1a4a 3424 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3425 } else {
3426 /* Save a 32 bit len */
10c43610 3427 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3428 if (fwrite(buf,1,1,fp) == 0) return -1;
3429 len = htonl(len);
3430 if (fwrite(&len,4,1,fp) == 0) return -1;
3431 }
3432 return 0;
3433}
3434
32a66513 3435/* Encode 'value' as an integer if possible (if integer will fit the
3436 * supported range). If the function sucessful encoded the integer
3437 * then the (up to 5 bytes) encoded representation is written in the
3438 * string pointed by 'enc' and the length is returned. Otherwise
3439 * 0 is returned. */
3440static int rdbEncodeInteger(long long value, unsigned char *enc) {
e3566d4b 3441 /* Finally check if it fits in our ranges */
3442 if (value >= -(1<<7) && value <= (1<<7)-1) {
3443 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3444 enc[1] = value&0xFF;
3445 return 2;
3446 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3447 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3448 enc[1] = value&0xFF;
3449 enc[2] = (value>>8)&0xFF;
3450 return 3;
3451 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3452 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3453 enc[1] = value&0xFF;
3454 enc[2] = (value>>8)&0xFF;
3455 enc[3] = (value>>16)&0xFF;
3456 enc[4] = (value>>24)&0xFF;
3457 return 5;
3458 } else {
3459 return 0;
3460 }
3461}
3462
32a66513 3463/* String objects in the form "2391" "-100" without any space and with a
3464 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3465 * encoded as integers to save space */
3466static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3467 long long value;
3468 char *endptr, buf[32];
3469
3470 /* Check if it's possible to encode this value as a number */
3471 value = strtoll(s, &endptr, 10);
3472 if (endptr[0] != '\0') return 0;
3473 ll2string(buf,32,value);
3474
3475 /* If the number converted back into a string is not identical
3476 * then it's not possible to encode the string as integer */
3477 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3478
3479 return rdbEncodeInteger(value,enc);
3480}
3481
b1befe6a 3482static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3483 size_t comprlen, outlen;
774e3047 3484 unsigned char byte;
3485 void *out;
3486
3487 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3488 if (len <= 4) return 0;
3489 outlen = len-4;
3a2694c4 3490 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3491 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3492 if (comprlen == 0) {
88e85998 3493 zfree(out);
774e3047 3494 return 0;
3495 }
3496 /* Data compressed! Let's save it on disk */
3497 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3498 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3499 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3500 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3501 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3502 zfree(out);
774e3047 3503 return comprlen;
3504
3505writeerr:
88e85998 3506 zfree(out);
774e3047 3507 return -1;
3508}
3509
e3566d4b 3510/* Save a string objet as [len][data] on disk. If the object is a string
3511 * representation of an integer value we try to safe it in a special form */
b1befe6a 3512static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3513 int enclen;
10c43610 3514
774e3047 3515 /* Try integer encoding */
e3566d4b 3516 if (len <= 11) {
3517 unsigned char buf[5];
b1befe6a 3518 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3519 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3520 return 0;
3521 }
3522 }
774e3047 3523
3524 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3525 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3526 if (server.rdbcompression && len > 20) {
774e3047 3527 int retval;
3528
b1befe6a 3529 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3530 if (retval == -1) return -1;
3531 if (retval > 0) return 0;
3532 /* retval == 0 means data can't be compressed, save the old way */
3533 }
3534
3535 /* Store verbatim */
10c43610 3536 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3537 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3538 return 0;
3539}
3540
942a3961 3541/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3542static int rdbSaveStringObject(FILE *fp, robj *obj) {
3543 int retval;
942a3961 3544
32a66513 3545 /* Avoid to decode the object, then encode it again, if the
3546 * object is alrady integer encoded. */
3547 if (obj->encoding == REDIS_ENCODING_INT) {
3548 long val = (long) obj->ptr;
3549 unsigned char buf[5];
3550 int enclen;
3551
3552 if ((enclen = rdbEncodeInteger(val,buf)) > 0) {
3553 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3554 return 0;
3555 }
3556 /* otherwise... fall throught and continue with the usual
3557 * code path. */
3558 }
3559
f2d9f50f 3560 /* Avoid incr/decr ref count business when possible.
3561 * This plays well with copy-on-write given that we are probably
3562 * in a child process (BGSAVE). Also this makes sure key objects
3563 * of swapped objects are not incRefCount-ed (an assert does not allow
3564 * this in order to avoid bugs) */
3565 if (obj->encoding != REDIS_ENCODING_RAW) {
996cb5f7 3566 obj = getDecodedObject(obj);
b1befe6a 3567 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3568 decrRefCount(obj);
3569 } else {
b1befe6a 3570 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3571 }
9d65a1bb 3572 return retval;
942a3961 3573}
3574
a7866db6 3575/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3576 * 8 bit integer specifing the length of the representation.
3577 * This 8 bit integer has special values in order to specify the following
3578 * conditions:
3579 * 253: not a number
3580 * 254: + inf
3581 * 255: - inf
3582 */
3583static int rdbSaveDoubleValue(FILE *fp, double val) {
3584 unsigned char buf[128];
3585 int len;
3586
3587 if (isnan(val)) {
3588 buf[0] = 253;
3589 len = 1;
3590 } else if (!isfinite(val)) {
3591 len = 1;
3592 buf[0] = (val < 0) ? 255 : 254;
3593 } else {
88e8d89f 3594#if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
fe244589 3595 /* Check if the float is in a safe range to be casted into a
3596 * long long. We are assuming that long long is 64 bit here.
3597 * Also we are assuming that there are no implementations around where
3598 * double has precision < 52 bit.
3599 *
3600 * Under this assumptions we test if a double is inside an interval
3601 * where casting to long long is safe. Then using two castings we
3602 * make sure the decimal part is zero. If all this is true we use
3603 * integer printing function that is much faster. */
fb82e75c 3604 double min = -4503599627370495; /* (2^52)-1 */
3605 double max = 4503599627370496; /* -(2^52) */
fe244589 3606 if (val > min && val < max && val == ((double)((long long)val)))
8c096b16 3607 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3608 else
88e8d89f 3609#endif
8c096b16 3610 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3611 buf[0] = strlen((char*)buf+1);
a7866db6 3612 len = buf[0]+1;
3613 }
3614 if (fwrite(buf,len,1,fp) == 0) return -1;
3615 return 0;
3616}
3617
06233c45 3618/* Save a Redis object. */
3619static int rdbSaveObject(FILE *fp, robj *o) {
3620 if (o->type == REDIS_STRING) {
3621 /* Save a string value */
3622 if (rdbSaveStringObject(fp,o) == -1) return -1;
3623 } else if (o->type == REDIS_LIST) {
3624 /* Save a list value */
3625 list *list = o->ptr;
c7df85a4 3626 listIter li;
06233c45 3627 listNode *ln;
3628
06233c45 3629 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
c7df85a4 3630 listRewind(list,&li);
3631 while((ln = listNext(&li))) {
06233c45 3632 robj *eleobj = listNodeValue(ln);
3633
3634 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3635 }
3636 } else if (o->type == REDIS_SET) {
3637 /* Save a set value */
3638 dict *set = o->ptr;
3639 dictIterator *di = dictGetIterator(set);
3640 dictEntry *de;
3641
3642 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3643 while((de = dictNext(di)) != NULL) {
3644 robj *eleobj = dictGetEntryKey(de);
3645
3646 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3647 }
3648 dictReleaseIterator(di);
3649 } else if (o->type == REDIS_ZSET) {
3650 /* Save a set value */
3651 zset *zs = o->ptr;
3652 dictIterator *di = dictGetIterator(zs->dict);
3653 dictEntry *de;
3654
3655 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3656 while((de = dictNext(di)) != NULL) {
3657 robj *eleobj = dictGetEntryKey(de);
3658 double *score = dictGetEntryVal(de);
3659
3660 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3661 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3662 }
3663 dictReleaseIterator(di);
b1befe6a 3664 } else if (o->type == REDIS_HASH) {
3665 /* Save a hash value */
3666 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3667 unsigned char *p = zipmapRewind(o->ptr);
3668 unsigned int count = zipmapLen(o->ptr);
3669 unsigned char *key, *val;
3670 unsigned int klen, vlen;
3671
3672 if (rdbSaveLen(fp,count) == -1) return -1;
3673 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3674 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3675 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3676 }
3677 } else {
3678 dictIterator *di = dictGetIterator(o->ptr);
3679 dictEntry *de;
3680
3681 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3682 while((de = dictNext(di)) != NULL) {
3683 robj *key = dictGetEntryKey(de);
3684 robj *val = dictGetEntryVal(de);
3685
3686 if (rdbSaveStringObject(fp,key) == -1) return -1;
3687 if (rdbSaveStringObject(fp,val) == -1) return -1;
3688 }
3689 dictReleaseIterator(di);
3690 }
06233c45 3691 } else {
f83c6cb5 3692 redisPanic("Unknown object type");
06233c45 3693 }
3694 return 0;
3695}
3696
3697/* Return the length the object will have on disk if saved with
3698 * the rdbSaveObject() function. Currently we use a trick to get
3699 * this length with very little changes to the code. In the future
3700 * we could switch to a faster solution. */
b9bc0eef 3701static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3702 if (fp == NULL) fp = server.devnull;
06233c45 3703 rewind(fp);
3704 assert(rdbSaveObject(fp,o) != 1);
3705 return ftello(fp);
3706}
3707
06224fec 3708/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3709static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3710 off_t bytes = rdbSavedObjectLen(o,fp);
e0a62c7f 3711
06224fec 3712 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3713}
3714
ed9b544e 3715/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3716static int rdbSave(char *filename) {
ed9b544e 3717 dictIterator *di = NULL;
3718 dictEntry *de;
ed9b544e 3719 FILE *fp;
3720 char tmpfile[256];
3721 int j;
bb32ede5 3722 time_t now = time(NULL);
ed9b544e 3723
2316bb3b 3724 /* Wait for I/O therads to terminate, just in case this is a
3725 * foreground-saving, to avoid seeking the swap file descriptor at the
3726 * same time. */
3727 if (server.vm_enabled)
3728 waitEmptyIOJobsQueue();
3729
a3b21203 3730 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3731 fp = fopen(tmpfile,"w");
3732 if (!fp) {
3733 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3734 return REDIS_ERR;
3735 }
f78fd11b 3736 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3737 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3738 redisDb *db = server.db+j;
3739 dict *d = db->dict;
3305306f 3740 if (dictSize(d) == 0) continue;
ed9b544e 3741 di = dictGetIterator(d);
3742 if (!di) {
3743 fclose(fp);
3744 return REDIS_ERR;
3745 }
3746
3747 /* Write the SELECT DB opcode */
f78fd11b 3748 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3749 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3750
3751 /* Iterate this DB writing every entry */
3752 while((de = dictNext(di)) != NULL) {
3753 robj *key = dictGetEntryKey(de);
3754 robj *o = dictGetEntryVal(de);
bb32ede5 3755 time_t expiretime = getExpire(db,key);
3756
3757 /* Save the expire time */
3758 if (expiretime != -1) {
3759 /* If this key is already expired skip it */
3760 if (expiretime < now) continue;
3761 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3762 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3763 }
7e69548d 3764 /* Save the key and associated value. This requires special
3765 * handling if the value is swapped out. */
996cb5f7 3766 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3767 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3768 /* Save type, key, value */
3769 if (rdbSaveType(fp,o->type) == -1) goto werr;
3770 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3771 if (rdbSaveObject(fp,o) == -1) goto werr;
3772 } else {
996cb5f7 3773 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3774 robj *po;
7e69548d 3775 /* Get a preview of the object in memory */
3776 po = vmPreviewObject(key);
7e69548d 3777 /* Save type, key, value */
3778 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3779 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3780 if (rdbSaveObject(fp,po) == -1) goto werr;
3781 /* Remove the loaded object from memory */
3782 decrRefCount(po);
7e69548d 3783 }
ed9b544e 3784 }
3785 dictReleaseIterator(di);
3786 }
3787 /* EOF opcode */
f78fd11b 3788 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3789
3790 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3791 fflush(fp);
3792 fsync(fileno(fp));
3793 fclose(fp);
e0a62c7f 3794
ed9b544e 3795 /* Use RENAME to make sure the DB file is changed atomically only
3796 * if the generate DB file is ok. */
3797 if (rename(tmpfile,filename) == -1) {
325d1eb4 3798 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3799 unlink(tmpfile);
3800 return REDIS_ERR;
3801 }
3802 redisLog(REDIS_NOTICE,"DB saved on disk");
3803 server.dirty = 0;
3804 server.lastsave = time(NULL);
3805 return REDIS_OK;
3806
3807werr:
3808 fclose(fp);
3809 unlink(tmpfile);
3810 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3811 if (di) dictReleaseIterator(di);
3812 return REDIS_ERR;
3813}
3814
f78fd11b 3815static int rdbSaveBackground(char *filename) {
ed9b544e 3816 pid_t childpid;
3817
9d65a1bb 3818 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3819 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3820 if ((childpid = fork()) == 0) {
3821 /* Child */
054e426d 3822 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3823 close(server.fd);
f78fd11b 3824 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3825 _exit(0);
ed9b544e 3826 } else {
478c2c6f 3827 _exit(1);
ed9b544e 3828 }
3829 } else {
3830 /* Parent */
5a7c647e 3831 if (childpid == -1) {
3832 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3833 strerror(errno));
3834 return REDIS_ERR;
3835 }
ed9b544e 3836 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3837 server.bgsavechildpid = childpid;
884d4b39 3838 updateDictResizePolicy();
ed9b544e 3839 return REDIS_OK;
3840 }
3841 return REDIS_OK; /* unreached */
3842}
3843
a3b21203 3844static void rdbRemoveTempFile(pid_t childpid) {
3845 char tmpfile[256];
3846
3847 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3848 unlink(tmpfile);
3849}
3850
f78fd11b 3851static int rdbLoadType(FILE *fp) {
3852 unsigned char type;
7b45bfb2 3853 if (fread(&type,1,1,fp) == 0) return -1;
3854 return type;
3855}
3856
bb32ede5 3857static time_t rdbLoadTime(FILE *fp) {
3858 int32_t t32;
3859 if (fread(&t32,4,1,fp) == 0) return -1;
3860 return (time_t) t32;
3861}
3862
e3566d4b 3863/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3864 * of this file for a description of how this are stored on disk.
3865 *
3866 * isencoded is set to 1 if the readed length is not actually a length but
3867 * an "encoding type", check the above comments for more info */
c78a8ccc 3868static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3869 unsigned char buf[2];
3870 uint32_t len;
c78a8ccc 3871 int type;
f78fd11b 3872
e3566d4b 3873 if (isencoded) *isencoded = 0;
c78a8ccc 3874 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3875 type = (buf[0]&0xC0)>>6;
3876 if (type == REDIS_RDB_6BITLEN) {
3877 /* Read a 6 bit len */
3878 return buf[0]&0x3F;
3879 } else if (type == REDIS_RDB_ENCVAL) {
3880 /* Read a 6 bit len encoding type */
3881 if (isencoded) *isencoded = 1;
3882 return buf[0]&0x3F;
3883 } else if (type == REDIS_RDB_14BITLEN) {
3884 /* Read a 14 bit len */
3885 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3886 return ((buf[0]&0x3F)<<8)|buf[1];
3887 } else {
3888 /* Read a 32 bit len */
f78fd11b 3889 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3890 return ntohl(len);
f78fd11b 3891 }
f78fd11b 3892}
3893
ad30aa60 3894/* Load an integer-encoded object from file 'fp', with the specified
3895 * encoding type 'enctype'. If encode is true the function may return
3896 * an integer-encoded object as reply, otherwise the returned object
3897 * will always be encoded as a raw string. */
3898static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
e3566d4b 3899 unsigned char enc[4];
3900 long long val;
3901
3902 if (enctype == REDIS_RDB_ENC_INT8) {
3903 if (fread(enc,1,1,fp) == 0) return NULL;
3904 val = (signed char)enc[0];
3905 } else if (enctype == REDIS_RDB_ENC_INT16) {
3906 uint16_t v;
3907 if (fread(enc,2,1,fp) == 0) return NULL;
3908 v = enc[0]|(enc[1]<<8);
3909 val = (int16_t)v;
3910 } else if (enctype == REDIS_RDB_ENC_INT32) {
3911 uint32_t v;
3912 if (fread(enc,4,1,fp) == 0) return NULL;
3913 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3914 val = (int32_t)v;
3915 } else {
3916 val = 0; /* anti-warning */
f83c6cb5 3917 redisPanic("Unknown RDB integer encoding type");
e3566d4b 3918 }
ad30aa60 3919 if (encode)
3920 return createStringObjectFromLongLong(val);
3921 else
3922 return createObject(REDIS_STRING,sdsfromlonglong(val));
e3566d4b 3923}
3924
c78a8ccc 3925static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3926 unsigned int len, clen;
3927 unsigned char *c = NULL;
3928 sds val = NULL;
3929
c78a8ccc 3930 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3931 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3932 if ((c = zmalloc(clen)) == NULL) goto err;
3933 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3934 if (fread(c,clen,1,fp) == 0) goto err;
3935 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3936 zfree(c);
88e85998 3937 return createObject(REDIS_STRING,val);
3938err:
3939 zfree(c);
3940 sdsfree(val);
3941 return NULL;
3942}
3943
ad30aa60 3944static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
e3566d4b 3945 int isencoded;
3946 uint32_t len;
f78fd11b 3947 sds val;
3948
c78a8ccc 3949 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3950 if (isencoded) {
3951 switch(len) {
3952 case REDIS_RDB_ENC_INT8:
3953 case REDIS_RDB_ENC_INT16:
3954 case REDIS_RDB_ENC_INT32:
ad30aa60 3955 return rdbLoadIntegerObject(fp,len,encode);
88e85998 3956 case REDIS_RDB_ENC_LZF:
bdcb92f2 3957 return rdbLoadLzfStringObject(fp);
e3566d4b 3958 default:
f83c6cb5 3959 redisPanic("Unknown RDB encoding type");
e3566d4b 3960 }
3961 }
3962
f78fd11b 3963 if (len == REDIS_RDB_LENERR) return NULL;
3964 val = sdsnewlen(NULL,len);
3965 if (len && fread(val,len,1,fp) == 0) {
3966 sdsfree(val);
3967 return NULL;
3968 }
bdcb92f2 3969 return createObject(REDIS_STRING,val);
f78fd11b 3970}
3971
ad30aa60 3972static robj *rdbLoadStringObject(FILE *fp) {
3973 return rdbGenericLoadStringObject(fp,0);
3974}
3975
3976static robj *rdbLoadEncodedStringObject(FILE *fp) {
3977 return rdbGenericLoadStringObject(fp,1);
3978}
3979
a7866db6 3980/* For information about double serialization check rdbSaveDoubleValue() */
3981static int rdbLoadDoubleValue(FILE *fp, double *val) {
3982 char buf[128];
3983 unsigned char len;
3984
3985 if (fread(&len,1,1,fp) == 0) return -1;
3986 switch(len) {
3987 case 255: *val = R_NegInf; return 0;
3988 case 254: *val = R_PosInf; return 0;
3989 case 253: *val = R_Nan; return 0;
3990 default:
3991 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3992 buf[len] = '\0';
a7866db6 3993 sscanf(buf, "%lg", val);
3994 return 0;
3995 }
3996}
3997
c78a8ccc 3998/* Load a Redis object of the specified type from the specified file.
3999 * On success a newly allocated object is returned, otherwise NULL. */
4000static robj *rdbLoadObject(int type, FILE *fp) {
4001 robj *o;
4002
bcd11906 4003 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 4004 if (type == REDIS_STRING) {
4005 /* Read string value */
ad30aa60 4006 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4007 o = tryObjectEncoding(o);
c78a8ccc 4008 } else if (type == REDIS_LIST || type == REDIS_SET) {
4009 /* Read list/set value */
4010 uint32_t listlen;
4011
4012 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4013 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3c68de9b 4014 /* It's faster to expand the dict to the right size asap in order
4015 * to avoid rehashing */
4016 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
4017 dictExpand(o->ptr,listlen);
c78a8ccc 4018 /* Load every single element of the list/set */
4019 while(listlen--) {
4020 robj *ele;
4021
ad30aa60 4022 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4023 ele = tryObjectEncoding(ele);
c78a8ccc 4024 if (type == REDIS_LIST) {
4025 listAddNodeTail((list*)o->ptr,ele);
4026 } else {
4027 dictAdd((dict*)o->ptr,ele,NULL);
4028 }
4029 }
4030 } else if (type == REDIS_ZSET) {
4031 /* Read list/set value */
ada386b2 4032 size_t zsetlen;
c78a8ccc 4033 zset *zs;
4034
4035 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4036 o = createZsetObject();
4037 zs = o->ptr;
4038 /* Load every single element of the list/set */
4039 while(zsetlen--) {
4040 robj *ele;
4041 double *score = zmalloc(sizeof(double));
4042
ad30aa60 4043 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4044 ele = tryObjectEncoding(ele);
c78a8ccc 4045 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4046 dictAdd(zs->dict,ele,score);
4047 zslInsert(zs->zsl,*score,ele);
4048 incrRefCount(ele); /* added to skiplist */
4049 }
ada386b2 4050 } else if (type == REDIS_HASH) {
4051 size_t hashlen;
4052
4053 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4054 o = createHashObject();
4055 /* Too many entries? Use an hash table. */
4056 if (hashlen > server.hash_max_zipmap_entries)
4057 convertToRealHash(o);
4058 /* Load every key/value, then set it into the zipmap or hash
4059 * table, as needed. */
4060 while(hashlen--) {
4061 robj *key, *val;
4062
4063 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4064 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4065 /* If we are using a zipmap and there are too big values
4066 * the object is converted to real hash table encoding. */
4067 if (o->encoding != REDIS_ENCODING_HT &&
4068 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4069 sdslen(val->ptr) > server.hash_max_zipmap_value))
4070 {
4071 convertToRealHash(o);
4072 }
4073
4074 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4075 unsigned char *zm = o->ptr;
4076
4077 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4078 val->ptr,sdslen(val->ptr),NULL);
4079 o->ptr = zm;
4080 decrRefCount(key);
4081 decrRefCount(val);
4082 } else {
05df7621 4083 key = tryObjectEncoding(key);
4084 val = tryObjectEncoding(val);
ada386b2 4085 dictAdd((dict*)o->ptr,key,val);
ada386b2 4086 }
4087 }
c78a8ccc 4088 } else {
f83c6cb5 4089 redisPanic("Unknown object type");
c78a8ccc 4090 }
4091 return o;
4092}
4093
f78fd11b 4094static int rdbLoad(char *filename) {
ed9b544e 4095 FILE *fp;
f78fd11b 4096 uint32_t dbid;
bb32ede5 4097 int type, retval, rdbver;
585af7e2 4098 int swap_all_values = 0;
3305306f 4099 dict *d = server.db[0].dict;
bb32ede5 4100 redisDb *db = server.db+0;
f78fd11b 4101 char buf[1024];
242a64f3 4102 time_t expiretime, now = time(NULL);
b492cf00 4103 long long loadedkeys = 0;
bb32ede5 4104
ed9b544e 4105 fp = fopen(filename,"r");
4106 if (!fp) return REDIS_ERR;
4107 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 4108 buf[9] = '\0';
4109 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 4110 fclose(fp);
4111 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4112 return REDIS_ERR;
4113 }
f78fd11b 4114 rdbver = atoi(buf+5);
c78a8ccc 4115 if (rdbver != 1) {
f78fd11b 4116 fclose(fp);
4117 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4118 return REDIS_ERR;
4119 }
ed9b544e 4120 while(1) {
585af7e2 4121 robj *key, *val;
ed9b544e 4122
585af7e2 4123 expiretime = -1;
ed9b544e 4124 /* Read type. */
f78fd11b 4125 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 4126 if (type == REDIS_EXPIRETIME) {
4127 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4128 /* We read the time so we need to read the object type again */
4129 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4130 }
ed9b544e 4131 if (type == REDIS_EOF) break;
4132 /* Handle SELECT DB opcode as a special case */
4133 if (type == REDIS_SELECTDB) {
c78a8ccc 4134 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 4135 goto eoferr;
ed9b544e 4136 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 4137 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 4138 exit(1);
4139 }
bb32ede5 4140 db = server.db+dbid;
4141 d = db->dict;
ed9b544e 4142 continue;
4143 }
4144 /* Read key */
585af7e2 4145 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
c78a8ccc 4146 /* Read value */
585af7e2 4147 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
89e689c5 4148 /* Check if the key already expired */
4149 if (expiretime != -1 && expiretime < now) {
4150 decrRefCount(key);
4151 decrRefCount(val);
4152 continue;
4153 }
ed9b544e 4154 /* Add the new object in the hash table */
585af7e2 4155 retval = dictAdd(d,key,val);
ed9b544e 4156 if (retval == DICT_ERR) {
585af7e2 4157 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
ed9b544e 4158 exit(1);
4159 }
242a64f3 4160 loadedkeys++;
bb32ede5 4161 /* Set the expire time if needed */
89e689c5 4162 if (expiretime != -1) setExpire(db,key,expiretime);
242a64f3 4163
b492cf00 4164 /* Handle swapping while loading big datasets when VM is on */
242a64f3 4165
4166 /* If we detecter we are hopeless about fitting something in memory
4167 * we just swap every new key on disk. Directly...
4168 * Note that's important to check for this condition before resorting
4169 * to random sampling, otherwise we may try to swap already
4170 * swapped keys. */
585af7e2 4171 if (swap_all_values) {
4172 dictEntry *de = dictFind(d,key);
242a64f3 4173
4174 /* de may be NULL since the key already expired */
4175 if (de) {
585af7e2 4176 key = dictGetEntryKey(de);
4177 val = dictGetEntryVal(de);
242a64f3 4178
585af7e2 4179 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
242a64f3 4180 dictGetEntryVal(de) = NULL;
4181 }
4182 }
4183 continue;
4184 }
4185
4186 /* If we have still some hope of having some value fitting memory
4187 * then we try random sampling. */
585af7e2 4188 if (!swap_all_values && server.vm_enabled && (loadedkeys % 5000) == 0) {
b492cf00 4189 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 4190 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 4191 }
242a64f3 4192 if (zmalloc_used_memory() > server.vm_max_memory)
585af7e2 4193 swap_all_values = 1; /* We are already using too much mem */
b492cf00 4194 }
ed9b544e 4195 }
4196 fclose(fp);
4197 return REDIS_OK;
4198
4199eoferr: /* unexpected end of file is handled here with a fatal exit */
f80dff62 4200 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 4201 exit(1);
4202 return REDIS_ERR; /* Just to avoid warning */
4203}
4204
b58ba105 4205/*================================== Shutdown =============================== */
fab43727 4206static int prepareForShutdown() {
b58ba105
AM
4207 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4208 /* Kill the saving child if there is a background saving in progress.
4209 We want to avoid race conditions, for instance our saving child may
4210 overwrite the synchronous saving did by SHUTDOWN. */
4211 if (server.bgsavechildpid != -1) {
4212 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4213 kill(server.bgsavechildpid,SIGKILL);
4214 rdbRemoveTempFile(server.bgsavechildpid);
4215 }
4216 if (server.appendonly) {
4217 /* Append only file: fsync() the AOF and exit */
4218 fsync(server.appendfd);
4219 if (server.vm_enabled) unlink(server.vm_swap_file);
b58ba105
AM
4220 } else {
4221 /* Snapshotting. Perform a SYNC SAVE and exit */
4222 if (rdbSave(server.dbfilename) == REDIS_OK) {
4223 if (server.daemonize)
4224 unlink(server.pidfile);
4225 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
b58ba105
AM
4226 } else {
4227 /* Ooops.. error saving! The best we can do is to continue
4228 * operating. Note that if there was a background saving process,
4229 * in the next cron() Redis will be notified that the background
4230 * saving aborted, handling special stuff like slaves pending for
4231 * synchronization... */
4232 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
fab43727 4233 return REDIS_ERR;
b58ba105
AM
4234 }
4235 }
8513a757 4236 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
fab43727 4237 return REDIS_OK;
b58ba105
AM
4238}
4239
ed9b544e 4240/*================================== Commands =============================== */
4241
abcb223e 4242static void authCommand(redisClient *c) {
2e77c2ee 4243 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
4244 c->authenticated = 1;
4245 addReply(c,shared.ok);
4246 } else {
4247 c->authenticated = 0;
fa4c0aba 4248 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
4249 }
4250}
4251
ed9b544e 4252static void pingCommand(redisClient *c) {
4253 addReply(c,shared.pong);
4254}
4255
4256static void echoCommand(redisClient *c) {
dd88747b 4257 addReplyBulk(c,c->argv[1]);
ed9b544e 4258}
4259
4260/*=================================== Strings =============================== */
4261
526d00a5 4262static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
ed9b544e 4263 int retval;
10ce1276 4264 long seconds = 0; /* initialized to avoid an harmness warning */
ed9b544e 4265
526d00a5 4266 if (expire) {
4267 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4268 return;
4269 if (seconds <= 0) {
4270 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4271 return;
4272 }
4273 }
4274
37ab76c9 4275 touchWatchedKey(c->db,key);
526d00a5 4276 if (nx) deleteIfVolatile(c->db,key);
4277 retval = dictAdd(c->db->dict,key,val);
ed9b544e 4278 if (retval == DICT_ERR) {
4279 if (!nx) {
1b03836c 4280 /* If the key is about a swapped value, we want a new key object
4281 * to overwrite the old. So we delete the old key in the database.
4282 * This will also make sure that swap pages about the old object
4283 * will be marked as free. */
526d00a5 4284 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4285 incrRefCount(key);
4286 dictReplace(c->db->dict,key,val);
4287 incrRefCount(val);
ed9b544e 4288 } else {
c937aa89 4289 addReply(c,shared.czero);
ed9b544e 4290 return;
4291 }
4292 } else {
526d00a5 4293 incrRefCount(key);
4294 incrRefCount(val);
ed9b544e 4295 }
4296 server.dirty++;
526d00a5 4297 removeExpire(c->db,key);
4298 if (expire) setExpire(c->db,key,time(NULL)+seconds);
c937aa89 4299 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 4300}
4301
4302static void setCommand(redisClient *c) {
526d00a5 4303 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
ed9b544e 4304}
4305
4306static void setnxCommand(redisClient *c) {
526d00a5 4307 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4308}
4309
4310static void setexCommand(redisClient *c) {
4311 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
ed9b544e 4312}
4313
322fc7d8 4314static int getGenericCommand(redisClient *c) {
dd88747b 4315 robj *o;
e0a62c7f 4316
dd88747b 4317 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 4318 return REDIS_OK;
dd88747b 4319
4320 if (o->type != REDIS_STRING) {
4321 addReply(c,shared.wrongtypeerr);
4322 return REDIS_ERR;
ed9b544e 4323 } else {
dd88747b 4324 addReplyBulk(c,o);
4325 return REDIS_OK;
ed9b544e 4326 }
4327}
4328
322fc7d8 4329static void getCommand(redisClient *c) {
4330 getGenericCommand(c);
4331}
4332
f6b141c5 4333static void getsetCommand(redisClient *c) {
322fc7d8 4334 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 4335 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4336 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4337 } else {
4338 incrRefCount(c->argv[1]);
4339 }
4340 incrRefCount(c->argv[2]);
4341 server.dirty++;
4342 removeExpire(c->db,c->argv[1]);
4343}
4344
70003d28 4345static void mgetCommand(redisClient *c) {
70003d28 4346 int j;
e0a62c7f 4347
c937aa89 4348 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 4349 for (j = 1; j < c->argc; j++) {
3305306f 4350 robj *o = lookupKeyRead(c->db,c->argv[j]);
4351 if (o == NULL) {
c937aa89 4352 addReply(c,shared.nullbulk);
70003d28 4353 } else {
70003d28 4354 if (o->type != REDIS_STRING) {
c937aa89 4355 addReply(c,shared.nullbulk);
70003d28 4356 } else {
dd88747b 4357 addReplyBulk(c,o);
70003d28 4358 }
4359 }
4360 }
4361}
4362
6c446631 4363static void msetGenericCommand(redisClient *c, int nx) {
906573e7 4364 int j, busykeys = 0;
6c446631 4365
4366 if ((c->argc % 2) == 0) {
454d4e43 4367 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 4368 return;
4369 }
4370 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4371 * set nothing at all if at least one already key exists. */
4372 if (nx) {
4373 for (j = 1; j < c->argc; j += 2) {
906573e7 4374 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4375 busykeys++;
6c446631 4376 }
4377 }
4378 }
906573e7 4379 if (busykeys) {
4380 addReply(c, shared.czero);
4381 return;
4382 }
6c446631 4383
4384 for (j = 1; j < c->argc; j += 2) {
4385 int retval;
4386
05df7621 4387 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
6c446631 4388 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4389 if (retval == DICT_ERR) {
4390 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4391 incrRefCount(c->argv[j+1]);
4392 } else {
4393 incrRefCount(c->argv[j]);
4394 incrRefCount(c->argv[j+1]);
4395 }
4396 removeExpire(c->db,c->argv[j]);
4397 }
4398 server.dirty += (c->argc-1)/2;
4399 addReply(c, nx ? shared.cone : shared.ok);
4400}
4401
4402static void msetCommand(redisClient *c) {
4403 msetGenericCommand(c,0);
4404}
4405
4406static void msetnxCommand(redisClient *c) {
4407 msetGenericCommand(c,1);
4408}
4409
d68ed120 4410static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 4411 long long value;
4412 int retval;
4413 robj *o;
e0a62c7f 4414
3305306f 4415 o = lookupKeyWrite(c->db,c->argv[1]);
6485f293
PN
4416 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4417 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
ed9b544e 4418
4419 value += incr;
d6f4c262 4420 o = createStringObjectFromLongLong(value);
3305306f 4421 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 4422 if (retval == DICT_ERR) {
3305306f 4423 dictReplace(c->db->dict,c->argv[1],o);
4424 removeExpire(c->db,c->argv[1]);
ed9b544e 4425 } else {
4426 incrRefCount(c->argv[1]);
4427 }
4428 server.dirty++;
c937aa89 4429 addReply(c,shared.colon);
ed9b544e 4430 addReply(c,o);
4431 addReply(c,shared.crlf);
4432}
4433
4434static void incrCommand(redisClient *c) {
a4d1ba9a 4435 incrDecrCommand(c,1);
ed9b544e 4436}
4437
4438static void decrCommand(redisClient *c) {
a4d1ba9a 4439 incrDecrCommand(c,-1);
ed9b544e 4440}
4441
4442static void incrbyCommand(redisClient *c) {
bbe025e0
AM
4443 long long incr;
4444
bd79a6bd 4445 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4446 incrDecrCommand(c,incr);
ed9b544e 4447}
4448
4449static void decrbyCommand(redisClient *c) {
bbe025e0
AM
4450 long long incr;
4451
bd79a6bd 4452 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4453 incrDecrCommand(c,-incr);
ed9b544e 4454}
4455
4b00bebd 4456static void appendCommand(redisClient *c) {
4457 int retval;
4458 size_t totlen;
4459 robj *o;
4460
4461 o = lookupKeyWrite(c->db,c->argv[1]);
4462 if (o == NULL) {
4463 /* Create the key */
4464 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4465 incrRefCount(c->argv[1]);
4466 incrRefCount(c->argv[2]);
4467 totlen = stringObjectLen(c->argv[2]);
4468 } else {
4469 dictEntry *de;
e0a62c7f 4470
4b00bebd 4471 de = dictFind(c->db->dict,c->argv[1]);
4472 assert(de != NULL);
4473
4474 o = dictGetEntryVal(de);
4475 if (o->type != REDIS_STRING) {
4476 addReply(c,shared.wrongtypeerr);
4477 return;
4478 }
4479 /* If the object is specially encoded or shared we have to make
4480 * a copy */
4481 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4482 robj *decoded = getDecodedObject(o);
4483
4484 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4485 decrRefCount(decoded);
4486 dictReplace(c->db->dict,c->argv[1],o);
4487 }
4488 /* APPEND! */
4489 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4490 o->ptr = sdscatlen(o->ptr,
4491 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4492 } else {
4493 o->ptr = sdscatprintf(o->ptr, "%ld",
4494 (unsigned long) c->argv[2]->ptr);
4495 }
4496 totlen = sdslen(o->ptr);
4497 }
4498 server.dirty++;
4499 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4500}
4501
39191553 4502static void substrCommand(redisClient *c) {
4503 robj *o;
4504 long start = atoi(c->argv[2]->ptr);
4505 long end = atoi(c->argv[3]->ptr);
dd88747b 4506 size_t rangelen, strlen;
4507 sds range;
39191553 4508
dd88747b 4509 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4510 checkType(c,o,REDIS_STRING)) return;
39191553 4511
dd88747b 4512 o = getDecodedObject(o);
4513 strlen = sdslen(o->ptr);
8fe7fad7 4514
dd88747b 4515 /* convert negative indexes */
4516 if (start < 0) start = strlen+start;
4517 if (end < 0) end = strlen+end;
4518 if (start < 0) start = 0;
4519 if (end < 0) end = 0;
39191553 4520
dd88747b 4521 /* indexes sanity checks */
4522 if (start > end || (size_t)start >= strlen) {
4523 /* Out of range start or start > end result in null reply */
4524 addReply(c,shared.nullbulk);
4525 decrRefCount(o);
4526 return;
39191553 4527 }
dd88747b 4528 if ((size_t)end >= strlen) end = strlen-1;
4529 rangelen = (end-start)+1;
4530
4531 /* Return the result */
4532 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4533 range = sdsnewlen((char*)o->ptr+start,rangelen);
4534 addReplySds(c,range);
4535 addReply(c,shared.crlf);
4536 decrRefCount(o);
39191553 4537}
4538
ed9b544e 4539/* ========================= Type agnostic commands ========================= */
4540
4541static void delCommand(redisClient *c) {
5109cdff 4542 int deleted = 0, j;
4543
4544 for (j = 1; j < c->argc; j++) {
4545 if (deleteKey(c->db,c->argv[j])) {
37ab76c9 4546 touchWatchedKey(c->db,c->argv[j]);
5109cdff 4547 server.dirty++;
4548 deleted++;
4549 }
4550 }
482b672d 4551 addReplyLongLong(c,deleted);
ed9b544e 4552}
4553
4554static void existsCommand(redisClient *c) {
f4f06efc
PN
4555 expireIfNeeded(c->db,c->argv[1]);
4556 if (dictFind(c->db->dict,c->argv[1])) {
4557 addReply(c, shared.cone);
4558 } else {
4559 addReply(c, shared.czero);
4560 }
ed9b544e 4561}
4562
4563static void selectCommand(redisClient *c) {
4564 int id = atoi(c->argv[1]->ptr);
e0a62c7f 4565
ed9b544e 4566 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4567 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4568 } else {
4569 addReply(c,shared.ok);
4570 }
4571}
4572
4573static void randomkeyCommand(redisClient *c) {
4574 dictEntry *de;
dc4be23e 4575 robj *key;
e0a62c7f 4576
3305306f 4577 while(1) {
4578 de = dictGetRandomKey(c->db->dict);
ce7bef07 4579 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 4580 }
2b619329 4581
ed9b544e 4582 if (de == NULL) {
dc4be23e 4583 addReply(c,shared.nullbulk);
4584 return;
4585 }
4586
4587 key = dictGetEntryKey(de);
4588 if (server.vm_enabled) {
4589 key = dupStringObject(key);
4590 addReplyBulk(c,key);
4591 decrRefCount(key);
ed9b544e 4592 } else {
dc4be23e 4593 addReplyBulk(c,key);
ed9b544e 4594 }
4595}
4596
4597static void keysCommand(redisClient *c) {
4598 dictIterator *di;
4599 dictEntry *de;
4600 sds pattern = c->argv[1]->ptr;
4601 int plen = sdslen(pattern);
a3f9eec2 4602 unsigned long numkeys = 0;
ed9b544e 4603 robj *lenobj = createObject(REDIS_STRING,NULL);
4604
3305306f 4605 di = dictGetIterator(c->db->dict);
ed9b544e 4606 addReply(c,lenobj);
4607 decrRefCount(lenobj);
4608 while((de = dictNext(di)) != NULL) {
4609 robj *keyobj = dictGetEntryKey(de);
3305306f 4610
ed9b544e 4611 sds key = keyobj->ptr;
4612 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4613 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 4614 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4615 addReplyBulk(c,keyobj);
3305306f 4616 numkeys++;
3305306f 4617 }
ed9b544e 4618 }
4619 }
4620 dictReleaseIterator(di);
a3f9eec2 4621 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4622}
4623
4624static void dbsizeCommand(redisClient *c) {
4625 addReplySds(c,
3305306f 4626 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4627}
4628
4629static void lastsaveCommand(redisClient *c) {
4630 addReplySds(c,
c937aa89 4631 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4632}
4633
4634static void typeCommand(redisClient *c) {
3305306f 4635 robj *o;
ed9b544e 4636 char *type;
3305306f 4637
4638 o = lookupKeyRead(c->db,c->argv[1]);
4639 if (o == NULL) {
c937aa89 4640 type = "+none";
ed9b544e 4641 } else {
ed9b544e 4642 switch(o->type) {
c937aa89 4643 case REDIS_STRING: type = "+string"; break;
4644 case REDIS_LIST: type = "+list"; break;
4645 case REDIS_SET: type = "+set"; break;
412a8bce 4646 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4647 case REDIS_HASH: type = "+hash"; break;
4648 default: type = "+unknown"; break;
ed9b544e 4649 }
4650 }
4651 addReplySds(c,sdsnew(type));
4652 addReply(c,shared.crlf);
4653}
4654
4655static void saveCommand(redisClient *c) {
9d65a1bb 4656 if (server.bgsavechildpid != -1) {
05557f6d 4657 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4658 return;
4659 }
f78fd11b 4660 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4661 addReply(c,shared.ok);
4662 } else {
4663 addReply(c,shared.err);
4664 }
4665}
4666
4667static void bgsaveCommand(redisClient *c) {
9d65a1bb 4668 if (server.bgsavechildpid != -1) {
ed9b544e 4669 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4670 return;
4671 }
f78fd11b 4672 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4673 char *status = "+Background saving started\r\n";
4674 addReplySds(c,sdsnew(status));
ed9b544e 4675 } else {
4676 addReply(c,shared.err);
4677 }
4678}
4679
4680static void shutdownCommand(redisClient *c) {
fab43727 4681 if (prepareForShutdown() == REDIS_OK)
4682 exit(0);
4683 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
ed9b544e 4684}
4685
4686static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4687 robj *o;
4688
4689 /* To use the same key as src and dst is probably an error */
4690 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4691 addReply(c,shared.sameobjecterr);
ed9b544e 4692 return;
4693 }
4694
dd88747b 4695 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4696 return;
dd88747b 4697
ed9b544e 4698 incrRefCount(o);
3305306f 4699 deleteIfVolatile(c->db,c->argv[2]);
4700 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 4701 if (nx) {
4702 decrRefCount(o);
c937aa89 4703 addReply(c,shared.czero);
ed9b544e 4704 return;
4705 }
3305306f 4706 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 4707 } else {
4708 incrRefCount(c->argv[2]);
4709 }
3305306f 4710 deleteKey(c->db,c->argv[1]);
b167f877 4711 touchWatchedKey(c->db,c->argv[2]);
ed9b544e 4712 server.dirty++;
c937aa89 4713 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4714}
4715
4716static void renameCommand(redisClient *c) {
4717 renameGenericCommand(c,0);
4718}
4719
4720static void renamenxCommand(redisClient *c) {
4721 renameGenericCommand(c,1);
4722}
4723
4724static void moveCommand(redisClient *c) {
3305306f 4725 robj *o;
4726 redisDb *src, *dst;
ed9b544e 4727 int srcid;
4728
4729 /* Obtain source and target DB pointers */
3305306f 4730 src = c->db;
4731 srcid = c->db->id;
ed9b544e 4732 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4733 addReply(c,shared.outofrangeerr);
ed9b544e 4734 return;
4735 }
3305306f 4736 dst = c->db;
4737 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4738
4739 /* If the user is moving using as target the same
4740 * DB as the source DB it is probably an error. */
4741 if (src == dst) {
c937aa89 4742 addReply(c,shared.sameobjecterr);
ed9b544e 4743 return;
4744 }
4745
4746 /* Check if the element exists and get a reference */
3305306f 4747 o = lookupKeyWrite(c->db,c->argv[1]);
4748 if (!o) {
c937aa89 4749 addReply(c,shared.czero);
ed9b544e 4750 return;
4751 }
4752
4753 /* Try to add the element to the target DB */
3305306f 4754 deleteIfVolatile(dst,c->argv[1]);
4755 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 4756 addReply(c,shared.czero);
ed9b544e 4757 return;
4758 }
3305306f 4759 incrRefCount(c->argv[1]);
ed9b544e 4760 incrRefCount(o);
4761
4762 /* OK! key moved, free the entry in the source DB */
3305306f 4763 deleteKey(src,c->argv[1]);
ed9b544e 4764 server.dirty++;
c937aa89 4765 addReply(c,shared.cone);
ed9b544e 4766}
4767
4768/* =================================== Lists ================================ */
4769static void pushGenericCommand(redisClient *c, int where) {
4770 robj *lobj;
ed9b544e 4771 list *list;
3305306f 4772
4773 lobj = lookupKeyWrite(c->db,c->argv[1]);
4774 if (lobj == NULL) {
95242ab5 4775 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4776 addReply(c,shared.cone);
95242ab5 4777 return;
4778 }
ed9b544e 4779 lobj = createListObject();
4780 list = lobj->ptr;
4781 if (where == REDIS_HEAD) {
6b47e12e 4782 listAddNodeHead(list,c->argv[2]);
ed9b544e 4783 } else {
6b47e12e 4784 listAddNodeTail(list,c->argv[2]);
ed9b544e 4785 }
3305306f 4786 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 4787 incrRefCount(c->argv[1]);
4788 incrRefCount(c->argv[2]);
4789 } else {
ed9b544e 4790 if (lobj->type != REDIS_LIST) {
4791 addReply(c,shared.wrongtypeerr);
4792 return;
4793 }
95242ab5 4794 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4795 addReply(c,shared.cone);
95242ab5 4796 return;
4797 }
ed9b544e 4798 list = lobj->ptr;
4799 if (where == REDIS_HEAD) {
6b47e12e 4800 listAddNodeHead(list,c->argv[2]);
ed9b544e 4801 } else {
6b47e12e 4802 listAddNodeTail(list,c->argv[2]);
ed9b544e 4803 }
4804 incrRefCount(c->argv[2]);
4805 }
4806 server.dirty++;
482b672d 4807 addReplyLongLong(c,listLength(list));
ed9b544e 4808}
4809
4810static void lpushCommand(redisClient *c) {
4811 pushGenericCommand(c,REDIS_HEAD);
4812}
4813
4814static void rpushCommand(redisClient *c) {
4815 pushGenericCommand(c,REDIS_TAIL);
4816}
4817
4818static void llenCommand(redisClient *c) {
3305306f 4819 robj *o;
ed9b544e 4820 list *l;
dd88747b 4821
4822 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4823 checkType(c,o,REDIS_LIST)) return;
e0a62c7f 4824
dd88747b 4825 l = o->ptr;
4826 addReplyUlong(c,listLength(l));
ed9b544e 4827}
4828
4829static void lindexCommand(redisClient *c) {
3305306f 4830 robj *o;
ed9b544e 4831 int index = atoi(c->argv[2]->ptr);
dd88747b 4832 list *list;
4833 listNode *ln;
4834
4835 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4836 checkType(c,o,REDIS_LIST)) return;
4837 list = o->ptr;
4838
4839 ln = listIndex(list, index);
4840 if (ln == NULL) {
c937aa89 4841 addReply(c,shared.nullbulk);
ed9b544e 4842 } else {
dd88747b 4843 robj *ele = listNodeValue(ln);
4844 addReplyBulk(c,ele);
ed9b544e 4845 }
4846}
4847
4848static void lsetCommand(redisClient *c) {
3305306f 4849 robj *o;
ed9b544e 4850 int index = atoi(c->argv[2]->ptr);
dd88747b 4851 list *list;
4852 listNode *ln;
4853
4854 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4855 checkType(c,o,REDIS_LIST)) return;
4856 list = o->ptr;
4857
4858 ln = listIndex(list, index);
4859 if (ln == NULL) {
4860 addReply(c,shared.outofrangeerr);
ed9b544e 4861 } else {
dd88747b 4862 robj *ele = listNodeValue(ln);
ed9b544e 4863
dd88747b 4864 decrRefCount(ele);
4865 listNodeValue(ln) = c->argv[3];
4866 incrRefCount(c->argv[3]);
4867 addReply(c,shared.ok);
4868 server.dirty++;
ed9b544e 4869 }
4870}
4871
4872static void popGenericCommand(redisClient *c, int where) {
3305306f 4873 robj *o;
dd88747b 4874 list *list;
4875 listNode *ln;
3305306f 4876
dd88747b 4877 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4878 checkType(c,o,REDIS_LIST)) return;
4879 list = o->ptr;
ed9b544e 4880
dd88747b 4881 if (where == REDIS_HEAD)
4882 ln = listFirst(list);
4883 else
4884 ln = listLast(list);
ed9b544e 4885
dd88747b 4886 if (ln == NULL) {
4887 addReply(c,shared.nullbulk);
4888 } else {
4889 robj *ele = listNodeValue(ln);
4890 addReplyBulk(c,ele);
4891 listDelNode(list,ln);
3ea27d37 4892 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4893 server.dirty++;
ed9b544e 4894 }
4895}
4896
4897static void lpopCommand(redisClient *c) {
4898 popGenericCommand(c,REDIS_HEAD);
4899}
4900
4901static void rpopCommand(redisClient *c) {
4902 popGenericCommand(c,REDIS_TAIL);
4903}
4904
4905static void lrangeCommand(redisClient *c) {
3305306f 4906 robj *o;
ed9b544e 4907 int start = atoi(c->argv[2]->ptr);
4908 int end = atoi(c->argv[3]->ptr);
dd88747b 4909 int llen;
4910 int rangelen, j;
4911 list *list;
4912 listNode *ln;
4913 robj *ele;
4914
4e27f268 4915 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4916 || checkType(c,o,REDIS_LIST)) return;
dd88747b 4917 list = o->ptr;
4918 llen = listLength(list);
4919
4920 /* convert negative indexes */
4921 if (start < 0) start = llen+start;
4922 if (end < 0) end = llen+end;
4923 if (start < 0) start = 0;
4924 if (end < 0) end = 0;
4925
4926 /* indexes sanity checks */
4927 if (start > end || start >= llen) {
4928 /* Out of range start or start > end result in empty list */
4929 addReply(c,shared.emptymultibulk);
4930 return;
4931 }
4932 if (end >= llen) end = llen-1;
4933 rangelen = (end-start)+1;
3305306f 4934
dd88747b 4935 /* Return the result in form of a multi-bulk reply */
4936 ln = listIndex(list, start);
4937 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4938 for (j = 0; j < rangelen; j++) {
4939 ele = listNodeValue(ln);
4940 addReplyBulk(c,ele);
4941 ln = ln->next;
ed9b544e 4942 }
4943}
4944
4945static void ltrimCommand(redisClient *c) {
3305306f 4946 robj *o;
ed9b544e 4947 int start = atoi(c->argv[2]->ptr);
4948 int end = atoi(c->argv[3]->ptr);
dd88747b 4949 int llen;
4950 int j, ltrim, rtrim;
4951 list *list;
4952 listNode *ln;
4953
4954 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4955 checkType(c,o,REDIS_LIST)) return;
4956 list = o->ptr;
4957 llen = listLength(list);
4958
4959 /* convert negative indexes */
4960 if (start < 0) start = llen+start;
4961 if (end < 0) end = llen+end;
4962 if (start < 0) start = 0;
4963 if (end < 0) end = 0;
4964
4965 /* indexes sanity checks */
4966 if (start > end || start >= llen) {
4967 /* Out of range start or start > end result in empty list */
4968 ltrim = llen;
4969 rtrim = 0;
ed9b544e 4970 } else {
dd88747b 4971 if (end >= llen) end = llen-1;
4972 ltrim = start;
4973 rtrim = llen-end-1;
4974 }
ed9b544e 4975
dd88747b 4976 /* Remove list elements to perform the trim */
4977 for (j = 0; j < ltrim; j++) {
4978 ln = listFirst(list);
4979 listDelNode(list,ln);
4980 }
4981 for (j = 0; j < rtrim; j++) {
4982 ln = listLast(list);
4983 listDelNode(list,ln);
ed9b544e 4984 }
3ea27d37 4985 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4986 server.dirty++;
4987 addReply(c,shared.ok);
ed9b544e 4988}
4989
4990static void lremCommand(redisClient *c) {
3305306f 4991 robj *o;
dd88747b 4992 list *list;
4993 listNode *ln, *next;
4994 int toremove = atoi(c->argv[2]->ptr);
4995 int removed = 0;
4996 int fromtail = 0;
a4d1ba9a 4997
dd88747b 4998 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4999 checkType(c,o,REDIS_LIST)) return;
5000 list = o->ptr;
5001
5002 if (toremove < 0) {
5003 toremove = -toremove;
5004 fromtail = 1;
5005 }
5006 ln = fromtail ? list->tail : list->head;
5007 while (ln) {
5008 robj *ele = listNodeValue(ln);
5009
5010 next = fromtail ? ln->prev : ln->next;
bf028098 5011 if (equalStringObjects(ele,c->argv[3])) {
dd88747b 5012 listDelNode(list,ln);
5013 server.dirty++;
5014 removed++;
5015 if (toremove && removed == toremove) break;
ed9b544e 5016 }
dd88747b 5017 ln = next;
ed9b544e 5018 }
3ea27d37 5019 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5020 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 5021}
5022
12f9d551 5023/* This is the semantic of this command:
0f5f7e9a 5024 * RPOPLPUSH srclist dstlist:
12f9d551 5025 * IF LLEN(srclist) > 0
5026 * element = RPOP srclist
5027 * LPUSH dstlist element
5028 * RETURN element
5029 * ELSE
5030 * RETURN nil
5031 * END
5032 * END
5033 *
5034 * The idea is to be able to get an element from a list in a reliable way
5035 * since the element is not just returned but pushed against another list
5036 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5037 */
0f5f7e9a 5038static void rpoplpushcommand(redisClient *c) {
12f9d551 5039 robj *sobj;
dd88747b 5040 list *srclist;
5041 listNode *ln;
5042
5043 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5044 checkType(c,sobj,REDIS_LIST)) return;
5045 srclist = sobj->ptr;
5046 ln = listLast(srclist);
12f9d551 5047
dd88747b 5048 if (ln == NULL) {
12f9d551 5049 addReply(c,shared.nullbulk);
5050 } else {
dd88747b 5051 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5052 robj *ele = listNodeValue(ln);
5053 list *dstlist;
e20fb74f 5054
dd88747b 5055 if (dobj && dobj->type != REDIS_LIST) {
5056 addReply(c,shared.wrongtypeerr);
5057 return;
5058 }
12f9d551 5059
dd88747b 5060 /* Add the element to the target list (unless it's directly
5061 * passed to some BLPOP-ing client */
5062 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
5063 if (dobj == NULL) {
5064 /* Create the list if the key does not exist */
5065 dobj = createListObject();
5066 dictAdd(c->db->dict,c->argv[2],dobj);
5067 incrRefCount(c->argv[2]);
12f9d551 5068 }
dd88747b 5069 dstlist = dobj->ptr;
5070 listAddNodeHead(dstlist,ele);
5071 incrRefCount(ele);
12f9d551 5072 }
dd88747b 5073
5074 /* Send the element to the client as reply as well */
5075 addReplyBulk(c,ele);
5076
5077 /* Finally remove the element from the source list */
5078 listDelNode(srclist,ln);
3ea27d37 5079 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5080 server.dirty++;
12f9d551 5081 }
5082}
5083
ed9b544e 5084/* ==================================== Sets ================================ */
5085
5086static void saddCommand(redisClient *c) {
ed9b544e 5087 robj *set;
5088
3305306f 5089 set = lookupKeyWrite(c->db,c->argv[1]);
5090 if (set == NULL) {
ed9b544e 5091 set = createSetObject();
3305306f 5092 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 5093 incrRefCount(c->argv[1]);
5094 } else {
ed9b544e 5095 if (set->type != REDIS_SET) {
c937aa89 5096 addReply(c,shared.wrongtypeerr);
ed9b544e 5097 return;
5098 }
5099 }
5100 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5101 incrRefCount(c->argv[2]);
5102 server.dirty++;
c937aa89 5103 addReply(c,shared.cone);
ed9b544e 5104 } else {
c937aa89 5105 addReply(c,shared.czero);
ed9b544e 5106 }
5107}
5108
5109static void sremCommand(redisClient *c) {
3305306f 5110 robj *set;
ed9b544e 5111
dd88747b 5112 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5113 checkType(c,set,REDIS_SET)) return;
5114
5115 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5116 server.dirty++;
5117 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 5118 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5119 addReply(c,shared.cone);
ed9b544e 5120 } else {
dd88747b 5121 addReply(c,shared.czero);
ed9b544e 5122 }
5123}
5124
a4460ef4 5125static void smoveCommand(redisClient *c) {
5126 robj *srcset, *dstset;
5127
5128 srcset = lookupKeyWrite(c->db,c->argv[1]);
5129 dstset = lookupKeyWrite(c->db,c->argv[2]);
5130
5131 /* If the source key does not exist return 0, if it's of the wrong type
5132 * raise an error */
5133 if (srcset == NULL || srcset->type != REDIS_SET) {
5134 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5135 return;
5136 }
5137 /* Error if the destination key is not a set as well */
5138 if (dstset && dstset->type != REDIS_SET) {
5139 addReply(c,shared.wrongtypeerr);
5140 return;
5141 }
5142 /* Remove the element from the source set */
5143 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5144 /* Key not found in the src set! return zero */
5145 addReply(c,shared.czero);
5146 return;
5147 }
3ea27d37 5148 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5149 deleteKey(c->db,c->argv[1]);
a4460ef4 5150 server.dirty++;
5151 /* Add the element to the destination set */
5152 if (!dstset) {
5153 dstset = createSetObject();
5154 dictAdd(c->db->dict,c->argv[2],dstset);
5155 incrRefCount(c->argv[2]);
5156 }
5157 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5158 incrRefCount(c->argv[3]);
5159 addReply(c,shared.cone);
5160}
5161
ed9b544e 5162static void sismemberCommand(redisClient *c) {
3305306f 5163 robj *set;
ed9b544e 5164
dd88747b 5165 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5166 checkType(c,set,REDIS_SET)) return;
5167
5168 if (dictFind(set->ptr,c->argv[2]))
5169 addReply(c,shared.cone);
5170 else
c937aa89 5171 addReply(c,shared.czero);
ed9b544e 5172}
5173
5174static void scardCommand(redisClient *c) {
3305306f 5175 robj *o;
ed9b544e 5176 dict *s;
dd88747b 5177
5178 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5179 checkType(c,o,REDIS_SET)) return;
e0a62c7f 5180
dd88747b 5181 s = o->ptr;
5182 addReplyUlong(c,dictSize(s));
ed9b544e 5183}
5184
12fea928 5185static void spopCommand(redisClient *c) {
5186 robj *set;
5187 dictEntry *de;
5188
dd88747b 5189 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5190 checkType(c,set,REDIS_SET)) return;
5191
5192 de = dictGetRandomKey(set->ptr);
5193 if (de == NULL) {
12fea928 5194 addReply(c,shared.nullbulk);
5195 } else {
dd88747b 5196 robj *ele = dictGetEntryKey(de);
12fea928 5197
dd88747b 5198 addReplyBulk(c,ele);
5199 dictDelete(set->ptr,ele);
5200 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 5201 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5202 server.dirty++;
12fea928 5203 }
5204}
5205
2abb95a9 5206static void srandmemberCommand(redisClient *c) {
5207 robj *set;
5208 dictEntry *de;
5209
dd88747b 5210 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5211 checkType(c,set,REDIS_SET)) return;
5212
5213 de = dictGetRandomKey(set->ptr);
5214 if (de == NULL) {
2abb95a9 5215 addReply(c,shared.nullbulk);
5216 } else {
dd88747b 5217 robj *ele = dictGetEntryKey(de);
2abb95a9 5218
dd88747b 5219 addReplyBulk(c,ele);
2abb95a9 5220 }
5221}
5222
ed9b544e 5223static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5224 dict **d1 = (void*) s1, **d2 = (void*) s2;
5225
3305306f 5226 return dictSize(*d1)-dictSize(*d2);
ed9b544e 5227}
5228
682ac724 5229static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 5230 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5231 dictIterator *di;
5232 dictEntry *de;
5233 robj *lenobj = NULL, *dstset = NULL;
682ac724 5234 unsigned long j, cardinality = 0;
ed9b544e 5235
ed9b544e 5236 for (j = 0; j < setsnum; j++) {
5237 robj *setobj;
3305306f 5238
5239 setobj = dstkey ?
5240 lookupKeyWrite(c->db,setskeys[j]) :
5241 lookupKeyRead(c->db,setskeys[j]);
5242 if (!setobj) {
ed9b544e 5243 zfree(dv);
5faa6025 5244 if (dstkey) {
fdcaae84 5245 if (deleteKey(c->db,dstkey))
5246 server.dirty++;
0d36ded0 5247 addReply(c,shared.czero);
5faa6025 5248 } else {
4e27f268 5249 addReply(c,shared.emptymultibulk);
5faa6025 5250 }
ed9b544e 5251 return;
5252 }
ed9b544e 5253 if (setobj->type != REDIS_SET) {
5254 zfree(dv);
c937aa89 5255 addReply(c,shared.wrongtypeerr);
ed9b544e 5256 return;
5257 }
5258 dv[j] = setobj->ptr;
5259 }
5260 /* Sort sets from the smallest to largest, this will improve our
5261 * algorithm's performace */
5262 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5263
5264 /* The first thing we should output is the total number of elements...
5265 * since this is a multi-bulk write, but at this stage we don't know
5266 * the intersection set size, so we use a trick, append an empty object
5267 * to the output list and save the pointer to later modify it with the
5268 * right length */
5269 if (!dstkey) {
5270 lenobj = createObject(REDIS_STRING,NULL);
5271 addReply(c,lenobj);
5272 decrRefCount(lenobj);
5273 } else {
5274 /* If we have a target key where to store the resulting set
5275 * create this key with an empty set inside */
5276 dstset = createSetObject();
ed9b544e 5277 }
5278
5279 /* Iterate all the elements of the first (smallest) set, and test
5280 * the element against all the other sets, if at least one set does
5281 * not include the element it is discarded */
5282 di = dictGetIterator(dv[0]);
ed9b544e 5283
5284 while((de = dictNext(di)) != NULL) {
5285 robj *ele;
5286
5287 for (j = 1; j < setsnum; j++)
5288 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5289 if (j != setsnum)
5290 continue; /* at least one set does not contain the member */
5291 ele = dictGetEntryKey(de);
5292 if (!dstkey) {
dd88747b 5293 addReplyBulk(c,ele);
ed9b544e 5294 cardinality++;
5295 } else {
5296 dictAdd(dstset->ptr,ele,NULL);
5297 incrRefCount(ele);
5298 }
5299 }
5300 dictReleaseIterator(di);
5301
83cdfe18 5302 if (dstkey) {
3ea27d37 5303 /* Store the resulting set into the target, if the intersection
5304 * is not an empty set. */
83cdfe18 5305 deleteKey(c->db,dstkey);
3ea27d37 5306 if (dictSize((dict*)dstset->ptr) > 0) {
5307 dictAdd(c->db->dict,dstkey,dstset);
5308 incrRefCount(dstkey);
482b672d 5309 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5310 } else {
5311 decrRefCount(dstset);
d36c4e97 5312 addReply(c,shared.czero);
3ea27d37 5313 }
40d224a9 5314 server.dirty++;
d36c4e97 5315 } else {
5316 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 5317 }
ed9b544e 5318 zfree(dv);
5319}
5320
5321static void sinterCommand(redisClient *c) {
5322 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5323}
5324
5325static void sinterstoreCommand(redisClient *c) {
5326 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5327}
5328
f4f56e1d 5329#define REDIS_OP_UNION 0
5330#define REDIS_OP_DIFF 1
2830ca53 5331#define REDIS_OP_INTER 2
f4f56e1d 5332
5333static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 5334 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5335 dictIterator *di;
5336 dictEntry *de;
f4f56e1d 5337 robj *dstset = NULL;
40d224a9 5338 int j, cardinality = 0;
5339
40d224a9 5340 for (j = 0; j < setsnum; j++) {
5341 robj *setobj;
5342
5343 setobj = dstkey ?
5344 lookupKeyWrite(c->db,setskeys[j]) :
5345 lookupKeyRead(c->db,setskeys[j]);
5346 if (!setobj) {
5347 dv[j] = NULL;
5348 continue;
5349 }
5350 if (setobj->type != REDIS_SET) {
5351 zfree(dv);
5352 addReply(c,shared.wrongtypeerr);
5353 return;
5354 }
5355 dv[j] = setobj->ptr;
5356 }
5357
5358 /* We need a temp set object to store our union. If the dstkey
5359 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5360 * this set object will be the resulting object to set into the target key*/
5361 dstset = createSetObject();
5362
40d224a9 5363 /* Iterate all the elements of all the sets, add every element a single
5364 * time to the result set */
5365 for (j = 0; j < setsnum; j++) {
51829ed3 5366 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 5367 if (!dv[j]) continue; /* non existing keys are like empty sets */
5368
5369 di = dictGetIterator(dv[j]);
40d224a9 5370
5371 while((de = dictNext(di)) != NULL) {
5372 robj *ele;
5373
5374 /* dictAdd will not add the same element multiple times */
5375 ele = dictGetEntryKey(de);
f4f56e1d 5376 if (op == REDIS_OP_UNION || j == 0) {
5377 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5378 incrRefCount(ele);
40d224a9 5379 cardinality++;
5380 }
f4f56e1d 5381 } else if (op == REDIS_OP_DIFF) {
5382 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5383 cardinality--;
5384 }
40d224a9 5385 }
5386 }
5387 dictReleaseIterator(di);
51829ed3 5388
d36c4e97 5389 /* result set is empty? Exit asap. */
5390 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 5391 }
5392
f4f56e1d 5393 /* Output the content of the resulting set, if not in STORE mode */
5394 if (!dstkey) {
5395 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5396 di = dictGetIterator(dstset->ptr);
f4f56e1d 5397 while((de = dictNext(di)) != NULL) {
5398 robj *ele;
5399
5400 ele = dictGetEntryKey(de);
dd88747b 5401 addReplyBulk(c,ele);
f4f56e1d 5402 }
5403 dictReleaseIterator(di);
d36c4e97 5404 decrRefCount(dstset);
83cdfe18
AG
5405 } else {
5406 /* If we have a target key where to store the resulting set
5407 * create this key with the result set inside */
5408 deleteKey(c->db,dstkey);
3ea27d37 5409 if (dictSize((dict*)dstset->ptr) > 0) {
5410 dictAdd(c->db->dict,dstkey,dstset);
5411 incrRefCount(dstkey);
482b672d 5412 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5413 } else {
5414 decrRefCount(dstset);
d36c4e97 5415 addReply(c,shared.czero);
3ea27d37 5416 }
40d224a9 5417 server.dirty++;
5418 }
5419 zfree(dv);
5420}
5421
5422static void sunionCommand(redisClient *c) {
f4f56e1d 5423 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 5424}
5425
5426static void sunionstoreCommand(redisClient *c) {
f4f56e1d 5427 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5428}
5429
5430static void sdiffCommand(redisClient *c) {
5431 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5432}
5433
5434static void sdiffstoreCommand(redisClient *c) {
5435 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 5436}
5437
6b47e12e 5438/* ==================================== ZSets =============================== */
5439
5440/* ZSETs are ordered sets using two data structures to hold the same elements
5441 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5442 * data structure.
5443 *
5444 * The elements are added to an hash table mapping Redis objects to scores.
5445 * At the same time the elements are added to a skip list mapping scores
5446 * to Redis objects (so objects are sorted by scores in this "view"). */
5447
5448/* This skiplist implementation is almost a C translation of the original
5449 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5450 * Alternative to Balanced Trees", modified in three ways:
5451 * a) this implementation allows for repeated values.
5452 * b) the comparison is not just by key (our 'score') but by satellite data.
5453 * c) there is a back pointer, so it's a doubly linked list with the back
5454 * pointers being only at "level 1". This allows to traverse the list
5455 * from tail to head, useful for ZREVRANGE. */
5456
5457static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5458 zskiplistNode *zn = zmalloc(sizeof(*zn));
5459
5460 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2f4dd7e0 5461 if (level > 1)
2b37892e 5462 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
2f4dd7e0 5463 else
5464 zn->span = NULL;
6b47e12e 5465 zn->score = score;
5466 zn->obj = obj;
5467 return zn;
5468}
5469
5470static zskiplist *zslCreate(void) {
5471 int j;
5472 zskiplist *zsl;
e0a62c7f 5473
6b47e12e 5474 zsl = zmalloc(sizeof(*zsl));
5475 zsl->level = 1;
cc812361 5476 zsl->length = 0;
6b47e12e 5477 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 5478 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 5479 zsl->header->forward[j] = NULL;
94e543b5 5480
5481 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5482 if (j < ZSKIPLIST_MAXLEVEL-1)
5483 zsl->header->span[j] = 0;
69d95c3e 5484 }
e3870fab 5485 zsl->header->backward = NULL;
5486 zsl->tail = NULL;
6b47e12e 5487 return zsl;
5488}
5489
fd8ccf44 5490static void zslFreeNode(zskiplistNode *node) {
5491 decrRefCount(node->obj);
ad807e6f 5492 zfree(node->forward);
69d95c3e 5493 zfree(node->span);
fd8ccf44 5494 zfree(node);
5495}
5496
5497static void zslFree(zskiplist *zsl) {
ad807e6f 5498 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 5499
ad807e6f 5500 zfree(zsl->header->forward);
69d95c3e 5501 zfree(zsl->header->span);
ad807e6f 5502 zfree(zsl->header);
fd8ccf44 5503 while(node) {
599379dd 5504 next = node->forward[0];
fd8ccf44 5505 zslFreeNode(node);
5506 node = next;
5507 }
ad807e6f 5508 zfree(zsl);
fd8ccf44 5509}
5510
6b47e12e 5511static int zslRandomLevel(void) {
5512 int level = 1;
5513 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5514 level += 1;
10c2baa5 5515 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6b47e12e 5516}
5517
5518static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5519 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 5520 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 5521 int i, level;
5522
5523 x = zsl->header;
5524 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
5525 /* store rank that is crossed to reach the insert position */
5526 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 5527
9d60e6e4 5528 while (x->forward[i] &&
5529 (x->forward[i]->score < score ||
5530 (x->forward[i]->score == score &&
69d95c3e 5531 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 5532 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 5533 x = x->forward[i];
69d95c3e 5534 }
6b47e12e 5535 update[i] = x;
5536 }
6b47e12e 5537 /* we assume the key is not already inside, since we allow duplicated
5538 * scores, and the re-insertion of score and redis object should never
5539 * happpen since the caller of zslInsert() should test in the hash table
5540 * if the element is already inside or not. */
5541 level = zslRandomLevel();
5542 if (level > zsl->level) {
69d95c3e 5543 for (i = zsl->level; i < level; i++) {
2b37892e 5544 rank[i] = 0;
6b47e12e 5545 update[i] = zsl->header;
2b37892e 5546 update[i]->span[i-1] = zsl->length;
69d95c3e 5547 }
6b47e12e 5548 zsl->level = level;
5549 }
5550 x = zslCreateNode(level,score,obj);
5551 for (i = 0; i < level; i++) {
5552 x->forward[i] = update[i]->forward[i];
5553 update[i]->forward[i] = x;
69d95c3e
PN
5554
5555 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
5556 if (i > 0) {
5557 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5558 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5559 }
6b47e12e 5560 }
69d95c3e
PN
5561
5562 /* increment span for untouched levels */
5563 for (i = level; i < zsl->level; i++) {
2b37892e 5564 update[i]->span[i-1]++;
69d95c3e
PN
5565 }
5566
bb975144 5567 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 5568 if (x->forward[0])
5569 x->forward[0]->backward = x;
5570 else
5571 zsl->tail = x;
cc812361 5572 zsl->length++;
6b47e12e 5573}
5574
84105336
PN
5575/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5576void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5577 int i;
5578 for (i = 0; i < zsl->level; i++) {
5579 if (update[i]->forward[i] == x) {
5580 if (i > 0) {
5581 update[i]->span[i-1] += x->span[i-1] - 1;
5582 }
5583 update[i]->forward[i] = x->forward[i];
5584 } else {
5585 /* invariant: i > 0, because update[0]->forward[0]
5586 * is always equal to x */
5587 update[i]->span[i-1] -= 1;
5588 }
5589 }
5590 if (x->forward[0]) {
5591 x->forward[0]->backward = x->backward;
5592 } else {
5593 zsl->tail = x->backward;
5594 }
5595 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5596 zsl->level--;
5597 zsl->length--;
5598}
5599
50c55df5 5600/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 5601static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 5602 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5603 int i;
5604
5605 x = zsl->header;
5606 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 5607 while (x->forward[i] &&
5608 (x->forward[i]->score < score ||
5609 (x->forward[i]->score == score &&
5610 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 5611 x = x->forward[i];
5612 update[i] = x;
5613 }
5614 /* We may have multiple elements with the same score, what we need
5615 * is to find the element with both the right score and object. */
5616 x = x->forward[0];
bf028098 5617 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
84105336 5618 zslDeleteNode(zsl, x, update);
9d60e6e4 5619 zslFreeNode(x);
9d60e6e4 5620 return 1;
5621 } else {
5622 return 0; /* not found */
e197b441 5623 }
5624 return 0; /* not found */
fd8ccf44 5625}
5626
1807985b 5627/* Delete all the elements with score between min and max from the skiplist.
5628 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5629 * Note that this function takes the reference to the hash table view of the
5630 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 5631static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 5632 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5633 unsigned long removed = 0;
5634 int i;
5635
5636 x = zsl->header;
5637 for (i = zsl->level-1; i >= 0; i--) {
5638 while (x->forward[i] && x->forward[i]->score < min)
5639 x = x->forward[i];
5640 update[i] = x;
5641 }
5642 /* We may have multiple elements with the same score, what we need
5643 * is to find the element with both the right score and object. */
5644 x = x->forward[0];
5645 while (x && x->score <= max) {
84105336
PN
5646 zskiplistNode *next = x->forward[0];
5647 zslDeleteNode(zsl, x, update);
1807985b 5648 dictDelete(dict,x->obj);
5649 zslFreeNode(x);
1807985b 5650 removed++;
5651 x = next;
5652 }
5653 return removed; /* not found */
5654}
1807985b 5655
9212eafd 5656/* Delete all the elements with rank between start and end from the skiplist.
2424490f 5657 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
5658static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5659 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5660 unsigned long traversed = 0, removed = 0;
5661 int i;
5662
9212eafd
PN
5663 x = zsl->header;
5664 for (i = zsl->level-1; i >= 0; i--) {
5665 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5666 traversed += i > 0 ? x->span[i-1] : 1;
5667 x = x->forward[i];
1807985b 5668 }
9212eafd
PN
5669 update[i] = x;
5670 }
5671
5672 traversed++;
5673 x = x->forward[0];
5674 while (x && traversed <= end) {
84105336
PN
5675 zskiplistNode *next = x->forward[0];
5676 zslDeleteNode(zsl, x, update);
1807985b 5677 dictDelete(dict,x->obj);
5678 zslFreeNode(x);
1807985b 5679 removed++;
9212eafd 5680 traversed++;
1807985b 5681 x = next;
5682 }
9212eafd 5683 return removed;
1807985b 5684}
5685
50c55df5 5686/* Find the first node having a score equal or greater than the specified one.
5687 * Returns NULL if there is no match. */
5688static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5689 zskiplistNode *x;
5690 int i;
5691
5692 x = zsl->header;
5693 for (i = zsl->level-1; i >= 0; i--) {
5694 while (x->forward[i] && x->forward[i]->score < score)
5695 x = x->forward[i];
5696 }
5697 /* We may have multiple elements with the same score, what we need
5698 * is to find the element with both the right score and object. */
5699 return x->forward[0];
5700}
5701
27b0ccca
PN
5702/* Find the rank for an element by both score and key.
5703 * Returns 0 when the element cannot be found, rank otherwise.
5704 * Note that the rank is 1-based due to the span of zsl->header to the
5705 * first element. */
5706static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5707 zskiplistNode *x;
5708 unsigned long rank = 0;
5709 int i;
5710
5711 x = zsl->header;
5712 for (i = zsl->level-1; i >= 0; i--) {
5713 while (x->forward[i] &&
5714 (x->forward[i]->score < score ||
5715 (x->forward[i]->score == score &&
5716 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 5717 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
5718 x = x->forward[i];
5719 }
5720
5721 /* x might be equal to zsl->header, so test if obj is non-NULL */
bf028098 5722 if (x->obj && equalStringObjects(x->obj,o)) {
27b0ccca
PN
5723 return rank;
5724 }
5725 }
5726 return 0;
5727}
5728
e74825c2
PN
5729/* Finds an element by its rank. The rank argument needs to be 1-based. */
5730zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5731 zskiplistNode *x;
5732 unsigned long traversed = 0;
5733 int i;
5734
5735 x = zsl->header;
5736 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 5737 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5738 {
a50ea45c 5739 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
5740 x = x->forward[i];
5741 }
e74825c2
PN
5742 if (traversed == rank) {
5743 return x;
5744 }
5745 }
5746 return NULL;
5747}
5748
fd8ccf44 5749/* The actual Z-commands implementations */
5750
7db723ad 5751/* This generic command implements both ZADD and ZINCRBY.
e2665397 5752 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 5753 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 5754static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 5755 robj *zsetobj;
5756 zset *zs;
5757 double *score;
5758
5fc9229c 5759 if (isnan(scoreval)) {
5760 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
5761 return;
5762 }
5763
e2665397 5764 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 5765 if (zsetobj == NULL) {
5766 zsetobj = createZsetObject();
e2665397 5767 dictAdd(c->db->dict,key,zsetobj);
5768 incrRefCount(key);
fd8ccf44 5769 } else {
5770 if (zsetobj->type != REDIS_ZSET) {
5771 addReply(c,shared.wrongtypeerr);
5772 return;
5773 }
5774 }
fd8ccf44 5775 zs = zsetobj->ptr;
e2665397 5776
7db723ad 5777 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 5778 * needs to handle the two different conditions. It's all about setting
5779 * '*score', that is, the new score to set, to the right value. */
5780 score = zmalloc(sizeof(double));
5781 if (doincrement) {
5782 dictEntry *de;
5783
5784 /* Read the old score. If the element was not present starts from 0 */
5785 de = dictFind(zs->dict,ele);
5786 if (de) {
5787 double *oldscore = dictGetEntryVal(de);
5788 *score = *oldscore + scoreval;
5789 } else {
5790 *score = scoreval;
5791 }
5fc9229c 5792 if (isnan(*score)) {
5793 addReplySds(c,
5794 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
5795 zfree(score);
5796 /* Note that we don't need to check if the zset may be empty and
5797 * should be removed here, as we can only obtain Nan as score if
5798 * there was already an element in the sorted set. */
5799 return;
5800 }
e2665397 5801 } else {
5802 *score = scoreval;
5803 }
5804
5805 /* What follows is a simple remove and re-insert operation that is common
7db723ad 5806 * to both ZADD and ZINCRBY... */
e2665397 5807 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 5808 /* case 1: New element */
e2665397 5809 incrRefCount(ele); /* added to hash */
5810 zslInsert(zs->zsl,*score,ele);
5811 incrRefCount(ele); /* added to skiplist */
fd8ccf44 5812 server.dirty++;
e2665397 5813 if (doincrement)
e2665397 5814 addReplyDouble(c,*score);
91d71bfc 5815 else
5816 addReply(c,shared.cone);
fd8ccf44 5817 } else {
5818 dictEntry *de;
5819 double *oldscore;
e0a62c7f 5820
fd8ccf44 5821 /* case 2: Score update operation */
e2665397 5822 de = dictFind(zs->dict,ele);
dfc5e96c 5823 redisAssert(de != NULL);
fd8ccf44 5824 oldscore = dictGetEntryVal(de);
5825 if (*score != *oldscore) {
5826 int deleted;
5827
e2665397 5828 /* Remove and insert the element in the skip list with new score */
5829 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 5830 redisAssert(deleted != 0);
e2665397 5831 zslInsert(zs->zsl,*score,ele);
5832 incrRefCount(ele);
5833 /* Update the score in the hash table */
5834 dictReplace(zs->dict,ele,score);
fd8ccf44 5835 server.dirty++;
2161a965 5836 } else {
5837 zfree(score);
fd8ccf44 5838 }
e2665397 5839 if (doincrement)
5840 addReplyDouble(c,*score);
5841 else
5842 addReply(c,shared.czero);
fd8ccf44 5843 }
5844}
5845
e2665397 5846static void zaddCommand(redisClient *c) {
5847 double scoreval;
5848
bd79a6bd 5849 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5850 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5851}
5852
7db723ad 5853static void zincrbyCommand(redisClient *c) {
e2665397 5854 double scoreval;
5855
bd79a6bd 5856 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5857 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5858}
5859
1b7106e7 5860static void zremCommand(redisClient *c) {
5861 robj *zsetobj;
5862 zset *zs;
dd88747b 5863 dictEntry *de;
5864 double *oldscore;
5865 int deleted;
1b7106e7 5866
dd88747b 5867 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5868 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 5869
dd88747b 5870 zs = zsetobj->ptr;
5871 de = dictFind(zs->dict,c->argv[2]);
5872 if (de == NULL) {
5873 addReply(c,shared.czero);
5874 return;
1b7106e7 5875 }
dd88747b 5876 /* Delete from the skiplist */
5877 oldscore = dictGetEntryVal(de);
5878 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5879 redisAssert(deleted != 0);
5880
5881 /* Delete from the hash table */
5882 dictDelete(zs->dict,c->argv[2]);
5883 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5884 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5885 server.dirty++;
5886 addReply(c,shared.cone);
1b7106e7 5887}
5888
1807985b 5889static void zremrangebyscoreCommand(redisClient *c) {
bbe025e0
AM
5890 double min;
5891 double max;
dd88747b 5892 long deleted;
1807985b 5893 robj *zsetobj;
5894 zset *zs;
5895
bd79a6bd
PN
5896 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5897 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
bbe025e0 5898
dd88747b 5899 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5900 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 5901
dd88747b 5902 zs = zsetobj->ptr;
5903 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5904 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5905 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5906 server.dirty += deleted;
482b672d 5907 addReplyLongLong(c,deleted);
1807985b 5908}
5909
9212eafd 5910static void zremrangebyrankCommand(redisClient *c) {
bbe025e0
AM
5911 long start;
5912 long end;
dd88747b 5913 int llen;
5914 long deleted;
9212eafd
PN
5915 robj *zsetobj;
5916 zset *zs;
5917
bd79a6bd
PN
5918 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5919 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 5920
dd88747b 5921 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5922 checkType(c,zsetobj,REDIS_ZSET)) return;
5923 zs = zsetobj->ptr;
5924 llen = zs->zsl->length;
9212eafd 5925
dd88747b 5926 /* convert negative indexes */
5927 if (start < 0) start = llen+start;
5928 if (end < 0) end = llen+end;
5929 if (start < 0) start = 0;
5930 if (end < 0) end = 0;
9212eafd 5931
dd88747b 5932 /* indexes sanity checks */
5933 if (start > end || start >= llen) {
5934 addReply(c,shared.czero);
5935 return;
9212eafd 5936 }
dd88747b 5937 if (end >= llen) end = llen-1;
5938
5939 /* increment start and end because zsl*Rank functions
5940 * use 1-based rank */
5941 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5942 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5943 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5944 server.dirty += deleted;
482b672d 5945 addReplyLongLong(c, deleted);
9212eafd
PN
5946}
5947
8f92e768
PN
5948typedef struct {
5949 dict *dict;
5950 double weight;
5951} zsetopsrc;
5952
5953static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5954 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5955 unsigned long size1, size2;
5956 size1 = d1->dict ? dictSize(d1->dict) : 0;
5957 size2 = d2->dict ? dictSize(d2->dict) : 0;
5958 return size1 - size2;
5959}
5960
d2764cd6
PN
5961#define REDIS_AGGR_SUM 1
5962#define REDIS_AGGR_MIN 2
5963#define REDIS_AGGR_MAX 3
bc000c1d 5964#define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
d2764cd6
PN
5965
5966inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5967 if (aggregate == REDIS_AGGR_SUM) {
5968 *target = *target + val;
5969 } else if (aggregate == REDIS_AGGR_MIN) {
5970 *target = val < *target ? val : *target;
5971 } else if (aggregate == REDIS_AGGR_MAX) {
5972 *target = val > *target ? val : *target;
5973 } else {
5974 /* safety net */
f83c6cb5 5975 redisPanic("Unknown ZUNION/INTER aggregate type");
d2764cd6
PN
5976 }
5977}
5978
2830ca53 5979static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
bc000c1d 5980 int i, j, setnum;
d2764cd6 5981 int aggregate = REDIS_AGGR_SUM;
8f92e768 5982 zsetopsrc *src;
2830ca53
PN
5983 robj *dstobj;
5984 zset *dstzset;
b287c9bb
PN
5985 dictIterator *di;
5986 dictEntry *de;
5987
bc000c1d
JC
5988 /* expect setnum input keys to be given */
5989 setnum = atoi(c->argv[2]->ptr);
5990 if (setnum < 1) {
5d373da9 5991 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
2830ca53 5992 return;
b287c9bb 5993 }
2830ca53
PN
5994
5995 /* test if the expected number of keys would overflow */
bc000c1d 5996 if (3+setnum > c->argc) {
b287c9bb
PN
5997 addReply(c,shared.syntaxerr);
5998 return;
5999 }
6000
2830ca53 6001 /* read keys to be used for input */
bc000c1d
JC
6002 src = zmalloc(sizeof(zsetopsrc) * setnum);
6003 for (i = 0, j = 3; i < setnum; i++, j++) {
6004 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
6005 if (!obj) {
8f92e768 6006 src[i].dict = NULL;
b287c9bb 6007 } else {
bc000c1d
JC
6008 if (obj->type == REDIS_ZSET) {
6009 src[i].dict = ((zset*)obj->ptr)->dict;
6010 } else if (obj->type == REDIS_SET) {
6011 src[i].dict = (obj->ptr);
6012 } else {
8f92e768 6013 zfree(src);
b287c9bb
PN
6014 addReply(c,shared.wrongtypeerr);
6015 return;
6016 }
b287c9bb 6017 }
2830ca53
PN
6018
6019 /* default all weights to 1 */
8f92e768 6020 src[i].weight = 1.0;
b287c9bb
PN
6021 }
6022
2830ca53
PN
6023 /* parse optional extra arguments */
6024 if (j < c->argc) {
d2764cd6 6025 int remaining = c->argc - j;
b287c9bb 6026
2830ca53 6027 while (remaining) {
bc000c1d 6028 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 6029 j++; remaining--;
bc000c1d 6030 for (i = 0; i < setnum; i++, j++, remaining--) {
bd79a6bd 6031 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
bbe025e0 6032 return;
2830ca53 6033 }
d2764cd6
PN
6034 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6035 j++; remaining--;
6036 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6037 aggregate = REDIS_AGGR_SUM;
6038 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6039 aggregate = REDIS_AGGR_MIN;
6040 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6041 aggregate = REDIS_AGGR_MAX;
6042 } else {
6043 zfree(src);
6044 addReply(c,shared.syntaxerr);
6045 return;
6046 }
6047 j++; remaining--;
2830ca53 6048 } else {
8f92e768 6049 zfree(src);
2830ca53
PN
6050 addReply(c,shared.syntaxerr);
6051 return;
6052 }
6053 }
6054 }
b287c9bb 6055
d2764cd6
PN
6056 /* sort sets from the smallest to largest, this will improve our
6057 * algorithm's performance */
bc000c1d 6058 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
d2764cd6 6059
2830ca53
PN
6060 dstobj = createZsetObject();
6061 dstzset = dstobj->ptr;
6062
6063 if (op == REDIS_OP_INTER) {
8f92e768
PN
6064 /* skip going over all entries if the smallest zset is NULL or empty */
6065 if (src[0].dict && dictSize(src[0].dict) > 0) {
6066 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6067 * from small to large, all src[i > 0].dict are non-empty too */
6068 di = dictGetIterator(src[0].dict);
2830ca53 6069 while((de = dictNext(di)) != NULL) {
d2764cd6 6070 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6071 *score = src[0].weight * zunionInterDictValue(de);
2830ca53 6072
bc000c1d 6073 for (j = 1; j < setnum; j++) {
d2764cd6 6074 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6075 if (other) {
bc000c1d 6076 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6077 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6078 } else {
6079 break;
6080 }
6081 }
b287c9bb 6082
2830ca53 6083 /* skip entry when not present in every source dict */
bc000c1d 6084 if (j != setnum) {
2830ca53
PN
6085 zfree(score);
6086 } else {
6087 robj *o = dictGetEntryKey(de);
6088 dictAdd(dstzset->dict,o,score);
6089 incrRefCount(o); /* added to dictionary */
6090 zslInsert(dstzset->zsl,*score,o);
6091 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
6092 }
6093 }
2830ca53
PN
6094 dictReleaseIterator(di);
6095 }
6096 } else if (op == REDIS_OP_UNION) {
bc000c1d 6097 for (i = 0; i < setnum; i++) {
8f92e768 6098 if (!src[i].dict) continue;
2830ca53 6099
8f92e768 6100 di = dictGetIterator(src[i].dict);
2830ca53
PN
6101 while((de = dictNext(di)) != NULL) {
6102 /* skip key when already processed */
6103 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6104
d2764cd6 6105 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6106 *score = src[i].weight * zunionInterDictValue(de);
2830ca53 6107
d2764cd6
PN
6108 /* because the zsets are sorted by size, its only possible
6109 * for sets at larger indices to hold this entry */
bc000c1d 6110 for (j = (i+1); j < setnum; j++) {
d2764cd6 6111 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6112 if (other) {
bc000c1d 6113 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6114 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6115 }
6116 }
b287c9bb 6117
2830ca53
PN
6118 robj *o = dictGetEntryKey(de);
6119 dictAdd(dstzset->dict,o,score);
6120 incrRefCount(o); /* added to dictionary */
6121 zslInsert(dstzset->zsl,*score,o);
6122 incrRefCount(o); /* added to skiplist */
6123 }
6124 dictReleaseIterator(di);
b287c9bb 6125 }
2830ca53
PN
6126 } else {
6127 /* unknown operator */
6128 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
6129 }
6130
6131 deleteKey(c->db,dstkey);
3ea27d37 6132 if (dstzset->zsl->length) {
6133 dictAdd(c->db->dict,dstkey,dstobj);
6134 incrRefCount(dstkey);
482b672d 6135 addReplyLongLong(c, dstzset->zsl->length);
3ea27d37 6136 server.dirty++;
6137 } else {
8bca8773 6138 decrRefCount(dstobj);
3ea27d37 6139 addReply(c, shared.czero);
6140 }
8f92e768 6141 zfree(src);
b287c9bb
PN
6142}
6143
5d373da9 6144static void zunionstoreCommand(redisClient *c) {
2830ca53 6145 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
6146}
6147
5d373da9 6148static void zinterstoreCommand(redisClient *c) {
2830ca53 6149 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
6150}
6151
e3870fab 6152static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 6153 robj *o;
bbe025e0
AM
6154 long start;
6155 long end;
752da584 6156 int withscores = 0;
dd88747b 6157 int llen;
6158 int rangelen, j;
6159 zset *zsetobj;
6160 zskiplist *zsl;
6161 zskiplistNode *ln;
6162 robj *ele;
752da584 6163
bd79a6bd
PN
6164 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6165 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 6166
752da584 6167 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6168 withscores = 1;
6169 } else if (c->argc >= 5) {
6170 addReply(c,shared.syntaxerr);
6171 return;
6172 }
cc812361 6173
4e27f268 6174 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6175 || checkType(c,o,REDIS_ZSET)) return;
dd88747b 6176 zsetobj = o->ptr;
6177 zsl = zsetobj->zsl;
6178 llen = zsl->length;
cc812361 6179
dd88747b 6180 /* convert negative indexes */
6181 if (start < 0) start = llen+start;
6182 if (end < 0) end = llen+end;
6183 if (start < 0) start = 0;
6184 if (end < 0) end = 0;
cc812361 6185
dd88747b 6186 /* indexes sanity checks */
6187 if (start > end || start >= llen) {
6188 /* Out of range start or start > end result in empty list */
6189 addReply(c,shared.emptymultibulk);
6190 return;
6191 }
6192 if (end >= llen) end = llen-1;
6193 rangelen = (end-start)+1;
cc812361 6194
dd88747b 6195 /* check if starting point is trivial, before searching
6196 * the element in log(N) time */
6197 if (reverse) {
6198 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6199 } else {
6200 ln = start == 0 ?
6201 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6202 }
cc812361 6203
dd88747b 6204 /* Return the result in form of a multi-bulk reply */
6205 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6206 withscores ? (rangelen*2) : rangelen));
6207 for (j = 0; j < rangelen; j++) {
6208 ele = ln->obj;
6209 addReplyBulk(c,ele);
6210 if (withscores)
6211 addReplyDouble(c,ln->score);
6212 ln = reverse ? ln->backward : ln->forward[0];
cc812361 6213 }
6214}
6215
e3870fab 6216static void zrangeCommand(redisClient *c) {
6217 zrangeGenericCommand(c,0);
6218}
6219
6220static void zrevrangeCommand(redisClient *c) {
6221 zrangeGenericCommand(c,1);
6222}
6223
f44dd428 6224/* This command implements both ZRANGEBYSCORE and ZCOUNT.
6225 * If justcount is non-zero, just the count is returned. */
6226static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 6227 robj *o;
f44dd428 6228 double min, max;
6229 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 6230 int offset = 0, limit = -1;
0500ef27
SH
6231 int withscores = 0;
6232 int badsyntax = 0;
6233
f44dd428 6234 /* Parse the min-max interval. If one of the values is prefixed
6235 * by the "(" character, it's considered "open". For instance
6236 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6237 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6238 if (((char*)c->argv[2]->ptr)[0] == '(') {
6239 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6240 minex = 1;
6241 } else {
6242 min = strtod(c->argv[2]->ptr,NULL);
6243 }
6244 if (((char*)c->argv[3]->ptr)[0] == '(') {
6245 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6246 maxex = 1;
6247 } else {
6248 max = strtod(c->argv[3]->ptr,NULL);
6249 }
6250
6251 /* Parse "WITHSCORES": note that if the command was called with
6252 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6253 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 6254 if (c->argc == 5 || c->argc == 8) {
3a3978b1 6255 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6256 withscores = 1;
6257 else
6258 badsyntax = 1;
0500ef27 6259 }
3a3978b1 6260 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 6261 badsyntax = 1;
0500ef27 6262 if (badsyntax) {
454d4e43 6263 addReplySds(c,
6264 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 6265 return;
0500ef27
SH
6266 }
6267
f44dd428 6268 /* Parse "LIMIT" */
0500ef27 6269 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 6270 addReply(c,shared.syntaxerr);
6271 return;
0500ef27 6272 } else if (c->argc == (7 + withscores)) {
80181f78 6273 offset = atoi(c->argv[5]->ptr);
6274 limit = atoi(c->argv[6]->ptr);
0b13687c 6275 if (offset < 0) offset = 0;
80181f78 6276 }
50c55df5 6277
f44dd428 6278 /* Ok, lookup the key and get the range */
50c55df5 6279 o = lookupKeyRead(c->db,c->argv[1]);
6280 if (o == NULL) {
4e27f268 6281 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6282 } else {
6283 if (o->type != REDIS_ZSET) {
6284 addReply(c,shared.wrongtypeerr);
6285 } else {
6286 zset *zsetobj = o->ptr;
6287 zskiplist *zsl = zsetobj->zsl;
6288 zskiplistNode *ln;
f44dd428 6289 robj *ele, *lenobj = NULL;
6290 unsigned long rangelen = 0;
50c55df5 6291
f44dd428 6292 /* Get the first node with the score >= min, or with
6293 * score > min if 'minex' is true. */
50c55df5 6294 ln = zslFirstWithScore(zsl,min);
f44dd428 6295 while (minex && ln && ln->score == min) ln = ln->forward[0];
6296
50c55df5 6297 if (ln == NULL) {
6298 /* No element matching the speciifed interval */
f44dd428 6299 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6300 return;
6301 }
6302
6303 /* We don't know in advance how many matching elements there
6304 * are in the list, so we push this object that will represent
6305 * the multi-bulk length in the output buffer, and will "fix"
6306 * it later */
f44dd428 6307 if (!justcount) {
6308 lenobj = createObject(REDIS_STRING,NULL);
6309 addReply(c,lenobj);
6310 decrRefCount(lenobj);
6311 }
50c55df5 6312
f44dd428 6313 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 6314 if (offset) {
6315 offset--;
6316 ln = ln->forward[0];
6317 continue;
6318 }
6319 if (limit == 0) break;
f44dd428 6320 if (!justcount) {
6321 ele = ln->obj;
dd88747b 6322 addReplyBulk(c,ele);
f44dd428 6323 if (withscores)
6324 addReplyDouble(c,ln->score);
6325 }
50c55df5 6326 ln = ln->forward[0];
6327 rangelen++;
80181f78 6328 if (limit > 0) limit--;
50c55df5 6329 }
f44dd428 6330 if (justcount) {
482b672d 6331 addReplyLongLong(c,(long)rangelen);
f44dd428 6332 } else {
6333 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6334 withscores ? (rangelen*2) : rangelen);
6335 }
50c55df5 6336 }
6337 }
6338}
6339
f44dd428 6340static void zrangebyscoreCommand(redisClient *c) {
6341 genericZrangebyscoreCommand(c,0);
6342}
6343
6344static void zcountCommand(redisClient *c) {
6345 genericZrangebyscoreCommand(c,1);
6346}
6347
3c41331e 6348static void zcardCommand(redisClient *c) {
e197b441 6349 robj *o;
6350 zset *zs;
dd88747b 6351
6352 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6353 checkType(c,o,REDIS_ZSET)) return;
6354
6355 zs = o->ptr;
6356 addReplyUlong(c,zs->zsl->length);
e197b441 6357}
6358
6e333bbe 6359static void zscoreCommand(redisClient *c) {
6360 robj *o;
6361 zset *zs;
dd88747b 6362 dictEntry *de;
6363
6364 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6365 checkType(c,o,REDIS_ZSET)) return;
6366
6367 zs = o->ptr;
6368 de = dictFind(zs->dict,c->argv[2]);
6369 if (!de) {
96d8b4ee 6370 addReply(c,shared.nullbulk);
6e333bbe 6371 } else {
dd88747b 6372 double *score = dictGetEntryVal(de);
6e333bbe 6373
dd88747b 6374 addReplyDouble(c,*score);
6e333bbe 6375 }
6376}
6377
798d9e55 6378static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 6379 robj *o;
dd88747b 6380 zset *zs;
6381 zskiplist *zsl;
6382 dictEntry *de;
6383 unsigned long rank;
6384 double *score;
6385
6386 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6387 checkType(c,o,REDIS_ZSET)) return;
6388
6389 zs = o->ptr;
6390 zsl = zs->zsl;
6391 de = dictFind(zs->dict,c->argv[2]);
6392 if (!de) {
69d95c3e
PN
6393 addReply(c,shared.nullbulk);
6394 return;
6395 }
69d95c3e 6396
dd88747b 6397 score = dictGetEntryVal(de);
6398 rank = zslGetRank(zsl, *score, c->argv[2]);
6399 if (rank) {
6400 if (reverse) {
482b672d 6401 addReplyLongLong(c, zsl->length - rank);
27b0ccca 6402 } else {
482b672d 6403 addReplyLongLong(c, rank-1);
69d95c3e 6404 }
dd88747b 6405 } else {
6406 addReply(c,shared.nullbulk);
978c2c94 6407 }
6408}
6409
798d9e55
PN
6410static void zrankCommand(redisClient *c) {
6411 zrankGenericCommand(c, 0);
6412}
6413
6414static void zrevrankCommand(redisClient *c) {
6415 zrankGenericCommand(c, 1);
6416}
6417
7fb16bac
PN
6418/* ========================= Hashes utility functions ======================= */
6419#define REDIS_HASH_KEY 1
6420#define REDIS_HASH_VALUE 2
978c2c94 6421
7fb16bac
PN
6422/* Check the length of a number of objects to see if we need to convert a
6423 * zipmap to a real hash. Note that we only check string encoded objects
6424 * as their string length can be queried in constant time. */
6425static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6426 int i;
6427 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
978c2c94 6428
7fb16bac
PN
6429 for (i = start; i <= end; i++) {
6430 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6431 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6432 {
6433 convertToRealHash(subject);
978c2c94 6434 return;
6435 }
6436 }
7fb16bac 6437}
bae2c7ec 6438
97224de7
PN
6439/* Encode given objects in-place when the hash uses a dict. */
6440static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6441 if (subject->encoding == REDIS_ENCODING_HT) {
3f973463
PN
6442 if (o1) *o1 = tryObjectEncoding(*o1);
6443 if (o2) *o2 = tryObjectEncoding(*o2);
97224de7
PN
6444 }
6445}
6446
7fb16bac 6447/* Get the value from a hash identified by key. Returns either a string
a3f3af86
PN
6448 * object or NULL if the value cannot be found. The refcount of the object
6449 * is always increased by 1 when the value was found. */
7fb16bac
PN
6450static robj *hashGet(robj *o, robj *key) {
6451 robj *value = NULL;
978c2c94 6452 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7fb16bac
PN
6453 unsigned char *v;
6454 unsigned int vlen;
6455 key = getDecodedObject(key);
6456 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6457 value = createStringObject((char*)v,vlen);
6458 }
6459 decrRefCount(key);
6460 } else {
6461 dictEntry *de = dictFind(o->ptr,key);
6462 if (de != NULL) {
6463 value = dictGetEntryVal(de);
a3f3af86 6464 incrRefCount(value);
7fb16bac
PN
6465 }
6466 }
6467 return value;
6468}
978c2c94 6469
7fb16bac
PN
6470/* Test if the key exists in the given hash. Returns 1 if the key
6471 * exists and 0 when it doesn't. */
6472static int hashExists(robj *o, robj *key) {
6473 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6474 key = getDecodedObject(key);
6475 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6476 decrRefCount(key);
6477 return 1;
6478 }
6479 decrRefCount(key);
6480 } else {
6481 if (dictFind(o->ptr,key) != NULL) {
6482 return 1;
6483 }
6484 }
6485 return 0;
6486}
bae2c7ec 6487
7fb16bac
PN
6488/* Add an element, discard the old if the key already exists.
6489 * Return 0 on insert and 1 on update. */
feb8d7e6 6490static int hashSet(robj *o, robj *key, robj *value) {
7fb16bac
PN
6491 int update = 0;
6492 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6493 key = getDecodedObject(key);
6494 value = getDecodedObject(value);
6495 o->ptr = zipmapSet(o->ptr,
6496 key->ptr,sdslen(key->ptr),
6497 value->ptr,sdslen(value->ptr), &update);
6498 decrRefCount(key);
6499 decrRefCount(value);
6500
6501 /* Check if the zipmap needs to be upgraded to a real hash table */
6502 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
bae2c7ec 6503 convertToRealHash(o);
978c2c94 6504 } else {
7fb16bac
PN
6505 if (dictReplace(o->ptr,key,value)) {
6506 /* Insert */
6507 incrRefCount(key);
978c2c94 6508 } else {
7fb16bac 6509 /* Update */
978c2c94 6510 update = 1;
6511 }
7fb16bac 6512 incrRefCount(value);
978c2c94 6513 }
7fb16bac 6514 return update;
978c2c94 6515}
6516
7fb16bac
PN
6517/* Delete an element from a hash.
6518 * Return 1 on deleted and 0 on not found. */
6519static int hashDelete(robj *o, robj *key) {
6520 int deleted = 0;
6521 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6522 key = getDecodedObject(key);
6523 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6524 decrRefCount(key);
6525 } else {
6526 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6527 /* Always check if the dictionary needs a resize after a delete. */
6528 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
d33278d1 6529 }
7fb16bac
PN
6530 return deleted;
6531}
d33278d1 6532
7fb16bac 6533/* Return the number of elements in a hash. */
c811bb38 6534static unsigned long hashLength(robj *o) {
7fb16bac
PN
6535 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6536 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6537}
6538
6539/* Structure to hold hash iteration abstration. Note that iteration over
6540 * hashes involves both fields and values. Because it is possible that
6541 * not both are required, store pointers in the iterator to avoid
6542 * unnecessary memory allocation for fields/values. */
6543typedef struct {
6544 int encoding;
6545 unsigned char *zi;
6546 unsigned char *zk, *zv;
6547 unsigned int zklen, zvlen;
6548
6549 dictIterator *di;
6550 dictEntry *de;
6551} hashIterator;
6552
c44d3b56
PN
6553static hashIterator *hashInitIterator(robj *subject) {
6554 hashIterator *hi = zmalloc(sizeof(hashIterator));
7fb16bac
PN
6555 hi->encoding = subject->encoding;
6556 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6557 hi->zi = zipmapRewind(subject->ptr);
6558 } else if (hi->encoding == REDIS_ENCODING_HT) {
6559 hi->di = dictGetIterator(subject->ptr);
d33278d1 6560 } else {
7fb16bac 6561 redisAssert(NULL);
d33278d1 6562 }
c44d3b56 6563 return hi;
7fb16bac 6564}
d33278d1 6565
7fb16bac
PN
6566static void hashReleaseIterator(hashIterator *hi) {
6567 if (hi->encoding == REDIS_ENCODING_HT) {
6568 dictReleaseIterator(hi->di);
d33278d1 6569 }
c44d3b56 6570 zfree(hi);
7fb16bac 6571}
d33278d1 6572
7fb16bac
PN
6573/* Move to the next entry in the hash. Return REDIS_OK when the next entry
6574 * could be found and REDIS_ERR when the iterator reaches the end. */
c811bb38 6575static int hashNext(hashIterator *hi) {
7fb16bac
PN
6576 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6577 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6578 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6579 } else {
6580 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6581 }
6582 return REDIS_OK;
6583}
d33278d1 6584
0c390abc 6585/* Get key or value object at current iteration position.
a3f3af86 6586 * This increases the refcount of the field object by 1. */
c811bb38 6587static robj *hashCurrent(hashIterator *hi, int what) {
7fb16bac
PN
6588 robj *o;
6589 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6590 if (what & REDIS_HASH_KEY) {
6591 o = createStringObject((char*)hi->zk,hi->zklen);
6592 } else {
6593 o = createStringObject((char*)hi->zv,hi->zvlen);
d33278d1 6594 }
d33278d1 6595 } else {
7fb16bac
PN
6596 if (what & REDIS_HASH_KEY) {
6597 o = dictGetEntryKey(hi->de);
6598 } else {
6599 o = dictGetEntryVal(hi->de);
d33278d1 6600 }
a3f3af86 6601 incrRefCount(o);
d33278d1 6602 }
7fb16bac 6603 return o;
d33278d1
PN
6604}
6605
7fb16bac
PN
6606static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6607 robj *o = lookupKeyWrite(c->db,key);
01426b05
PN
6608 if (o == NULL) {
6609 o = createHashObject();
7fb16bac
PN
6610 dictAdd(c->db->dict,key,o);
6611 incrRefCount(key);
01426b05
PN
6612 } else {
6613 if (o->type != REDIS_HASH) {
6614 addReply(c,shared.wrongtypeerr);
7fb16bac 6615 return NULL;
01426b05
PN
6616 }
6617 }
7fb16bac
PN
6618 return o;
6619}
01426b05 6620
7fb16bac
PN
6621/* ============================= Hash commands ============================== */
6622static void hsetCommand(redisClient *c) {
6e9e463f 6623 int update;
7fb16bac 6624 robj *o;
bbe025e0 6625
7fb16bac
PN
6626 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6627 hashTryConversion(o,c->argv,2,3);
97224de7 6628 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6629 update = hashSet(o,c->argv[2],c->argv[3]);
6e9e463f 6630 addReply(c, update ? shared.czero : shared.cone);
7fb16bac
PN
6631 server.dirty++;
6632}
01426b05 6633
1f1c7695
PN
6634static void hsetnxCommand(redisClient *c) {
6635 robj *o;
6636 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6637 hashTryConversion(o,c->argv,2,3);
6638
6639 if (hashExists(o, c->argv[2])) {
6640 addReply(c, shared.czero);
01426b05 6641 } else {
97224de7 6642 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6643 hashSet(o,c->argv[2],c->argv[3]);
1f1c7695
PN
6644 addReply(c, shared.cone);
6645 server.dirty++;
6646 }
6647}
01426b05 6648
7fb16bac
PN
6649static void hmsetCommand(redisClient *c) {
6650 int i;
6651 robj *o;
01426b05 6652
7fb16bac
PN
6653 if ((c->argc % 2) == 1) {
6654 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6655 return;
6656 }
01426b05 6657
7fb16bac
PN
6658 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6659 hashTryConversion(o,c->argv,2,c->argc-1);
6660 for (i = 2; i < c->argc; i += 2) {
97224de7 6661 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
feb8d7e6 6662 hashSet(o,c->argv[i],c->argv[i+1]);
7fb16bac
PN
6663 }
6664 addReply(c, shared.ok);
edc2f63a 6665 server.dirty++;
7fb16bac
PN
6666}
6667
6668static void hincrbyCommand(redisClient *c) {
6669 long long value, incr;
6670 robj *o, *current, *new;
6671
bd79a6bd 6672 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
7fb16bac
PN
6673 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6674 if ((current = hashGet(o,c->argv[2])) != NULL) {
946342c1
PN
6675 if (getLongLongFromObjectOrReply(c,current,&value,
6676 "hash value is not an integer") != REDIS_OK) {
6677 decrRefCount(current);
6678 return;
6679 }
a3f3af86 6680 decrRefCount(current);
7fb16bac
PN
6681 } else {
6682 value = 0;
01426b05
PN
6683 }
6684
7fb16bac 6685 value += incr;
3f973463
PN
6686 new = createStringObjectFromLongLong(value);
6687 hashTryObjectEncoding(o,&c->argv[2],NULL);
feb8d7e6 6688 hashSet(o,c->argv[2],new);
7fb16bac
PN
6689 decrRefCount(new);
6690 addReplyLongLong(c,value);
01426b05 6691 server.dirty++;
01426b05
PN
6692}
6693
978c2c94 6694static void hgetCommand(redisClient *c) {
7fb16bac 6695 robj *o, *value;
dd88747b 6696 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6697 checkType(c,o,REDIS_HASH)) return;
6698
7fb16bac
PN
6699 if ((value = hashGet(o,c->argv[2])) != NULL) {
6700 addReplyBulk(c,value);
a3f3af86 6701 decrRefCount(value);
dd88747b 6702 } else {
7fb16bac 6703 addReply(c,shared.nullbulk);
69d95c3e 6704 }
69d95c3e
PN
6705}
6706
09aeb579
PN
6707static void hmgetCommand(redisClient *c) {
6708 int i;
7fb16bac
PN
6709 robj *o, *value;
6710 o = lookupKeyRead(c->db,c->argv[1]);
6711 if (o != NULL && o->type != REDIS_HASH) {
6712 addReply(c,shared.wrongtypeerr);
09aeb579
PN
6713 }
6714
7fb16bac
PN
6715 /* Note the check for o != NULL happens inside the loop. This is
6716 * done because objects that cannot be found are considered to be
6717 * an empty hash. The reply should then be a series of NULLs. */
09aeb579 6718 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7fb16bac
PN
6719 for (i = 2; i < c->argc; i++) {
6720 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6721 addReplyBulk(c,value);
a3f3af86 6722 decrRefCount(value);
7fb16bac
PN
6723 } else {
6724 addReply(c,shared.nullbulk);
09aeb579
PN
6725 }
6726 }
6727}
6728
07efaf74 6729static void hdelCommand(redisClient *c) {
dd88747b 6730 robj *o;
dd88747b 6731 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6732 checkType(c,o,REDIS_HASH)) return;
07efaf74 6733
7fb16bac
PN
6734 if (hashDelete(o,c->argv[2])) {
6735 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6736 addReply(c,shared.cone);
6737 server.dirty++;
dd88747b 6738 } else {
7fb16bac 6739 addReply(c,shared.czero);
07efaf74 6740 }
6741}
6742
92b27fe9 6743static void hlenCommand(redisClient *c) {
6744 robj *o;
dd88747b 6745 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 6746 checkType(c,o,REDIS_HASH)) return;
6747
7fb16bac 6748 addReplyUlong(c,hashLength(o));
92b27fe9 6749}
6750
78409a0f 6751static void genericHgetallCommand(redisClient *c, int flags) {
7fb16bac 6752 robj *o, *lenobj, *obj;
78409a0f 6753 unsigned long count = 0;
c44d3b56 6754 hashIterator *hi;
78409a0f 6755
4e27f268 6756 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
78409a0f 6757 || checkType(c,o,REDIS_HASH)) return;
6758
6759 lenobj = createObject(REDIS_STRING,NULL);
6760 addReply(c,lenobj);
6761 decrRefCount(lenobj);
6762
c44d3b56
PN
6763 hi = hashInitIterator(o);
6764 while (hashNext(hi) != REDIS_ERR) {
7fb16bac 6765 if (flags & REDIS_HASH_KEY) {
c44d3b56 6766 obj = hashCurrent(hi,REDIS_HASH_KEY);
7fb16bac 6767 addReplyBulk(c,obj);
a3f3af86 6768 decrRefCount(obj);
7fb16bac 6769 count++;
78409a0f 6770 }
7fb16bac 6771 if (flags & REDIS_HASH_VALUE) {
c44d3b56 6772 obj = hashCurrent(hi,REDIS_HASH_VALUE);
7fb16bac 6773 addReplyBulk(c,obj);
a3f3af86 6774 decrRefCount(obj);
7fb16bac 6775 count++;
78409a0f 6776 }
78409a0f 6777 }
c44d3b56 6778 hashReleaseIterator(hi);
7fb16bac 6779
78409a0f 6780 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6781}
6782
6783static void hkeysCommand(redisClient *c) {
7fb16bac 6784 genericHgetallCommand(c,REDIS_HASH_KEY);
78409a0f 6785}
6786
6787static void hvalsCommand(redisClient *c) {
7fb16bac 6788 genericHgetallCommand(c,REDIS_HASH_VALUE);
78409a0f 6789}
6790
6791static void hgetallCommand(redisClient *c) {
7fb16bac 6792 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
78409a0f 6793}
6794
a86f14b1 6795static void hexistsCommand(redisClient *c) {
6796 robj *o;
a86f14b1 6797 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6798 checkType(c,o,REDIS_HASH)) return;
6799
7fb16bac 6800 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
a86f14b1 6801}
6802
ada386b2 6803static void convertToRealHash(robj *o) {
6804 unsigned char *key, *val, *p, *zm = o->ptr;
6805 unsigned int klen, vlen;
6806 dict *dict = dictCreate(&hashDictType,NULL);
6807
6808 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6809 p = zipmapRewind(zm);
6810 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6811 robj *keyobj, *valobj;
6812
6813 keyobj = createStringObject((char*)key,klen);
6814 valobj = createStringObject((char*)val,vlen);
05df7621 6815 keyobj = tryObjectEncoding(keyobj);
6816 valobj = tryObjectEncoding(valobj);
ada386b2 6817 dictAdd(dict,keyobj,valobj);
6818 }
6819 o->encoding = REDIS_ENCODING_HT;
6820 o->ptr = dict;
6821 zfree(zm);
6822}
6823
6b47e12e 6824/* ========================= Non type-specific commands ==================== */
6825
ed9b544e 6826static void flushdbCommand(redisClient *c) {
ca37e9cd 6827 server.dirty += dictSize(c->db->dict);
9b30e1a2 6828 touchWatchedKeysOnFlush(c->db->id);
3305306f 6829 dictEmpty(c->db->dict);
6830 dictEmpty(c->db->expires);
ed9b544e 6831 addReply(c,shared.ok);
ed9b544e 6832}
6833
6834static void flushallCommand(redisClient *c) {
9b30e1a2 6835 touchWatchedKeysOnFlush(-1);
ca37e9cd 6836 server.dirty += emptyDb();
ed9b544e 6837 addReply(c,shared.ok);
500ece7c 6838 if (server.bgsavechildpid != -1) {
6839 kill(server.bgsavechildpid,SIGKILL);
6840 rdbRemoveTempFile(server.bgsavechildpid);
6841 }
f78fd11b 6842 rdbSave(server.dbfilename);
ca37e9cd 6843 server.dirty++;
ed9b544e 6844}
6845
56906eef 6846static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 6847 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 6848 so->type = type;
6849 so->pattern = pattern;
6850 return so;
6851}
6852
6853/* Return the value associated to the key with a name obtained
55017f9d
PN
6854 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6855 * The returned object will always have its refcount increased by 1
6856 * when it is non-NULL. */
56906eef 6857static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6d7d1370 6858 char *p, *f;
ed9b544e 6859 sds spat, ssub;
6d7d1370
PN
6860 robj keyobj, fieldobj, *o;
6861 int prefixlen, sublen, postfixlen, fieldlen;
ed9b544e 6862 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6863 struct {
f1017b3f 6864 long len;
6865 long free;
ed9b544e 6866 char buf[REDIS_SORTKEY_MAX+1];
6d7d1370 6867 } keyname, fieldname;
ed9b544e 6868
28173a49 6869 /* If the pattern is "#" return the substitution object itself in order
6870 * to implement the "SORT ... GET #" feature. */
6871 spat = pattern->ptr;
6872 if (spat[0] == '#' && spat[1] == '\0') {
55017f9d 6873 incrRefCount(subst);
28173a49 6874 return subst;
6875 }
6876
6877 /* The substitution object may be specially encoded. If so we create
9d65a1bb 6878 * a decoded object on the fly. Otherwise getDecodedObject will just
6879 * increment the ref count, that we'll decrement later. */
6880 subst = getDecodedObject(subst);
942a3961 6881
ed9b544e 6882 ssub = subst->ptr;
6883 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6884 p = strchr(spat,'*');
ed5a857a 6885 if (!p) {
6886 decrRefCount(subst);
6887 return NULL;
6888 }
ed9b544e 6889
6d7d1370
PN
6890 /* Find out if we're dealing with a hash dereference. */
6891 if ((f = strstr(p+1, "->")) != NULL) {
6892 fieldlen = sdslen(spat)-(f-spat);
6893 /* this also copies \0 character */
6894 memcpy(fieldname.buf,f+2,fieldlen-1);
6895 fieldname.len = fieldlen-2;
6896 } else {
6897 fieldlen = 0;
6898 }
6899
ed9b544e 6900 prefixlen = p-spat;
6901 sublen = sdslen(ssub);
6d7d1370 6902 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
ed9b544e 6903 memcpy(keyname.buf,spat,prefixlen);
6904 memcpy(keyname.buf+prefixlen,ssub,sublen);
6905 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6906 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6907 keyname.len = prefixlen+sublen+postfixlen;
942a3961 6908 decrRefCount(subst);
6909
6d7d1370
PN
6910 /* Lookup substituted key */
6911 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6912 o = lookupKeyRead(db,&keyobj);
55017f9d
PN
6913 if (o == NULL) return NULL;
6914
6915 if (fieldlen > 0) {
6916 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6d7d1370 6917
705dad38
PN
6918 /* Retrieve value from hash by the field name. This operation
6919 * already increases the refcount of the returned object. */
6d7d1370
PN
6920 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6921 o = hashGet(o, &fieldobj);
705dad38 6922 } else {
55017f9d 6923 if (o->type != REDIS_STRING) return NULL;
b6f07345 6924
705dad38
PN
6925 /* Every object that this function returns needs to have its refcount
6926 * increased. sortCommand decreases it again. */
6927 incrRefCount(o);
6d7d1370
PN
6928 }
6929
6930 return o;
ed9b544e 6931}
6932
6933/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6934 * the additional parameter is not standard but a BSD-specific we have to
6935 * pass sorting parameters via the global 'server' structure */
6936static int sortCompare(const void *s1, const void *s2) {
6937 const redisSortObject *so1 = s1, *so2 = s2;
6938 int cmp;
6939
6940 if (!server.sort_alpha) {
6941 /* Numeric sorting. Here it's trivial as we precomputed scores */
6942 if (so1->u.score > so2->u.score) {
6943 cmp = 1;
6944 } else if (so1->u.score < so2->u.score) {
6945 cmp = -1;
6946 } else {
6947 cmp = 0;
6948 }
6949 } else {
6950 /* Alphanumeric sorting */
6951 if (server.sort_bypattern) {
6952 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6953 /* At least one compare object is NULL */
6954 if (so1->u.cmpobj == so2->u.cmpobj)
6955 cmp = 0;
6956 else if (so1->u.cmpobj == NULL)
6957 cmp = -1;
6958 else
6959 cmp = 1;
6960 } else {
6961 /* We have both the objects, use strcoll */
6962 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6963 }
6964 } else {
08ee9b57 6965 /* Compare elements directly. */
6966 cmp = compareStringObjects(so1->obj,so2->obj);
ed9b544e 6967 }
6968 }
6969 return server.sort_desc ? -cmp : cmp;
6970}
6971
6972/* The SORT command is the most complex command in Redis. Warning: this code
6973 * is optimized for speed and a bit less for readability */
6974static void sortCommand(redisClient *c) {
ed9b544e 6975 list *operations;
6976 int outputlen = 0;
6977 int desc = 0, alpha = 0;
6978 int limit_start = 0, limit_count = -1, start, end;
6979 int j, dontsort = 0, vectorlen;
6980 int getop = 0; /* GET operation counter */
443c6409 6981 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 6982 redisSortObject *vector; /* Resulting vector to sort */
6983
6984 /* Lookup the key to sort. It must be of the right types */
3305306f 6985 sortval = lookupKeyRead(c->db,c->argv[1]);
6986 if (sortval == NULL) {
4e27f268 6987 addReply(c,shared.emptymultibulk);
ed9b544e 6988 return;
6989 }
a5eb649b 6990 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6991 sortval->type != REDIS_ZSET)
6992 {
c937aa89 6993 addReply(c,shared.wrongtypeerr);
ed9b544e 6994 return;
6995 }
6996
6997 /* Create a list of operations to perform for every sorted element.
6998 * Operations can be GET/DEL/INCR/DECR */
6999 operations = listCreate();
092dac2a 7000 listSetFreeMethod(operations,zfree);
ed9b544e 7001 j = 2;
7002
7003 /* Now we need to protect sortval incrementing its count, in the future
7004 * SORT may have options able to overwrite/delete keys during the sorting
7005 * and the sorted key itself may get destroied */
7006 incrRefCount(sortval);
7007
7008 /* The SORT command has an SQL-alike syntax, parse it */
7009 while(j < c->argc) {
7010 int leftargs = c->argc-j-1;
7011 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7012 desc = 0;
7013 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7014 desc = 1;
7015 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7016 alpha = 1;
7017 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7018 limit_start = atoi(c->argv[j+1]->ptr);
7019 limit_count = atoi(c->argv[j+2]->ptr);
7020 j+=2;
443c6409 7021 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7022 storekey = c->argv[j+1];
7023 j++;
ed9b544e 7024 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7025 sortby = c->argv[j+1];
7026 /* If the BY pattern does not contain '*', i.e. it is constant,
7027 * we don't need to sort nor to lookup the weight keys. */
7028 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7029 j++;
7030 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7031 listAddNodeTail(operations,createSortOperation(
7032 REDIS_SORT_GET,c->argv[j+1]));
7033 getop++;
7034 j++;
ed9b544e 7035 } else {
7036 decrRefCount(sortval);
7037 listRelease(operations);
c937aa89 7038 addReply(c,shared.syntaxerr);
ed9b544e 7039 return;
7040 }
7041 j++;
7042 }
7043
7044 /* Load the sorting vector with all the objects to sort */
a5eb649b 7045 switch(sortval->type) {
7046 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
7047 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7048 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
f83c6cb5 7049 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
a5eb649b 7050 }
ed9b544e 7051 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 7052 j = 0;
a5eb649b 7053
ed9b544e 7054 if (sortval->type == REDIS_LIST) {
7055 list *list = sortval->ptr;
6208b3a7 7056 listNode *ln;
c7df85a4 7057 listIter li;
6208b3a7 7058
c7df85a4 7059 listRewind(list,&li);
7060 while((ln = listNext(&li))) {
ed9b544e 7061 robj *ele = ln->value;
7062 vector[j].obj = ele;
7063 vector[j].u.score = 0;
7064 vector[j].u.cmpobj = NULL;
ed9b544e 7065 j++;
7066 }
7067 } else {
a5eb649b 7068 dict *set;
ed9b544e 7069 dictIterator *di;
7070 dictEntry *setele;
7071
a5eb649b 7072 if (sortval->type == REDIS_SET) {
7073 set = sortval->ptr;
7074 } else {
7075 zset *zs = sortval->ptr;
7076 set = zs->dict;
7077 }
7078
ed9b544e 7079 di = dictGetIterator(set);
ed9b544e 7080 while((setele = dictNext(di)) != NULL) {
7081 vector[j].obj = dictGetEntryKey(setele);
7082 vector[j].u.score = 0;
7083 vector[j].u.cmpobj = NULL;
7084 j++;
7085 }
7086 dictReleaseIterator(di);
7087 }
dfc5e96c 7088 redisAssert(j == vectorlen);
ed9b544e 7089
7090 /* Now it's time to load the right scores in the sorting vector */
7091 if (dontsort == 0) {
7092 for (j = 0; j < vectorlen; j++) {
6d7d1370 7093 robj *byval;
ed9b544e 7094 if (sortby) {
6d7d1370 7095 /* lookup value to sort by */
3305306f 7096 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
705dad38 7097 if (!byval) continue;
ed9b544e 7098 } else {
6d7d1370
PN
7099 /* use object itself to sort by */
7100 byval = vector[j].obj;
7101 }
7102
7103 if (alpha) {
08ee9b57 7104 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6d7d1370
PN
7105 } else {
7106 if (byval->encoding == REDIS_ENCODING_RAW) {
7107 vector[j].u.score = strtod(byval->ptr,NULL);
16fa22f1 7108 } else if (byval->encoding == REDIS_ENCODING_INT) {
6d7d1370
PN
7109 /* Don't need to decode the object if it's
7110 * integer-encoded (the only encoding supported) so
7111 * far. We can just cast it */
16fa22f1
PN
7112 vector[j].u.score = (long)byval->ptr;
7113 } else {
7114 redisAssert(1 != 1);
942a3961 7115 }
ed9b544e 7116 }
6d7d1370 7117
705dad38
PN
7118 /* when the object was retrieved using lookupKeyByPattern,
7119 * its refcount needs to be decreased. */
7120 if (sortby) {
7121 decrRefCount(byval);
ed9b544e 7122 }
7123 }
7124 }
7125
7126 /* We are ready to sort the vector... perform a bit of sanity check
7127 * on the LIMIT option too. We'll use a partial version of quicksort. */
7128 start = (limit_start < 0) ? 0 : limit_start;
7129 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7130 if (start >= vectorlen) {
7131 start = vectorlen-1;
7132 end = vectorlen-2;
7133 }
7134 if (end >= vectorlen) end = vectorlen-1;
7135
7136 if (dontsort == 0) {
7137 server.sort_desc = desc;
7138 server.sort_alpha = alpha;
7139 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 7140 if (sortby && (start != 0 || end != vectorlen-1))
7141 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7142 else
7143 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 7144 }
7145
7146 /* Send command output to the output buffer, performing the specified
7147 * GET/DEL/INCR/DECR operations if any. */
7148 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 7149 if (storekey == NULL) {
7150 /* STORE option not specified, sent the sorting result to client */
7151 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7152 for (j = start; j <= end; j++) {
7153 listNode *ln;
c7df85a4 7154 listIter li;
7155
dd88747b 7156 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 7157 listRewind(operations,&li);
7158 while((ln = listNext(&li))) {
443c6409 7159 redisSortOperation *sop = ln->value;
7160 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7161 vector[j].obj);
7162
7163 if (sop->type == REDIS_SORT_GET) {
55017f9d 7164 if (!val) {
443c6409 7165 addReply(c,shared.nullbulk);
7166 } else {
dd88747b 7167 addReplyBulk(c,val);
55017f9d 7168 decrRefCount(val);
443c6409 7169 }
7170 } else {
dfc5e96c 7171 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 7172 }
7173 }
ed9b544e 7174 }
443c6409 7175 } else {
7176 robj *listObject = createListObject();
7177 list *listPtr = (list*) listObject->ptr;
7178
7179 /* STORE option specified, set the sorting result as a List object */
7180 for (j = start; j <= end; j++) {
7181 listNode *ln;
c7df85a4 7182 listIter li;
7183
443c6409 7184 if (!getop) {
7185 listAddNodeTail(listPtr,vector[j].obj);
7186 incrRefCount(vector[j].obj);
7187 }
c7df85a4 7188 listRewind(operations,&li);
7189 while((ln = listNext(&li))) {
443c6409 7190 redisSortOperation *sop = ln->value;
7191 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7192 vector[j].obj);
7193
7194 if (sop->type == REDIS_SORT_GET) {
55017f9d 7195 if (!val) {
443c6409 7196 listAddNodeTail(listPtr,createStringObject("",0));
7197 } else {
55017f9d
PN
7198 /* We should do a incrRefCount on val because it is
7199 * added to the list, but also a decrRefCount because
7200 * it is returned by lookupKeyByPattern. This results
7201 * in doing nothing at all. */
443c6409 7202 listAddNodeTail(listPtr,val);
443c6409 7203 }
ed9b544e 7204 } else {
dfc5e96c 7205 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 7206 }
ed9b544e 7207 }
ed9b544e 7208 }
121796f7 7209 if (dictReplace(c->db->dict,storekey,listObject)) {
7210 incrRefCount(storekey);
7211 }
443c6409 7212 /* Note: we add 1 because the DB is dirty anyway since even if the
7213 * SORT result is empty a new key is set and maybe the old content
7214 * replaced. */
7215 server.dirty += 1+outputlen;
7216 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 7217 }
7218
7219 /* Cleanup */
7220 decrRefCount(sortval);
7221 listRelease(operations);
7222 for (j = 0; j < vectorlen; j++) {
16fa22f1 7223 if (alpha && vector[j].u.cmpobj)
ed9b544e 7224 decrRefCount(vector[j].u.cmpobj);
7225 }
7226 zfree(vector);
7227}
7228
ec6c7a1d 7229/* Convert an amount of bytes into a human readable string in the form
7230 * of 100B, 2G, 100M, 4K, and so forth. */
7231static void bytesToHuman(char *s, unsigned long long n) {
7232 double d;
7233
7234 if (n < 1024) {
7235 /* Bytes */
7236 sprintf(s,"%lluB",n);
7237 return;
7238 } else if (n < (1024*1024)) {
7239 d = (double)n/(1024);
7240 sprintf(s,"%.2fK",d);
7241 } else if (n < (1024LL*1024*1024)) {
7242 d = (double)n/(1024*1024);
7243 sprintf(s,"%.2fM",d);
7244 } else if (n < (1024LL*1024*1024*1024)) {
7245 d = (double)n/(1024LL*1024*1024);
b72f6a4b 7246 sprintf(s,"%.2fG",d);
ec6c7a1d 7247 }
7248}
7249
1c85b79f 7250/* Create the string returned by the INFO command. This is decoupled
7251 * by the INFO command itself as we need to report the same information
7252 * on memory corruption problems. */
7253static sds genRedisInfoString(void) {
ed9b544e 7254 sds info;
7255 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 7256 int j;
ec6c7a1d 7257 char hmem[64];
55a8298f 7258
b72f6a4b 7259 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 7260 info = sdscatprintf(sdsempty(),
7261 "redis_version:%s\r\n"
5436146c
PN
7262 "redis_git_sha1:%s\r\n"
7263 "redis_git_dirty:%d\r\n"
f1017b3f 7264 "arch_bits:%s\r\n"
7a932b74 7265 "multiplexing_api:%s\r\n"
0d7170a4 7266 "process_id:%ld\r\n"
682ac724 7267 "uptime_in_seconds:%ld\r\n"
7268 "uptime_in_days:%ld\r\n"
ed9b544e 7269 "connected_clients:%d\r\n"
7270 "connected_slaves:%d\r\n"
f86a74e9 7271 "blocked_clients:%d\r\n"
5fba9f71 7272 "used_memory:%zu\r\n"
ec6c7a1d 7273 "used_memory_human:%s\r\n"
ed9b544e 7274 "changes_since_last_save:%lld\r\n"
be2bb6b0 7275 "bgsave_in_progress:%d\r\n"
682ac724 7276 "last_save_time:%ld\r\n"
b3fad521 7277 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 7278 "total_connections_received:%lld\r\n"
7279 "total_commands_processed:%lld\r\n"
2a6a2ed1 7280 "expired_keys:%lld\r\n"
3be2c9d7 7281 "hash_max_zipmap_entries:%zu\r\n"
7282 "hash_max_zipmap_value:%zu\r\n"
ffc6b7f8 7283 "pubsub_channels:%ld\r\n"
7284 "pubsub_patterns:%u\r\n"
7d98e08c 7285 "vm_enabled:%d\r\n"
a0f643ea 7286 "role:%s\r\n"
ed9b544e 7287 ,REDIS_VERSION,
5436146c 7288 REDIS_GIT_SHA1,
274e45e3 7289 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
f1017b3f 7290 (sizeof(long) == 8) ? "64" : "32",
7a932b74 7291 aeGetApiName(),
0d7170a4 7292 (long) getpid(),
a0f643ea 7293 uptime,
7294 uptime/(3600*24),
ed9b544e 7295 listLength(server.clients)-listLength(server.slaves),
7296 listLength(server.slaves),
d5d55fc3 7297 server.blpop_blocked_clients,
b72f6a4b 7298 zmalloc_used_memory(),
ec6c7a1d 7299 hmem,
ed9b544e 7300 server.dirty,
9d65a1bb 7301 server.bgsavechildpid != -1,
ed9b544e 7302 server.lastsave,
b3fad521 7303 server.bgrewritechildpid != -1,
ed9b544e 7304 server.stat_numconnections,
7305 server.stat_numcommands,
2a6a2ed1 7306 server.stat_expiredkeys,
55a8298f 7307 server.hash_max_zipmap_entries,
7308 server.hash_max_zipmap_value,
ffc6b7f8 7309 dictSize(server.pubsub_channels),
7310 listLength(server.pubsub_patterns),
7d98e08c 7311 server.vm_enabled != 0,
a0f643ea 7312 server.masterhost == NULL ? "master" : "slave"
ed9b544e 7313 );
a0f643ea 7314 if (server.masterhost) {
7315 info = sdscatprintf(info,
7316 "master_host:%s\r\n"
7317 "master_port:%d\r\n"
7318 "master_link_status:%s\r\n"
7319 "master_last_io_seconds_ago:%d\r\n"
7320 ,server.masterhost,
7321 server.masterport,
7322 (server.replstate == REDIS_REPL_CONNECTED) ?
7323 "up" : "down",
f72b934d 7324 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 7325 );
7326 }
7d98e08c 7327 if (server.vm_enabled) {
1064ef87 7328 lockThreadedIO();
7d98e08c 7329 info = sdscatprintf(info,
7330 "vm_conf_max_memory:%llu\r\n"
7331 "vm_conf_page_size:%llu\r\n"
7332 "vm_conf_pages:%llu\r\n"
7333 "vm_stats_used_pages:%llu\r\n"
7334 "vm_stats_swapped_objects:%llu\r\n"
7335 "vm_stats_swappin_count:%llu\r\n"
7336 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 7337 "vm_stats_io_newjobs_len:%lu\r\n"
7338 "vm_stats_io_processing_len:%lu\r\n"
7339 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 7340 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 7341 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 7342 ,(unsigned long long) server.vm_max_memory,
7343 (unsigned long long) server.vm_page_size,
7344 (unsigned long long) server.vm_pages,
7345 (unsigned long long) server.vm_stats_used_pages,
7346 (unsigned long long) server.vm_stats_swapped_objects,
7347 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 7348 (unsigned long long) server.vm_stats_swapouts,
7349 (unsigned long) listLength(server.io_newjobs),
7350 (unsigned long) listLength(server.io_processing),
7351 (unsigned long) listLength(server.io_processed),
d5d55fc3 7352 (unsigned long) server.io_active_threads,
7353 (unsigned long) server.vm_blocked_clients
7d98e08c 7354 );
1064ef87 7355 unlockThreadedIO();
7d98e08c 7356 }
c3cb078d 7357 for (j = 0; j < server.dbnum; j++) {
7358 long long keys, vkeys;
7359
7360 keys = dictSize(server.db[j].dict);
7361 vkeys = dictSize(server.db[j].expires);
7362 if (keys || vkeys) {
9d65a1bb 7363 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 7364 j, keys, vkeys);
7365 }
7366 }
1c85b79f 7367 return info;
7368}
7369
7370static void infoCommand(redisClient *c) {
7371 sds info = genRedisInfoString();
83c6a618 7372 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7373 (unsigned long)sdslen(info)));
ed9b544e 7374 addReplySds(c,info);
70003d28 7375 addReply(c,shared.crlf);
ed9b544e 7376}
7377
3305306f 7378static void monitorCommand(redisClient *c) {
7379 /* ignore MONITOR if aleady slave or in monitor mode */
7380 if (c->flags & REDIS_SLAVE) return;
7381
7382 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7383 c->slaveseldb = 0;
6b47e12e 7384 listAddNodeTail(server.monitors,c);
3305306f 7385 addReply(c,shared.ok);
7386}
7387
7388/* ================================= Expire ================================= */
7389static int removeExpire(redisDb *db, robj *key) {
7390 if (dictDelete(db->expires,key) == DICT_OK) {
7391 return 1;
7392 } else {
7393 return 0;
7394 }
7395}
7396
7397static int setExpire(redisDb *db, robj *key, time_t when) {
7398 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7399 return 0;
7400 } else {
7401 incrRefCount(key);
7402 return 1;
7403 }
7404}
7405
bb32ede5 7406/* Return the expire time of the specified key, or -1 if no expire
7407 * is associated with this key (i.e. the key is non volatile) */
7408static time_t getExpire(redisDb *db, robj *key) {
7409 dictEntry *de;
7410
7411 /* No expire? return ASAP */
7412 if (dictSize(db->expires) == 0 ||
7413 (de = dictFind(db->expires,key)) == NULL) return -1;
7414
7415 return (time_t) dictGetEntryVal(de);
7416}
7417
3305306f 7418static int expireIfNeeded(redisDb *db, robj *key) {
7419 time_t when;
7420 dictEntry *de;
7421
7422 /* No expire? return ASAP */
7423 if (dictSize(db->expires) == 0 ||
7424 (de = dictFind(db->expires,key)) == NULL) return 0;
7425
7426 /* Lookup the expire */
7427 when = (time_t) dictGetEntryVal(de);
7428 if (time(NULL) <= when) return 0;
7429
7430 /* Delete the key */
7431 dictDelete(db->expires,key);
2a6a2ed1 7432 server.stat_expiredkeys++;
3305306f 7433 return dictDelete(db->dict,key) == DICT_OK;
7434}
7435
7436static int deleteIfVolatile(redisDb *db, robj *key) {
7437 dictEntry *de;
7438
7439 /* No expire? return ASAP */
7440 if (dictSize(db->expires) == 0 ||
7441 (de = dictFind(db->expires,key)) == NULL) return 0;
7442
7443 /* Delete the key */
0c66a471 7444 server.dirty++;
2a6a2ed1 7445 server.stat_expiredkeys++;
3305306f 7446 dictDelete(db->expires,key);
7447 return dictDelete(db->dict,key) == DICT_OK;
7448}
7449
bbe025e0 7450static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
3305306f 7451 dictEntry *de;
bbe025e0
AM
7452 time_t seconds;
7453
bd79a6bd 7454 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
bbe025e0
AM
7455
7456 seconds -= offset;
3305306f 7457
802e8373 7458 de = dictFind(c->db->dict,key);
3305306f 7459 if (de == NULL) {
7460 addReply(c,shared.czero);
7461 return;
7462 }
d4dd6556 7463 if (seconds <= 0) {
43e5ccdf 7464 if (deleteKey(c->db,key)) server.dirty++;
7465 addReply(c, shared.cone);
3305306f 7466 return;
7467 } else {
7468 time_t when = time(NULL)+seconds;
802e8373 7469 if (setExpire(c->db,key,when)) {
3305306f 7470 addReply(c,shared.cone);
77423026 7471 server.dirty++;
7472 } else {
3305306f 7473 addReply(c,shared.czero);
77423026 7474 }
3305306f 7475 return;
7476 }
7477}
7478
802e8373 7479static void expireCommand(redisClient *c) {
bbe025e0 7480 expireGenericCommand(c,c->argv[1],c->argv[2],0);
802e8373 7481}
7482
7483static void expireatCommand(redisClient *c) {
bbe025e0 7484 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
802e8373 7485}
7486
fd88489a 7487static void ttlCommand(redisClient *c) {
7488 time_t expire;
7489 int ttl = -1;
7490
7491 expire = getExpire(c->db,c->argv[1]);
7492 if (expire != -1) {
7493 ttl = (int) (expire-time(NULL));
7494 if (ttl < 0) ttl = -1;
7495 }
7496 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7497}
7498
6e469882 7499/* ================================ MULTI/EXEC ============================== */
7500
7501/* Client state initialization for MULTI/EXEC */
7502static void initClientMultiState(redisClient *c) {
7503 c->mstate.commands = NULL;
7504 c->mstate.count = 0;
7505}
7506
7507/* Release all the resources associated with MULTI/EXEC state */
7508static void freeClientMultiState(redisClient *c) {
7509 int j;
7510
7511 for (j = 0; j < c->mstate.count; j++) {
7512 int i;
7513 multiCmd *mc = c->mstate.commands+j;
7514
7515 for (i = 0; i < mc->argc; i++)
7516 decrRefCount(mc->argv[i]);
7517 zfree(mc->argv);
7518 }
7519 zfree(c->mstate.commands);
7520}
7521
7522/* Add a new command into the MULTI commands queue */
7523static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7524 multiCmd *mc;
7525 int j;
7526
7527 c->mstate.commands = zrealloc(c->mstate.commands,
7528 sizeof(multiCmd)*(c->mstate.count+1));
7529 mc = c->mstate.commands+c->mstate.count;
7530 mc->cmd = cmd;
7531 mc->argc = c->argc;
7532 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7533 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7534 for (j = 0; j < c->argc; j++)
7535 incrRefCount(mc->argv[j]);
7536 c->mstate.count++;
7537}
7538
7539static void multiCommand(redisClient *c) {
6531c94d 7540 if (c->flags & REDIS_MULTI) {
7541 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
7542 return;
7543 }
6e469882 7544 c->flags |= REDIS_MULTI;
36c548f0 7545 addReply(c,shared.ok);
6e469882 7546}
7547
18b6cb76
DJ
7548static void discardCommand(redisClient *c) {
7549 if (!(c->flags & REDIS_MULTI)) {
7550 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7551 return;
7552 }
7553
7554 freeClientMultiState(c);
7555 initClientMultiState(c);
7556 c->flags &= (~REDIS_MULTI);
7557 addReply(c,shared.ok);
7558}
7559
66c8853f 7560/* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7561 * implememntation for more information. */
7562static void execCommandReplicateMulti(redisClient *c) {
7563 struct redisCommand *cmd;
7564 robj *multistring = createStringObject("MULTI",5);
7565
7566 cmd = lookupCommand("multi");
7567 if (server.appendonly)
7568 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7569 if (listLength(server.slaves))
7570 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7571 decrRefCount(multistring);
7572}
7573
6e469882 7574static void execCommand(redisClient *c) {
7575 int j;
7576 robj **orig_argv;
7577 int orig_argc;
7578
7579 if (!(c->flags & REDIS_MULTI)) {
7580 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7581 return;
7582 }
7583
37ab76c9 7584 /* Check if we need to abort the EXEC if some WATCHed key was touched.
7585 * A failed EXEC will return a multi bulk nil object. */
7586 if (c->flags & REDIS_DIRTY_CAS) {
7587 freeClientMultiState(c);
7588 initClientMultiState(c);
7589 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7590 unwatchAllKeys(c);
7591 addReply(c,shared.nullmultibulk);
7592 return;
7593 }
7594
66c8853f 7595 /* Replicate a MULTI request now that we are sure the block is executed.
7596 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7597 * both the AOF and the replication link will have the same consistency
7598 * and atomicity guarantees. */
7599 execCommandReplicateMulti(c);
7600
7601 /* Exec all the queued commands */
1ad4d316 7602 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
6e469882 7603 orig_argv = c->argv;
7604 orig_argc = c->argc;
7605 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7606 for (j = 0; j < c->mstate.count; j++) {
7607 c->argc = c->mstate.commands[j].argc;
7608 c->argv = c->mstate.commands[j].argv;
7609 call(c,c->mstate.commands[j].cmd);
7610 }
7611 c->argv = orig_argv;
7612 c->argc = orig_argc;
7613 freeClientMultiState(c);
7614 initClientMultiState(c);
1ad4d316 7615 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
66c8853f 7616 /* Make sure the EXEC command is always replicated / AOF, since we
7617 * always send the MULTI command (we can't know beforehand if the
7618 * next operations will contain at least a modification to the DB). */
7619 server.dirty++;
6e469882 7620}
7621
4409877e 7622/* =========================== Blocking Operations ========================= */
7623
7624/* Currently Redis blocking operations support is limited to list POP ops,
7625 * so the current implementation is not fully generic, but it is also not
7626 * completely specific so it will not require a rewrite to support new
7627 * kind of blocking operations in the future.
7628 *
7629 * Still it's important to note that list blocking operations can be already
7630 * used as a notification mechanism in order to implement other blocking
7631 * operations at application level, so there must be a very strong evidence
7632 * of usefulness and generality before new blocking operations are implemented.
7633 *
7634 * This is how the current blocking POP works, we use BLPOP as example:
7635 * - If the user calls BLPOP and the key exists and contains a non empty list
7636 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7637 * if there is not to block.
7638 * - If instead BLPOP is called and the key does not exists or the list is
7639 * empty we need to block. In order to do so we remove the notification for
7640 * new data to read in the client socket (so that we'll not serve new
7641 * requests if the blocking request is not served). Also we put the client
37ab76c9 7642 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
4409877e 7643 * blocking for this keys.
7644 * - If a PUSH operation against a key with blocked clients waiting is
7645 * performed, we serve the first in the list: basically instead to push
7646 * the new element inside the list we return it to the (first / oldest)
7647 * blocking client, unblock the client, and remove it form the list.
7648 *
7649 * The above comment and the source code should be enough in order to understand
7650 * the implementation and modify / fix it later.
7651 */
7652
7653/* Set a client in blocking mode for the specified key, with the specified
7654 * timeout */
b177fd30 7655static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 7656 dictEntry *de;
7657 list *l;
b177fd30 7658 int j;
4409877e 7659
37ab76c9 7660 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
7661 c->blocking_keys_num = numkeys;
4409877e 7662 c->blockingto = timeout;
b177fd30 7663 for (j = 0; j < numkeys; j++) {
7664 /* Add the key in the client structure, to map clients -> keys */
37ab76c9 7665 c->blocking_keys[j] = keys[j];
b177fd30 7666 incrRefCount(keys[j]);
4409877e 7667
b177fd30 7668 /* And in the other "side", to map keys -> clients */
37ab76c9 7669 de = dictFind(c->db->blocking_keys,keys[j]);
b177fd30 7670 if (de == NULL) {
7671 int retval;
7672
7673 /* For every key we take a list of clients blocked for it */
7674 l = listCreate();
37ab76c9 7675 retval = dictAdd(c->db->blocking_keys,keys[j],l);
b177fd30 7676 incrRefCount(keys[j]);
7677 assert(retval == DICT_OK);
7678 } else {
7679 l = dictGetEntryVal(de);
7680 }
7681 listAddNodeTail(l,c);
4409877e 7682 }
b177fd30 7683 /* Mark the client as a blocked client */
4409877e 7684 c->flags |= REDIS_BLOCKED;
d5d55fc3 7685 server.blpop_blocked_clients++;
4409877e 7686}
7687
7688/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 7689static void unblockClientWaitingData(redisClient *c) {
4409877e 7690 dictEntry *de;
7691 list *l;
b177fd30 7692 int j;
4409877e 7693
37ab76c9 7694 assert(c->blocking_keys != NULL);
b177fd30 7695 /* The client may wait for multiple keys, so unblock it for every key. */
37ab76c9 7696 for (j = 0; j < c->blocking_keys_num; j++) {
b177fd30 7697 /* Remove this client from the list of clients waiting for this key. */
37ab76c9 7698 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
b177fd30 7699 assert(de != NULL);
7700 l = dictGetEntryVal(de);
7701 listDelNode(l,listSearchKey(l,c));
7702 /* If the list is empty we need to remove it to avoid wasting memory */
7703 if (listLength(l) == 0)
37ab76c9 7704 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
7705 decrRefCount(c->blocking_keys[j]);
b177fd30 7706 }
7707 /* Cleanup the client structure */
37ab76c9 7708 zfree(c->blocking_keys);
7709 c->blocking_keys = NULL;
4409877e 7710 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 7711 server.blpop_blocked_clients--;
5921aa36 7712 /* We want to process data if there is some command waiting
b0d8747d 7713 * in the input buffer. Note that this is safe even if
7714 * unblockClientWaitingData() gets called from freeClient() because
7715 * freeClient() will be smart enough to call this function
7716 * *after* c->querybuf was set to NULL. */
4409877e 7717 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7718}
7719
7720/* This should be called from any function PUSHing into lists.
7721 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7722 * 'ele' is the element pushed.
7723 *
7724 * If the function returns 0 there was no client waiting for a list push
7725 * against this key.
7726 *
7727 * If the function returns 1 there was a client waiting for a list push
7728 * against this key, the element was passed to this client thus it's not
7729 * needed to actually add it to the list and the caller should return asap. */
7730static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7731 struct dictEntry *de;
7732 redisClient *receiver;
7733 list *l;
7734 listNode *ln;
7735
37ab76c9 7736 de = dictFind(c->db->blocking_keys,key);
4409877e 7737 if (de == NULL) return 0;
7738 l = dictGetEntryVal(de);
7739 ln = listFirst(l);
7740 assert(ln != NULL);
7741 receiver = ln->value;
4409877e 7742
b177fd30 7743 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 7744 addReplyBulk(receiver,key);
7745 addReplyBulk(receiver,ele);
b0d8747d 7746 unblockClientWaitingData(receiver);
4409877e 7747 return 1;
7748}
7749
7750/* Blocking RPOP/LPOP */
7751static void blockingPopGenericCommand(redisClient *c, int where) {
7752 robj *o;
7753 time_t timeout;
b177fd30 7754 int j;
4409877e 7755
b177fd30 7756 for (j = 1; j < c->argc-1; j++) {
7757 o = lookupKeyWrite(c->db,c->argv[j]);
7758 if (o != NULL) {
7759 if (o->type != REDIS_LIST) {
7760 addReply(c,shared.wrongtypeerr);
4409877e 7761 return;
b177fd30 7762 } else {
7763 list *list = o->ptr;
7764 if (listLength(list) != 0) {
7765 /* If the list contains elements fall back to the usual
7766 * non-blocking POP operation */
7767 robj *argv[2], **orig_argv;
7768 int orig_argc;
e0a62c7f 7769
b177fd30 7770 /* We need to alter the command arguments before to call
7771 * popGenericCommand() as the command takes a single key. */
7772 orig_argv = c->argv;
7773 orig_argc = c->argc;
7774 argv[1] = c->argv[j];
7775 c->argv = argv;
7776 c->argc = 2;
7777
7778 /* Also the return value is different, we need to output
7779 * the multi bulk reply header and the key name. The
7780 * "real" command will add the last element (the value)
7781 * for us. If this souds like an hack to you it's just
7782 * because it is... */
7783 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 7784 addReplyBulk(c,argv[1]);
b177fd30 7785 popGenericCommand(c,where);
7786
7787 /* Fix the client structure with the original stuff */
7788 c->argv = orig_argv;
7789 c->argc = orig_argc;
7790 return;
7791 }
4409877e 7792 }
7793 }
7794 }
7795 /* If the list is empty or the key does not exists we must block */
b177fd30 7796 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 7797 if (timeout > 0) timeout += time(NULL);
b177fd30 7798 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 7799}
7800
7801static void blpopCommand(redisClient *c) {
7802 blockingPopGenericCommand(c,REDIS_HEAD);
7803}
7804
7805static void brpopCommand(redisClient *c) {
7806 blockingPopGenericCommand(c,REDIS_TAIL);
7807}
7808
ed9b544e 7809/* =============================== Replication ============================= */
7810
a4d1ba9a 7811static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7812 ssize_t nwritten, ret = size;
7813 time_t start = time(NULL);
7814
7815 timeout++;
7816 while(size) {
7817 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7818 nwritten = write(fd,ptr,size);
7819 if (nwritten == -1) return -1;
7820 ptr += nwritten;
7821 size -= nwritten;
7822 }
7823 if ((time(NULL)-start) > timeout) {
7824 errno = ETIMEDOUT;
7825 return -1;
7826 }
7827 }
7828 return ret;
7829}
7830
a4d1ba9a 7831static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7832 ssize_t nread, totread = 0;
7833 time_t start = time(NULL);
7834
7835 timeout++;
7836 while(size) {
7837 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7838 nread = read(fd,ptr,size);
7839 if (nread == -1) return -1;
7840 ptr += nread;
7841 size -= nread;
7842 totread += nread;
7843 }
7844 if ((time(NULL)-start) > timeout) {
7845 errno = ETIMEDOUT;
7846 return -1;
7847 }
7848 }
7849 return totread;
7850}
7851
7852static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7853 ssize_t nread = 0;
7854
7855 size--;
7856 while(size) {
7857 char c;
7858
7859 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7860 if (c == '\n') {
7861 *ptr = '\0';
7862 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7863 return nread;
7864 } else {
7865 *ptr++ = c;
7866 *ptr = '\0';
7867 nread++;
7868 }
7869 }
7870 return nread;
7871}
7872
7873static void syncCommand(redisClient *c) {
40d224a9 7874 /* ignore SYNC if aleady slave or in monitor mode */
7875 if (c->flags & REDIS_SLAVE) return;
7876
7877 /* SYNC can't be issued when the server has pending data to send to
7878 * the client about already issued commands. We need a fresh reply
7879 * buffer registering the differences between the BGSAVE and the current
7880 * dataset, so that we can copy to other slaves if needed. */
7881 if (listLength(c->reply) != 0) {
7882 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7883 return;
7884 }
7885
7886 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7887 /* Here we need to check if there is a background saving operation
7888 * in progress, or if it is required to start one */
9d65a1bb 7889 if (server.bgsavechildpid != -1) {
40d224a9 7890 /* Ok a background save is in progress. Let's check if it is a good
7891 * one for replication, i.e. if there is another slave that is
7892 * registering differences since the server forked to save */
7893 redisClient *slave;
7894 listNode *ln;
c7df85a4 7895 listIter li;
40d224a9 7896
c7df85a4 7897 listRewind(server.slaves,&li);
7898 while((ln = listNext(&li))) {
40d224a9 7899 slave = ln->value;
7900 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 7901 }
7902 if (ln) {
7903 /* Perfect, the server is already registering differences for
7904 * another slave. Set the right state, and copy the buffer. */
7905 listRelease(c->reply);
7906 c->reply = listDup(slave->reply);
40d224a9 7907 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7908 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7909 } else {
7910 /* No way, we need to wait for the next BGSAVE in order to
7911 * register differences */
7912 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7913 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7914 }
7915 } else {
7916 /* Ok we don't have a BGSAVE in progress, let's start one */
7917 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7918 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7919 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7920 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7921 return;
7922 }
7923 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7924 }
6208b3a7 7925 c->repldbfd = -1;
40d224a9 7926 c->flags |= REDIS_SLAVE;
7927 c->slaveseldb = 0;
6b47e12e 7928 listAddNodeTail(server.slaves,c);
40d224a9 7929 return;
7930}
7931
6208b3a7 7932static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7933 redisClient *slave = privdata;
7934 REDIS_NOTUSED(el);
7935 REDIS_NOTUSED(mask);
7936 char buf[REDIS_IOBUF_LEN];
7937 ssize_t nwritten, buflen;
7938
7939 if (slave->repldboff == 0) {
7940 /* Write the bulk write count before to transfer the DB. In theory here
7941 * we don't know how much room there is in the output buffer of the
7942 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7943 * operations) will never be smaller than the few bytes we need. */
7944 sds bulkcount;
7945
7946 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7947 slave->repldbsize);
7948 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7949 {
7950 sdsfree(bulkcount);
7951 freeClient(slave);
7952 return;
7953 }
7954 sdsfree(bulkcount);
7955 }
7956 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7957 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7958 if (buflen <= 0) {
7959 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7960 (buflen == 0) ? "premature EOF" : strerror(errno));
7961 freeClient(slave);
7962 return;
7963 }
7964 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 7965 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 7966 strerror(errno));
7967 freeClient(slave);
7968 return;
7969 }
7970 slave->repldboff += nwritten;
7971 if (slave->repldboff == slave->repldbsize) {
7972 close(slave->repldbfd);
7973 slave->repldbfd = -1;
7974 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7975 slave->replstate = REDIS_REPL_ONLINE;
7976 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 7977 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 7978 freeClient(slave);
7979 return;
7980 }
7981 addReplySds(slave,sdsempty());
7982 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7983 }
7984}
ed9b544e 7985
a3b21203 7986/* This function is called at the end of every backgrond saving.
7987 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7988 * otherwise REDIS_ERR is passed to the function.
7989 *
7990 * The goal of this function is to handle slaves waiting for a successful
7991 * background saving in order to perform non-blocking synchronization. */
7992static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 7993 listNode *ln;
7994 int startbgsave = 0;
c7df85a4 7995 listIter li;
ed9b544e 7996
c7df85a4 7997 listRewind(server.slaves,&li);
7998 while((ln = listNext(&li))) {
6208b3a7 7999 redisClient *slave = ln->value;
ed9b544e 8000
6208b3a7 8001 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
8002 startbgsave = 1;
8003 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8004 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 8005 struct redis_stat buf;
e0a62c7f 8006
6208b3a7 8007 if (bgsaveerr != REDIS_OK) {
8008 freeClient(slave);
8009 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8010 continue;
8011 }
8012 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 8013 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 8014 freeClient(slave);
8015 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8016 continue;
8017 }
8018 slave->repldboff = 0;
8019 slave->repldbsize = buf.st_size;
8020 slave->replstate = REDIS_REPL_SEND_BULK;
8021 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 8022 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 8023 freeClient(slave);
8024 continue;
8025 }
8026 }
ed9b544e 8027 }
6208b3a7 8028 if (startbgsave) {
8029 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 8030 listIter li;
8031
8032 listRewind(server.slaves,&li);
6208b3a7 8033 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 8034 while((ln = listNext(&li))) {
6208b3a7 8035 redisClient *slave = ln->value;
ed9b544e 8036
6208b3a7 8037 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8038 freeClient(slave);
8039 }
8040 }
8041 }
ed9b544e 8042}
8043
8044static int syncWithMaster(void) {
d0ccebcf 8045 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 8046 long dumpsize;
ed9b544e 8047 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 8048 int dfd, maxtries = 5;
ed9b544e 8049
8050 if (fd == -1) {
8051 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8052 strerror(errno));
8053 return REDIS_ERR;
8054 }
d0ccebcf 8055
8056 /* AUTH with the master if required. */
8057 if(server.masterauth) {
8058 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8059 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8060 close(fd);
8061 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8062 strerror(errno));
8063 return REDIS_ERR;
8064 }
8065 /* Read the AUTH result. */
8066 if (syncReadLine(fd,buf,1024,3600) == -1) {
8067 close(fd);
8068 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8069 strerror(errno));
8070 return REDIS_ERR;
8071 }
8072 if (buf[0] != '+') {
8073 close(fd);
8074 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8075 return REDIS_ERR;
8076 }
8077 }
8078
ed9b544e 8079 /* Issue the SYNC command */
8080 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8081 close(fd);
8082 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8083 strerror(errno));
8084 return REDIS_ERR;
8085 }
8086 /* Read the bulk write count */
8c4d91fc 8087 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 8088 close(fd);
8089 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8090 strerror(errno));
8091 return REDIS_ERR;
8092 }
4aa701c1 8093 if (buf[0] != '$') {
8094 close(fd);
8095 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8096 return REDIS_ERR;
8097 }
18e61fa2 8098 dumpsize = strtol(buf+1,NULL,10);
8099 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 8100 /* Read the bulk write data on a temp file */
8c5abee8 8101 while(maxtries--) {
8102 snprintf(tmpfile,256,
8103 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8104 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8105 if (dfd != -1) break;
5de9ad7c 8106 sleep(1);
8c5abee8 8107 }
ed9b544e 8108 if (dfd == -1) {
8109 close(fd);
8110 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8111 return REDIS_ERR;
8112 }
8113 while(dumpsize) {
8114 int nread, nwritten;
8115
8116 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8117 if (nread == -1) {
8118 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8119 strerror(errno));
8120 close(fd);
8121 close(dfd);
8122 return REDIS_ERR;
8123 }
8124 nwritten = write(dfd,buf,nread);
8125 if (nwritten == -1) {
8126 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8127 close(fd);
8128 close(dfd);
8129 return REDIS_ERR;
8130 }
8131 dumpsize -= nread;
8132 }
8133 close(dfd);
8134 if (rename(tmpfile,server.dbfilename) == -1) {
8135 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8136 unlink(tmpfile);
8137 close(fd);
8138 return REDIS_ERR;
8139 }
8140 emptyDb();
f78fd11b 8141 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 8142 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8143 close(fd);
8144 return REDIS_ERR;
8145 }
8146 server.master = createClient(fd);
8147 server.master->flags |= REDIS_MASTER;
179b3952 8148 server.master->authenticated = 1;
ed9b544e 8149 server.replstate = REDIS_REPL_CONNECTED;
8150 return REDIS_OK;
8151}
8152
321b0e13 8153static void slaveofCommand(redisClient *c) {
8154 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8155 !strcasecmp(c->argv[2]->ptr,"one")) {
8156 if (server.masterhost) {
8157 sdsfree(server.masterhost);
8158 server.masterhost = NULL;
8159 if (server.master) freeClient(server.master);
8160 server.replstate = REDIS_REPL_NONE;
8161 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8162 }
8163 } else {
8164 sdsfree(server.masterhost);
8165 server.masterhost = sdsdup(c->argv[1]->ptr);
8166 server.masterport = atoi(c->argv[2]->ptr);
8167 if (server.master) freeClient(server.master);
8168 server.replstate = REDIS_REPL_CONNECT;
8169 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8170 server.masterhost, server.masterport);
8171 }
8172 addReply(c,shared.ok);
8173}
8174
3fd78bcd 8175/* ============================ Maxmemory directive ======================== */
8176
a5819310 8177/* Try to free one object form the pre-allocated objects free list.
8178 * This is useful under low mem conditions as by default we take 1 million
8179 * free objects allocated. On success REDIS_OK is returned, otherwise
8180 * REDIS_ERR. */
8181static int tryFreeOneObjectFromFreelist(void) {
f870935d 8182 robj *o;
8183
a5819310 8184 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8185 if (listLength(server.objfreelist)) {
8186 listNode *head = listFirst(server.objfreelist);
8187 o = listNodeValue(head);
8188 listDelNode(server.objfreelist,head);
8189 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8190 zfree(o);
8191 return REDIS_OK;
8192 } else {
8193 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8194 return REDIS_ERR;
8195 }
f870935d 8196}
8197
3fd78bcd 8198/* This function gets called when 'maxmemory' is set on the config file to limit
8199 * the max memory used by the server, and we are out of memory.
8200 * This function will try to, in order:
8201 *
8202 * - Free objects from the free list
8203 * - Try to remove keys with an EXPIRE set
8204 *
8205 * It is not possible to free enough memory to reach used-memory < maxmemory
8206 * the server will start refusing commands that will enlarge even more the
8207 * memory usage.
8208 */
8209static void freeMemoryIfNeeded(void) {
8210 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 8211 int j, k, freed = 0;
8212
8213 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8214 for (j = 0; j < server.dbnum; j++) {
8215 int minttl = -1;
8216 robj *minkey = NULL;
8217 struct dictEntry *de;
8218
8219 if (dictSize(server.db[j].expires)) {
8220 freed = 1;
8221 /* From a sample of three keys drop the one nearest to
8222 * the natural expire */
8223 for (k = 0; k < 3; k++) {
8224 time_t t;
8225
8226 de = dictGetRandomKey(server.db[j].expires);
8227 t = (time_t) dictGetEntryVal(de);
8228 if (minttl == -1 || t < minttl) {
8229 minkey = dictGetEntryKey(de);
8230 minttl = t;
3fd78bcd 8231 }
3fd78bcd 8232 }
a5819310 8233 deleteKey(server.db+j,minkey);
3fd78bcd 8234 }
3fd78bcd 8235 }
a5819310 8236 if (!freed) return; /* nothing to free... */
3fd78bcd 8237 }
8238}
8239
f80dff62 8240/* ============================== Append Only file ========================== */
8241
28ed1f33 8242/* Write the append only file buffer on disk.
8243 *
8244 * Since we are required to write the AOF before replying to the client,
8245 * and the only way the client socket can get a write is entering when the
8246 * the event loop, we accumulate all the AOF writes in a memory
8247 * buffer and write it on disk using this function just before entering
8248 * the event loop again. */
8249static void flushAppendOnlyFile(void) {
8250 time_t now;
8251 ssize_t nwritten;
8252
8253 if (sdslen(server.aofbuf) == 0) return;
8254
8255 /* We want to perform a single write. This should be guaranteed atomic
8256 * at least if the filesystem we are writing is a real physical one.
8257 * While this will save us against the server being killed I don't think
8258 * there is much to do about the whole server stopping for power problems
8259 * or alike */
8260 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8261 if (nwritten != (signed)sdslen(server.aofbuf)) {
8262 /* Ooops, we are in troubles. The best thing to do for now is
8263 * aborting instead of giving the illusion that everything is
8264 * working as expected. */
8265 if (nwritten == -1) {
8266 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8267 } else {
8268 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8269 }
8270 exit(1);
8271 }
8272 sdsfree(server.aofbuf);
8273 server.aofbuf = sdsempty();
8274
8275 /* Fsync if needed */
8276 now = time(NULL);
8277 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8278 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8279 now-server.lastfsync > 1))
8280 {
8281 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8282 * flushing metadata. */
8283 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8284 server.lastfsync = now;
8285 }
8286}
8287
9376e434
PN
8288static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8289 int j;
8290 buf = sdscatprintf(buf,"*%d\r\n",argc);
8291 for (j = 0; j < argc; j++) {
8292 robj *o = getDecodedObject(argv[j]);
8293 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8294 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8295 buf = sdscatlen(buf,"\r\n",2);
8296 decrRefCount(o);
8297 }
8298 return buf;
8299}
8300
8301static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8302 int argc = 3;
8303 long when;
8304 robj *argv[3];
8305
8306 /* Make sure we can use strtol */
8307 seconds = getDecodedObject(seconds);
8308 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8309 decrRefCount(seconds);
8310
8311 argv[0] = createStringObject("EXPIREAT",8);
8312 argv[1] = key;
8313 argv[2] = createObject(REDIS_STRING,
8314 sdscatprintf(sdsempty(),"%ld",when));
8315 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8316 decrRefCount(argv[0]);
8317 decrRefCount(argv[2]);
8318 return buf;
8319}
8320
f80dff62 8321static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8322 sds buf = sdsempty();
f80dff62 8323 robj *tmpargv[3];
8324
8325 /* The DB this command was targetting is not the same as the last command
8326 * we appendend. To issue a SELECT command is needed. */
8327 if (dictid != server.appendseldb) {
8328 char seldb[64];
8329
8330 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 8331 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 8332 (unsigned long)strlen(seldb),seldb);
f80dff62 8333 server.appendseldb = dictid;
8334 }
8335
f80dff62 8336 if (cmd->proc == expireCommand) {
9376e434
PN
8337 /* Translate EXPIRE into EXPIREAT */
8338 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8339 } else if (cmd->proc == setexCommand) {
8340 /* Translate SETEX to SET and EXPIREAT */
8341 tmpargv[0] = createStringObject("SET",3);
f80dff62 8342 tmpargv[1] = argv[1];
9376e434
PN
8343 tmpargv[2] = argv[3];
8344 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8345 decrRefCount(tmpargv[0]);
8346 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8347 } else {
8348 buf = catAppendOnlyGenericCommand(buf,argc,argv);
f80dff62 8349 }
8350
28ed1f33 8351 /* Append to the AOF buffer. This will be flushed on disk just before
8352 * of re-entering the event loop, so before the client will get a
8353 * positive reply about the operation performed. */
8354 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8355
85a83172 8356 /* If a background append only file rewriting is in progress we want to
8357 * accumulate the differences between the child DB and the current one
8358 * in a buffer, so that when the child process will do its work we
8359 * can append the differences to the new append only file. */
8360 if (server.bgrewritechildpid != -1)
8361 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8362
8363 sdsfree(buf);
f80dff62 8364}
8365
8366/* In Redis commands are always executed in the context of a client, so in
8367 * order to load the append only file we need to create a fake client. */
8368static struct redisClient *createFakeClient(void) {
8369 struct redisClient *c = zmalloc(sizeof(*c));
8370
8371 selectDb(c,0);
8372 c->fd = -1;
8373 c->querybuf = sdsempty();
8374 c->argc = 0;
8375 c->argv = NULL;
8376 c->flags = 0;
9387d17d 8377 /* We set the fake client as a slave waiting for the synchronization
8378 * so that Redis will not try to send replies to this client. */
8379 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 8380 c->reply = listCreate();
8381 listSetFreeMethod(c->reply,decrRefCount);
8382 listSetDupMethod(c->reply,dupClientReplyValue);
4132ad8d 8383 initClientMultiState(c);
f80dff62 8384 return c;
8385}
8386
8387static void freeFakeClient(struct redisClient *c) {
8388 sdsfree(c->querybuf);
8389 listRelease(c->reply);
4132ad8d 8390 freeClientMultiState(c);
f80dff62 8391 zfree(c);
8392}
8393
8394/* Replay the append log file. On error REDIS_OK is returned. On non fatal
8395 * error (the append only file is zero-length) REDIS_ERR is returned. On
8396 * fatal error an error message is logged and the program exists. */
8397int loadAppendOnlyFile(char *filename) {
8398 struct redisClient *fakeClient;
8399 FILE *fp = fopen(filename,"r");
8400 struct redis_stat sb;
b492cf00 8401 unsigned long long loadedkeys = 0;
4132ad8d 8402 int appendonly = server.appendonly;
f80dff62 8403
8404 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8405 return REDIS_ERR;
8406
8407 if (fp == NULL) {
8408 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8409 exit(1);
8410 }
8411
4132ad8d
PN
8412 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8413 * to the same file we're about to read. */
8414 server.appendonly = 0;
8415
f80dff62 8416 fakeClient = createFakeClient();
8417 while(1) {
8418 int argc, j;
8419 unsigned long len;
8420 robj **argv;
8421 char buf[128];
8422 sds argsds;
8423 struct redisCommand *cmd;
8424
8425 if (fgets(buf,sizeof(buf),fp) == NULL) {
8426 if (feof(fp))
8427 break;
8428 else
8429 goto readerr;
8430 }
8431 if (buf[0] != '*') goto fmterr;
8432 argc = atoi(buf+1);
8433 argv = zmalloc(sizeof(robj*)*argc);
8434 for (j = 0; j < argc; j++) {
8435 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8436 if (buf[0] != '$') goto fmterr;
8437 len = strtol(buf+1,NULL,10);
8438 argsds = sdsnewlen(NULL,len);
0f151ef1 8439 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 8440 argv[j] = createObject(REDIS_STRING,argsds);
8441 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8442 }
8443
8444 /* Command lookup */
8445 cmd = lookupCommand(argv[0]->ptr);
8446 if (!cmd) {
8447 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8448 exit(1);
8449 }
bdcb92f2 8450 /* Try object encoding */
f80dff62 8451 if (cmd->flags & REDIS_CMD_BULK)
05df7621 8452 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
f80dff62 8453 /* Run the command in the context of a fake client */
8454 fakeClient->argc = argc;
8455 fakeClient->argv = argv;
8456 cmd->proc(fakeClient);
8457 /* Discard the reply objects list from the fake client */
8458 while(listLength(fakeClient->reply))
8459 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8460 /* Clean up, ready for the next command */
8461 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8462 zfree(argv);
b492cf00 8463 /* Handle swapping while loading big datasets when VM is on */
8464 loadedkeys++;
8465 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8466 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 8467 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 8468 }
8469 }
f80dff62 8470 }
4132ad8d
PN
8471
8472 /* This point can only be reached when EOF is reached without errors.
8473 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8474 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8475
f80dff62 8476 fclose(fp);
8477 freeFakeClient(fakeClient);
4132ad8d 8478 server.appendonly = appendonly;
f80dff62 8479 return REDIS_OK;
8480
8481readerr:
8482 if (feof(fp)) {
8483 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8484 } else {
8485 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8486 }
8487 exit(1);
8488fmterr:
8489 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8490 exit(1);
8491}
8492
9d65a1bb 8493/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
9c8e3cee 8494static int fwriteBulkObject(FILE *fp, robj *obj) {
9d65a1bb 8495 char buf[128];
b9bc0eef 8496 int decrrc = 0;
8497
f2d9f50f 8498 /* Avoid the incr/decr ref count business if possible to help
8499 * copy-on-write (we are often in a child process when this function
8500 * is called).
8501 * Also makes sure that key objects don't get incrRefCount-ed when VM
8502 * is enabled */
8503 if (obj->encoding != REDIS_ENCODING_RAW) {
b9bc0eef 8504 obj = getDecodedObject(obj);
8505 decrrc = 1;
8506 }
9d65a1bb 8507 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8508 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 8509 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8510 goto err;
9d65a1bb 8511 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 8512 if (decrrc) decrRefCount(obj);
9d65a1bb 8513 return 1;
8514err:
b9bc0eef 8515 if (decrrc) decrRefCount(obj);
9d65a1bb 8516 return 0;
8517}
8518
9c8e3cee 8519/* Write binary-safe string into a file in the bulkformat
8520 * $<count>\r\n<payload>\r\n */
8521static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8522 char buf[128];
8523
8524 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8525 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8526 if (len && fwrite(s,len,1,fp) == 0) return 0;
8527 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8528 return 1;
8529}
8530
9d65a1bb 8531/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8532static int fwriteBulkDouble(FILE *fp, double d) {
8533 char buf[128], dbuf[128];
8534
8535 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8536 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8537 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8538 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8539 return 1;
8540}
8541
8542/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8543static int fwriteBulkLong(FILE *fp, long l) {
8544 char buf[128], lbuf[128];
8545
8546 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8547 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8548 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8549 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8550 return 1;
8551}
8552
8553/* Write a sequence of commands able to fully rebuild the dataset into
8554 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8555static int rewriteAppendOnlyFile(char *filename) {
8556 dictIterator *di = NULL;
8557 dictEntry *de;
8558 FILE *fp;
8559 char tmpfile[256];
8560 int j;
8561 time_t now = time(NULL);
8562
8563 /* Note that we have to use a different temp name here compared to the
8564 * one used by rewriteAppendOnlyFileBackground() function. */
8565 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8566 fp = fopen(tmpfile,"w");
8567 if (!fp) {
8568 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8569 return REDIS_ERR;
8570 }
8571 for (j = 0; j < server.dbnum; j++) {
8572 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8573 redisDb *db = server.db+j;
8574 dict *d = db->dict;
8575 if (dictSize(d) == 0) continue;
8576 di = dictGetIterator(d);
8577 if (!di) {
8578 fclose(fp);
8579 return REDIS_ERR;
8580 }
8581
8582 /* SELECT the new DB */
8583 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 8584 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 8585
8586 /* Iterate this DB writing every entry */
8587 while((de = dictNext(di)) != NULL) {
e7546c63 8588 robj *key, *o;
8589 time_t expiretime;
8590 int swapped;
8591
8592 key = dictGetEntryKey(de);
b9bc0eef 8593 /* If the value for this key is swapped, load a preview in memory.
8594 * We use a "swapped" flag to remember if we need to free the
8595 * value object instead to just increment the ref count anyway
8596 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 8597 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8598 key->storage == REDIS_VM_SWAPPING) {
e7546c63 8599 o = dictGetEntryVal(de);
8600 swapped = 0;
8601 } else {
8602 o = vmPreviewObject(key);
e7546c63 8603 swapped = 1;
8604 }
8605 expiretime = getExpire(db,key);
9d65a1bb 8606
8607 /* Save the key and associated value */
9d65a1bb 8608 if (o->type == REDIS_STRING) {
8609 /* Emit a SET command */
8610 char cmd[]="*3\r\n$3\r\nSET\r\n";
8611 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8612 /* Key and value */
9c8e3cee 8613 if (fwriteBulkObject(fp,key) == 0) goto werr;
8614 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 8615 } else if (o->type == REDIS_LIST) {
8616 /* Emit the RPUSHes needed to rebuild the list */
8617 list *list = o->ptr;
8618 listNode *ln;
c7df85a4 8619 listIter li;
9d65a1bb 8620
c7df85a4 8621 listRewind(list,&li);
8622 while((ln = listNext(&li))) {
9d65a1bb 8623 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8624 robj *eleobj = listNodeValue(ln);
8625
8626 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8627 if (fwriteBulkObject(fp,key) == 0) goto werr;
8628 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8629 }
8630 } else if (o->type == REDIS_SET) {
8631 /* Emit the SADDs needed to rebuild the set */
8632 dict *set = o->ptr;
8633 dictIterator *di = dictGetIterator(set);
8634 dictEntry *de;
8635
8636 while((de = dictNext(di)) != NULL) {
8637 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8638 robj *eleobj = dictGetEntryKey(de);
8639
8640 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8641 if (fwriteBulkObject(fp,key) == 0) goto werr;
8642 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8643 }
8644 dictReleaseIterator(di);
8645 } else if (o->type == REDIS_ZSET) {
8646 /* Emit the ZADDs needed to rebuild the sorted set */
8647 zset *zs = o->ptr;
8648 dictIterator *di = dictGetIterator(zs->dict);
8649 dictEntry *de;
8650
8651 while((de = dictNext(di)) != NULL) {
8652 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8653 robj *eleobj = dictGetEntryKey(de);
8654 double *score = dictGetEntryVal(de);
8655
8656 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8657 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8658 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 8659 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8660 }
8661 dictReleaseIterator(di);
9c8e3cee 8662 } else if (o->type == REDIS_HASH) {
8663 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8664
8665 /* Emit the HSETs needed to rebuild the hash */
8666 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8667 unsigned char *p = zipmapRewind(o->ptr);
8668 unsigned char *field, *val;
8669 unsigned int flen, vlen;
8670
8671 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8672 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8673 if (fwriteBulkObject(fp,key) == 0) goto werr;
8674 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8675 return -1;
8676 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8677 return -1;
8678 }
8679 } else {
8680 dictIterator *di = dictGetIterator(o->ptr);
8681 dictEntry *de;
8682
8683 while((de = dictNext(di)) != NULL) {
8684 robj *field = dictGetEntryKey(de);
8685 robj *val = dictGetEntryVal(de);
8686
8687 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8688 if (fwriteBulkObject(fp,key) == 0) goto werr;
8689 if (fwriteBulkObject(fp,field) == -1) return -1;
8690 if (fwriteBulkObject(fp,val) == -1) return -1;
8691 }
8692 dictReleaseIterator(di);
8693 }
9d65a1bb 8694 } else {
f83c6cb5 8695 redisPanic("Unknown object type");
9d65a1bb 8696 }
8697 /* Save the expire time */
8698 if (expiretime != -1) {
e96e4fbf 8699 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 8700 /* If this key is already expired skip it */
8701 if (expiretime < now) continue;
8702 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8703 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8704 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8705 }
b9bc0eef 8706 if (swapped) decrRefCount(o);
9d65a1bb 8707 }
8708 dictReleaseIterator(di);
8709 }
8710
8711 /* Make sure data will not remain on the OS's output buffers */
8712 fflush(fp);
8713 fsync(fileno(fp));
8714 fclose(fp);
e0a62c7f 8715
9d65a1bb 8716 /* Use RENAME to make sure the DB file is changed atomically only
8717 * if the generate DB file is ok. */
8718 if (rename(tmpfile,filename) == -1) {
8719 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8720 unlink(tmpfile);
8721 return REDIS_ERR;
8722 }
8723 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8724 return REDIS_OK;
8725
8726werr:
8727 fclose(fp);
8728 unlink(tmpfile);
e96e4fbf 8729 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 8730 if (di) dictReleaseIterator(di);
8731 return REDIS_ERR;
8732}
8733
8734/* This is how rewriting of the append only file in background works:
8735 *
8736 * 1) The user calls BGREWRITEAOF
8737 * 2) Redis calls this function, that forks():
8738 * 2a) the child rewrite the append only file in a temp file.
8739 * 2b) the parent accumulates differences in server.bgrewritebuf.
8740 * 3) When the child finished '2a' exists.
8741 * 4) The parent will trap the exit code, if it's OK, will append the
8742 * data accumulated into server.bgrewritebuf into the temp file, and
8743 * finally will rename(2) the temp file in the actual file name.
8744 * The the new file is reopened as the new append only file. Profit!
8745 */
8746static int rewriteAppendOnlyFileBackground(void) {
8747 pid_t childpid;
8748
8749 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 8750 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 8751 if ((childpid = fork()) == 0) {
8752 /* Child */
8753 char tmpfile[256];
9d65a1bb 8754
054e426d 8755 if (server.vm_enabled) vmReopenSwapFile();
8756 close(server.fd);
9d65a1bb 8757 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8758 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 8759 _exit(0);
9d65a1bb 8760 } else {
478c2c6f 8761 _exit(1);
9d65a1bb 8762 }
8763 } else {
8764 /* Parent */
8765 if (childpid == -1) {
8766 redisLog(REDIS_WARNING,
8767 "Can't rewrite append only file in background: fork: %s",
8768 strerror(errno));
8769 return REDIS_ERR;
8770 }
8771 redisLog(REDIS_NOTICE,
8772 "Background append only file rewriting started by pid %d",childpid);
8773 server.bgrewritechildpid = childpid;
884d4b39 8774 updateDictResizePolicy();
85a83172 8775 /* We set appendseldb to -1 in order to force the next call to the
8776 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8777 * accumulated by the parent into server.bgrewritebuf will start
8778 * with a SELECT statement and it will be safe to merge. */
8779 server.appendseldb = -1;
9d65a1bb 8780 return REDIS_OK;
8781 }
8782 return REDIS_OK; /* unreached */
8783}
8784
8785static void bgrewriteaofCommand(redisClient *c) {
8786 if (server.bgrewritechildpid != -1) {
8787 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8788 return;
8789 }
8790 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 8791 char *status = "+Background append only file rewriting started\r\n";
8792 addReplySds(c,sdsnew(status));
9d65a1bb 8793 } else {
8794 addReply(c,shared.err);
8795 }
8796}
8797
8798static void aofRemoveTempFile(pid_t childpid) {
8799 char tmpfile[256];
8800
8801 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8802 unlink(tmpfile);
8803}
8804
996cb5f7 8805/* Virtual Memory is composed mainly of two subsystems:
8806 * - Blocking Virutal Memory
8807 * - Threaded Virtual Memory I/O
8808 * The two parts are not fully decoupled, but functions are split among two
8809 * different sections of the source code (delimited by comments) in order to
8810 * make more clear what functionality is about the blocking VM and what about
8811 * the threaded (not blocking) VM.
8812 *
8813 * Redis VM design:
8814 *
8815 * Redis VM is a blocking VM (one that blocks reading swapped values from
8816 * disk into memory when a value swapped out is needed in memory) that is made
8817 * unblocking by trying to examine the command argument vector in order to
8818 * load in background values that will likely be needed in order to exec
8819 * the command. The command is executed only once all the relevant keys
8820 * are loaded into memory.
8821 *
8822 * This basically is almost as simple of a blocking VM, but almost as parallel
8823 * as a fully non-blocking VM.
8824 */
8825
2e5eb04e 8826/* Called when the user switches from "appendonly yes" to "appendonly no"
8827 * at runtime using the CONFIG command. */
8828static void stopAppendOnly(void) {
8829 flushAppendOnlyFile();
8830 fsync(server.appendfd);
8831 close(server.appendfd);
8832
8833 server.appendfd = -1;
8834 server.appendseldb = -1;
8835 server.appendonly = 0;
8836 /* rewrite operation in progress? kill it, wait child exit */
8837 if (server.bgsavechildpid != -1) {
8838 int statloc;
8839
30dd89b6 8840 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8841 wait3(&statloc,0,NULL);
2e5eb04e 8842 /* reset the buffer accumulating changes while the child saves */
8843 sdsfree(server.bgrewritebuf);
8844 server.bgrewritebuf = sdsempty();
30dd89b6 8845 server.bgsavechildpid = -1;
2e5eb04e 8846 }
8847}
8848
8849/* Called when the user switches from "appendonly no" to "appendonly yes"
8850 * at runtime using the CONFIG command. */
8851static int startAppendOnly(void) {
8852 server.appendonly = 1;
8853 server.lastfsync = time(NULL);
8854 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8855 if (server.appendfd == -1) {
8856 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8857 return REDIS_ERR;
8858 }
8859 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8860 server.appendonly = 0;
8861 close(server.appendfd);
8862 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8863 return REDIS_ERR;
8864 }
8865 return REDIS_OK;
8866}
8867
996cb5f7 8868/* =================== Virtual Memory - Blocking Side ====================== */
054e426d 8869
75680a3c 8870static void vmInit(void) {
8871 off_t totsize;
996cb5f7 8872 int pipefds[2];
bcaa7a4f 8873 size_t stacksize;
8b5bb414 8874 struct flock fl;
75680a3c 8875
4ad37480 8876 if (server.vm_max_threads != 0)
8877 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8878
054e426d 8879 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8b5bb414 8880 /* Try to open the old swap file, otherwise create it */
6fa987e3 8881 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8882 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8883 }
75680a3c 8884 if (server.vm_fp == NULL) {
6fa987e3 8885 redisLog(REDIS_WARNING,
8b5bb414 8886 "Can't open the swap file: %s. Exiting.",
6fa987e3 8887 strerror(errno));
75680a3c 8888 exit(1);
8889 }
8890 server.vm_fd = fileno(server.vm_fp);
8b5bb414 8891 /* Lock the swap file for writing, this is useful in order to avoid
8892 * another instance to use the same swap file for a config error. */
8893 fl.l_type = F_WRLCK;
8894 fl.l_whence = SEEK_SET;
8895 fl.l_start = fl.l_len = 0;
8896 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8897 redisLog(REDIS_WARNING,
8898 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8899 exit(1);
8900 }
8901 /* Initialize */
75680a3c 8902 server.vm_next_page = 0;
8903 server.vm_near_pages = 0;
7d98e08c 8904 server.vm_stats_used_pages = 0;
8905 server.vm_stats_swapped_objects = 0;
8906 server.vm_stats_swapouts = 0;
8907 server.vm_stats_swapins = 0;
75680a3c 8908 totsize = server.vm_pages*server.vm_page_size;
8909 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8910 if (ftruncate(server.vm_fd,totsize) == -1) {
8911 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8912 strerror(errno));
8913 exit(1);
8914 } else {
8915 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8916 }
7d30035d 8917 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 8918 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 8919 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 8920 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 8921
996cb5f7 8922 /* Initialize threaded I/O (used by Virtual Memory) */
8923 server.io_newjobs = listCreate();
8924 server.io_processing = listCreate();
8925 server.io_processed = listCreate();
d5d55fc3 8926 server.io_ready_clients = listCreate();
92f8e882 8927 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 8928 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8929 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 8930 server.io_active_threads = 0;
996cb5f7 8931 if (pipe(pipefds) == -1) {
8932 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8933 ,strerror(errno));
8934 exit(1);
8935 }
8936 server.io_ready_pipe_read = pipefds[0];
8937 server.io_ready_pipe_write = pipefds[1];
8938 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 8939 /* LZF requires a lot of stack */
8940 pthread_attr_init(&server.io_threads_attr);
8941 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8942 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8943 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 8944 /* Listen for events in the threaded I/O pipe */
8945 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8946 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8947 oom("creating file event");
75680a3c 8948}
8949
06224fec 8950/* Mark the page as used */
8951static void vmMarkPageUsed(off_t page) {
8952 off_t byte = page/8;
8953 int bit = page&7;
970e10bb 8954 redisAssert(vmFreePage(page) == 1);
06224fec 8955 server.vm_bitmap[byte] |= 1<<bit;
8956}
8957
8958/* Mark N contiguous pages as used, with 'page' being the first. */
8959static void vmMarkPagesUsed(off_t page, off_t count) {
8960 off_t j;
8961
8962 for (j = 0; j < count; j++)
7d30035d 8963 vmMarkPageUsed(page+j);
7d98e08c 8964 server.vm_stats_used_pages += count;
7c775e09 8965 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8966 (long long)count, (long long)page);
06224fec 8967}
8968
8969/* Mark the page as free */
8970static void vmMarkPageFree(off_t page) {
8971 off_t byte = page/8;
8972 int bit = page&7;
970e10bb 8973 redisAssert(vmFreePage(page) == 0);
06224fec 8974 server.vm_bitmap[byte] &= ~(1<<bit);
8975}
8976
8977/* Mark N contiguous pages as free, with 'page' being the first. */
8978static void vmMarkPagesFree(off_t page, off_t count) {
8979 off_t j;
8980
8981 for (j = 0; j < count; j++)
7d30035d 8982 vmMarkPageFree(page+j);
7d98e08c 8983 server.vm_stats_used_pages -= count;
7c775e09 8984 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8985 (long long)count, (long long)page);
06224fec 8986}
8987
8988/* Test if the page is free */
8989static int vmFreePage(off_t page) {
8990 off_t byte = page/8;
8991 int bit = page&7;
7d30035d 8992 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 8993}
8994
8995/* Find N contiguous free pages storing the first page of the cluster in *first.
e0a62c7f 8996 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
3a66edc7 8997 * REDIS_ERR is returned.
06224fec 8998 *
8999 * This function uses a simple algorithm: we try to allocate
9000 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9001 * again from the start of the swap file searching for free spaces.
9002 *
9003 * If it looks pretty clear that there are no free pages near our offset
9004 * we try to find less populated places doing a forward jump of
9005 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9006 * without hurry, and then we jump again and so forth...
e0a62c7f 9007 *
06224fec 9008 * This function can be improved using a free list to avoid to guess
9009 * too much, since we could collect data about freed pages.
9010 *
9011 * note: I implemented this function just after watching an episode of
9012 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9013 */
c7df85a4 9014static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 9015 off_t base, offset = 0, since_jump = 0, numfree = 0;
9016
9017 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9018 server.vm_near_pages = 0;
9019 server.vm_next_page = 0;
9020 }
9021 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9022 base = server.vm_next_page;
9023
9024 while(offset < server.vm_pages) {
9025 off_t this = base+offset;
9026
9027 /* If we overflow, restart from page zero */
9028 if (this >= server.vm_pages) {
9029 this -= server.vm_pages;
9030 if (this == 0) {
9031 /* Just overflowed, what we found on tail is no longer
9032 * interesting, as it's no longer contiguous. */
9033 numfree = 0;
9034 }
9035 }
9036 if (vmFreePage(this)) {
9037 /* This is a free page */
9038 numfree++;
9039 /* Already got N free pages? Return to the caller, with success */
9040 if (numfree == n) {
7d30035d 9041 *first = this-(n-1);
9042 server.vm_next_page = this+1;
7c775e09 9043 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 9044 return REDIS_OK;
06224fec 9045 }
9046 } else {
9047 /* The current one is not a free page */
9048 numfree = 0;
9049 }
9050
9051 /* Fast-forward if the current page is not free and we already
9052 * searched enough near this place. */
9053 since_jump++;
9054 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9055 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9056 since_jump = 0;
9057 /* Note that even if we rewind after the jump, we are don't need
9058 * to make sure numfree is set to zero as we only jump *if* it
9059 * is set to zero. */
9060 } else {
9061 /* Otherwise just check the next page */
9062 offset++;
9063 }
9064 }
3a66edc7 9065 return REDIS_ERR;
9066}
9067
a5819310 9068/* Write the specified object at the specified page of the swap file */
9069static int vmWriteObjectOnSwap(robj *o, off_t page) {
9070 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9071 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9072 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9073 redisLog(REDIS_WARNING,
9ebed7cf 9074 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 9075 strerror(errno));
9076 return REDIS_ERR;
9077 }
9078 rdbSaveObject(server.vm_fp,o);
ba76a8f9 9079 fflush(server.vm_fp);
a5819310 9080 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9081 return REDIS_OK;
9082}
9083
3a66edc7 9084/* Swap the 'val' object relative to 'key' into disk. Store all the information
9085 * needed to later retrieve the object into the key object.
9086 * If we can't find enough contiguous empty pages to swap the object on disk
9087 * REDIS_ERR is returned. */
a69a0c9c 9088static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 9089 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 9090 off_t page;
9091
9092 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 9093 assert(key->refcount == 1);
3a66edc7 9094 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 9095 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 9096 key->vm.page = page;
9097 key->vm.usedpages = pages;
9098 key->storage = REDIS_VM_SWAPPED;
d894161b 9099 key->vtype = val->type;
3a66edc7 9100 decrRefCount(val); /* Deallocate the object from memory. */
9101 vmMarkPagesUsed(page,pages);
7d30035d 9102 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
9103 (unsigned char*) key->ptr,
9104 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 9105 server.vm_stats_swapped_objects++;
9106 server.vm_stats_swapouts++;
3a66edc7 9107 return REDIS_OK;
9108}
9109
a5819310 9110static robj *vmReadObjectFromSwap(off_t page, int type) {
9111 robj *o;
3a66edc7 9112
a5819310 9113 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9114 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 9115 redisLog(REDIS_WARNING,
d5d55fc3 9116 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 9117 strerror(errno));
478c2c6f 9118 _exit(1);
3a66edc7 9119 }
a5819310 9120 o = rdbLoadObject(type,server.vm_fp);
9121 if (o == NULL) {
d5d55fc3 9122 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 9123 _exit(1);
3a66edc7 9124 }
a5819310 9125 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9126 return o;
9127}
9128
9129/* Load the value object relative to the 'key' object from swap to memory.
9130 * The newly allocated object is returned.
9131 *
9132 * If preview is true the unserialized object is returned to the caller but
9133 * no changes are made to the key object, nor the pages are marked as freed */
9134static robj *vmGenericLoadObject(robj *key, int preview) {
9135 robj *val;
9136
d5d55fc3 9137 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
a5819310 9138 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 9139 if (!preview) {
9140 key->storage = REDIS_VM_MEMORY;
9141 key->vm.atime = server.unixtime;
9142 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9143 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
9144 (unsigned char*) key->ptr);
7d98e08c 9145 server.vm_stats_swapped_objects--;
38aba9a1 9146 } else {
9147 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
9148 (unsigned char*) key->ptr);
7e69548d 9149 }
7d98e08c 9150 server.vm_stats_swapins++;
3a66edc7 9151 return val;
06224fec 9152}
9153
7e69548d 9154/* Plain object loading, from swap to memory */
9155static robj *vmLoadObject(robj *key) {
996cb5f7 9156 /* If we are loading the object in background, stop it, we
9157 * need to load this object synchronously ASAP. */
9158 if (key->storage == REDIS_VM_LOADING)
9159 vmCancelThreadedIOJob(key);
7e69548d 9160 return vmGenericLoadObject(key,0);
9161}
9162
9163/* Just load the value on disk, without to modify the key.
9164 * This is useful when we want to perform some operation on the value
9165 * without to really bring it from swap to memory, like while saving the
9166 * dataset or rewriting the append only log. */
9167static robj *vmPreviewObject(robj *key) {
9168 return vmGenericLoadObject(key,1);
9169}
9170
4ef8de8a 9171/* How a good candidate is this object for swapping?
9172 * The better candidate it is, the greater the returned value.
9173 *
9174 * Currently we try to perform a fast estimation of the object size in
9175 * memory, and combine it with aging informations.
9176 *
9177 * Basically swappability = idle-time * log(estimated size)
9178 *
9179 * Bigger objects are preferred over smaller objects, but not
9180 * proportionally, this is why we use the logarithm. This algorithm is
9181 * just a first try and will probably be tuned later. */
9182static double computeObjectSwappability(robj *o) {
9183 time_t age = server.unixtime - o->vm.atime;
9184 long asize = 0;
9185 list *l;
9186 dict *d;
9187 struct dictEntry *de;
9188 int z;
9189
9190 if (age <= 0) return 0;
9191 switch(o->type) {
9192 case REDIS_STRING:
9193 if (o->encoding != REDIS_ENCODING_RAW) {
9194 asize = sizeof(*o);
9195 } else {
9196 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9197 }
9198 break;
9199 case REDIS_LIST:
9200 l = o->ptr;
9201 listNode *ln = listFirst(l);
9202
9203 asize = sizeof(list);
9204 if (ln) {
9205 robj *ele = ln->value;
9206 long elesize;
9207
9208 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9209 (sizeof(*o)+sdslen(ele->ptr)) :
9210 sizeof(*o);
9211 asize += (sizeof(listNode)+elesize)*listLength(l);
9212 }
9213 break;
9214 case REDIS_SET:
9215 case REDIS_ZSET:
9216 z = (o->type == REDIS_ZSET);
9217 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9218
9219 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9220 if (z) asize += sizeof(zset)-sizeof(dict);
9221 if (dictSize(d)) {
9222 long elesize;
9223 robj *ele;
9224
9225 de = dictGetRandomKey(d);
9226 ele = dictGetEntryKey(de);
9227 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9228 (sizeof(*o)+sdslen(ele->ptr)) :
9229 sizeof(*o);
9230 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9231 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9232 }
9233 break;
a97b9060 9234 case REDIS_HASH:
9235 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9236 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9237 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9238 unsigned int klen, vlen;
9239 unsigned char *key, *val;
9240
9241 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9242 klen = 0;
9243 vlen = 0;
9244 }
9245 asize = len*(klen+vlen+3);
9246 } else if (o->encoding == REDIS_ENCODING_HT) {
9247 d = o->ptr;
9248 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9249 if (dictSize(d)) {
9250 long elesize;
9251 robj *ele;
9252
9253 de = dictGetRandomKey(d);
9254 ele = dictGetEntryKey(de);
9255 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9256 (sizeof(*o)+sdslen(ele->ptr)) :
9257 sizeof(*o);
9258 ele = dictGetEntryVal(de);
9259 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9260 (sizeof(*o)+sdslen(ele->ptr)) :
9261 sizeof(*o);
9262 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9263 }
9264 }
9265 break;
4ef8de8a 9266 }
c8c72447 9267 return (double)age*log(1+asize);
4ef8de8a 9268}
9269
9270/* Try to swap an object that's a good candidate for swapping.
9271 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 9272 * to swap any object at all.
9273 *
9274 * If 'usethreaded' is true, Redis will try to swap the object in background
9275 * using I/O threads. */
9276static int vmSwapOneObject(int usethreads) {
4ef8de8a 9277 int j, i;
9278 struct dictEntry *best = NULL;
9279 double best_swappability = 0;
b9bc0eef 9280 redisDb *best_db = NULL;
4ef8de8a 9281 robj *key, *val;
9282
9283 for (j = 0; j < server.dbnum; j++) {
9284 redisDb *db = server.db+j;
b72f6a4b 9285 /* Why maxtries is set to 100?
9286 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9287 * are swappable objects */
b0d8747d 9288 int maxtries = 100;
4ef8de8a 9289
9290 if (dictSize(db->dict) == 0) continue;
9291 for (i = 0; i < 5; i++) {
9292 dictEntry *de;
9293 double swappability;
9294
e3cadb8a 9295 if (maxtries) maxtries--;
4ef8de8a 9296 de = dictGetRandomKey(db->dict);
9297 key = dictGetEntryKey(de);
9298 val = dictGetEntryVal(de);
1064ef87 9299 /* Only swap objects that are currently in memory.
9300 *
9301 * Also don't swap shared objects if threaded VM is on, as we
9302 * try to ensure that the main thread does not touch the
9303 * object while the I/O thread is using it, but we can't
9304 * control other keys without adding additional mutex. */
9305 if (key->storage != REDIS_VM_MEMORY ||
9306 (server.vm_max_threads != 0 && val->refcount != 1)) {
e3cadb8a 9307 if (maxtries) i--; /* don't count this try */
9308 continue;
9309 }
4ef8de8a 9310 swappability = computeObjectSwappability(val);
9311 if (!best || swappability > best_swappability) {
9312 best = de;
9313 best_swappability = swappability;
b9bc0eef 9314 best_db = db;
4ef8de8a 9315 }
9316 }
9317 }
7c775e09 9318 if (best == NULL) return REDIS_ERR;
4ef8de8a 9319 key = dictGetEntryKey(best);
9320 val = dictGetEntryVal(best);
9321
e3cadb8a 9322 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 9323 key->ptr, best_swappability);
9324
9325 /* Unshare the key if needed */
9326 if (key->refcount > 1) {
9327 robj *newkey = dupStringObject(key);
9328 decrRefCount(key);
9329 key = dictGetEntryKey(best) = newkey;
9330 }
9331 /* Swap it */
a69a0c9c 9332 if (usethreads) {
b9bc0eef 9333 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 9334 return REDIS_OK;
9335 } else {
a69a0c9c 9336 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9337 dictGetEntryVal(best) = NULL;
9338 return REDIS_OK;
9339 } else {
9340 return REDIS_ERR;
9341 }
4ef8de8a 9342 }
9343}
9344
a69a0c9c 9345static int vmSwapOneObjectBlocking() {
9346 return vmSwapOneObject(0);
9347}
9348
9349static int vmSwapOneObjectThreaded() {
9350 return vmSwapOneObject(1);
9351}
9352
7e69548d 9353/* Return true if it's safe to swap out objects in a given moment.
9354 * Basically we don't want to swap objects out while there is a BGSAVE
9355 * or a BGAEOREWRITE running in backgroud. */
9356static int vmCanSwapOut(void) {
9357 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9358}
9359
1b03836c 9360/* Delete a key if swapped. Returns 1 if the key was found, was swapped
9361 * and was deleted. Otherwise 0 is returned. */
9362static int deleteIfSwapped(redisDb *db, robj *key) {
9363 dictEntry *de;
9364 robj *foundkey;
9365
9366 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9367 foundkey = dictGetEntryKey(de);
9368 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9369 deleteKey(db,key);
9370 return 1;
9371}
9372
996cb5f7 9373/* =================== Virtual Memory - Threaded I/O ======================= */
9374
b9bc0eef 9375static void freeIOJob(iojob *j) {
d5d55fc3 9376 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9377 j->type == REDIS_IOJOB_DO_SWAP ||
9378 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
b9bc0eef 9379 decrRefCount(j->val);
78ebe4c8 9380 /* We don't decrRefCount the j->key field as we did't incremented
9381 * the count creating IO Jobs. This is because the key field here is
9382 * just used as an indentifier and if a key is removed the Job should
9383 * never be touched again. */
b9bc0eef 9384 zfree(j);
9385}
9386
996cb5f7 9387/* Every time a thread finished a Job, it writes a byte into the write side
9388 * of an unix pipe in order to "awake" the main thread, and this function
9389 * is called. */
9390static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9391 int mask)
9392{
9393 char buf[1];
b0d8747d 9394 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 9395 REDIS_NOTUSED(el);
9396 REDIS_NOTUSED(mask);
9397 REDIS_NOTUSED(privdata);
9398
9399 /* For every byte we read in the read side of the pipe, there is one
9400 * I/O job completed to process. */
9401 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 9402 iojob *j;
9403 listNode *ln;
9404 robj *key;
9405 struct dictEntry *de;
9406
996cb5f7 9407 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 9408
9409 /* Get the processed element (the oldest one) */
9410 lockThreadedIO();
1064ef87 9411 assert(listLength(server.io_processed) != 0);
f6c0bba8 9412 if (toprocess == -1) {
9413 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9414 if (toprocess <= 0) toprocess = 1;
9415 }
b9bc0eef 9416 ln = listFirst(server.io_processed);
9417 j = ln->value;
9418 listDelNode(server.io_processed,ln);
9419 unlockThreadedIO();
9420 /* If this job is marked as canceled, just ignore it */
9421 if (j->canceled) {
9422 freeIOJob(j);
9423 continue;
9424 }
9425 /* Post process it in the main thread, as there are things we
9426 * can do just here to avoid race conditions and/or invasive locks */
6c96ba7d 9427 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 9428 de = dictFind(j->db->dict,j->key);
9429 assert(de != NULL);
9430 key = dictGetEntryKey(de);
9431 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9432 redisDb *db;
9433
b9bc0eef 9434 /* Key loaded, bring it at home */
9435 key->storage = REDIS_VM_MEMORY;
9436 key->vm.atime = server.unixtime;
9437 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9438 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9439 (unsigned char*) key->ptr);
9440 server.vm_stats_swapped_objects--;
9441 server.vm_stats_swapins++;
d5d55fc3 9442 dictGetEntryVal(de) = j->val;
9443 incrRefCount(j->val);
9444 db = j->db;
b9bc0eef 9445 freeIOJob(j);
d5d55fc3 9446 /* Handle clients waiting for this key to be loaded. */
9447 handleClientsBlockedOnSwappedKey(db,key);
b9bc0eef 9448 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9449 /* Now we know the amount of pages required to swap this object.
9450 * Let's find some space for it, and queue this task again
9451 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 9452 if (!vmCanSwapOut() ||
9453 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9454 {
9455 /* Ooops... no space or we can't swap as there is
9456 * a fork()ed Redis trying to save stuff on disk. */
b9bc0eef 9457 freeIOJob(j);
054e426d 9458 key->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 9459 } else {
c7df85a4 9460 /* Note that we need to mark this pages as used now,
9461 * if the job will be canceled, we'll mark them as freed
9462 * again. */
9463 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 9464 j->type = REDIS_IOJOB_DO_SWAP;
9465 lockThreadedIO();
9466 queueIOJob(j);
9467 unlockThreadedIO();
9468 }
9469 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9470 robj *val;
9471
9472 /* Key swapped. We can finally free some memory. */
6c96ba7d 9473 if (key->storage != REDIS_VM_SWAPPING) {
9474 printf("key->storage: %d\n",key->storage);
9475 printf("key->name: %s\n",(char*)key->ptr);
9476 printf("key->refcount: %d\n",key->refcount);
9477 printf("val: %p\n",(void*)j->val);
9478 printf("val->type: %d\n",j->val->type);
9479 printf("val->ptr: %s\n",(char*)j->val->ptr);
9480 }
9481 redisAssert(key->storage == REDIS_VM_SWAPPING);
b9bc0eef 9482 val = dictGetEntryVal(de);
9483 key->vm.page = j->page;
9484 key->vm.usedpages = j->pages;
9485 key->storage = REDIS_VM_SWAPPED;
9486 key->vtype = j->val->type;
9487 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 9488 dictGetEntryVal(de) = NULL;
b9bc0eef 9489 redisLog(REDIS_DEBUG,
9490 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9491 (unsigned char*) key->ptr,
9492 (unsigned long long) j->page, (unsigned long long) j->pages);
9493 server.vm_stats_swapped_objects++;
9494 server.vm_stats_swapouts++;
9495 freeIOJob(j);
f11b8647 9496 /* Put a few more swap requests in queue if we are still
9497 * out of memory */
b0d8747d 9498 if (trytoswap && vmCanSwapOut() &&
9499 zmalloc_used_memory() > server.vm_max_memory)
9500 {
f11b8647 9501 int more = 1;
9502 while(more) {
9503 lockThreadedIO();
9504 more = listLength(server.io_newjobs) <
9505 (unsigned) server.vm_max_threads;
9506 unlockThreadedIO();
9507 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 9508 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9509 trytoswap = 0;
9510 break;
9511 }
f11b8647 9512 }
9513 }
b9bc0eef 9514 }
c953f24b 9515 processed++;
f6c0bba8 9516 if (processed == toprocess) return;
996cb5f7 9517 }
9518 if (retval < 0 && errno != EAGAIN) {
9519 redisLog(REDIS_WARNING,
9520 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9521 strerror(errno));
9522 }
9523}
9524
9525static void lockThreadedIO(void) {
9526 pthread_mutex_lock(&server.io_mutex);
9527}
9528
9529static void unlockThreadedIO(void) {
9530 pthread_mutex_unlock(&server.io_mutex);
9531}
9532
9533/* Remove the specified object from the threaded I/O queue if still not
9534 * processed, otherwise make sure to flag it as canceled. */
9535static void vmCancelThreadedIOJob(robj *o) {
9536 list *lists[3] = {
6c96ba7d 9537 server.io_newjobs, /* 0 */
9538 server.io_processing, /* 1 */
9539 server.io_processed /* 2 */
996cb5f7 9540 };
9541 int i;
9542
9543 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 9544again:
996cb5f7 9545 lockThreadedIO();
9546 /* Search for a matching key in one of the queues */
9547 for (i = 0; i < 3; i++) {
9548 listNode *ln;
c7df85a4 9549 listIter li;
996cb5f7 9550
c7df85a4 9551 listRewind(lists[i],&li);
9552 while ((ln = listNext(&li)) != NULL) {
996cb5f7 9553 iojob *job = ln->value;
9554
6c96ba7d 9555 if (job->canceled) continue; /* Skip this, already canceled. */
78ebe4c8 9556 if (job->key == o) {
970e10bb 9557 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9558 (void*)job, (char*)o->ptr, job->type, i);
427a2153 9559 /* Mark the pages as free since the swap didn't happened
9560 * or happened but is now discarded. */
970e10bb 9561 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 9562 vmMarkPagesFree(job->page,job->pages);
9563 /* Cancel the job. It depends on the list the job is
9564 * living in. */
996cb5f7 9565 switch(i) {
9566 case 0: /* io_newjobs */
6c96ba7d 9567 /* If the job was yet not processed the best thing to do
996cb5f7 9568 * is to remove it from the queue at all */
6c96ba7d 9569 freeIOJob(job);
996cb5f7 9570 listDelNode(lists[i],ln);
9571 break;
9572 case 1: /* io_processing */
d5d55fc3 9573 /* Oh Shi- the thread is messing with the Job:
9574 *
9575 * Probably it's accessing the object if this is a
9576 * PREPARE_SWAP or DO_SWAP job.
9577 * If it's a LOAD job it may be reading from disk and
9578 * if we don't wait for the job to terminate before to
9579 * cancel it, maybe in a few microseconds data can be
9580 * corrupted in this pages. So the short story is:
9581 *
9582 * Better to wait for the job to move into the
9583 * next queue (processed)... */
9584
9585 /* We try again and again until the job is completed. */
9586 unlockThreadedIO();
9587 /* But let's wait some time for the I/O thread
9588 * to finish with this job. After all this condition
9589 * should be very rare. */
9590 usleep(1);
9591 goto again;
996cb5f7 9592 case 2: /* io_processed */
2e111efe 9593 /* The job was already processed, that's easy...
9594 * just mark it as canceled so that we'll ignore it
9595 * when processing completed jobs. */
996cb5f7 9596 job->canceled = 1;
9597 break;
9598 }
c7df85a4 9599 /* Finally we have to adjust the storage type of the object
9600 * in order to "UNDO" the operaiton. */
996cb5f7 9601 if (o->storage == REDIS_VM_LOADING)
9602 o->storage = REDIS_VM_SWAPPED;
9603 else if (o->storage == REDIS_VM_SWAPPING)
9604 o->storage = REDIS_VM_MEMORY;
9605 unlockThreadedIO();
9606 return;
9607 }
9608 }
9609 }
9610 unlockThreadedIO();
9611 assert(1 != 1); /* We should never reach this */
9612}
9613
b9bc0eef 9614static void *IOThreadEntryPoint(void *arg) {
9615 iojob *j;
9616 listNode *ln;
9617 REDIS_NOTUSED(arg);
9618
9619 pthread_detach(pthread_self());
9620 while(1) {
9621 /* Get a new job to process */
9622 lockThreadedIO();
9623 if (listLength(server.io_newjobs) == 0) {
9624 /* No new jobs in queue, exit. */
9ebed7cf 9625 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9626 (long) pthread_self());
b9bc0eef 9627 server.io_active_threads--;
9628 unlockThreadedIO();
9629 return NULL;
9630 }
9631 ln = listFirst(server.io_newjobs);
9632 j = ln->value;
9633 listDelNode(server.io_newjobs,ln);
9634 /* Add the job in the processing queue */
9635 j->thread = pthread_self();
9636 listAddNodeTail(server.io_processing,j);
9637 ln = listLast(server.io_processing); /* We use ln later to remove it */
9638 unlockThreadedIO();
9ebed7cf 9639 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9640 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 9641
9642 /* Process the Job */
9643 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9644 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
b9bc0eef 9645 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9646 FILE *fp = fopen("/dev/null","w+");
9647 j->pages = rdbSavedObjectPages(j->val,fp);
9648 fclose(fp);
9649 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 9650 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9651 j->canceled = 1;
b9bc0eef 9652 }
9653
9654 /* Done: insert the job into the processed queue */
9ebed7cf 9655 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9656 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 9657 lockThreadedIO();
9658 listDelNode(server.io_processing,ln);
9659 listAddNodeTail(server.io_processed,j);
9660 unlockThreadedIO();
e0a62c7f 9661
b9bc0eef 9662 /* Signal the main thread there is new stuff to process */
9663 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9664 }
9665 return NULL; /* never reached */
9666}
9667
9668static void spawnIOThread(void) {
9669 pthread_t thread;
478c2c6f 9670 sigset_t mask, omask;
a97b9060 9671 int err;
b9bc0eef 9672
478c2c6f 9673 sigemptyset(&mask);
9674 sigaddset(&mask,SIGCHLD);
9675 sigaddset(&mask,SIGHUP);
9676 sigaddset(&mask,SIGPIPE);
9677 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 9678 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9679 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9680 strerror(err));
9681 usleep(1000000);
9682 }
478c2c6f 9683 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 9684 server.io_active_threads++;
9685}
9686
4ee9488d 9687/* We need to wait for the last thread to exit before we are able to
9688 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 9689static void waitEmptyIOJobsQueue(void) {
4ee9488d 9690 while(1) {
76b7233a 9691 int io_processed_len;
9692
4ee9488d 9693 lockThreadedIO();
054e426d 9694 if (listLength(server.io_newjobs) == 0 &&
9695 listLength(server.io_processing) == 0 &&
9696 server.io_active_threads == 0)
9697 {
4ee9488d 9698 unlockThreadedIO();
9699 return;
9700 }
76b7233a 9701 /* While waiting for empty jobs queue condition we post-process some
9702 * finshed job, as I/O threads may be hanging trying to write against
9703 * the io_ready_pipe_write FD but there are so much pending jobs that
9704 * it's blocking. */
9705 io_processed_len = listLength(server.io_processed);
4ee9488d 9706 unlockThreadedIO();
76b7233a 9707 if (io_processed_len) {
9708 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9709 usleep(1000); /* 1 millisecond */
9710 } else {
9711 usleep(10000); /* 10 milliseconds */
9712 }
4ee9488d 9713 }
9714}
9715
054e426d 9716static void vmReopenSwapFile(void) {
478c2c6f 9717 /* Note: we don't close the old one as we are in the child process
9718 * and don't want to mess at all with the original file object. */
054e426d 9719 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9720 if (server.vm_fp == NULL) {
9721 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9722 server.vm_swap_file);
478c2c6f 9723 _exit(1);
054e426d 9724 }
9725 server.vm_fd = fileno(server.vm_fp);
9726}
9727
b9bc0eef 9728/* This function must be called while with threaded IO locked */
9729static void queueIOJob(iojob *j) {
6c96ba7d 9730 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9731 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 9732 listAddNodeTail(server.io_newjobs,j);
9733 if (server.io_active_threads < server.vm_max_threads)
9734 spawnIOThread();
9735}
9736
9737static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9738 iojob *j;
e0a62c7f 9739
b9bc0eef 9740 assert(key->storage == REDIS_VM_MEMORY);
9741 assert(key->refcount == 1);
9742
9743 j = zmalloc(sizeof(*j));
9744 j->type = REDIS_IOJOB_PREPARE_SWAP;
9745 j->db = db;
78ebe4c8 9746 j->key = key;
b9bc0eef 9747 j->val = val;
9748 incrRefCount(val);
9749 j->canceled = 0;
9750 j->thread = (pthread_t) -1;
f11b8647 9751 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 9752
9753 lockThreadedIO();
9754 queueIOJob(j);
9755 unlockThreadedIO();
9756 return REDIS_OK;
9757}
9758
b0d8747d 9759/* ============ Virtual Memory - Blocking clients on missing keys =========== */
9760
d5d55fc3 9761/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9762 * If there is not already a job loading the key, it is craeted.
9763 * The key is added to the io_keys list in the client structure, and also
9764 * in the hash table mapping swapped keys to waiting clients, that is,
9765 * server.io_waited_keys. */
9766static int waitForSwappedKey(redisClient *c, robj *key) {
9767 struct dictEntry *de;
9768 robj *o;
9769 list *l;
9770
9771 /* If the key does not exist or is already in RAM we don't need to
9772 * block the client at all. */
9773 de = dictFind(c->db->dict,key);
9774 if (de == NULL) return 0;
9775 o = dictGetEntryKey(de);
9776 if (o->storage == REDIS_VM_MEMORY) {
9777 return 0;
9778 } else if (o->storage == REDIS_VM_SWAPPING) {
9779 /* We were swapping the key, undo it! */
9780 vmCancelThreadedIOJob(o);
9781 return 0;
9782 }
e0a62c7f 9783
d5d55fc3 9784 /* OK: the key is either swapped, or being loaded just now. */
9785
9786 /* Add the key to the list of keys this client is waiting for.
9787 * This maps clients to keys they are waiting for. */
9788 listAddNodeTail(c->io_keys,key);
9789 incrRefCount(key);
9790
9791 /* Add the client to the swapped keys => clients waiting map. */
9792 de = dictFind(c->db->io_keys,key);
9793 if (de == NULL) {
9794 int retval;
9795
9796 /* For every key we take a list of clients blocked for it */
9797 l = listCreate();
9798 retval = dictAdd(c->db->io_keys,key,l);
9799 incrRefCount(key);
9800 assert(retval == DICT_OK);
9801 } else {
9802 l = dictGetEntryVal(de);
9803 }
9804 listAddNodeTail(l,c);
9805
9806 /* Are we already loading the key from disk? If not create a job */
9807 if (o->storage == REDIS_VM_SWAPPED) {
9808 iojob *j;
9809
9810 o->storage = REDIS_VM_LOADING;
9811 j = zmalloc(sizeof(*j));
9812 j->type = REDIS_IOJOB_LOAD;
9813 j->db = c->db;
78ebe4c8 9814 j->key = o;
d5d55fc3 9815 j->key->vtype = o->vtype;
9816 j->page = o->vm.page;
9817 j->val = NULL;
9818 j->canceled = 0;
9819 j->thread = (pthread_t) -1;
9820 lockThreadedIO();
9821 queueIOJob(j);
9822 unlockThreadedIO();
9823 }
9824 return 1;
9825}
9826
6f078746
PN
9827/* Preload keys for any command with first, last and step values for
9828 * the command keys prototype, as defined in the command table. */
9829static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9830 int j, last;
9831 if (cmd->vm_firstkey == 0) return;
9832 last = cmd->vm_lastkey;
9833 if (last < 0) last = argc+last;
9834 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
9835 redisAssert(j < argc);
9836 waitForSwappedKey(c,argv[j]);
9837 }
9838}
9839
5d373da9 9840/* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
739ba0d2
PN
9841 * Note that the number of keys to preload is user-defined, so we need to
9842 * apply a sanity check against argc. */
ca1788b5 9843static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
76583ea4 9844 int i, num;
ca1788b5 9845 REDIS_NOTUSED(cmd);
ca1788b5
PN
9846
9847 num = atoi(argv[2]->ptr);
739ba0d2 9848 if (num > (argc-3)) return;
76583ea4 9849 for (i = 0; i < num; i++) {
ca1788b5 9850 waitForSwappedKey(c,argv[3+i]);
76583ea4
PN
9851 }
9852}
9853
3805e04f
PN
9854/* Preload keys needed to execute the entire MULTI/EXEC block.
9855 *
9856 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
9857 * and will block the client when any command requires a swapped out value. */
9858static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9859 int i, margc;
9860 struct redisCommand *mcmd;
9861 robj **margv;
9862 REDIS_NOTUSED(cmd);
9863 REDIS_NOTUSED(argc);
9864 REDIS_NOTUSED(argv);
9865
9866 if (!(c->flags & REDIS_MULTI)) return;
9867 for (i = 0; i < c->mstate.count; i++) {
9868 mcmd = c->mstate.commands[i].cmd;
9869 margc = c->mstate.commands[i].argc;
9870 margv = c->mstate.commands[i].argv;
9871
9872 if (mcmd->vm_preload_proc != NULL) {
9873 mcmd->vm_preload_proc(c,mcmd,margc,margv);
9874 } else {
9875 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
9876 }
76583ea4
PN
9877 }
9878}
9879
b0d8747d 9880/* Is this client attempting to run a command against swapped keys?
d5d55fc3 9881 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 9882 *
d5d55fc3 9883 * The important idea about this function is that it can fail! If keys will
9884 * still be swapped when the client is resumed, this key lookups will
9885 * just block loading keys from disk. In practical terms this should only
9886 * happen with SORT BY command or if there is a bug in this function.
9887 *
9888 * Return 1 if the client is marked as blocked, 0 if the client can
9889 * continue as the keys it is going to access appear to be in memory. */
0a6f3f0f 9890static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
76583ea4 9891 if (cmd->vm_preload_proc != NULL) {
ca1788b5 9892 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
76583ea4 9893 } else {
6f078746 9894 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
76583ea4
PN
9895 }
9896
d5d55fc3 9897 /* If the client was blocked for at least one key, mark it as blocked. */
9898 if (listLength(c->io_keys)) {
9899 c->flags |= REDIS_IO_WAIT;
9900 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9901 server.vm_blocked_clients++;
9902 return 1;
9903 } else {
9904 return 0;
9905 }
9906}
9907
9908/* Remove the 'key' from the list of blocked keys for a given client.
9909 *
9910 * The function returns 1 when there are no longer blocking keys after
9911 * the current one was removed (and the client can be unblocked). */
9912static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9913 list *l;
9914 listNode *ln;
9915 listIter li;
9916 struct dictEntry *de;
9917
9918 /* Remove the key from the list of keys this client is waiting for. */
9919 listRewind(c->io_keys,&li);
9920 while ((ln = listNext(&li)) != NULL) {
bf028098 9921 if (equalStringObjects(ln->value,key)) {
d5d55fc3 9922 listDelNode(c->io_keys,ln);
9923 break;
9924 }
9925 }
9926 assert(ln != NULL);
9927
9928 /* Remove the client form the key => waiting clients map. */
9929 de = dictFind(c->db->io_keys,key);
9930 assert(de != NULL);
9931 l = dictGetEntryVal(de);
9932 ln = listSearchKey(l,c);
9933 assert(ln != NULL);
9934 listDelNode(l,ln);
9935 if (listLength(l) == 0)
9936 dictDelete(c->db->io_keys,key);
9937
9938 return listLength(c->io_keys) == 0;
9939}
9940
9941static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9942 struct dictEntry *de;
9943 list *l;
9944 listNode *ln;
9945 int len;
9946
9947 de = dictFind(db->io_keys,key);
9948 if (!de) return;
9949
9950 l = dictGetEntryVal(de);
9951 len = listLength(l);
9952 /* Note: we can't use something like while(listLength(l)) as the list
9953 * can be freed by the calling function when we remove the last element. */
9954 while (len--) {
9955 ln = listFirst(l);
9956 redisClient *c = ln->value;
9957
9958 if (dontWaitForSwappedKey(c,key)) {
9959 /* Put the client in the list of clients ready to go as we
9960 * loaded all the keys about it. */
9961 listAddNodeTail(server.io_ready_clients,c);
9962 }
9963 }
b0d8747d 9964}
b0d8747d 9965
500ece7c 9966/* =========================== Remote Configuration ========================= */
9967
9968static void configSetCommand(redisClient *c) {
9969 robj *o = getDecodedObject(c->argv[3]);
2e5eb04e 9970 long long ll;
9971
500ece7c 9972 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9973 zfree(server.dbfilename);
9974 server.dbfilename = zstrdup(o->ptr);
9975 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9976 zfree(server.requirepass);
9977 server.requirepass = zstrdup(o->ptr);
9978 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9979 zfree(server.masterauth);
9980 server.masterauth = zstrdup(o->ptr);
9981 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
2e5eb04e 9982 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9983 ll < 0) goto badfmt;
9984 server.maxmemory = ll;
9985 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
9986 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9987 ll < 0 || ll > LONG_MAX) goto badfmt;
9988 server.maxidletime = ll;
1b677732 9989 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
9990 if (!strcasecmp(o->ptr,"no")) {
9991 server.appendfsync = APPENDFSYNC_NO;
9992 } else if (!strcasecmp(o->ptr,"everysec")) {
9993 server.appendfsync = APPENDFSYNC_EVERYSEC;
9994 } else if (!strcasecmp(o->ptr,"always")) {
9995 server.appendfsync = APPENDFSYNC_ALWAYS;
9996 } else {
9997 goto badfmt;
9998 }
2e5eb04e 9999 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
10000 int old = server.appendonly;
10001 int new = yesnotoi(o->ptr);
10002
10003 if (new == -1) goto badfmt;
10004 if (old != new) {
10005 if (new == 0) {
10006 stopAppendOnly();
10007 } else {
10008 if (startAppendOnly() == REDIS_ERR) {
10009 addReplySds(c,sdscatprintf(sdsempty(),
10010 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10011 decrRefCount(o);
10012 return;
10013 }
10014 }
10015 }
a34e0a25 10016 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10017 int vlen, j;
10018 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10019
10020 /* Perform sanity check before setting the new config:
10021 * - Even number of args
10022 * - Seconds >= 1, changes >= 0 */
10023 if (vlen & 1) {
10024 sdsfreesplitres(v,vlen);
10025 goto badfmt;
10026 }
10027 for (j = 0; j < vlen; j++) {
10028 char *eptr;
10029 long val;
10030
10031 val = strtoll(v[j], &eptr, 10);
10032 if (eptr[0] != '\0' ||
10033 ((j & 1) == 0 && val < 1) ||
10034 ((j & 1) == 1 && val < 0)) {
10035 sdsfreesplitres(v,vlen);
10036 goto badfmt;
10037 }
10038 }
10039 /* Finally set the new config */
10040 resetServerSaveParams();
10041 for (j = 0; j < vlen; j += 2) {
10042 time_t seconds;
10043 int changes;
10044
10045 seconds = strtoll(v[j],NULL,10);
10046 changes = strtoll(v[j+1],NULL,10);
10047 appendServerSaveParams(seconds, changes);
10048 }
10049 sdsfreesplitres(v,vlen);
500ece7c 10050 } else {
10051 addReplySds(c,sdscatprintf(sdsempty(),
10052 "-ERR not supported CONFIG parameter %s\r\n",
10053 (char*)c->argv[2]->ptr));
10054 decrRefCount(o);
10055 return;
10056 }
10057 decrRefCount(o);
10058 addReply(c,shared.ok);
a34e0a25 10059 return;
10060
10061badfmt: /* Bad format errors */
10062 addReplySds(c,sdscatprintf(sdsempty(),
10063 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10064 (char*)o->ptr,
10065 (char*)c->argv[2]->ptr));
10066 decrRefCount(o);
500ece7c 10067}
10068
10069static void configGetCommand(redisClient *c) {
10070 robj *o = getDecodedObject(c->argv[2]);
10071 robj *lenobj = createObject(REDIS_STRING,NULL);
10072 char *pattern = o->ptr;
10073 int matches = 0;
10074
10075 addReply(c,lenobj);
10076 decrRefCount(lenobj);
10077
10078 if (stringmatch(pattern,"dbfilename",0)) {
10079 addReplyBulkCString(c,"dbfilename");
10080 addReplyBulkCString(c,server.dbfilename);
10081 matches++;
10082 }
10083 if (stringmatch(pattern,"requirepass",0)) {
10084 addReplyBulkCString(c,"requirepass");
10085 addReplyBulkCString(c,server.requirepass);
10086 matches++;
10087 }
10088 if (stringmatch(pattern,"masterauth",0)) {
10089 addReplyBulkCString(c,"masterauth");
10090 addReplyBulkCString(c,server.masterauth);
10091 matches++;
10092 }
10093 if (stringmatch(pattern,"maxmemory",0)) {
10094 char buf[128];
10095
2e5eb04e 10096 ll2string(buf,128,server.maxmemory);
500ece7c 10097 addReplyBulkCString(c,"maxmemory");
10098 addReplyBulkCString(c,buf);
10099 matches++;
10100 }
2e5eb04e 10101 if (stringmatch(pattern,"timeout",0)) {
10102 char buf[128];
10103
10104 ll2string(buf,128,server.maxidletime);
10105 addReplyBulkCString(c,"timeout");
10106 addReplyBulkCString(c,buf);
10107 matches++;
10108 }
10109 if (stringmatch(pattern,"appendonly",0)) {
10110 addReplyBulkCString(c,"appendonly");
10111 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10112 matches++;
10113 }
1b677732 10114 if (stringmatch(pattern,"appendfsync",0)) {
10115 char *policy;
10116
10117 switch(server.appendfsync) {
10118 case APPENDFSYNC_NO: policy = "no"; break;
10119 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10120 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10121 default: policy = "unknown"; break; /* too harmless to panic */
10122 }
10123 addReplyBulkCString(c,"appendfsync");
10124 addReplyBulkCString(c,policy);
10125 matches++;
10126 }
a34e0a25 10127 if (stringmatch(pattern,"save",0)) {
10128 sds buf = sdsempty();
10129 int j;
10130
10131 for (j = 0; j < server.saveparamslen; j++) {
10132 buf = sdscatprintf(buf,"%ld %d",
10133 server.saveparams[j].seconds,
10134 server.saveparams[j].changes);
10135 if (j != server.saveparamslen-1)
10136 buf = sdscatlen(buf," ",1);
10137 }
10138 addReplyBulkCString(c,"save");
10139 addReplyBulkCString(c,buf);
10140 sdsfree(buf);
10141 matches++;
10142 }
500ece7c 10143 decrRefCount(o);
10144 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10145}
10146
10147static void configCommand(redisClient *c) {
10148 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10149 if (c->argc != 4) goto badarity;
10150 configSetCommand(c);
10151 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10152 if (c->argc != 3) goto badarity;
10153 configGetCommand(c);
10154 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10155 if (c->argc != 2) goto badarity;
10156 server.stat_numcommands = 0;
10157 server.stat_numconnections = 0;
10158 server.stat_expiredkeys = 0;
10159 server.stat_starttime = time(NULL);
10160 addReply(c,shared.ok);
10161 } else {
10162 addReplySds(c,sdscatprintf(sdsempty(),
10163 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10164 }
10165 return;
10166
10167badarity:
10168 addReplySds(c,sdscatprintf(sdsempty(),
10169 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10170 (char*) c->argv[1]->ptr));
10171}
10172
befec3cd 10173/* =========================== Pubsub implementation ======================== */
10174
ffc6b7f8 10175static void freePubsubPattern(void *p) {
10176 pubsubPattern *pat = p;
10177
10178 decrRefCount(pat->pattern);
10179 zfree(pat);
10180}
10181
10182static int listMatchPubsubPattern(void *a, void *b) {
10183 pubsubPattern *pa = a, *pb = b;
10184
10185 return (pa->client == pb->client) &&
bf028098 10186 (equalStringObjects(pa->pattern,pb->pattern));
ffc6b7f8 10187}
10188
10189/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10190 * 0 if the client was already subscribed to that channel. */
10191static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
befec3cd 10192 struct dictEntry *de;
10193 list *clients = NULL;
10194 int retval = 0;
10195
ffc6b7f8 10196 /* Add the channel to the client -> channels hash table */
10197 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
befec3cd 10198 retval = 1;
ffc6b7f8 10199 incrRefCount(channel);
10200 /* Add the client to the channel -> list of clients hash table */
10201 de = dictFind(server.pubsub_channels,channel);
befec3cd 10202 if (de == NULL) {
10203 clients = listCreate();
ffc6b7f8 10204 dictAdd(server.pubsub_channels,channel,clients);
10205 incrRefCount(channel);
befec3cd 10206 } else {
10207 clients = dictGetEntryVal(de);
10208 }
10209 listAddNodeTail(clients,c);
10210 }
10211 /* Notify the client */
10212 addReply(c,shared.mbulk3);
10213 addReply(c,shared.subscribebulk);
ffc6b7f8 10214 addReplyBulk(c,channel);
482b672d 10215 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
befec3cd 10216 return retval;
10217}
10218
ffc6b7f8 10219/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10220 * 0 if the client was not subscribed to the specified channel. */
10221static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
befec3cd 10222 struct dictEntry *de;
10223 list *clients;
10224 listNode *ln;
10225 int retval = 0;
10226
ffc6b7f8 10227 /* Remove the channel from the client -> channels hash table */
10228 incrRefCount(channel); /* channel may be just a pointer to the same object
201037f5 10229 we have in the hash tables. Protect it... */
ffc6b7f8 10230 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
befec3cd 10231 retval = 1;
ffc6b7f8 10232 /* Remove the client from the channel -> clients list hash table */
10233 de = dictFind(server.pubsub_channels,channel);
befec3cd 10234 assert(de != NULL);
10235 clients = dictGetEntryVal(de);
10236 ln = listSearchKey(clients,c);
10237 assert(ln != NULL);
10238 listDelNode(clients,ln);
ff767a75 10239 if (listLength(clients) == 0) {
10240 /* Free the list and associated hash entry at all if this was
10241 * the latest client, so that it will be possible to abuse
ffc6b7f8 10242 * Redis PUBSUB creating millions of channels. */
10243 dictDelete(server.pubsub_channels,channel);
ff767a75 10244 }
befec3cd 10245 }
10246 /* Notify the client */
10247 if (notify) {
10248 addReply(c,shared.mbulk3);
10249 addReply(c,shared.unsubscribebulk);
ffc6b7f8 10250 addReplyBulk(c,channel);
482b672d 10251 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10252 listLength(c->pubsub_patterns));
10253
10254 }
10255 decrRefCount(channel); /* it is finally safe to release it */
10256 return retval;
10257}
10258
10259/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10260static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10261 int retval = 0;
10262
10263 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10264 retval = 1;
10265 pubsubPattern *pat;
10266 listAddNodeTail(c->pubsub_patterns,pattern);
10267 incrRefCount(pattern);
10268 pat = zmalloc(sizeof(*pat));
10269 pat->pattern = getDecodedObject(pattern);
10270 pat->client = c;
10271 listAddNodeTail(server.pubsub_patterns,pat);
10272 }
10273 /* Notify the client */
10274 addReply(c,shared.mbulk3);
10275 addReply(c,shared.psubscribebulk);
10276 addReplyBulk(c,pattern);
482b672d 10277 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
ffc6b7f8 10278 return retval;
10279}
10280
10281/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10282 * 0 if the client was not subscribed to the specified channel. */
10283static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10284 listNode *ln;
10285 pubsubPattern pat;
10286 int retval = 0;
10287
10288 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10289 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10290 retval = 1;
10291 listDelNode(c->pubsub_patterns,ln);
10292 pat.client = c;
10293 pat.pattern = pattern;
10294 ln = listSearchKey(server.pubsub_patterns,&pat);
10295 listDelNode(server.pubsub_patterns,ln);
10296 }
10297 /* Notify the client */
10298 if (notify) {
10299 addReply(c,shared.mbulk3);
10300 addReply(c,shared.punsubscribebulk);
10301 addReplyBulk(c,pattern);
482b672d 10302 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10303 listLength(c->pubsub_patterns));
befec3cd 10304 }
ffc6b7f8 10305 decrRefCount(pattern);
befec3cd 10306 return retval;
10307}
10308
ffc6b7f8 10309/* Unsubscribe from all the channels. Return the number of channels the
10310 * client was subscribed from. */
10311static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10312 dictIterator *di = dictGetIterator(c->pubsub_channels);
befec3cd 10313 dictEntry *de;
10314 int count = 0;
10315
10316 while((de = dictNext(di)) != NULL) {
ffc6b7f8 10317 robj *channel = dictGetEntryKey(de);
befec3cd 10318
ffc6b7f8 10319 count += pubsubUnsubscribeChannel(c,channel,notify);
befec3cd 10320 }
10321 dictReleaseIterator(di);
10322 return count;
10323}
10324
ffc6b7f8 10325/* Unsubscribe from all the patterns. Return the number of patterns the
10326 * client was subscribed from. */
10327static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10328 listNode *ln;
10329 listIter li;
10330 int count = 0;
10331
10332 listRewind(c->pubsub_patterns,&li);
10333 while ((ln = listNext(&li)) != NULL) {
10334 robj *pattern = ln->value;
10335
10336 count += pubsubUnsubscribePattern(c,pattern,notify);
10337 }
10338 return count;
10339}
10340
befec3cd 10341/* Publish a message */
ffc6b7f8 10342static int pubsubPublishMessage(robj *channel, robj *message) {
befec3cd 10343 int receivers = 0;
10344 struct dictEntry *de;
ffc6b7f8 10345 listNode *ln;
10346 listIter li;
befec3cd 10347
ffc6b7f8 10348 /* Send to clients listening for that channel */
10349 de = dictFind(server.pubsub_channels,channel);
befec3cd 10350 if (de) {
10351 list *list = dictGetEntryVal(de);
10352 listNode *ln;
10353 listIter li;
10354
10355 listRewind(list,&li);
10356 while ((ln = listNext(&li)) != NULL) {
10357 redisClient *c = ln->value;
10358
10359 addReply(c,shared.mbulk3);
10360 addReply(c,shared.messagebulk);
ffc6b7f8 10361 addReplyBulk(c,channel);
befec3cd 10362 addReplyBulk(c,message);
10363 receivers++;
10364 }
10365 }
ffc6b7f8 10366 /* Send to clients listening to matching channels */
10367 if (listLength(server.pubsub_patterns)) {
10368 listRewind(server.pubsub_patterns,&li);
10369 channel = getDecodedObject(channel);
10370 while ((ln = listNext(&li)) != NULL) {
10371 pubsubPattern *pat = ln->value;
10372
10373 if (stringmatchlen((char*)pat->pattern->ptr,
10374 sdslen(pat->pattern->ptr),
10375 (char*)channel->ptr,
10376 sdslen(channel->ptr),0)) {
c8d0ea0e 10377 addReply(pat->client,shared.mbulk4);
10378 addReply(pat->client,shared.pmessagebulk);
10379 addReplyBulk(pat->client,pat->pattern);
ffc6b7f8 10380 addReplyBulk(pat->client,channel);
10381 addReplyBulk(pat->client,message);
10382 receivers++;
10383 }
10384 }
10385 decrRefCount(channel);
10386 }
befec3cd 10387 return receivers;
10388}
10389
10390static void subscribeCommand(redisClient *c) {
10391 int j;
10392
10393 for (j = 1; j < c->argc; j++)
ffc6b7f8 10394 pubsubSubscribeChannel(c,c->argv[j]);
befec3cd 10395}
10396
10397static void unsubscribeCommand(redisClient *c) {
10398 if (c->argc == 1) {
ffc6b7f8 10399 pubsubUnsubscribeAllChannels(c,1);
10400 return;
10401 } else {
10402 int j;
10403
10404 for (j = 1; j < c->argc; j++)
10405 pubsubUnsubscribeChannel(c,c->argv[j],1);
10406 }
10407}
10408
10409static void psubscribeCommand(redisClient *c) {
10410 int j;
10411
10412 for (j = 1; j < c->argc; j++)
10413 pubsubSubscribePattern(c,c->argv[j]);
10414}
10415
10416static void punsubscribeCommand(redisClient *c) {
10417 if (c->argc == 1) {
10418 pubsubUnsubscribeAllPatterns(c,1);
befec3cd 10419 return;
10420 } else {
10421 int j;
10422
10423 for (j = 1; j < c->argc; j++)
ffc6b7f8 10424 pubsubUnsubscribePattern(c,c->argv[j],1);
befec3cd 10425 }
10426}
10427
10428static void publishCommand(redisClient *c) {
10429 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
482b672d 10430 addReplyLongLong(c,receivers);
befec3cd 10431}
10432
37ab76c9 10433/* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10434 *
10435 * The implementation uses a per-DB hash table mapping keys to list of clients
10436 * WATCHing those keys, so that given a key that is going to be modified
10437 * we can mark all the associated clients as dirty.
10438 *
10439 * Also every client contains a list of WATCHed keys so that's possible to
10440 * un-watch such keys when the client is freed or when UNWATCH is called. */
10441
10442/* In the client->watched_keys list we need to use watchedKey structures
10443 * as in order to identify a key in Redis we need both the key name and the
10444 * DB */
10445typedef struct watchedKey {
10446 robj *key;
10447 redisDb *db;
10448} watchedKey;
10449
10450/* Watch for the specified key */
10451static void watchForKey(redisClient *c, robj *key) {
10452 list *clients = NULL;
10453 listIter li;
10454 listNode *ln;
10455 watchedKey *wk;
10456
10457 /* Check if we are already watching for this key */
10458 listRewind(c->watched_keys,&li);
10459 while((ln = listNext(&li))) {
10460 wk = listNodeValue(ln);
10461 if (wk->db == c->db && equalStringObjects(key,wk->key))
10462 return; /* Key already watched */
10463 }
10464 /* This key is not already watched in this DB. Let's add it */
10465 clients = dictFetchValue(c->db->watched_keys,key);
10466 if (!clients) {
10467 clients = listCreate();
10468 dictAdd(c->db->watched_keys,key,clients);
10469 incrRefCount(key);
10470 }
10471 listAddNodeTail(clients,c);
10472 /* Add the new key to the lits of keys watched by this client */
10473 wk = zmalloc(sizeof(*wk));
10474 wk->key = key;
10475 wk->db = c->db;
10476 incrRefCount(key);
10477 listAddNodeTail(c->watched_keys,wk);
10478}
10479
10480/* Unwatch all the keys watched by this client. To clean the EXEC dirty
10481 * flag is up to the caller. */
10482static void unwatchAllKeys(redisClient *c) {
10483 listIter li;
10484 listNode *ln;
10485
10486 if (listLength(c->watched_keys) == 0) return;
10487 listRewind(c->watched_keys,&li);
10488 while((ln = listNext(&li))) {
10489 list *clients;
10490 watchedKey *wk;
10491
10492 /* Lookup the watched key -> clients list and remove the client
10493 * from the list */
10494 wk = listNodeValue(ln);
10495 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10496 assert(clients != NULL);
10497 listDelNode(clients,listSearchKey(clients,c));
10498 /* Kill the entry at all if this was the only client */
10499 if (listLength(clients) == 0)
10500 dictDelete(wk->db->watched_keys, wk->key);
10501 /* Remove this watched key from the client->watched list */
10502 listDelNode(c->watched_keys,ln);
10503 decrRefCount(wk->key);
10504 zfree(wk);
10505 }
10506}
10507
ca3f830b 10508/* "Touch" a key, so that if this key is being WATCHed by some client the
37ab76c9 10509 * next EXEC will fail. */
10510static void touchWatchedKey(redisDb *db, robj *key) {
10511 list *clients;
10512 listIter li;
10513 listNode *ln;
10514
10515 if (dictSize(db->watched_keys) == 0) return;
10516 clients = dictFetchValue(db->watched_keys, key);
10517 if (!clients) return;
10518
10519 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
10520 /* Check if we are already watching for this key */
10521 listRewind(clients,&li);
10522 while((ln = listNext(&li))) {
10523 redisClient *c = listNodeValue(ln);
10524
10525 c->flags |= REDIS_DIRTY_CAS;
10526 }
10527}
10528
9b30e1a2 10529/* On FLUSHDB or FLUSHALL all the watched keys that are present before the
10530 * flush but will be deleted as effect of the flushing operation should
10531 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
10532 * a FLUSHALL operation (all the DBs flushed). */
10533static void touchWatchedKeysOnFlush(int dbid) {
10534 listIter li1, li2;
10535 listNode *ln;
10536
10537 /* For every client, check all the waited keys */
10538 listRewind(server.clients,&li1);
10539 while((ln = listNext(&li1))) {
10540 redisClient *c = listNodeValue(ln);
10541 listRewind(c->watched_keys,&li2);
10542 while((ln = listNext(&li2))) {
10543 watchedKey *wk = listNodeValue(ln);
10544
10545 /* For every watched key matching the specified DB, if the
10546 * key exists, mark the client as dirty, as the key will be
10547 * removed. */
10548 if (dbid == -1 || wk->db->id == dbid) {
10549 if (dictFind(wk->db->dict, wk->key) != NULL)
10550 c->flags |= REDIS_DIRTY_CAS;
10551 }
10552 }
10553 }
10554}
10555
37ab76c9 10556static void watchCommand(redisClient *c) {
10557 int j;
10558
6531c94d 10559 if (c->flags & REDIS_MULTI) {
10560 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
10561 return;
10562 }
37ab76c9 10563 for (j = 1; j < c->argc; j++)
10564 watchForKey(c,c->argv[j]);
10565 addReply(c,shared.ok);
10566}
10567
10568static void unwatchCommand(redisClient *c) {
10569 unwatchAllKeys(c);
10570 c->flags &= (~REDIS_DIRTY_CAS);
10571 addReply(c,shared.ok);
10572}
10573
7f957c92 10574/* ================================= Debugging ============================== */
10575
ba798261 10576/* Compute the sha1 of string at 's' with 'len' bytes long.
10577 * The SHA1 is then xored againt the string pointed by digest.
10578 * Since xor is commutative, this operation is used in order to
10579 * "add" digests relative to unordered elements.
10580 *
10581 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
10582static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
10583 SHA1_CTX ctx;
10584 unsigned char hash[20], *s = ptr;
10585 int j;
10586
10587 SHA1Init(&ctx);
10588 SHA1Update(&ctx,s,len);
10589 SHA1Final(hash,&ctx);
10590
10591 for (j = 0; j < 20; j++)
10592 digest[j] ^= hash[j];
10593}
10594
10595static void xorObjectDigest(unsigned char *digest, robj *o) {
10596 o = getDecodedObject(o);
10597 xorDigest(digest,o->ptr,sdslen(o->ptr));
10598 decrRefCount(o);
10599}
10600
10601/* This function instead of just computing the SHA1 and xoring it
10602 * against diget, also perform the digest of "digest" itself and
10603 * replace the old value with the new one.
10604 *
10605 * So the final digest will be:
10606 *
10607 * digest = SHA1(digest xor SHA1(data))
10608 *
10609 * This function is used every time we want to preserve the order so
10610 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
10611 *
10612 * Also note that mixdigest("foo") followed by mixdigest("bar")
10613 * will lead to a different digest compared to "fo", "obar".
10614 */
10615static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
10616 SHA1_CTX ctx;
10617 char *s = ptr;
10618
10619 xorDigest(digest,s,len);
10620 SHA1Init(&ctx);
10621 SHA1Update(&ctx,digest,20);
10622 SHA1Final(digest,&ctx);
10623}
10624
10625static void mixObjectDigest(unsigned char *digest, robj *o) {
10626 o = getDecodedObject(o);
10627 mixDigest(digest,o->ptr,sdslen(o->ptr));
10628 decrRefCount(o);
10629}
10630
10631/* Compute the dataset digest. Since keys, sets elements, hashes elements
10632 * are not ordered, we use a trick: every aggregate digest is the xor
10633 * of the digests of their elements. This way the order will not change
10634 * the result. For list instead we use a feedback entering the output digest
10635 * as input in order to ensure that a different ordered list will result in
10636 * a different digest. */
10637static void computeDatasetDigest(unsigned char *final) {
10638 unsigned char digest[20];
10639 char buf[128];
10640 dictIterator *di = NULL;
10641 dictEntry *de;
10642 int j;
10643 uint32_t aux;
10644
10645 memset(final,0,20); /* Start with a clean result */
10646
10647 for (j = 0; j < server.dbnum; j++) {
10648 redisDb *db = server.db+j;
10649
10650 if (dictSize(db->dict) == 0) continue;
10651 di = dictGetIterator(db->dict);
10652
10653 /* hash the DB id, so the same dataset moved in a different
10654 * DB will lead to a different digest */
10655 aux = htonl(j);
10656 mixDigest(final,&aux,sizeof(aux));
10657
10658 /* Iterate this DB writing every entry */
10659 while((de = dictNext(di)) != NULL) {
cbae1d34 10660 robj *key, *o, *kcopy;
ba798261 10661 time_t expiretime;
10662
10663 memset(digest,0,20); /* This key-val digest */
10664 key = dictGetEntryKey(de);
cbae1d34 10665
10666 if (!server.vm_enabled) {
10667 mixObjectDigest(digest,key);
ba798261 10668 o = dictGetEntryVal(de);
ba798261 10669 } else {
cbae1d34 10670 /* Don't work with the key directly as when VM is active
10671 * this is unsafe: TODO: fix decrRefCount to check if the
10672 * count really reached 0 to avoid this mess */
10673 kcopy = dupStringObject(key);
10674 mixObjectDigest(digest,kcopy);
10675 o = lookupKeyRead(db,kcopy);
10676 decrRefCount(kcopy);
ba798261 10677 }
10678 aux = htonl(o->type);
10679 mixDigest(digest,&aux,sizeof(aux));
10680 expiretime = getExpire(db,key);
10681
10682 /* Save the key and associated value */
10683 if (o->type == REDIS_STRING) {
10684 mixObjectDigest(digest,o);
10685 } else if (o->type == REDIS_LIST) {
10686 list *list = o->ptr;
10687 listNode *ln;
10688 listIter li;
10689
10690 listRewind(list,&li);
10691 while((ln = listNext(&li))) {
10692 robj *eleobj = listNodeValue(ln);
10693
10694 mixObjectDigest(digest,eleobj);
10695 }
10696 } else if (o->type == REDIS_SET) {
10697 dict *set = o->ptr;
10698 dictIterator *di = dictGetIterator(set);
10699 dictEntry *de;
10700
10701 while((de = dictNext(di)) != NULL) {
10702 robj *eleobj = dictGetEntryKey(de);
10703
10704 xorObjectDigest(digest,eleobj);
10705 }
10706 dictReleaseIterator(di);
10707 } else if (o->type == REDIS_ZSET) {
10708 zset *zs = o->ptr;
10709 dictIterator *di = dictGetIterator(zs->dict);
10710 dictEntry *de;
10711
10712 while((de = dictNext(di)) != NULL) {
10713 robj *eleobj = dictGetEntryKey(de);
10714 double *score = dictGetEntryVal(de);
10715 unsigned char eledigest[20];
10716
10717 snprintf(buf,sizeof(buf),"%.17g",*score);
10718 memset(eledigest,0,20);
10719 mixObjectDigest(eledigest,eleobj);
10720 mixDigest(eledigest,buf,strlen(buf));
10721 xorDigest(digest,eledigest,20);
10722 }
10723 dictReleaseIterator(di);
10724 } else if (o->type == REDIS_HASH) {
10725 hashIterator *hi;
10726 robj *obj;
10727
10728 hi = hashInitIterator(o);
10729 while (hashNext(hi) != REDIS_ERR) {
10730 unsigned char eledigest[20];
10731
10732 memset(eledigest,0,20);
10733 obj = hashCurrent(hi,REDIS_HASH_KEY);
10734 mixObjectDigest(eledigest,obj);
10735 decrRefCount(obj);
10736 obj = hashCurrent(hi,REDIS_HASH_VALUE);
10737 mixObjectDigest(eledigest,obj);
10738 decrRefCount(obj);
10739 xorDigest(digest,eledigest,20);
10740 }
10741 hashReleaseIterator(hi);
10742 } else {
10743 redisPanic("Unknown object type");
10744 }
ba798261 10745 /* If the key has an expire, add it to the mix */
10746 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
10747 /* We can finally xor the key-val digest to the final digest */
10748 xorDigest(final,digest,20);
10749 }
10750 dictReleaseIterator(di);
10751 }
10752}
10753
7f957c92 10754static void debugCommand(redisClient *c) {
10755 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10756 *((char*)-1) = 'x';
210e29f7 10757 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10758 if (rdbSave(server.dbfilename) != REDIS_OK) {
10759 addReply(c,shared.err);
10760 return;
10761 }
10762 emptyDb();
10763 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10764 addReply(c,shared.err);
10765 return;
10766 }
10767 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10768 addReply(c,shared.ok);
71c2b467 10769 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10770 emptyDb();
10771 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10772 addReply(c,shared.err);
10773 return;
10774 }
10775 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10776 addReply(c,shared.ok);
333298da 10777 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10778 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10779 robj *key, *val;
10780
10781 if (!de) {
10782 addReply(c,shared.nokeyerr);
10783 return;
10784 }
10785 key = dictGetEntryKey(de);
10786 val = dictGetEntryVal(de);
59146ef3 10787 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
10788 key->storage == REDIS_VM_SWAPPING)) {
07efaf74 10789 char *strenc;
10790 char buf[128];
10791
10792 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10793 strenc = strencoding[val->encoding];
10794 } else {
10795 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10796 strenc = buf;
10797 }
ace06542 10798 addReplySds(c,sdscatprintf(sdsempty(),
10799 "+Key at:%p refcount:%d, value at:%p refcount:%d "
07efaf74 10800 "encoding:%s serializedlength:%lld\r\n",
682ac724 10801 (void*)key, key->refcount, (void*)val, val->refcount,
07efaf74 10802 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 10803 } else {
10804 addReplySds(c,sdscatprintf(sdsempty(),
10805 "+Key at:%p refcount:%d, value swapped at: page %llu "
10806 "using %llu pages\r\n",
10807 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10808 (unsigned long long) key->vm.usedpages));
10809 }
78ebe4c8 10810 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10811 lookupKeyRead(c->db,c->argv[2]);
10812 addReply(c,shared.ok);
7d30035d 10813 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10814 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10815 robj *key, *val;
10816
10817 if (!server.vm_enabled) {
10818 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10819 return;
10820 }
10821 if (!de) {
10822 addReply(c,shared.nokeyerr);
10823 return;
10824 }
10825 key = dictGetEntryKey(de);
10826 val = dictGetEntryVal(de);
4ef8de8a 10827 /* If the key is shared we want to create a copy */
10828 if (key->refcount > 1) {
10829 robj *newkey = dupStringObject(key);
10830 decrRefCount(key);
10831 key = dictGetEntryKey(de) = newkey;
10832 }
10833 /* Swap it */
7d30035d 10834 if (key->storage != REDIS_VM_MEMORY) {
10835 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 10836 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 10837 dictGetEntryVal(de) = NULL;
10838 addReply(c,shared.ok);
10839 } else {
10840 addReply(c,shared.err);
10841 }
59305dc7 10842 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
10843 long keys, j;
10844 robj *key, *val;
10845 char buf[128];
10846
10847 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
10848 return;
10849 for (j = 0; j < keys; j++) {
10850 snprintf(buf,sizeof(buf),"key:%lu",j);
10851 key = createStringObject(buf,strlen(buf));
10852 if (lookupKeyRead(c->db,key) != NULL) {
10853 decrRefCount(key);
10854 continue;
10855 }
10856 snprintf(buf,sizeof(buf),"value:%lu",j);
10857 val = createStringObject(buf,strlen(buf));
10858 dictAdd(c->db->dict,key,val);
10859 }
10860 addReply(c,shared.ok);
ba798261 10861 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
10862 unsigned char digest[20];
10863 sds d = sdsnew("+");
10864 int j;
10865
10866 computeDatasetDigest(digest);
10867 for (j = 0; j < 20; j++)
10868 d = sdscatprintf(d, "%02x",digest[j]);
10869
10870 d = sdscatlen(d,"\r\n",2);
10871 addReplySds(c,d);
7f957c92 10872 } else {
333298da 10873 addReplySds(c,sdsnew(
bdcb92f2 10874 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 10875 }
10876}
56906eef 10877
6c96ba7d 10878static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 10879 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
fdfb02e7 10880 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
dfc5e96c 10881#ifdef HAVE_BACKTRACE
10882 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10883 *((char*)-1) = 'x';
10884#endif
10885}
10886
c651fd9e 10887static void _redisPanic(char *msg, char *file, int line) {
10888 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
17772754 10889 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
c651fd9e 10890#ifdef HAVE_BACKTRACE
10891 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10892 *((char*)-1) = 'x';
10893#endif
10894}
10895
bcfc686d 10896/* =================================== Main! ================================ */
56906eef 10897
bcfc686d 10898#ifdef __linux__
10899int linuxOvercommitMemoryValue(void) {
10900 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10901 char buf[64];
56906eef 10902
bcfc686d 10903 if (!fp) return -1;
10904 if (fgets(buf,64,fp) == NULL) {
10905 fclose(fp);
10906 return -1;
10907 }
10908 fclose(fp);
56906eef 10909
bcfc686d 10910 return atoi(buf);
10911}
10912
10913void linuxOvercommitMemoryWarning(void) {
10914 if (linuxOvercommitMemoryValue() == 0) {
7ccd2d0a 10915 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
bcfc686d 10916 }
10917}
10918#endif /* __linux__ */
10919
10920static void daemonize(void) {
10921 int fd;
10922 FILE *fp;
10923
10924 if (fork() != 0) exit(0); /* parent exits */
10925 setsid(); /* create a new session */
10926
10927 /* Every output goes to /dev/null. If Redis is daemonized but
10928 * the 'logfile' is set to 'stdout' in the configuration file
10929 * it will not log at all. */
10930 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10931 dup2(fd, STDIN_FILENO);
10932 dup2(fd, STDOUT_FILENO);
10933 dup2(fd, STDERR_FILENO);
10934 if (fd > STDERR_FILENO) close(fd);
10935 }
10936 /* Try to write the pid file */
10937 fp = fopen(server.pidfile,"w");
10938 if (fp) {
10939 fprintf(fp,"%d\n",getpid());
10940 fclose(fp);
56906eef 10941 }
56906eef 10942}
10943
42ab0172 10944static void version() {
8a3b0d2d 10945 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
10946 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
42ab0172
AO
10947 exit(0);
10948}
10949
723fb69b
AO
10950static void usage() {
10951 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 10952 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
10953 exit(1);
10954}
10955
bcfc686d 10956int main(int argc, char **argv) {
9651a787 10957 time_t start;
10958
bcfc686d 10959 initServerConfig();
1a132bbc 10960 sortCommandTable();
bcfc686d 10961 if (argc == 2) {
44efe66e 10962 if (strcmp(argv[1], "-v") == 0 ||
10963 strcmp(argv[1], "--version") == 0) version();
10964 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 10965 resetServerSaveParams();
10966 loadServerConfig(argv[1]);
723fb69b
AO
10967 } else if ((argc > 2)) {
10968 usage();
bcfc686d 10969 } else {
10970 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10971 }
bcfc686d 10972 if (server.daemonize) daemonize();
71c54b21 10973 initServer();
bcfc686d 10974 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10975#ifdef __linux__
10976 linuxOvercommitMemoryWarning();
10977#endif
9651a787 10978 start = time(NULL);
bcfc686d 10979 if (server.appendonly) {
10980 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 10981 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 10982 } else {
10983 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 10984 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 10985 }
bcfc686d 10986 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 10987 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 10988 aeMain(server.el);
10989 aeDeleteEventLoop(server.el);
10990 return 0;
10991}
10992
10993/* ============================= Backtrace support ========================= */
10994
10995#ifdef HAVE_BACKTRACE
10996static char *findFuncName(void *pointer, unsigned long *offset);
10997
56906eef 10998static void *getMcontextEip(ucontext_t *uc) {
10999#if defined(__FreeBSD__)
11000 return (void*) uc->uc_mcontext.mc_eip;
11001#elif defined(__dietlibc__)
11002 return (void*) uc->uc_mcontext.eip;
06db1f50 11003#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 11004 #if __x86_64__
11005 return (void*) uc->uc_mcontext->__ss.__rip;
11006 #else
56906eef 11007 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 11008 #endif
06db1f50 11009#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 11010 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 11011 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 11012 #else
11013 return (void*) uc->uc_mcontext->__ss.__eip;
e0a62c7f 11014 #endif
54bac49d 11015#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 11016 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 11017#elif defined(__ia64__) /* Linux IA64 */
11018 return (void*) uc->uc_mcontext.sc_ip;
11019#else
11020 return NULL;
56906eef 11021#endif
11022}
11023
11024static void segvHandler(int sig, siginfo_t *info, void *secret) {
11025 void *trace[100];
11026 char **messages = NULL;
11027 int i, trace_size = 0;
11028 unsigned long offset=0;
56906eef 11029 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 11030 sds infostring;
56906eef 11031 REDIS_NOTUSED(info);
11032
11033 redisLog(REDIS_WARNING,
11034 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 11035 infostring = genRedisInfoString();
11036 redisLog(REDIS_WARNING, "%s",infostring);
11037 /* It's not safe to sdsfree() the returned string under memory
11038 * corruption conditions. Let it leak as we are going to abort */
e0a62c7f 11039
56906eef 11040 trace_size = backtrace(trace, 100);
de96dbfe 11041 /* overwrite sigaction with caller's address */
b91cf5ef 11042 if (getMcontextEip(uc) != NULL) {
11043 trace[1] = getMcontextEip(uc);
11044 }
56906eef 11045 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 11046
d76412d1 11047 for (i=1; i<trace_size; ++i) {
56906eef 11048 char *fn = findFuncName(trace[i], &offset), *p;
11049
11050 p = strchr(messages[i],'+');
11051 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11052 redisLog(REDIS_WARNING,"%s", messages[i]);
11053 } else {
11054 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11055 }
11056 }
b177fd30 11057 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 11058 _exit(0);
fe3bbfbe 11059}
56906eef 11060
fab43727 11061static void sigtermHandler(int sig) {
11062 REDIS_NOTUSED(sig);
b58ba105 11063
fab43727 11064 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11065 server.shutdown_asap = 1;
b58ba105
AM
11066}
11067
56906eef 11068static void setupSigSegvAction(void) {
11069 struct sigaction act;
11070
11071 sigemptyset (&act.sa_mask);
11072 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11073 * is used. Otherwise, sa_handler is used */
11074 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11075 act.sa_sigaction = segvHandler;
11076 sigaction (SIGSEGV, &act, NULL);
11077 sigaction (SIGBUS, &act, NULL);
12fea928 11078 sigaction (SIGFPE, &act, NULL);
11079 sigaction (SIGILL, &act, NULL);
11080 sigaction (SIGBUS, &act, NULL);
b58ba105
AM
11081
11082 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
fab43727 11083 act.sa_handler = sigtermHandler;
b58ba105 11084 sigaction (SIGTERM, &act, NULL);
e65fdc78 11085 return;
56906eef 11086}
e65fdc78 11087
bcfc686d 11088#include "staticsymbols.h"
11089/* This function try to convert a pointer into a function name. It's used in
11090 * oreder to provide a backtrace under segmentation fault that's able to
11091 * display functions declared as static (otherwise the backtrace is useless). */
11092static char *findFuncName(void *pointer, unsigned long *offset){
11093 int i, ret = -1;
11094 unsigned long off, minoff = 0;
ed9b544e 11095
bcfc686d 11096 /* Try to match against the Symbol with the smallest offset */
11097 for (i=0; symsTable[i].pointer; i++) {
11098 unsigned long lp = (unsigned long) pointer;
0bc03378 11099
bcfc686d 11100 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11101 off=lp-symsTable[i].pointer;
11102 if (ret < 0 || off < minoff) {
11103 minoff=off;
11104 ret=i;
11105 }
11106 }
0bc03378 11107 }
bcfc686d 11108 if (ret == -1) return NULL;
11109 *offset = minoff;
11110 return symsTable[ret].name;
0bc03378 11111}
bcfc686d 11112#else /* HAVE_BACKTRACE */
11113static void setupSigSegvAction(void) {
0bc03378 11114}
bcfc686d 11115#endif /* HAVE_BACKTRACE */
0bc03378 11116
ed9b544e 11117
ed9b544e 11118
bcfc686d 11119/* The End */
11120
11121
ed9b544e 11122