]> git.saurik.com Git - redis.git/blame - redis.c
redis.conf new features the new option, a minor typo preventing the compilation fixed
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
9005896c 30#define REDIS_VERSION "2.1.1"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
40#include <signal.h>
fbf9bcdb 41
42#ifdef HAVE_BACKTRACE
c9468bcf 43#include <execinfo.h>
44#include <ucontext.h>
fbf9bcdb 45#endif /* HAVE_BACKTRACE */
46
ed9b544e 47#include <sys/wait.h>
48#include <errno.h>
49#include <assert.h>
50#include <ctype.h>
51#include <stdarg.h>
52#include <inttypes.h>
53#include <arpa/inet.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#include <sys/time.h>
57#include <sys/resource.h>
2895e862 58#include <sys/uio.h>
f78fd11b 59#include <limits.h>
fb82e75c 60#include <float.h>
a7866db6 61#include <math.h>
92f8e882 62#include <pthread.h>
0bc1b2f6 63
64#if defined(__sun)
5043dff3 65#include "solarisfixes.h"
66#endif
ed9b544e 67
c9468bcf 68#include "redis.h"
ed9b544e 69#include "ae.h" /* Event driven programming library */
70#include "sds.h" /* Dynamic safe strings */
71#include "anet.h" /* Networking the easy way */
72#include "dict.h" /* Hash tables */
73#include "adlist.h" /* Linked lists */
74#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 75#include "lzf.h" /* LZF compression library */
76#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
ba798261 77#include "zipmap.h" /* Compact dictionary-alike data structure */
78#include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
5436146c 79#include "release.h" /* Release and/or git repository information */
ed9b544e 80
81/* Error codes */
82#define REDIS_OK 0
83#define REDIS_ERR -1
84
85/* Static server configuration */
86#define REDIS_SERVERPORT 6379 /* TCP port */
87#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 88#define REDIS_IOBUF_LEN 1024
ed9b544e 89#define REDIS_LOADBUF_LEN 1024
248ea310 90#define REDIS_STATIC_ARGS 8
ed9b544e 91#define REDIS_DEFAULT_DBNUM 16
92#define REDIS_CONFIGLINE_MAX 1024
93#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
94#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
8ca3e9d1 95#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
6f376729 96#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 97#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98
99/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
100#define REDIS_WRITEV_THRESHOLD 3
101/* Max number of iovecs used for each writev call */
102#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 103
104/* Hash table parameters */
105#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 106
107/* Command flags */
3fd78bcd 108#define REDIS_CMD_BULK 1 /* Bulk write command */
109#define REDIS_CMD_INLINE 2 /* Inline command */
110/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
111 this flags will return an error when the 'maxmemory' option is set in the
112 config file and the server is using more than maxmemory bytes of memory.
113 In short this commands are denied on low memory conditions. */
114#define REDIS_CMD_DENYOOM 4
4005fef1 115#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
ed9b544e 116
117/* Object types */
118#define REDIS_STRING 0
119#define REDIS_LIST 1
120#define REDIS_SET 2
1812e024 121#define REDIS_ZSET 3
122#define REDIS_HASH 4
f78fd11b 123
5234952b 124/* Objects encoding. Some kind of objects like Strings and Hashes can be
125 * internally represented in multiple ways. The 'encoding' field of the object
126 * is set to one of this fields for this object. */
942a3961 127#define REDIS_ENCODING_RAW 0 /* Raw representation */
128#define REDIS_ENCODING_INT 1 /* Encoded as integer */
5234952b 129#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
130#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
942a3961 131
07efaf74 132static char* strencoding[] = {
133 "raw", "int", "zipmap", "hashtable"
134};
135
f78fd11b 136/* Object types only used for dumping to disk */
bb32ede5 137#define REDIS_EXPIRETIME 253
ed9b544e 138#define REDIS_SELECTDB 254
139#define REDIS_EOF 255
140
f78fd11b 141/* Defines related to the dump file format. To store 32 bits lengths for short
142 * keys requires a lot of space, so we check the most significant 2 bits of
143 * the first byte to interpreter the length:
144 *
145 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
146 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
147 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 148 * 11|000000 this means: specially encoded object will follow. The six bits
149 * number specify the kind of object that follows.
150 * See the REDIS_RDB_ENC_* defines.
f78fd11b 151 *
10c43610 152 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
153 * values, will fit inside. */
f78fd11b 154#define REDIS_RDB_6BITLEN 0
155#define REDIS_RDB_14BITLEN 1
156#define REDIS_RDB_32BITLEN 2
17be1a4a 157#define REDIS_RDB_ENCVAL 3
f78fd11b 158#define REDIS_RDB_LENERR UINT_MAX
159
a4d1ba9a 160/* When a length of a string object stored on disk has the first two bits
161 * set, the remaining two bits specify a special encoding for the object
162 * accordingly to the following defines: */
163#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
164#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
165#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 166#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 167
75680a3c 168/* Virtual memory object->where field. */
169#define REDIS_VM_MEMORY 0 /* The object is on memory */
170#define REDIS_VM_SWAPPED 1 /* The object is on disk */
171#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
172#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
173
06224fec 174/* Virtual memory static configuration stuff.
175 * Check vmFindContiguousPages() to know more about this magic numbers. */
176#define REDIS_VM_MAX_NEAR_PAGES 65536
177#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 178#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 179#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 180/* The following is the *percentage* of completed I/O jobs to process when the
181 * handelr is called. While Virtual Memory I/O operations are performed by
182 * threads, this operations must be processed by the main thread when completed
183 * in order to take effect. */
c953f24b 184#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 185
ed9b544e 186/* Client flags */
d5d55fc3 187#define REDIS_SLAVE 1 /* This client is a slave server */
188#define REDIS_MASTER 2 /* This client is a master server */
189#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
190#define REDIS_MULTI 8 /* This client is in a MULTI context */
191#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
192#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
37ab76c9 193#define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
ed9b544e 194
40d224a9 195/* Slave replication state - slave side */
ed9b544e 196#define REDIS_REPL_NONE 0 /* No active replication */
197#define REDIS_REPL_CONNECT 1 /* Must connect to master */
198#define REDIS_REPL_CONNECTED 2 /* Connected to master */
199
40d224a9 200/* Slave replication state - from the point of view of master
201 * Note that in SEND_BULK and ONLINE state the slave receives new updates
202 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
203 * to start the next background saving in order to send updates to it. */
204#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
205#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
206#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
207#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
208
ed9b544e 209/* List related stuff */
210#define REDIS_HEAD 0
211#define REDIS_TAIL 1
212
213/* Sort operations */
214#define REDIS_SORT_GET 0
443c6409 215#define REDIS_SORT_ASC 1
216#define REDIS_SORT_DESC 2
ed9b544e 217#define REDIS_SORTKEY_MAX 1024
218
219/* Log levels */
220#define REDIS_DEBUG 0
f870935d 221#define REDIS_VERBOSE 1
222#define REDIS_NOTICE 2
223#define REDIS_WARNING 3
ed9b544e 224
225/* Anti-warning macro... */
226#define REDIS_NOTUSED(V) ((void) V)
227
6b47e12e 228#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
229#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 230
48f0308a 231/* Append only defines */
232#define APPENDFSYNC_NO 0
233#define APPENDFSYNC_ALWAYS 1
234#define APPENDFSYNC_EVERYSEC 2
235
cbba7dd7 236/* Hashes related defaults */
237#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
238#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
239
dfc5e96c 240/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 241#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
c651fd9e 242#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
6c96ba7d 243static void _redisAssert(char *estr, char *file, int line);
c651fd9e 244static void _redisPanic(char *msg, char *file, int line);
dfc5e96c 245
ed9b544e 246/*================================= Data types ============================== */
247
248/* A redis object, that is a type able to hold a string / list / set */
75680a3c 249
250/* The VM object structure */
251struct redisObjectVM {
3a66edc7 252 off_t page; /* the page at witch the object is stored on disk */
253 off_t usedpages; /* number of pages used on disk */
254 time_t atime; /* Last access time */
75680a3c 255} vm;
256
257/* The actual Redis Object */
ed9b544e 258typedef struct redisObject {
ed9b544e 259 void *ptr;
942a3961 260 unsigned char type;
261 unsigned char encoding;
d894161b 262 unsigned char storage; /* If this object is a key, where is the value?
263 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
264 unsigned char vtype; /* If this object is a key, and value is swapped out,
265 * this is the type of the swapped out object. */
ed9b544e 266 int refcount;
75680a3c 267 /* VM fields, this are only allocated if VM is active, otherwise the
268 * object allocation function will just allocate
269 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
270 * Redis without VM active will not have any overhead. */
271 struct redisObjectVM vm;
ed9b544e 272} robj;
273
dfc5e96c 274/* Macro used to initalize a Redis object allocated on the stack.
275 * Note that this macro is taken near the structure definition to make sure
276 * we'll update it when the structure is changed, to avoid bugs like
277 * bug #85 introduced exactly in this way. */
278#define initStaticStringObject(_var,_ptr) do { \
279 _var.refcount = 1; \
280 _var.type = REDIS_STRING; \
281 _var.encoding = REDIS_ENCODING_RAW; \
282 _var.ptr = _ptr; \
3a66edc7 283 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 284} while(0);
285
3305306f 286typedef struct redisDb {
4409877e 287 dict *dict; /* The keyspace for this DB */
288 dict *expires; /* Timeout of keys with a timeout set */
37ab76c9 289 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 290 dict *io_keys; /* Keys with clients waiting for VM I/O */
37ab76c9 291 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
3305306f 292 int id;
293} redisDb;
294
6e469882 295/* Client MULTI/EXEC state */
296typedef struct multiCmd {
297 robj **argv;
298 int argc;
299 struct redisCommand *cmd;
300} multiCmd;
301
302typedef struct multiState {
303 multiCmd *commands; /* Array of MULTI commands */
304 int count; /* Total number of MULTI commands */
305} multiState;
306
ed9b544e 307/* With multiplexing we need to take per-clinet state.
308 * Clients are taken in a liked list. */
309typedef struct redisClient {
310 int fd;
3305306f 311 redisDb *db;
ed9b544e 312 int dictid;
313 sds querybuf;
e8a74421 314 robj **argv, **mbargv;
315 int argc, mbargc;
40d224a9 316 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 317 int multibulk; /* multi bulk command format active */
ed9b544e 318 list *reply;
319 int sentlen;
320 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 321 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 322 int slaveseldb; /* slave selected db, if this client is a slave */
323 int authenticated; /* when requirepass is non-NULL */
324 int replstate; /* replication state if this is a slave */
325 int repldbfd; /* replication DB file descriptor */
6e469882 326 long repldboff; /* replication DB file offset */
40d224a9 327 off_t repldbsize; /* replication DB file size */
6e469882 328 multiState mstate; /* MULTI/EXEC state */
37ab76c9 329 robj **blocking_keys; /* The key we are waiting to terminate a blocking
4409877e 330 * operation such as BLPOP. Otherwise NULL. */
37ab76c9 331 int blocking_keys_num; /* Number of blocking keys */
4409877e 332 time_t blockingto; /* Blocking operation timeout. If UNIX current time
333 * is >= blockingto then the operation timed out. */
92f8e882 334 list *io_keys; /* Keys this client is waiting to be loaded from the
335 * swap file in order to continue. */
37ab76c9 336 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
ffc6b7f8 337 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
338 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
ed9b544e 339} redisClient;
340
341struct saveparam {
342 time_t seconds;
343 int changes;
344};
345
346/* Global server state structure */
347struct redisServer {
348 int port;
349 int fd;
3305306f 350 redisDb *db;
ed9b544e 351 long long dirty; /* changes to DB from the last save */
352 list *clients;
87eca727 353 list *slaves, *monitors;
ed9b544e 354 char neterr[ANET_ERR_LEN];
355 aeEventLoop *el;
356 int cronloops; /* number of times the cron function run */
357 list *objfreelist; /* A list of freed objects to avoid malloc() */
358 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 359 /* Fields used only for stats */
360 time_t stat_starttime; /* server start time */
361 long long stat_numcommands; /* number of processed commands */
362 long long stat_numconnections; /* number of connections received */
2a6a2ed1 363 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 364 /* Configuration */
365 int verbosity;
366 int glueoutputbuf;
367 int maxidletime;
368 int dbnum;
369 int daemonize;
44b38ef4 370 int appendonly;
48f0308a 371 int appendfsync;
38db9171 372 int no_appendfsync_on_rewrite;
fab43727 373 int shutdown_asap;
48f0308a 374 time_t lastfsync;
44b38ef4 375 int appendfd;
376 int appendseldb;
ed329fcf 377 char *pidfile;
9f3c422c 378 pid_t bgsavechildpid;
9d65a1bb 379 pid_t bgrewritechildpid;
380 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
28ed1f33 381 sds aofbuf; /* AOF buffer, written before entering the event loop */
ed9b544e 382 struct saveparam *saveparams;
383 int saveparamslen;
384 char *logfile;
385 char *bindaddr;
386 char *dbfilename;
44b38ef4 387 char *appendfilename;
abcb223e 388 char *requirepass;
121f70cf 389 int rdbcompression;
8ca3e9d1 390 int activerehashing;
ed9b544e 391 /* Replication related */
392 int isslave;
d0ccebcf 393 char *masterauth;
ed9b544e 394 char *masterhost;
395 int masterport;
40d224a9 396 redisClient *master; /* client that is master for this slave */
ed9b544e 397 int replstate;
285add55 398 unsigned int maxclients;
4ef8de8a 399 unsigned long long maxmemory;
d5d55fc3 400 unsigned int blpop_blocked_clients;
401 unsigned int vm_blocked_clients;
ed9b544e 402 /* Sort parameters - qsort_r() is only available under BSD so we
403 * have to take this state global, in order to pass it to sortCompare() */
404 int sort_desc;
405 int sort_alpha;
406 int sort_bypattern;
75680a3c 407 /* Virtual memory configuration */
408 int vm_enabled;
054e426d 409 char *vm_swap_file;
75680a3c 410 off_t vm_page_size;
411 off_t vm_pages;
4ef8de8a 412 unsigned long long vm_max_memory;
cbba7dd7 413 /* Hashes config */
414 size_t hash_max_zipmap_entries;
415 size_t hash_max_zipmap_value;
75680a3c 416 /* Virtual memory state */
417 FILE *vm_fp;
418 int vm_fd;
419 off_t vm_next_page; /* Next probably empty page */
420 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 421 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 422 time_t unixtime; /* Unix time sampled every second. */
92f8e882 423 /* Virtual memory I/O threads stuff */
92f8e882 424 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 425 * put the result of the operation in the io_done list. While the
426 * job is being processed, it's put on io_processing queue. */
427 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
428 list *io_processing; /* List of VM I/O jobs being processed */
429 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 430 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 431 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 432 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
433 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 434 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 435 int io_active_threads; /* Number of running I/O threads */
436 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 437 /* Our main thread is blocked on the event loop, locking for sockets ready
438 * to be read or written, so when a threaded I/O operation is ready to be
439 * processed by the main thread, the I/O thread will use a unix pipe to
440 * awake the main thread. The followings are the two pipe FDs. */
441 int io_ready_pipe_read;
442 int io_ready_pipe_write;
7d98e08c 443 /* Virtual memory stats */
444 unsigned long long vm_stats_used_pages;
445 unsigned long long vm_stats_swapped_objects;
446 unsigned long long vm_stats_swapouts;
447 unsigned long long vm_stats_swapins;
befec3cd 448 /* Pubsub */
ffc6b7f8 449 dict *pubsub_channels; /* Map channels to list of subscribed clients */
450 list *pubsub_patterns; /* A list of pubsub_patterns */
befec3cd 451 /* Misc */
b9bc0eef 452 FILE *devnull;
ed9b544e 453};
454
ffc6b7f8 455typedef struct pubsubPattern {
456 redisClient *client;
457 robj *pattern;
458} pubsubPattern;
459
ed9b544e 460typedef void redisCommandProc(redisClient *c);
ca1788b5 461typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
ed9b544e 462struct redisCommand {
463 char *name;
464 redisCommandProc *proc;
465 int arity;
466 int flags;
76583ea4
PN
467 /* Use a function to determine which keys need to be loaded
468 * in the background prior to executing this command. Takes precedence
469 * over vm_firstkey and others, ignored when NULL */
ca1788b5 470 redisVmPreloadProc *vm_preload_proc;
7c775e09 471 /* What keys should be loaded in background when calling this command? */
472 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
473 int vm_lastkey; /* THe last argument that's a key */
474 int vm_keystep; /* The step between first and last key */
ed9b544e 475};
476
de96dbfe 477struct redisFunctionSym {
478 char *name;
56906eef 479 unsigned long pointer;
de96dbfe 480};
481
ed9b544e 482typedef struct _redisSortObject {
483 robj *obj;
484 union {
485 double score;
486 robj *cmpobj;
487 } u;
488} redisSortObject;
489
490typedef struct _redisSortOperation {
491 int type;
492 robj *pattern;
493} redisSortOperation;
494
6b47e12e 495/* ZSETs use a specialized version of Skiplists */
496
497typedef struct zskiplistNode {
498 struct zskiplistNode **forward;
e3870fab 499 struct zskiplistNode *backward;
912b9165 500 unsigned int *span;
6b47e12e 501 double score;
502 robj *obj;
503} zskiplistNode;
504
505typedef struct zskiplist {
e3870fab 506 struct zskiplistNode *header, *tail;
d13f767c 507 unsigned long length;
6b47e12e 508 int level;
509} zskiplist;
510
1812e024 511typedef struct zset {
512 dict *dict;
6b47e12e 513 zskiplist *zsl;
1812e024 514} zset;
515
6b47e12e 516/* Our shared "common" objects */
517
05df7621 518#define REDIS_SHARED_INTEGERS 10000
ed9b544e 519struct sharedObjectsStruct {
c937aa89 520 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 521 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 522 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
523 *outofrangeerr, *plus,
ed9b544e 524 *select0, *select1, *select2, *select3, *select4,
befec3cd 525 *select5, *select6, *select7, *select8, *select9,
c8d0ea0e 526 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
527 *mbulk4, *psubscribebulk, *punsubscribebulk,
528 *integers[REDIS_SHARED_INTEGERS];
ed9b544e 529} shared;
530
a7866db6 531/* Global vars that are actally used as constants. The following double
532 * values are used for double on-disk serialization, and are initialized
533 * at runtime to avoid strange compiler optimizations. */
534
535static double R_Zero, R_PosInf, R_NegInf, R_Nan;
536
92f8e882 537/* VM threaded I/O request message */
b9bc0eef 538#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
539#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
540#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 541typedef struct iojob {
996cb5f7 542 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 543 redisDb *db;/* Redis database */
92f8e882 544 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 545 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 546 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
547 off_t page; /* Swap page where to read/write the object */
248ea310 548 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 549 int canceled; /* True if this command was canceled by blocking side of VM */
550 pthread_t thread; /* ID of the thread processing this entry */
551} iojob;
92f8e882 552
ed9b544e 553/*================================ Prototypes =============================== */
554
555static void freeStringObject(robj *o);
556static void freeListObject(robj *o);
557static void freeSetObject(robj *o);
558static void decrRefCount(void *o);
559static robj *createObject(int type, void *ptr);
560static void freeClient(redisClient *c);
f78fd11b 561static int rdbLoad(char *filename);
ed9b544e 562static void addReply(redisClient *c, robj *obj);
563static void addReplySds(redisClient *c, sds s);
564static void incrRefCount(robj *o);
f78fd11b 565static int rdbSaveBackground(char *filename);
ed9b544e 566static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 567static robj *dupStringObject(robj *o);
248ea310 568static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
dd142b9c 569static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
28ed1f33 570static void flushAppendOnlyFile(void);
44b38ef4 571static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 572static int syncWithMaster(void);
05df7621 573static robj *tryObjectEncoding(robj *o);
9d65a1bb 574static robj *getDecodedObject(robj *o);
3305306f 575static int removeExpire(redisDb *db, robj *key);
576static int expireIfNeeded(redisDb *db, robj *key);
577static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 578static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 579static int deleteKey(redisDb *db, robj *key);
bb32ede5 580static time_t getExpire(redisDb *db, robj *key);
581static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 582static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 583static void freeMemoryIfNeeded(void);
de96dbfe 584static int processCommand(redisClient *c);
56906eef 585static void setupSigSegvAction(void);
a3b21203 586static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 587static void aofRemoveTempFile(pid_t childpid);
0ea663ea 588static size_t stringObjectLen(robj *o);
638e42ac 589static void processInputBuffer(redisClient *c);
6b47e12e 590static zskiplist *zslCreate(void);
fd8ccf44 591static void zslFree(zskiplist *zsl);
2b59cfdf 592static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 593static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 594static void initClientMultiState(redisClient *c);
595static void freeClientMultiState(redisClient *c);
596static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 597static void unblockClientWaitingData(redisClient *c);
4409877e 598static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 599static void vmInit(void);
a35ddf12 600static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 601static robj *vmLoadObject(robj *key);
7e69548d 602static robj *vmPreviewObject(robj *key);
a69a0c9c 603static int vmSwapOneObjectBlocking(void);
604static int vmSwapOneObjectThreaded(void);
7e69548d 605static int vmCanSwapOut(void);
a5819310 606static int tryFreeOneObjectFromFreelist(void);
996cb5f7 607static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
608static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
609static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 610static void lockThreadedIO(void);
611static void unlockThreadedIO(void);
612static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
613static void freeIOJob(iojob *j);
614static void queueIOJob(iojob *j);
a5819310 615static int vmWriteObjectOnSwap(robj *o, off_t page);
616static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 617static void waitEmptyIOJobsQueue(void);
618static void vmReopenSwapFile(void);
970e10bb 619static int vmFreePage(off_t page);
ca1788b5 620static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
3805e04f 621static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
0a6f3f0f 622static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
d5d55fc3 623static int dontWaitForSwappedKey(redisClient *c, robj *key);
624static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
625static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
626static struct redisCommand *lookupCommand(char *name);
627static void call(redisClient *c, struct redisCommand *cmd);
628static void resetClient(redisClient *c);
ada386b2 629static void convertToRealHash(robj *o);
ffc6b7f8 630static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
631static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
632static void freePubsubPattern(void *p);
633static int listMatchPubsubPattern(void *a, void *b);
634static int compareStringObjects(robj *a, robj *b);
bf028098 635static int equalStringObjects(robj *a, robj *b);
befec3cd 636static void usage();
8f63ddca 637static int rewriteAppendOnlyFileBackground(void);
242a64f3 638static int vmSwapObjectBlocking(robj *key, robj *val);
fab43727 639static int prepareForShutdown();
37ab76c9 640static void touchWatchedKey(redisDb *db, robj *key);
9b30e1a2 641static void touchWatchedKeysOnFlush(int dbid);
37ab76c9 642static void unwatchAllKeys(redisClient *c);
ed9b544e 643
abcb223e 644static void authCommand(redisClient *c);
ed9b544e 645static void pingCommand(redisClient *c);
646static void echoCommand(redisClient *c);
647static void setCommand(redisClient *c);
648static void setnxCommand(redisClient *c);
526d00a5 649static void setexCommand(redisClient *c);
ed9b544e 650static void getCommand(redisClient *c);
651static void delCommand(redisClient *c);
652static void existsCommand(redisClient *c);
653static void incrCommand(redisClient *c);
654static void decrCommand(redisClient *c);
655static void incrbyCommand(redisClient *c);
656static void decrbyCommand(redisClient *c);
657static void selectCommand(redisClient *c);
658static void randomkeyCommand(redisClient *c);
659static void keysCommand(redisClient *c);
660static void dbsizeCommand(redisClient *c);
661static void lastsaveCommand(redisClient *c);
662static void saveCommand(redisClient *c);
663static void bgsaveCommand(redisClient *c);
9d65a1bb 664static void bgrewriteaofCommand(redisClient *c);
ed9b544e 665static void shutdownCommand(redisClient *c);
666static void moveCommand(redisClient *c);
667static void renameCommand(redisClient *c);
668static void renamenxCommand(redisClient *c);
669static void lpushCommand(redisClient *c);
670static void rpushCommand(redisClient *c);
671static void lpopCommand(redisClient *c);
672static void rpopCommand(redisClient *c);
673static void llenCommand(redisClient *c);
674static void lindexCommand(redisClient *c);
675static void lrangeCommand(redisClient *c);
676static void ltrimCommand(redisClient *c);
677static void typeCommand(redisClient *c);
678static void lsetCommand(redisClient *c);
679static void saddCommand(redisClient *c);
680static void sremCommand(redisClient *c);
a4460ef4 681static void smoveCommand(redisClient *c);
ed9b544e 682static void sismemberCommand(redisClient *c);
683static void scardCommand(redisClient *c);
12fea928 684static void spopCommand(redisClient *c);
2abb95a9 685static void srandmemberCommand(redisClient *c);
ed9b544e 686static void sinterCommand(redisClient *c);
687static void sinterstoreCommand(redisClient *c);
40d224a9 688static void sunionCommand(redisClient *c);
689static void sunionstoreCommand(redisClient *c);
f4f56e1d 690static void sdiffCommand(redisClient *c);
691static void sdiffstoreCommand(redisClient *c);
ed9b544e 692static void syncCommand(redisClient *c);
693static void flushdbCommand(redisClient *c);
694static void flushallCommand(redisClient *c);
695static void sortCommand(redisClient *c);
696static void lremCommand(redisClient *c);
0f5f7e9a 697static void rpoplpushcommand(redisClient *c);
ed9b544e 698static void infoCommand(redisClient *c);
70003d28 699static void mgetCommand(redisClient *c);
87eca727 700static void monitorCommand(redisClient *c);
3305306f 701static void expireCommand(redisClient *c);
802e8373 702static void expireatCommand(redisClient *c);
f6b141c5 703static void getsetCommand(redisClient *c);
fd88489a 704static void ttlCommand(redisClient *c);
321b0e13 705static void slaveofCommand(redisClient *c);
7f957c92 706static void debugCommand(redisClient *c);
f6b141c5 707static void msetCommand(redisClient *c);
708static void msetnxCommand(redisClient *c);
fd8ccf44 709static void zaddCommand(redisClient *c);
7db723ad 710static void zincrbyCommand(redisClient *c);
cc812361 711static void zrangeCommand(redisClient *c);
50c55df5 712static void zrangebyscoreCommand(redisClient *c);
f44dd428 713static void zcountCommand(redisClient *c);
e3870fab 714static void zrevrangeCommand(redisClient *c);
3c41331e 715static void zcardCommand(redisClient *c);
1b7106e7 716static void zremCommand(redisClient *c);
6e333bbe 717static void zscoreCommand(redisClient *c);
1807985b 718static void zremrangebyscoreCommand(redisClient *c);
6e469882 719static void multiCommand(redisClient *c);
720static void execCommand(redisClient *c);
18b6cb76 721static void discardCommand(redisClient *c);
4409877e 722static void blpopCommand(redisClient *c);
723static void brpopCommand(redisClient *c);
4b00bebd 724static void appendCommand(redisClient *c);
39191553 725static void substrCommand(redisClient *c);
69d95c3e 726static void zrankCommand(redisClient *c);
798d9e55 727static void zrevrankCommand(redisClient *c);
978c2c94 728static void hsetCommand(redisClient *c);
1f1c7695 729static void hsetnxCommand(redisClient *c);
978c2c94 730static void hgetCommand(redisClient *c);
09aeb579
PN
731static void hmsetCommand(redisClient *c);
732static void hmgetCommand(redisClient *c);
07efaf74 733static void hdelCommand(redisClient *c);
92b27fe9 734static void hlenCommand(redisClient *c);
9212eafd 735static void zremrangebyrankCommand(redisClient *c);
5d373da9 736static void zunionstoreCommand(redisClient *c);
737static void zinterstoreCommand(redisClient *c);
78409a0f 738static void hkeysCommand(redisClient *c);
739static void hvalsCommand(redisClient *c);
740static void hgetallCommand(redisClient *c);
a86f14b1 741static void hexistsCommand(redisClient *c);
500ece7c 742static void configCommand(redisClient *c);
01426b05 743static void hincrbyCommand(redisClient *c);
befec3cd 744static void subscribeCommand(redisClient *c);
745static void unsubscribeCommand(redisClient *c);
ffc6b7f8 746static void psubscribeCommand(redisClient *c);
747static void punsubscribeCommand(redisClient *c);
befec3cd 748static void publishCommand(redisClient *c);
37ab76c9 749static void watchCommand(redisClient *c);
750static void unwatchCommand(redisClient *c);
f6b141c5 751
ed9b544e 752/*================================= Globals ================================= */
753
754/* Global vars */
755static struct redisServer server; /* server global state */
756static struct redisCommand cmdTable[] = {
76583ea4
PN
757 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
758 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
759 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
526d00a5 760 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
761 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
762 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
763 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
764 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
765 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
766 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
767 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
768 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
769 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
770 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
771 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
777 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
780 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
781 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
782 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
783 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
784 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
785 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
789 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
790 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
791 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
792 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
793 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
794 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
795 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
798 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
5d373da9 800 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
801 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
802 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
807 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
808 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
809 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
810 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
1f1c7695 811 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 812 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
d33278d1 813 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 814 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
01426b05 815 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
816 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
817 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
819 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
820 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 821 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
822 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
823 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
824 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
825 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
826 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
827 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
830 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
831 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
832 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
839 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
840 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
842 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
843 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
844 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
845 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
3805e04f 846 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
847 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
848 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
849 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
850 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
851 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
852 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
853 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
854 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
855 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
856 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 857 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
befec3cd 858 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
859 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
ffc6b7f8 860 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
861 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
4005fef1 862 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
37ab76c9 863 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
864 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
76583ea4 865 {NULL,NULL,0,0,NULL,0,0,0}
ed9b544e 866};
bcfc686d 867
ed9b544e 868/*============================ Utility functions ============================ */
869
870/* Glob-style pattern matching. */
500ece7c 871static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 872 const char *string, int stringLen, int nocase)
873{
874 while(patternLen) {
875 switch(pattern[0]) {
876 case '*':
877 while (pattern[1] == '*') {
878 pattern++;
879 patternLen--;
880 }
881 if (patternLen == 1)
882 return 1; /* match */
883 while(stringLen) {
884 if (stringmatchlen(pattern+1, patternLen-1,
885 string, stringLen, nocase))
886 return 1; /* match */
887 string++;
888 stringLen--;
889 }
890 return 0; /* no match */
891 break;
892 case '?':
893 if (stringLen == 0)
894 return 0; /* no match */
895 string++;
896 stringLen--;
897 break;
898 case '[':
899 {
900 int not, match;
901
902 pattern++;
903 patternLen--;
904 not = pattern[0] == '^';
905 if (not) {
906 pattern++;
907 patternLen--;
908 }
909 match = 0;
910 while(1) {
911 if (pattern[0] == '\\') {
912 pattern++;
913 patternLen--;
914 if (pattern[0] == string[0])
915 match = 1;
916 } else if (pattern[0] == ']') {
917 break;
918 } else if (patternLen == 0) {
919 pattern--;
920 patternLen++;
921 break;
922 } else if (pattern[1] == '-' && patternLen >= 3) {
923 int start = pattern[0];
924 int end = pattern[2];
925 int c = string[0];
926 if (start > end) {
927 int t = start;
928 start = end;
929 end = t;
930 }
931 if (nocase) {
932 start = tolower(start);
933 end = tolower(end);
934 c = tolower(c);
935 }
936 pattern += 2;
937 patternLen -= 2;
938 if (c >= start && c <= end)
939 match = 1;
940 } else {
941 if (!nocase) {
942 if (pattern[0] == string[0])
943 match = 1;
944 } else {
945 if (tolower((int)pattern[0]) == tolower((int)string[0]))
946 match = 1;
947 }
948 }
949 pattern++;
950 patternLen--;
951 }
952 if (not)
953 match = !match;
954 if (!match)
955 return 0; /* no match */
956 string++;
957 stringLen--;
958 break;
959 }
960 case '\\':
961 if (patternLen >= 2) {
962 pattern++;
963 patternLen--;
964 }
965 /* fall through */
966 default:
967 if (!nocase) {
968 if (pattern[0] != string[0])
969 return 0; /* no match */
970 } else {
971 if (tolower((int)pattern[0]) != tolower((int)string[0]))
972 return 0; /* no match */
973 }
974 string++;
975 stringLen--;
976 break;
977 }
978 pattern++;
979 patternLen--;
980 if (stringLen == 0) {
981 while(*pattern == '*') {
982 pattern++;
983 patternLen--;
984 }
985 break;
986 }
987 }
988 if (patternLen == 0 && stringLen == 0)
989 return 1;
990 return 0;
991}
992
500ece7c 993static int stringmatch(const char *pattern, const char *string, int nocase) {
994 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
995}
996
2b619329 997/* Convert a string representing an amount of memory into the number of
998 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
999 * (1024*1024*1024).
1000 *
1001 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1002 * set to 0 */
1003static long long memtoll(const char *p, int *err) {
1004 const char *u;
1005 char buf[128];
1006 long mul; /* unit multiplier */
1007 long long val;
1008 unsigned int digits;
1009
1010 if (err) *err = 0;
1011 /* Search the first non digit character. */
1012 u = p;
1013 if (*u == '-') u++;
1014 while(*u && isdigit(*u)) u++;
1015 if (*u == '\0' || !strcasecmp(u,"b")) {
1016 mul = 1;
72324005 1017 } else if (!strcasecmp(u,"k")) {
2b619329 1018 mul = 1000;
72324005 1019 } else if (!strcasecmp(u,"kb")) {
2b619329 1020 mul = 1024;
72324005 1021 } else if (!strcasecmp(u,"m")) {
2b619329 1022 mul = 1000*1000;
72324005 1023 } else if (!strcasecmp(u,"mb")) {
2b619329 1024 mul = 1024*1024;
72324005 1025 } else if (!strcasecmp(u,"g")) {
2b619329 1026 mul = 1000L*1000*1000;
72324005 1027 } else if (!strcasecmp(u,"gb")) {
2b619329 1028 mul = 1024L*1024*1024;
1029 } else {
1030 if (err) *err = 1;
1031 mul = 1;
1032 }
1033 digits = u-p;
1034 if (digits >= sizeof(buf)) {
1035 if (err) *err = 1;
1036 return LLONG_MAX;
1037 }
1038 memcpy(buf,p,digits);
1039 buf[digits] = '\0';
1040 val = strtoll(buf,NULL,10);
1041 return val*mul;
1042}
1043
ee14da56 1044/* Convert a long long into a string. Returns the number of
1045 * characters needed to represent the number, that can be shorter if passed
1046 * buffer length is not enough to store the whole number. */
1047static int ll2string(char *s, size_t len, long long value) {
1048 char buf[32], *p;
1049 unsigned long long v;
1050 size_t l;
1051
1052 if (len == 0) return 0;
1053 v = (value < 0) ? -value : value;
1054 p = buf+31; /* point to the last character */
1055 do {
1056 *p-- = '0'+(v%10);
1057 v /= 10;
1058 } while(v);
1059 if (value < 0) *p-- = '-';
1060 p++;
1061 l = 32-(p-buf);
1062 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1063 memcpy(s,p,l);
1064 s[l] = '\0';
1065 return l;
1066}
1067
56906eef 1068static void redisLog(int level, const char *fmt, ...) {
ed9b544e 1069 va_list ap;
1070 FILE *fp;
1071
1072 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1073 if (!fp) return;
1074
1075 va_start(ap, fmt);
1076 if (level >= server.verbosity) {
6766f45e 1077 char *c = ".-*#";
1904ecc1 1078 char buf[64];
1079 time_t now;
1080
1081 now = time(NULL);
6c9385e0 1082 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 1083 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 1084 vfprintf(fp, fmt, ap);
1085 fprintf(fp,"\n");
1086 fflush(fp);
1087 }
1088 va_end(ap);
1089
1090 if (server.logfile) fclose(fp);
1091}
1092
1093/*====================== Hash table type implementation ==================== */
1094
1095/* This is an hash table type that uses the SDS dynamic strings libary as
1096 * keys and radis objects as values (objects can hold SDS strings,
1097 * lists, sets). */
1098
1812e024 1099static void dictVanillaFree(void *privdata, void *val)
1100{
1101 DICT_NOTUSED(privdata);
1102 zfree(val);
1103}
1104
4409877e 1105static void dictListDestructor(void *privdata, void *val)
1106{
1107 DICT_NOTUSED(privdata);
1108 listRelease((list*)val);
1109}
1110
ed9b544e 1111static int sdsDictKeyCompare(void *privdata, const void *key1,
1112 const void *key2)
1113{
1114 int l1,l2;
1115 DICT_NOTUSED(privdata);
1116
1117 l1 = sdslen((sds)key1);
1118 l2 = sdslen((sds)key2);
1119 if (l1 != l2) return 0;
1120 return memcmp(key1, key2, l1) == 0;
1121}
1122
1123static void dictRedisObjectDestructor(void *privdata, void *val)
1124{
1125 DICT_NOTUSED(privdata);
1126
a35ddf12 1127 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 1128 decrRefCount(val);
1129}
1130
942a3961 1131static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1132 const void *key2)
1133{
1134 const robj *o1 = key1, *o2 = key2;
1135 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1136}
1137
942a3961 1138static unsigned int dictObjHash(const void *key) {
ed9b544e 1139 const robj *o = key;
1140 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1141}
1142
942a3961 1143static int dictEncObjKeyCompare(void *privdata, const void *key1,
1144 const void *key2)
1145{
9d65a1bb 1146 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1147 int cmp;
942a3961 1148
2a1198b4 1149 if (o1->encoding == REDIS_ENCODING_INT &&
dc05abde 1150 o2->encoding == REDIS_ENCODING_INT)
1151 return o1->ptr == o2->ptr;
2a1198b4 1152
9d65a1bb 1153 o1 = getDecodedObject(o1);
1154 o2 = getDecodedObject(o2);
1155 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1156 decrRefCount(o1);
1157 decrRefCount(o2);
1158 return cmp;
942a3961 1159}
1160
1161static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1162 robj *o = (robj*) key;
942a3961 1163
ed9e4966 1164 if (o->encoding == REDIS_ENCODING_RAW) {
1165 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1166 } else {
1167 if (o->encoding == REDIS_ENCODING_INT) {
1168 char buf[32];
1169 int len;
1170
ee14da56 1171 len = ll2string(buf,32,(long)o->ptr);
ed9e4966 1172 return dictGenHashFunction((unsigned char*)buf, len);
1173 } else {
1174 unsigned int hash;
1175
1176 o = getDecodedObject(o);
1177 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1178 decrRefCount(o);
1179 return hash;
1180 }
1181 }
942a3961 1182}
1183
f2d9f50f 1184/* Sets type and expires */
ed9b544e 1185static dictType setDictType = {
942a3961 1186 dictEncObjHash, /* hash function */
ed9b544e 1187 NULL, /* key dup */
1188 NULL, /* val dup */
942a3961 1189 dictEncObjKeyCompare, /* key compare */
ed9b544e 1190 dictRedisObjectDestructor, /* key destructor */
1191 NULL /* val destructor */
1192};
1193
f2d9f50f 1194/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1195static dictType zsetDictType = {
1196 dictEncObjHash, /* hash function */
1197 NULL, /* key dup */
1198 NULL, /* val dup */
1199 dictEncObjKeyCompare, /* key compare */
1200 dictRedisObjectDestructor, /* key destructor */
da0a1620 1201 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1202};
1203
f2d9f50f 1204/* Db->dict */
5234952b 1205static dictType dbDictType = {
942a3961 1206 dictObjHash, /* hash function */
ed9b544e 1207 NULL, /* key dup */
1208 NULL, /* val dup */
942a3961 1209 dictObjKeyCompare, /* key compare */
ed9b544e 1210 dictRedisObjectDestructor, /* key destructor */
1211 dictRedisObjectDestructor /* val destructor */
1212};
1213
f2d9f50f 1214/* Db->expires */
1215static dictType keyptrDictType = {
1216 dictObjHash, /* hash function */
1217 NULL, /* key dup */
1218 NULL, /* val dup */
1219 dictObjKeyCompare, /* key compare */
1220 dictRedisObjectDestructor, /* key destructor */
1221 NULL /* val destructor */
1222};
1223
5234952b 1224/* Hash type hash table (note that small hashes are represented with zimpaps) */
1225static dictType hashDictType = {
1226 dictEncObjHash, /* hash function */
1227 NULL, /* key dup */
1228 NULL, /* val dup */
1229 dictEncObjKeyCompare, /* key compare */
1230 dictRedisObjectDestructor, /* key destructor */
1231 dictRedisObjectDestructor /* val destructor */
1232};
1233
4409877e 1234/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1235 * lists as values. It's used for blocking operations (BLPOP) and to
1236 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1237static dictType keylistDictType = {
1238 dictObjHash, /* hash function */
1239 NULL, /* key dup */
1240 NULL, /* val dup */
1241 dictObjKeyCompare, /* key compare */
1242 dictRedisObjectDestructor, /* key destructor */
1243 dictListDestructor /* val destructor */
1244};
1245
42ab0172
AO
1246static void version();
1247
ed9b544e 1248/* ========================= Random utility functions ======================= */
1249
1250/* Redis generally does not try to recover from out of memory conditions
1251 * when allocating objects or strings, it is not clear if it will be possible
1252 * to report this condition to the client since the networking layer itself
1253 * is based on heap allocation for send buffers, so we simply abort.
1254 * At least the code will be simpler to read... */
1255static void oom(const char *msg) {
71c54b21 1256 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1257 sleep(1);
1258 abort();
1259}
1260
1261/* ====================== Redis server networking stuff ===================== */
56906eef 1262static void closeTimedoutClients(void) {
ed9b544e 1263 redisClient *c;
ed9b544e 1264 listNode *ln;
1265 time_t now = time(NULL);
c7df85a4 1266 listIter li;
ed9b544e 1267
c7df85a4 1268 listRewind(server.clients,&li);
1269 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1270 c = listNodeValue(ln);
f86a74e9 1271 if (server.maxidletime &&
1272 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1273 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
ffc6b7f8 1274 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1275 listLength(c->pubsub_patterns) == 0 &&
d6cc8867 1276 (now - c->lastinteraction > server.maxidletime))
f86a74e9 1277 {
f870935d 1278 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1279 freeClient(c);
f86a74e9 1280 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1281 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1282 addReply(c,shared.nullmultibulk);
b0d8747d 1283 unblockClientWaitingData(c);
f86a74e9 1284 }
ed9b544e 1285 }
1286 }
ed9b544e 1287}
1288
12fea928 1289static int htNeedsResize(dict *dict) {
1290 long long size, used;
1291
1292 size = dictSlots(dict);
1293 used = dictSize(dict);
1294 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1295 (used*100/size < REDIS_HT_MINFILL));
1296}
1297
0bc03378 1298/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1299 * we resize the hash table to save memory */
56906eef 1300static void tryResizeHashTables(void) {
0bc03378 1301 int j;
1302
1303 for (j = 0; j < server.dbnum; j++) {
5413c40d 1304 if (htNeedsResize(server.db[j].dict))
0bc03378 1305 dictResize(server.db[j].dict);
12fea928 1306 if (htNeedsResize(server.db[j].expires))
1307 dictResize(server.db[j].expires);
0bc03378 1308 }
1309}
1310
8ca3e9d1 1311/* Our hash table implementation performs rehashing incrementally while
1312 * we write/read from the hash table. Still if the server is idle, the hash
1313 * table will use two tables for a long time. So we try to use 1 millisecond
1314 * of CPU time at every serverCron() loop in order to rehash some key. */
1315static void incrementallyRehash(void) {
1316 int j;
1317
1318 for (j = 0; j < server.dbnum; j++) {
1319 if (dictIsRehashing(server.db[j].dict)) {
1320 dictRehashMilliseconds(server.db[j].dict,1);
1321 break; /* already used our millisecond for this loop... */
1322 }
1323 }
1324}
1325
9d65a1bb 1326/* A background saving child (BGSAVE) terminated its work. Handle this. */
1327void backgroundSaveDoneHandler(int statloc) {
1328 int exitcode = WEXITSTATUS(statloc);
1329 int bysignal = WIFSIGNALED(statloc);
1330
1331 if (!bysignal && exitcode == 0) {
1332 redisLog(REDIS_NOTICE,
1333 "Background saving terminated with success");
1334 server.dirty = 0;
1335 server.lastsave = time(NULL);
1336 } else if (!bysignal && exitcode != 0) {
1337 redisLog(REDIS_WARNING, "Background saving error");
1338 } else {
1339 redisLog(REDIS_WARNING,
454eea7c 1340 "Background saving terminated by signal %d", WTERMSIG(statloc));
9d65a1bb 1341 rdbRemoveTempFile(server.bgsavechildpid);
1342 }
1343 server.bgsavechildpid = -1;
1344 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1345 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1346 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1347}
1348
1349/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1350 * Handle this. */
1351void backgroundRewriteDoneHandler(int statloc) {
1352 int exitcode = WEXITSTATUS(statloc);
1353 int bysignal = WIFSIGNALED(statloc);
1354
1355 if (!bysignal && exitcode == 0) {
1356 int fd;
1357 char tmpfile[256];
1358
1359 redisLog(REDIS_NOTICE,
1360 "Background append only file rewriting terminated with success");
1361 /* Now it's time to flush the differences accumulated by the parent */
1362 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1363 fd = open(tmpfile,O_WRONLY|O_APPEND);
1364 if (fd == -1) {
1365 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1366 goto cleanup;
1367 }
1368 /* Flush our data... */
1369 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1370 (signed) sdslen(server.bgrewritebuf)) {
1371 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1372 close(fd);
1373 goto cleanup;
1374 }
b32627cd 1375 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1376 /* Now our work is to rename the temp file into the stable file. And
1377 * switch the file descriptor used by the server for append only. */
1378 if (rename(tmpfile,server.appendfilename) == -1) {
1379 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1380 close(fd);
1381 goto cleanup;
1382 }
1383 /* Mission completed... almost */
1384 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1385 if (server.appendfd != -1) {
1386 /* If append only is actually enabled... */
1387 close(server.appendfd);
1388 server.appendfd = fd;
d5d23dab 1389 if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
85a83172 1390 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1391 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1392 } else {
1393 /* If append only is disabled we just generate a dump in this
1394 * format. Why not? */
1395 close(fd);
1396 }
1397 } else if (!bysignal && exitcode != 0) {
1398 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1399 } else {
1400 redisLog(REDIS_WARNING,
454eea7c 1401 "Background append only file rewriting terminated by signal %d",
1402 WTERMSIG(statloc));
9d65a1bb 1403 }
1404cleanup:
1405 sdsfree(server.bgrewritebuf);
1406 server.bgrewritebuf = sdsempty();
1407 aofRemoveTempFile(server.bgrewritechildpid);
1408 server.bgrewritechildpid = -1;
1409}
1410
884d4b39 1411/* This function is called once a background process of some kind terminates,
1412 * as we want to avoid resizing the hash tables when there is a child in order
1413 * to play well with copy-on-write (otherwise when a resize happens lots of
1414 * memory pages are copied). The goal of this function is to update the ability
1415 * for dict.c to resize the hash tables accordingly to the fact we have o not
1416 * running childs. */
1417static void updateDictResizePolicy(void) {
1418 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1419 dictEnableResize();
1420 else
1421 dictDisableResize();
1422}
1423
56906eef 1424static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1425 int j, loops = server.cronloops++;
ed9b544e 1426 REDIS_NOTUSED(eventLoop);
1427 REDIS_NOTUSED(id);
1428 REDIS_NOTUSED(clientData);
1429
3a66edc7 1430 /* We take a cached value of the unix time in the global state because
1431 * with virtual memory and aging there is to store the current time
1432 * in objects at every object access, and accuracy is not needed.
1433 * To access a global var is faster than calling time(NULL) */
1434 server.unixtime = time(NULL);
1435
fab43727 1436 /* We received a SIGTERM, shutting down here in a safe way, as it is
1437 * not ok doing so inside the signal handler. */
1438 if (server.shutdown_asap) {
1439 if (prepareForShutdown() == REDIS_OK) exit(0);
1440 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1441 }
1442
0bc03378 1443 /* Show some info about non-empty databases */
ed9b544e 1444 for (j = 0; j < server.dbnum; j++) {
dec423d9 1445 long long size, used, vkeys;
94754ccc 1446
3305306f 1447 size = dictSlots(server.db[j].dict);
1448 used = dictSize(server.db[j].dict);
94754ccc 1449 vkeys = dictSize(server.db[j].expires);
1763929f 1450 if (!(loops % 50) && (used || vkeys)) {
f870935d 1451 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1452 /* dictPrintStats(server.dict); */
ed9b544e 1453 }
ed9b544e 1454 }
1455
0bc03378 1456 /* We don't want to resize the hash tables while a bacground saving
1457 * is in progress: the saving child is created using fork() that is
1458 * implemented with a copy-on-write semantic in most modern systems, so
1459 * if we resize the HT while there is the saving child at work actually
1460 * a lot of memory movements in the parent will cause a lot of pages
1461 * copied. */
8ca3e9d1 1462 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1463 if (!(loops % 10)) tryResizeHashTables();
1464 if (server.activerehashing) incrementallyRehash();
884d4b39 1465 }
0bc03378 1466
ed9b544e 1467 /* Show information about connected clients */
1763929f 1468 if (!(loops % 50)) {
bdcb92f2 1469 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
ed9b544e 1470 listLength(server.clients)-listLength(server.slaves),
1471 listLength(server.slaves),
bdcb92f2 1472 zmalloc_used_memory());
ed9b544e 1473 }
1474
1475 /* Close connections of timedout clients */
1763929f 1476 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1477 closeTimedoutClients();
1478
9d65a1bb 1479 /* Check if a background saving or AOF rewrite in progress terminated */
1480 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1481 int statloc;
9d65a1bb 1482 pid_t pid;
1483
1484 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1485 if (pid == server.bgsavechildpid) {
1486 backgroundSaveDoneHandler(statloc);
ed9b544e 1487 } else {
9d65a1bb 1488 backgroundRewriteDoneHandler(statloc);
ed9b544e 1489 }
884d4b39 1490 updateDictResizePolicy();
ed9b544e 1491 }
1492 } else {
1493 /* If there is not a background saving in progress check if
1494 * we have to save now */
1495 time_t now = time(NULL);
1496 for (j = 0; j < server.saveparamslen; j++) {
1497 struct saveparam *sp = server.saveparams+j;
1498
1499 if (server.dirty >= sp->changes &&
1500 now-server.lastsave > sp->seconds) {
1501 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1502 sp->changes, sp->seconds);
f78fd11b 1503 rdbSaveBackground(server.dbfilename);
ed9b544e 1504 break;
1505 }
1506 }
1507 }
94754ccc 1508
f2324293 1509 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1510 * will use few CPU cycles if there are few expiring keys, otherwise
1511 * it will get more aggressive to avoid that too much memory is used by
1512 * keys that can be removed from the keyspace. */
94754ccc 1513 for (j = 0; j < server.dbnum; j++) {
f2324293 1514 int expired;
94754ccc 1515 redisDb *db = server.db+j;
94754ccc 1516
f2324293 1517 /* Continue to expire if at the end of the cycle more than 25%
1518 * of the keys were expired. */
1519 do {
4ef8de8a 1520 long num = dictSize(db->expires);
94754ccc 1521 time_t now = time(NULL);
1522
f2324293 1523 expired = 0;
94754ccc 1524 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1525 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1526 while (num--) {
1527 dictEntry *de;
1528 time_t t;
1529
1530 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1531 t = (time_t) dictGetEntryVal(de);
1532 if (now > t) {
1533 deleteKey(db,dictGetEntryKey(de));
f2324293 1534 expired++;
2a6a2ed1 1535 server.stat_expiredkeys++;
94754ccc 1536 }
1537 }
f2324293 1538 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1539 }
1540
4ef8de8a 1541 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1542 * is enbled. Try to free objects from the free list first. */
7e69548d 1543 if (vmCanSwapOut()) {
1544 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1545 server.vm_max_memory)
1546 {
72e9fd40 1547 int retval;
1548
a5819310 1549 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1550 retval = (server.vm_max_threads == 0) ?
1551 vmSwapOneObjectBlocking() :
1552 vmSwapOneObjectThreaded();
1763929f 1553 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1554 zmalloc_used_memory() >
1555 (server.vm_max_memory+server.vm_max_memory/10))
1556 {
1557 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1558 }
72e9fd40 1559 /* Note that when using threade I/O we free just one object,
1560 * because anyway when the I/O thread in charge to swap this
1561 * object out will finish, the handler of completed jobs
1562 * will try to swap more objects if we are still out of memory. */
1563 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1564 }
1565 }
1566
ed9b544e 1567 /* Check if we should connect to a MASTER */
1763929f 1568 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1569 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1570 if (syncWithMaster() == REDIS_OK) {
1571 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
8f63ddca 1572 if (server.appendonly) rewriteAppendOnlyFileBackground();
ed9b544e 1573 }
1574 }
1763929f 1575 return 100;
ed9b544e 1576}
1577
d5d55fc3 1578/* This function gets called every time Redis is entering the
1579 * main loop of the event driven library, that is, before to sleep
1580 * for ready file descriptors. */
1581static void beforeSleep(struct aeEventLoop *eventLoop) {
1582 REDIS_NOTUSED(eventLoop);
1583
28ed1f33 1584 /* Awake clients that got all the swapped keys they requested */
d5d55fc3 1585 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1586 listIter li;
1587 listNode *ln;
1588
1589 listRewind(server.io_ready_clients,&li);
1590 while((ln = listNext(&li))) {
1591 redisClient *c = ln->value;
1592 struct redisCommand *cmd;
1593
1594 /* Resume the client. */
1595 listDelNode(server.io_ready_clients,ln);
1596 c->flags &= (~REDIS_IO_WAIT);
1597 server.vm_blocked_clients--;
1598 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1599 readQueryFromClient, c);
1600 cmd = lookupCommand(c->argv[0]->ptr);
1601 assert(cmd != NULL);
1602 call(c,cmd);
1603 resetClient(c);
1604 /* There may be more data to process in the input buffer. */
1605 if (c->querybuf && sdslen(c->querybuf) > 0)
1606 processInputBuffer(c);
1607 }
1608 }
28ed1f33 1609 /* Write the AOF buffer on disk */
1610 flushAppendOnlyFile();
d5d55fc3 1611}
1612
ed9b544e 1613static void createSharedObjects(void) {
05df7621 1614 int j;
1615
ed9b544e 1616 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1617 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1618 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1619 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1620 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1621 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1622 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1623 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1624 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1625 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1626 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1627 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1628 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1629 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1630 "-ERR no such key\r\n"));
ed9b544e 1631 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1632 "-ERR syntax error\r\n"));
c937aa89 1633 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1634 "-ERR source and destination objects are the same\r\n"));
1635 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1636 "-ERR index out of range\r\n"));
ed9b544e 1637 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1638 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1639 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1640 shared.select0 = createStringObject("select 0\r\n",10);
1641 shared.select1 = createStringObject("select 1\r\n",10);
1642 shared.select2 = createStringObject("select 2\r\n",10);
1643 shared.select3 = createStringObject("select 3\r\n",10);
1644 shared.select4 = createStringObject("select 4\r\n",10);
1645 shared.select5 = createStringObject("select 5\r\n",10);
1646 shared.select6 = createStringObject("select 6\r\n",10);
1647 shared.select7 = createStringObject("select 7\r\n",10);
1648 shared.select8 = createStringObject("select 8\r\n",10);
1649 shared.select9 = createStringObject("select 9\r\n",10);
befec3cd 1650 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
c8d0ea0e 1651 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
befec3cd 1652 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
fc46bb71 1653 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
ffc6b7f8 1654 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1655 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
befec3cd 1656 shared.mbulk3 = createStringObject("*3\r\n",4);
c8d0ea0e 1657 shared.mbulk4 = createStringObject("*4\r\n",4);
05df7621 1658 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1659 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1660 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1661 }
ed9b544e 1662}
1663
1664static void appendServerSaveParams(time_t seconds, int changes) {
1665 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1666 server.saveparams[server.saveparamslen].seconds = seconds;
1667 server.saveparams[server.saveparamslen].changes = changes;
1668 server.saveparamslen++;
1669}
1670
bcfc686d 1671static void resetServerSaveParams() {
ed9b544e 1672 zfree(server.saveparams);
1673 server.saveparams = NULL;
1674 server.saveparamslen = 0;
1675}
1676
1677static void initServerConfig() {
1678 server.dbnum = REDIS_DEFAULT_DBNUM;
1679 server.port = REDIS_SERVERPORT;
f870935d 1680 server.verbosity = REDIS_VERBOSE;
ed9b544e 1681 server.maxidletime = REDIS_MAXIDLETIME;
1682 server.saveparams = NULL;
1683 server.logfile = NULL; /* NULL = log on standard output */
1684 server.bindaddr = NULL;
1685 server.glueoutputbuf = 1;
1686 server.daemonize = 0;
44b38ef4 1687 server.appendonly = 0;
1b677732 1688 server.appendfsync = APPENDFSYNC_EVERYSEC;
38db9171 1689 server.no_appendfsync_on_rewrite = 0;
48f0308a 1690 server.lastfsync = time(NULL);
44b38ef4 1691 server.appendfd = -1;
1692 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1693 server.pidfile = zstrdup("/var/run/redis.pid");
1694 server.dbfilename = zstrdup("dump.rdb");
1695 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1696 server.requirepass = NULL;
b0553789 1697 server.rdbcompression = 1;
8ca3e9d1 1698 server.activerehashing = 1;
285add55 1699 server.maxclients = 0;
d5d55fc3 1700 server.blpop_blocked_clients = 0;
3fd78bcd 1701 server.maxmemory = 0;
75680a3c 1702 server.vm_enabled = 0;
054e426d 1703 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1704 server.vm_page_size = 256; /* 256 bytes per page */
1705 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1706 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1707 server.vm_max_threads = 4;
d5d55fc3 1708 server.vm_blocked_clients = 0;
cbba7dd7 1709 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1710 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
fab43727 1711 server.shutdown_asap = 0;
75680a3c 1712
bcfc686d 1713 resetServerSaveParams();
ed9b544e 1714
1715 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1716 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1717 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1718 /* Replication related */
1719 server.isslave = 0;
d0ccebcf 1720 server.masterauth = NULL;
ed9b544e 1721 server.masterhost = NULL;
1722 server.masterport = 6379;
1723 server.master = NULL;
1724 server.replstate = REDIS_REPL_NONE;
a7866db6 1725
1726 /* Double constants initialization */
1727 R_Zero = 0.0;
1728 R_PosInf = 1.0/R_Zero;
1729 R_NegInf = -1.0/R_Zero;
1730 R_Nan = R_Zero/R_Zero;
ed9b544e 1731}
1732
1733static void initServer() {
1734 int j;
1735
1736 signal(SIGHUP, SIG_IGN);
1737 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1738 setupSigSegvAction();
ed9b544e 1739
b9bc0eef 1740 server.devnull = fopen("/dev/null","w");
1741 if (server.devnull == NULL) {
1742 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1743 exit(1);
1744 }
ed9b544e 1745 server.clients = listCreate();
1746 server.slaves = listCreate();
87eca727 1747 server.monitors = listCreate();
ed9b544e 1748 server.objfreelist = listCreate();
1749 createSharedObjects();
1750 server.el = aeCreateEventLoop();
3305306f 1751 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
ed9b544e 1752 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1753 if (server.fd == -1) {
1754 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1755 exit(1);
1756 }
3305306f 1757 for (j = 0; j < server.dbnum; j++) {
5234952b 1758 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1759 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
37ab76c9 1760 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1761 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1762 if (server.vm_enabled)
1763 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1764 server.db[j].id = j;
1765 }
ffc6b7f8 1766 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1767 server.pubsub_patterns = listCreate();
1768 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1769 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
ed9b544e 1770 server.cronloops = 0;
9f3c422c 1771 server.bgsavechildpid = -1;
9d65a1bb 1772 server.bgrewritechildpid = -1;
1773 server.bgrewritebuf = sdsempty();
28ed1f33 1774 server.aofbuf = sdsempty();
ed9b544e 1775 server.lastsave = time(NULL);
1776 server.dirty = 0;
ed9b544e 1777 server.stat_numcommands = 0;
1778 server.stat_numconnections = 0;
2a6a2ed1 1779 server.stat_expiredkeys = 0;
ed9b544e 1780 server.stat_starttime = time(NULL);
3a66edc7 1781 server.unixtime = time(NULL);
d8f8b666 1782 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1783 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1784 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1785
1786 if (server.appendonly) {
3bb225d6 1787 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1788 if (server.appendfd == -1) {
1789 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1790 strerror(errno));
1791 exit(1);
1792 }
1793 }
75680a3c 1794
1795 if (server.vm_enabled) vmInit();
ed9b544e 1796}
1797
1798/* Empty the whole database */
ca37e9cd 1799static long long emptyDb() {
ed9b544e 1800 int j;
ca37e9cd 1801 long long removed = 0;
ed9b544e 1802
3305306f 1803 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1804 removed += dictSize(server.db[j].dict);
3305306f 1805 dictEmpty(server.db[j].dict);
1806 dictEmpty(server.db[j].expires);
1807 }
ca37e9cd 1808 return removed;
ed9b544e 1809}
1810
85dd2f3a 1811static int yesnotoi(char *s) {
1812 if (!strcasecmp(s,"yes")) return 1;
1813 else if (!strcasecmp(s,"no")) return 0;
1814 else return -1;
1815}
1816
ed9b544e 1817/* I agree, this is a very rudimental way to load a configuration...
1818 will improve later if the config gets more complex */
1819static void loadServerConfig(char *filename) {
c9a111ac 1820 FILE *fp;
ed9b544e 1821 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1822 int linenum = 0;
1823 sds line = NULL;
c9a111ac 1824
1825 if (filename[0] == '-' && filename[1] == '\0')
1826 fp = stdin;
1827 else {
1828 if ((fp = fopen(filename,"r")) == NULL) {
9a22de82 1829 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
c9a111ac 1830 exit(1);
1831 }
ed9b544e 1832 }
c9a111ac 1833
ed9b544e 1834 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1835 sds *argv;
1836 int argc, j;
1837
1838 linenum++;
1839 line = sdsnew(buf);
1840 line = sdstrim(line," \t\r\n");
1841
1842 /* Skip comments and blank lines*/
1843 if (line[0] == '#' || line[0] == '\0') {
1844 sdsfree(line);
1845 continue;
1846 }
1847
1848 /* Split into arguments */
1849 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1850 sdstolower(argv[0]);
1851
1852 /* Execute config directives */
bb0b03a3 1853 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1854 server.maxidletime = atoi(argv[1]);
0150db36 1855 if (server.maxidletime < 0) {
ed9b544e 1856 err = "Invalid timeout value"; goto loaderr;
1857 }
bb0b03a3 1858 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1859 server.port = atoi(argv[1]);
1860 if (server.port < 1 || server.port > 65535) {
1861 err = "Invalid port"; goto loaderr;
1862 }
bb0b03a3 1863 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1864 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1865 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1866 int seconds = atoi(argv[1]);
1867 int changes = atoi(argv[2]);
1868 if (seconds < 1 || changes < 0) {
1869 err = "Invalid save parameters"; goto loaderr;
1870 }
1871 appendServerSaveParams(seconds,changes);
bb0b03a3 1872 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1873 if (chdir(argv[1]) == -1) {
1874 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1875 argv[1], strerror(errno));
1876 exit(1);
1877 }
bb0b03a3 1878 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1879 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1880 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1881 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1882 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1883 else {
1884 err = "Invalid log level. Must be one of debug, notice, warning";
1885 goto loaderr;
1886 }
bb0b03a3 1887 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1888 FILE *logfp;
ed9b544e 1889
1890 server.logfile = zstrdup(argv[1]);
bb0b03a3 1891 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1892 zfree(server.logfile);
1893 server.logfile = NULL;
1894 }
1895 if (server.logfile) {
1896 /* Test if we are able to open the file. The server will not
1897 * be able to abort just for this problem later... */
c9a111ac 1898 logfp = fopen(server.logfile,"a");
1899 if (logfp == NULL) {
ed9b544e 1900 err = sdscatprintf(sdsempty(),
1901 "Can't open the log file: %s", strerror(errno));
1902 goto loaderr;
1903 }
c9a111ac 1904 fclose(logfp);
ed9b544e 1905 }
bb0b03a3 1906 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1907 server.dbnum = atoi(argv[1]);
1908 if (server.dbnum < 1) {
1909 err = "Invalid number of databases"; goto loaderr;
1910 }
b3f83f12
JZ
1911 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1912 loadServerConfig(argv[1]);
285add55 1913 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1914 server.maxclients = atoi(argv[1]);
3fd78bcd 1915 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
2b619329 1916 server.maxmemory = memtoll(argv[1],NULL);
bb0b03a3 1917 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1918 server.masterhost = sdsnew(argv[1]);
1919 server.masterport = atoi(argv[2]);
1920 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1921 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1922 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1923 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1924 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1925 err = "argument must be 'yes' or 'no'"; goto loaderr;
1926 }
121f70cf 1927 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1928 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
8ca3e9d1 1929 err = "argument must be 'yes' or 'no'"; goto loaderr;
1930 }
1931 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1932 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
121f70cf 1933 err = "argument must be 'yes' or 'no'"; goto loaderr;
1934 }
bb0b03a3 1935 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1936 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1937 err = "argument must be 'yes' or 'no'"; goto loaderr;
1938 }
44b38ef4 1939 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1940 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1941 err = "argument must be 'yes' or 'no'"; goto loaderr;
1942 }
f3b52411
PN
1943 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1944 zfree(server.appendfilename);
1945 server.appendfilename = zstrdup(argv[1]);
38db9171 1946 } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite")
1947 && argc == 2) {
1948 if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) {
1949 err = "argument must be 'yes' or 'no'"; goto loaderr;
1950 }
48f0308a 1951 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1952 if (!strcasecmp(argv[1],"no")) {
48f0308a 1953 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1954 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1955 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1956 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1957 server.appendfsync = APPENDFSYNC_EVERYSEC;
1958 } else {
1959 err = "argument must be 'no', 'always' or 'everysec'";
1960 goto loaderr;
1961 }
bb0b03a3 1962 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 1963 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1964 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 1965 zfree(server.pidfile);
054e426d 1966 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1967 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 1968 zfree(server.dbfilename);
054e426d 1969 server.dbfilename = zstrdup(argv[1]);
75680a3c 1970 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1971 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1972 err = "argument must be 'yes' or 'no'"; goto loaderr;
1973 }
054e426d 1974 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 1975 zfree(server.vm_swap_file);
054e426d 1976 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 1977 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2b619329 1978 server.vm_max_memory = memtoll(argv[1],NULL);
4ef8de8a 1979 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2b619329 1980 server.vm_page_size = memtoll(argv[1], NULL);
4ef8de8a 1981 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2b619329 1982 server.vm_pages = memtoll(argv[1], NULL);
92f8e882 1983 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1984 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 1985 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2b619329 1986 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
cbba7dd7 1987 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2b619329 1988 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
ed9b544e 1989 } else {
1990 err = "Bad directive or wrong number of arguments"; goto loaderr;
1991 }
1992 for (j = 0; j < argc; j++)
1993 sdsfree(argv[j]);
1994 zfree(argv);
1995 sdsfree(line);
1996 }
c9a111ac 1997 if (fp != stdin) fclose(fp);
ed9b544e 1998 return;
1999
2000loaderr:
2001 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
2002 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
2003 fprintf(stderr, ">>> '%s'\n", line);
2004 fprintf(stderr, "%s\n", err);
2005 exit(1);
2006}
2007
2008static void freeClientArgv(redisClient *c) {
2009 int j;
2010
2011 for (j = 0; j < c->argc; j++)
2012 decrRefCount(c->argv[j]);
e8a74421 2013 for (j = 0; j < c->mbargc; j++)
2014 decrRefCount(c->mbargv[j]);
ed9b544e 2015 c->argc = 0;
e8a74421 2016 c->mbargc = 0;
ed9b544e 2017}
2018
2019static void freeClient(redisClient *c) {
2020 listNode *ln;
2021
4409877e 2022 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 2023 * call, we have to set querybuf to NULL *before* to call
2024 * unblockClientWaitingData() to avoid processInputBuffer() will get
2025 * called. Also it is important to remove the file events after
2026 * this, because this call adds the READABLE event. */
4409877e 2027 sdsfree(c->querybuf);
2028 c->querybuf = NULL;
2029 if (c->flags & REDIS_BLOCKED)
b0d8747d 2030 unblockClientWaitingData(c);
4409877e 2031
37ab76c9 2032 /* UNWATCH all the keys */
2033 unwatchAllKeys(c);
2034 listRelease(c->watched_keys);
ffc6b7f8 2035 /* Unsubscribe from all the pubsub channels */
2036 pubsubUnsubscribeAllChannels(c,0);
2037 pubsubUnsubscribeAllPatterns(c,0);
2038 dictRelease(c->pubsub_channels);
2039 listRelease(c->pubsub_patterns);
befec3cd 2040 /* Obvious cleanup */
ed9b544e 2041 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2042 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 2043 listRelease(c->reply);
2044 freeClientArgv(c);
2045 close(c->fd);
92f8e882 2046 /* Remove from the list of clients */
ed9b544e 2047 ln = listSearchKey(server.clients,c);
dfc5e96c 2048 redisAssert(ln != NULL);
ed9b544e 2049 listDelNode(server.clients,ln);
37ab76c9 2050 /* Remove from the list of clients that are now ready to be restarted
2051 * after waiting for swapped keys */
d5d55fc3 2052 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2053 ln = listSearchKey(server.io_ready_clients,c);
2054 if (ln) {
2055 listDelNode(server.io_ready_clients,ln);
2056 server.vm_blocked_clients--;
2057 }
2058 }
37ab76c9 2059 /* Remove from the list of clients waiting for swapped keys */
d5d55fc3 2060 while (server.vm_enabled && listLength(c->io_keys)) {
2061 ln = listFirst(c->io_keys);
2062 dontWaitForSwappedKey(c,ln->value);
92f8e882 2063 }
b3e3d0d7 2064 listRelease(c->io_keys);
befec3cd 2065 /* Master/slave cleanup */
ed9b544e 2066 if (c->flags & REDIS_SLAVE) {
6208b3a7 2067 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2068 close(c->repldbfd);
87eca727 2069 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2070 ln = listSearchKey(l,c);
dfc5e96c 2071 redisAssert(ln != NULL);
87eca727 2072 listDelNode(l,ln);
ed9b544e 2073 }
2074 if (c->flags & REDIS_MASTER) {
2075 server.master = NULL;
2076 server.replstate = REDIS_REPL_CONNECT;
2077 }
befec3cd 2078 /* Release memory */
93ea3759 2079 zfree(c->argv);
e8a74421 2080 zfree(c->mbargv);
6e469882 2081 freeClientMultiState(c);
ed9b544e 2082 zfree(c);
2083}
2084
cc30e368 2085#define GLUEREPLY_UP_TO (1024)
ed9b544e 2086static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 2087 int copylen = 0;
2088 char buf[GLUEREPLY_UP_TO];
6208b3a7 2089 listNode *ln;
c7df85a4 2090 listIter li;
ed9b544e 2091 robj *o;
2092
c7df85a4 2093 listRewind(c->reply,&li);
2094 while((ln = listNext(&li))) {
c28b42ac 2095 int objlen;
2096
ed9b544e 2097 o = ln->value;
c28b42ac 2098 objlen = sdslen(o->ptr);
2099 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2100 memcpy(buf+copylen,o->ptr,objlen);
2101 copylen += objlen;
ed9b544e 2102 listDelNode(c->reply,ln);
c28b42ac 2103 } else {
2104 if (copylen == 0) return;
2105 break;
ed9b544e 2106 }
ed9b544e 2107 }
c28b42ac 2108 /* Now the output buffer is empty, add the new single element */
2109 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2110 listAddNodeHead(c->reply,o);
ed9b544e 2111}
2112
2113static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2114 redisClient *c = privdata;
2115 int nwritten = 0, totwritten = 0, objlen;
2116 robj *o;
2117 REDIS_NOTUSED(el);
2118 REDIS_NOTUSED(mask);
2119
2895e862 2120 /* Use writev() if we have enough buffers to send */
7ea870c0 2121 if (!server.glueoutputbuf &&
e0a62c7f 2122 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
7ea870c0 2123 !(c->flags & REDIS_MASTER))
2895e862 2124 {
2125 sendReplyToClientWritev(el, fd, privdata, mask);
2126 return;
2127 }
2895e862 2128
ed9b544e 2129 while(listLength(c->reply)) {
c28b42ac 2130 if (server.glueoutputbuf && listLength(c->reply) > 1)
2131 glueReplyBuffersIfNeeded(c);
2132
ed9b544e 2133 o = listNodeValue(listFirst(c->reply));
2134 objlen = sdslen(o->ptr);
2135
2136 if (objlen == 0) {
2137 listDelNode(c->reply,listFirst(c->reply));
2138 continue;
2139 }
2140
2141 if (c->flags & REDIS_MASTER) {
6f376729 2142 /* Don't reply to a master */
ed9b544e 2143 nwritten = objlen - c->sentlen;
2144 } else {
a4d1ba9a 2145 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 2146 if (nwritten <= 0) break;
2147 }
2148 c->sentlen += nwritten;
2149 totwritten += nwritten;
2150 /* If we fully sent the object on head go to the next one */
2151 if (c->sentlen == objlen) {
2152 listDelNode(c->reply,listFirst(c->reply));
2153 c->sentlen = 0;
2154 }
6f376729 2155 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 2156 * bytes, in a single threaded server it's a good idea to serve
6f376729 2157 * other clients as well, even if a very large request comes from
2158 * super fast link that is always able to accept data (in real world
12f9d551 2159 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 2160 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 2161 }
2162 if (nwritten == -1) {
2163 if (errno == EAGAIN) {
2164 nwritten = 0;
2165 } else {
f870935d 2166 redisLog(REDIS_VERBOSE,
ed9b544e 2167 "Error writing to client: %s", strerror(errno));
2168 freeClient(c);
2169 return;
2170 }
2171 }
2172 if (totwritten > 0) c->lastinteraction = time(NULL);
2173 if (listLength(c->reply) == 0) {
2174 c->sentlen = 0;
2175 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2176 }
2177}
2178
2895e862 2179static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2180{
2181 redisClient *c = privdata;
2182 int nwritten = 0, totwritten = 0, objlen, willwrite;
2183 robj *o;
2184 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2185 int offset, ion = 0;
2186 REDIS_NOTUSED(el);
2187 REDIS_NOTUSED(mask);
2188
2189 listNode *node;
2190 while (listLength(c->reply)) {
2191 offset = c->sentlen;
2192 ion = 0;
2193 willwrite = 0;
2194
2195 /* fill-in the iov[] array */
2196 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2197 o = listNodeValue(node);
2198 objlen = sdslen(o->ptr);
2199
e0a62c7f 2200 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2895e862 2201 break;
2202
2203 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2204 break; /* no more iovecs */
2205
2206 iov[ion].iov_base = ((char*)o->ptr) + offset;
2207 iov[ion].iov_len = objlen - offset;
2208 willwrite += objlen - offset;
2209 offset = 0; /* just for the first item */
2210 ion++;
2211 }
2212
2213 if(willwrite == 0)
2214 break;
2215
2216 /* write all collected blocks at once */
2217 if((nwritten = writev(fd, iov, ion)) < 0) {
2218 if (errno != EAGAIN) {
f870935d 2219 redisLog(REDIS_VERBOSE,
2895e862 2220 "Error writing to client: %s", strerror(errno));
2221 freeClient(c);
2222 return;
2223 }
2224 break;
2225 }
2226
2227 totwritten += nwritten;
2228 offset = c->sentlen;
2229
2230 /* remove written robjs from c->reply */
2231 while (nwritten && listLength(c->reply)) {
2232 o = listNodeValue(listFirst(c->reply));
2233 objlen = sdslen(o->ptr);
2234
2235 if(nwritten >= objlen - offset) {
2236 listDelNode(c->reply, listFirst(c->reply));
2237 nwritten -= objlen - offset;
2238 c->sentlen = 0;
2239 } else {
2240 /* partial write */
2241 c->sentlen += nwritten;
2242 break;
2243 }
2244 offset = 0;
2245 }
2246 }
2247
e0a62c7f 2248 if (totwritten > 0)
2895e862 2249 c->lastinteraction = time(NULL);
2250
2251 if (listLength(c->reply) == 0) {
2252 c->sentlen = 0;
2253 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2254 }
2255}
2256
ed9b544e 2257static struct redisCommand *lookupCommand(char *name) {
2258 int j = 0;
2259 while(cmdTable[j].name != NULL) {
bb0b03a3 2260 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
ed9b544e 2261 j++;
2262 }
2263 return NULL;
2264}
2265
2266/* resetClient prepare the client to process the next command */
2267static void resetClient(redisClient *c) {
2268 freeClientArgv(c);
2269 c->bulklen = -1;
e8a74421 2270 c->multibulk = 0;
ed9b544e 2271}
2272
6e469882 2273/* Call() is the core of Redis execution of a command */
2274static void call(redisClient *c, struct redisCommand *cmd) {
2275 long long dirty;
2276
2277 dirty = server.dirty;
2278 cmd->proc(c);
4005fef1 2279 dirty = server.dirty-dirty;
2280
2281 if (server.appendonly && dirty)
6e469882 2282 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
4005fef1 2283 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2284 listLength(server.slaves))
248ea310 2285 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2286 if (listLength(server.monitors))
dd142b9c 2287 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2288 server.stat_numcommands++;
2289}
2290
ed9b544e 2291/* If this function gets called we already read a whole
2292 * command, argments are in the client argv/argc fields.
2293 * processCommand() execute the command or prepare the
2294 * server for a bulk read from the client.
2295 *
2296 * If 1 is returned the client is still alive and valid and
2297 * and other operations can be performed by the caller. Otherwise
2298 * if 0 is returned the client was destroied (i.e. after QUIT). */
2299static int processCommand(redisClient *c) {
2300 struct redisCommand *cmd;
ed9b544e 2301
3fd78bcd 2302 /* Free some memory if needed (maxmemory setting) */
2303 if (server.maxmemory) freeMemoryIfNeeded();
2304
e8a74421 2305 /* Handle the multi bulk command type. This is an alternative protocol
2306 * supported by Redis in order to receive commands that are composed of
2307 * multiple binary-safe "bulk" arguments. The latency of processing is
2308 * a bit higher but this allows things like multi-sets, so if this
2309 * protocol is used only for MSET and similar commands this is a big win. */
2310 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2311 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2312 if (c->multibulk <= 0) {
2313 resetClient(c);
2314 return 1;
2315 } else {
2316 decrRefCount(c->argv[c->argc-1]);
2317 c->argc--;
2318 return 1;
2319 }
2320 } else if (c->multibulk) {
2321 if (c->bulklen == -1) {
2322 if (((char*)c->argv[0]->ptr)[0] != '$') {
2323 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2324 resetClient(c);
2325 return 1;
2326 } else {
2327 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2328 decrRefCount(c->argv[0]);
2329 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2330 c->argc--;
2331 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2332 resetClient(c);
2333 return 1;
2334 }
2335 c->argc--;
2336 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2337 return 1;
2338 }
2339 } else {
2340 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2341 c->mbargv[c->mbargc] = c->argv[0];
2342 c->mbargc++;
2343 c->argc--;
2344 c->multibulk--;
2345 if (c->multibulk == 0) {
2346 robj **auxargv;
2347 int auxargc;
2348
2349 /* Here we need to swap the multi-bulk argc/argv with the
2350 * normal argc/argv of the client structure. */
2351 auxargv = c->argv;
2352 c->argv = c->mbargv;
2353 c->mbargv = auxargv;
2354
2355 auxargc = c->argc;
2356 c->argc = c->mbargc;
2357 c->mbargc = auxargc;
2358
2359 /* We need to set bulklen to something different than -1
2360 * in order for the code below to process the command without
2361 * to try to read the last argument of a bulk command as
2362 * a special argument. */
2363 c->bulklen = 0;
2364 /* continue below and process the command */
2365 } else {
2366 c->bulklen = -1;
2367 return 1;
2368 }
2369 }
2370 }
2371 /* -- end of multi bulk commands processing -- */
2372
ed9b544e 2373 /* The QUIT command is handled as a special case. Normal command
2374 * procs are unable to close the client connection safely */
bb0b03a3 2375 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2376 freeClient(c);
2377 return 0;
2378 }
d5d55fc3 2379
2380 /* Now lookup the command and check ASAP about trivial error conditions
2381 * such wrong arity, bad command name and so forth. */
ed9b544e 2382 cmd = lookupCommand(c->argv[0]->ptr);
2383 if (!cmd) {
2c14807b 2384 addReplySds(c,
2385 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2386 (char*)c->argv[0]->ptr));
ed9b544e 2387 resetClient(c);
2388 return 1;
2389 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2390 (c->argc < -cmd->arity)) {
454d4e43 2391 addReplySds(c,
2392 sdscatprintf(sdsempty(),
2393 "-ERR wrong number of arguments for '%s' command\r\n",
2394 cmd->name));
ed9b544e 2395 resetClient(c);
2396 return 1;
2397 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2398 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2399 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2400
2401 decrRefCount(c->argv[c->argc-1]);
2402 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2403 c->argc--;
2404 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2405 resetClient(c);
2406 return 1;
2407 }
2408 c->argc--;
2409 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2410 /* It is possible that the bulk read is already in the
8d0490e7 2411 * buffer. Check this condition and handle it accordingly.
2412 * This is just a fast path, alternative to call processInputBuffer().
2413 * It's a good idea since the code is small and this condition
2414 * happens most of the times. */
ed9b544e 2415 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2416 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2417 c->argc++;
2418 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2419 } else {
d5d55fc3 2420 /* Otherwise return... there is to read the last argument
2421 * from the socket. */
ed9b544e 2422 return 1;
2423 }
2424 }
942a3961 2425 /* Let's try to encode the bulk object to save space. */
2426 if (cmd->flags & REDIS_CMD_BULK)
05df7621 2427 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
942a3961 2428
e63943a4 2429 /* Check if the user is authenticated */
2430 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2431 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2432 resetClient(c);
2433 return 1;
2434 }
2435
b61a28fe 2436 /* Handle the maxmemory directive */
2437 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2438 zmalloc_used_memory() > server.maxmemory)
2439 {
2440 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2441 resetClient(c);
2442 return 1;
2443 }
2444
d6cc8867 2445 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
e6cca5db 2446 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2447 &&
ffc6b7f8 2448 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2449 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2450 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
d6cc8867 2451 resetClient(c);
2452 return 1;
2453 }
2454
ed9b544e 2455 /* Exec the command */
6531c94d 2456 if (c->flags & REDIS_MULTI &&
2457 cmd->proc != execCommand && cmd->proc != discardCommand &&
2458 cmd->proc != multiCommand && cmd->proc != watchCommand)
2459 {
6e469882 2460 queueMultiCommand(c,cmd);
2461 addReply(c,shared.queued);
2462 } else {
d5d55fc3 2463 if (server.vm_enabled && server.vm_max_threads > 0 &&
0a6f3f0f 2464 blockClientOnSwappedKeys(c,cmd)) return 1;
6e469882 2465 call(c,cmd);
2466 }
ed9b544e 2467
2468 /* Prepare the client for the next command */
ed9b544e 2469 resetClient(c);
2470 return 1;
2471}
2472
248ea310 2473static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2474 listNode *ln;
c7df85a4 2475 listIter li;
ed9b544e 2476 int outc = 0, j;
93ea3759 2477 robj **outv;
248ea310 2478 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2479 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2480 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2481 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2482 robj *lenobj;
93ea3759 2483
2484 if (argc <= REDIS_STATIC_ARGS) {
2485 outv = static_outv;
2486 } else {
248ea310 2487 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2488 }
248ea310 2489
2490 lenobj = createObject(REDIS_STRING,
2491 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2492 lenobj->refcount = 0;
2493 outv[outc++] = lenobj;
ed9b544e 2494 for (j = 0; j < argc; j++) {
248ea310 2495 lenobj = createObject(REDIS_STRING,
2496 sdscatprintf(sdsempty(),"$%lu\r\n",
2497 (unsigned long) stringObjectLen(argv[j])));
2498 lenobj->refcount = 0;
2499 outv[outc++] = lenobj;
ed9b544e 2500 outv[outc++] = argv[j];
248ea310 2501 outv[outc++] = shared.crlf;
ed9b544e 2502 }
ed9b544e 2503
40d224a9 2504 /* Increment all the refcounts at start and decrement at end in order to
2505 * be sure to free objects if there is no slave in a replication state
2506 * able to be feed with commands */
2507 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2508 listRewind(slaves,&li);
2509 while((ln = listNext(&li))) {
ed9b544e 2510 redisClient *slave = ln->value;
40d224a9 2511
2512 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2513 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2514
2515 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2516 if (slave->slaveseldb != dictid) {
2517 robj *selectcmd;
2518
2519 switch(dictid) {
2520 case 0: selectcmd = shared.select0; break;
2521 case 1: selectcmd = shared.select1; break;
2522 case 2: selectcmd = shared.select2; break;
2523 case 3: selectcmd = shared.select3; break;
2524 case 4: selectcmd = shared.select4; break;
2525 case 5: selectcmd = shared.select5; break;
2526 case 6: selectcmd = shared.select6; break;
2527 case 7: selectcmd = shared.select7; break;
2528 case 8: selectcmd = shared.select8; break;
2529 case 9: selectcmd = shared.select9; break;
2530 default:
2531 selectcmd = createObject(REDIS_STRING,
2532 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2533 selectcmd->refcount = 0;
2534 break;
2535 }
2536 addReply(slave,selectcmd);
2537 slave->slaveseldb = dictid;
2538 }
2539 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2540 }
40d224a9 2541 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2542 if (outv != static_outv) zfree(outv);
ed9b544e 2543}
2544
dd142b9c 2545static sds sdscatrepr(sds s, char *p, size_t len) {
2546 s = sdscatlen(s,"\"",1);
2547 while(len--) {
2548 switch(*p) {
2549 case '\\':
2550 case '"':
2551 s = sdscatprintf(s,"\\%c",*p);
2552 break;
2553 case '\n': s = sdscatlen(s,"\\n",1); break;
2554 case '\r': s = sdscatlen(s,"\\r",1); break;
2555 case '\t': s = sdscatlen(s,"\\t",1); break;
2556 case '\a': s = sdscatlen(s,"\\a",1); break;
2557 case '\b': s = sdscatlen(s,"\\b",1); break;
2558 default:
2559 if (isprint(*p))
2560 s = sdscatprintf(s,"%c",*p);
2561 else
2562 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2563 break;
2564 }
2565 p++;
2566 }
2567 return sdscatlen(s,"\"",1);
2568}
2569
2570static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2571 listNode *ln;
2572 listIter li;
2573 int j;
2574 sds cmdrepr = sdsnew("+");
2575 robj *cmdobj;
2576 struct timeval tv;
2577
2578 gettimeofday(&tv,NULL);
2579 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2580 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2581
2582 for (j = 0; j < argc; j++) {
2583 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2584 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2585 } else {
2586 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2587 sdslen(argv[j]->ptr));
2588 }
2589 if (j != argc-1)
2590 cmdrepr = sdscatlen(cmdrepr," ",1);
2591 }
2592 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2593 cmdobj = createObject(REDIS_STRING,cmdrepr);
2594
2595 listRewind(monitors,&li);
2596 while((ln = listNext(&li))) {
2597 redisClient *monitor = ln->value;
2598 addReply(monitor,cmdobj);
2599 }
2600 decrRefCount(cmdobj);
2601}
2602
638e42ac 2603static void processInputBuffer(redisClient *c) {
ed9b544e 2604again:
4409877e 2605 /* Before to process the input buffer, make sure the client is not
2606 * waitig for a blocking operation such as BLPOP. Note that the first
2607 * iteration the client is never blocked, otherwise the processInputBuffer
2608 * would not be called at all, but after the execution of the first commands
2609 * in the input buffer the client may be blocked, and the "goto again"
2610 * will try to reiterate. The following line will make it return asap. */
92f8e882 2611 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2612 if (c->bulklen == -1) {
2613 /* Read the first line of the query */
2614 char *p = strchr(c->querybuf,'\n');
2615 size_t querylen;
644fafa3 2616
ed9b544e 2617 if (p) {
2618 sds query, *argv;
2619 int argc, j;
e0a62c7f 2620
ed9b544e 2621 query = c->querybuf;
2622 c->querybuf = sdsempty();
2623 querylen = 1+(p-(query));
2624 if (sdslen(query) > querylen) {
2625 /* leave data after the first line of the query in the buffer */
2626 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2627 }
2628 *p = '\0'; /* remove "\n" */
2629 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2630 sdsupdatelen(query);
2631
2632 /* Now we can split the query in arguments */
ed9b544e 2633 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2634 sdsfree(query);
2635
2636 if (c->argv) zfree(c->argv);
2637 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2638
2639 for (j = 0; j < argc; j++) {
ed9b544e 2640 if (sdslen(argv[j])) {
2641 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2642 c->argc++;
2643 } else {
2644 sdsfree(argv[j]);
2645 }
2646 }
2647 zfree(argv);
7c49733c 2648 if (c->argc) {
2649 /* Execute the command. If the client is still valid
2650 * after processCommand() return and there is something
2651 * on the query buffer try to process the next command. */
2652 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2653 } else {
2654 /* Nothing to process, argc == 0. Just process the query
2655 * buffer if it's not empty or return to the caller */
2656 if (sdslen(c->querybuf)) goto again;
2657 }
ed9b544e 2658 return;
644fafa3 2659 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2660 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2661 freeClient(c);
2662 return;
2663 }
2664 } else {
2665 /* Bulk read handling. Note that if we are at this point
2666 the client already sent a command terminated with a newline,
2667 we are reading the bulk data that is actually the last
2668 argument of the command. */
2669 int qbl = sdslen(c->querybuf);
2670
2671 if (c->bulklen <= qbl) {
2672 /* Copy everything but the final CRLF as final argument */
2673 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2674 c->argc++;
2675 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2676 /* Process the command. If the client is still valid after
2677 * the processing and there is more data in the buffer
2678 * try to parse it. */
2679 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2680 return;
2681 }
2682 }
2683}
2684
638e42ac 2685static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2686 redisClient *c = (redisClient*) privdata;
2687 char buf[REDIS_IOBUF_LEN];
2688 int nread;
2689 REDIS_NOTUSED(el);
2690 REDIS_NOTUSED(mask);
2691
2692 nread = read(fd, buf, REDIS_IOBUF_LEN);
2693 if (nread == -1) {
2694 if (errno == EAGAIN) {
2695 nread = 0;
2696 } else {
f870935d 2697 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2698 freeClient(c);
2699 return;
2700 }
2701 } else if (nread == 0) {
f870935d 2702 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2703 freeClient(c);
2704 return;
2705 }
2706 if (nread) {
2707 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2708 c->lastinteraction = time(NULL);
2709 } else {
2710 return;
2711 }
168ac5c6 2712 processInputBuffer(c);
638e42ac 2713}
2714
ed9b544e 2715static int selectDb(redisClient *c, int id) {
2716 if (id < 0 || id >= server.dbnum)
2717 return REDIS_ERR;
3305306f 2718 c->db = &server.db[id];
ed9b544e 2719 return REDIS_OK;
2720}
2721
40d224a9 2722static void *dupClientReplyValue(void *o) {
2723 incrRefCount((robj*)o);
12d090d2 2724 return o;
40d224a9 2725}
2726
ffc6b7f8 2727static int listMatchObjects(void *a, void *b) {
bf028098 2728 return equalStringObjects(a,b);
ffc6b7f8 2729}
2730
ed9b544e 2731static redisClient *createClient(int fd) {
2732 redisClient *c = zmalloc(sizeof(*c));
2733
2734 anetNonBlock(NULL,fd);
2735 anetTcpNoDelay(NULL,fd);
2736 if (!c) return NULL;
2737 selectDb(c,0);
2738 c->fd = fd;
2739 c->querybuf = sdsempty();
2740 c->argc = 0;
93ea3759 2741 c->argv = NULL;
ed9b544e 2742 c->bulklen = -1;
e8a74421 2743 c->multibulk = 0;
2744 c->mbargc = 0;
2745 c->mbargv = NULL;
ed9b544e 2746 c->sentlen = 0;
2747 c->flags = 0;
2748 c->lastinteraction = time(NULL);
abcb223e 2749 c->authenticated = 0;
40d224a9 2750 c->replstate = REDIS_REPL_NONE;
6b47e12e 2751 c->reply = listCreate();
ed9b544e 2752 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2753 listSetDupMethod(c->reply,dupClientReplyValue);
37ab76c9 2754 c->blocking_keys = NULL;
2755 c->blocking_keys_num = 0;
92f8e882 2756 c->io_keys = listCreate();
87c68815 2757 c->watched_keys = listCreate();
92f8e882 2758 listSetFreeMethod(c->io_keys,decrRefCount);
ffc6b7f8 2759 c->pubsub_channels = dictCreate(&setDictType,NULL);
2760 c->pubsub_patterns = listCreate();
2761 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2762 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
ed9b544e 2763 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2764 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2765 freeClient(c);
2766 return NULL;
2767 }
6b47e12e 2768 listAddNodeTail(server.clients,c);
6e469882 2769 initClientMultiState(c);
ed9b544e 2770 return c;
2771}
2772
2773static void addReply(redisClient *c, robj *obj) {
2774 if (listLength(c->reply) == 0 &&
6208b3a7 2775 (c->replstate == REDIS_REPL_NONE ||
2776 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2777 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2778 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2779
2780 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2781 obj = dupStringObject(obj);
2782 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2783 }
9d65a1bb 2784 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2785}
2786
2787static void addReplySds(redisClient *c, sds s) {
2788 robj *o = createObject(REDIS_STRING,s);
2789 addReply(c,o);
2790 decrRefCount(o);
2791}
2792
e2665397 2793static void addReplyDouble(redisClient *c, double d) {
2794 char buf[128];
2795
2796 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2797 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2798 (unsigned long) strlen(buf),buf));
e2665397 2799}
2800
aa7c2934
PN
2801static void addReplyLongLong(redisClient *c, long long ll) {
2802 char buf[128];
2803 size_t len;
2804
2805 if (ll == 0) {
2806 addReply(c,shared.czero);
2807 return;
2808 } else if (ll == 1) {
2809 addReply(c,shared.cone);
2810 return;
2811 }
482b672d 2812 buf[0] = ':';
2813 len = ll2string(buf+1,sizeof(buf)-1,ll);
2814 buf[len+1] = '\r';
2815 buf[len+2] = '\n';
2816 addReplySds(c,sdsnewlen(buf,len+3));
aa7c2934
PN
2817}
2818
92b27fe9 2819static void addReplyUlong(redisClient *c, unsigned long ul) {
2820 char buf[128];
2821 size_t len;
2822
dd88747b 2823 if (ul == 0) {
2824 addReply(c,shared.czero);
2825 return;
2826 } else if (ul == 1) {
2827 addReply(c,shared.cone);
2828 return;
2829 }
92b27fe9 2830 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2831 addReplySds(c,sdsnewlen(buf,len));
2832}
2833
942a3961 2834static void addReplyBulkLen(redisClient *c, robj *obj) {
482b672d 2835 size_t len, intlen;
2836 char buf[128];
942a3961 2837
2838 if (obj->encoding == REDIS_ENCODING_RAW) {
2839 len = sdslen(obj->ptr);
2840 } else {
2841 long n = (long)obj->ptr;
2842
e054afda 2843 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2844 len = 1;
2845 if (n < 0) {
2846 len++;
2847 n = -n;
2848 }
2849 while((n = n/10) != 0) {
2850 len++;
2851 }
2852 }
482b672d 2853 buf[0] = '$';
2854 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2855 buf[intlen+1] = '\r';
2856 buf[intlen+2] = '\n';
2857 addReplySds(c,sdsnewlen(buf,intlen+3));
942a3961 2858}
2859
dd88747b 2860static void addReplyBulk(redisClient *c, robj *obj) {
2861 addReplyBulkLen(c,obj);
2862 addReply(c,obj);
2863 addReply(c,shared.crlf);
2864}
2865
500ece7c 2866/* In the CONFIG command we need to add vanilla C string as bulk replies */
2867static void addReplyBulkCString(redisClient *c, char *s) {
2868 if (s == NULL) {
2869 addReply(c,shared.nullbulk);
2870 } else {
2871 robj *o = createStringObject(s,strlen(s));
2872 addReplyBulk(c,o);
2873 decrRefCount(o);
2874 }
2875}
2876
ed9b544e 2877static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2878 int cport, cfd;
2879 char cip[128];
285add55 2880 redisClient *c;
ed9b544e 2881 REDIS_NOTUSED(el);
2882 REDIS_NOTUSED(mask);
2883 REDIS_NOTUSED(privdata);
2884
2885 cfd = anetAccept(server.neterr, fd, cip, &cport);
2886 if (cfd == AE_ERR) {
f870935d 2887 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2888 return;
2889 }
f870935d 2890 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2891 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2892 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2893 close(cfd); /* May be already closed, just ingore errors */
2894 return;
2895 }
285add55 2896 /* If maxclient directive is set and this is one client more... close the
2897 * connection. Note that we create the client instead to check before
2898 * for this condition, since now the socket is already set in nonblocking
2899 * mode and we can send an error for free using the Kernel I/O */
2900 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2901 char *err = "-ERR max number of clients reached\r\n";
2902
2903 /* That's a best effort error message, don't check write errors */
fee803ba 2904 if (write(c->fd,err,strlen(err)) == -1) {
2905 /* Nothing to do, Just to avoid the warning... */
2906 }
285add55 2907 freeClient(c);
2908 return;
2909 }
ed9b544e 2910 server.stat_numconnections++;
2911}
2912
2913/* ======================= Redis objects implementation ===================== */
2914
2915static robj *createObject(int type, void *ptr) {
2916 robj *o;
2917
a5819310 2918 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2919 if (listLength(server.objfreelist)) {
2920 listNode *head = listFirst(server.objfreelist);
2921 o = listNodeValue(head);
2922 listDelNode(server.objfreelist,head);
a5819310 2923 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2924 } else {
75680a3c 2925 if (server.vm_enabled) {
a5819310 2926 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2927 o = zmalloc(sizeof(*o));
2928 } else {
2929 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2930 }
ed9b544e 2931 }
ed9b544e 2932 o->type = type;
942a3961 2933 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2934 o->ptr = ptr;
2935 o->refcount = 1;
3a66edc7 2936 if (server.vm_enabled) {
1064ef87 2937 /* Note that this code may run in the context of an I/O thread
2938 * and accessing to server.unixtime in theory is an error
2939 * (no locks). But in practice this is safe, and even if we read
2940 * garbage Redis will not fail, as it's just a statistical info */
3a66edc7 2941 o->vm.atime = server.unixtime;
2942 o->storage = REDIS_VM_MEMORY;
2943 }
ed9b544e 2944 return o;
2945}
2946
2947static robj *createStringObject(char *ptr, size_t len) {
2948 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2949}
2950
3f973463
PN
2951static robj *createStringObjectFromLongLong(long long value) {
2952 robj *o;
2953 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2954 incrRefCount(shared.integers[value]);
2955 o = shared.integers[value];
2956 } else {
3f973463 2957 if (value >= LONG_MIN && value <= LONG_MAX) {
10dea8dc 2958 o = createObject(REDIS_STRING, NULL);
3f973463
PN
2959 o->encoding = REDIS_ENCODING_INT;
2960 o->ptr = (void*)((long)value);
2961 } else {
ee14da56 2962 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3f973463
PN
2963 }
2964 }
2965 return o;
2966}
2967
4ef8de8a 2968static robj *dupStringObject(robj *o) {
b9bc0eef 2969 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2970 return createStringObject(o->ptr,sdslen(o->ptr));
2971}
2972
ed9b544e 2973static robj *createListObject(void) {
2974 list *l = listCreate();
2975
ed9b544e 2976 listSetFreeMethod(l,decrRefCount);
2977 return createObject(REDIS_LIST,l);
2978}
2979
2980static robj *createSetObject(void) {
2981 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2982 return createObject(REDIS_SET,d);
2983}
2984
5234952b 2985static robj *createHashObject(void) {
2986 /* All the Hashes start as zipmaps. Will be automatically converted
2987 * into hash tables if there are enough elements or big elements
2988 * inside. */
2989 unsigned char *zm = zipmapNew();
2990 robj *o = createObject(REDIS_HASH,zm);
2991 o->encoding = REDIS_ENCODING_ZIPMAP;
2992 return o;
2993}
2994
1812e024 2995static robj *createZsetObject(void) {
6b47e12e 2996 zset *zs = zmalloc(sizeof(*zs));
2997
2998 zs->dict = dictCreate(&zsetDictType,NULL);
2999 zs->zsl = zslCreate();
3000 return createObject(REDIS_ZSET,zs);
1812e024 3001}
3002
ed9b544e 3003static void freeStringObject(robj *o) {
942a3961 3004 if (o->encoding == REDIS_ENCODING_RAW) {
3005 sdsfree(o->ptr);
3006 }
ed9b544e 3007}
3008
3009static void freeListObject(robj *o) {
3010 listRelease((list*) o->ptr);
3011}
3012
3013static void freeSetObject(robj *o) {
3014 dictRelease((dict*) o->ptr);
3015}
3016
fd8ccf44 3017static void freeZsetObject(robj *o) {
3018 zset *zs = o->ptr;
3019
3020 dictRelease(zs->dict);
3021 zslFree(zs->zsl);
3022 zfree(zs);
3023}
3024
ed9b544e 3025static void freeHashObject(robj *o) {
cbba7dd7 3026 switch (o->encoding) {
3027 case REDIS_ENCODING_HT:
3028 dictRelease((dict*) o->ptr);
3029 break;
3030 case REDIS_ENCODING_ZIPMAP:
3031 zfree(o->ptr);
3032 break;
3033 default:
f83c6cb5 3034 redisPanic("Unknown hash encoding type");
cbba7dd7 3035 break;
3036 }
ed9b544e 3037}
3038
3039static void incrRefCount(robj *o) {
3040 o->refcount++;
3041}
3042
3043static void decrRefCount(void *obj) {
3044 robj *o = obj;
94754ccc 3045
c651fd9e 3046 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
970e10bb 3047 /* Object is a key of a swapped out value, or in the process of being
3048 * loaded. */
996cb5f7 3049 if (server.vm_enabled &&
3050 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3051 {
996cb5f7 3052 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 3053 redisAssert(o->type == REDIS_STRING);
a35ddf12 3054 freeStringObject(o);
3055 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 3056 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 3057 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3058 !listAddNodeHead(server.objfreelist,o))
3059 zfree(o);
a5819310 3060 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 3061 server.vm_stats_swapped_objects--;
a35ddf12 3062 return;
3063 }
996cb5f7 3064 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 3065 if (--(o->refcount) == 0) {
996cb5f7 3066 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3067 vmCancelThreadedIOJob(obj);
ed9b544e 3068 switch(o->type) {
3069 case REDIS_STRING: freeStringObject(o); break;
3070 case REDIS_LIST: freeListObject(o); break;
3071 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 3072 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 3073 case REDIS_HASH: freeHashObject(o); break;
f83c6cb5 3074 default: redisPanic("Unknown object type"); break;
ed9b544e 3075 }
a5819310 3076 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 3077 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3078 !listAddNodeHead(server.objfreelist,o))
3079 zfree(o);
a5819310 3080 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 3081 }
3082}
3083
942a3961 3084static robj *lookupKey(redisDb *db, robj *key) {
3085 dictEntry *de = dictFind(db->dict,key);
3a66edc7 3086 if (de) {
55cf8433 3087 robj *key = dictGetEntryKey(de);
3088 robj *val = dictGetEntryVal(de);
3a66edc7 3089
55cf8433 3090 if (server.vm_enabled) {
996cb5f7 3091 if (key->storage == REDIS_VM_MEMORY ||
3092 key->storage == REDIS_VM_SWAPPING)
3093 {
3094 /* If we were swapping the object out, stop it, this key
3095 * was requested. */
3096 if (key->storage == REDIS_VM_SWAPPING)
3097 vmCancelThreadedIOJob(key);
55cf8433 3098 /* Update the access time of the key for the aging algorithm. */
3099 key->vm.atime = server.unixtime;
3100 } else {
d5d55fc3 3101 int notify = (key->storage == REDIS_VM_LOADING);
3102
55cf8433 3103 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 3104 redisAssert(val == NULL);
55cf8433 3105 val = vmLoadObject(key);
3106 dictGetEntryVal(de) = val;
d5d55fc3 3107
3108 /* Clients blocked by the VM subsystem may be waiting for
3109 * this key... */
3110 if (notify) handleClientsBlockedOnSwappedKey(db,key);
55cf8433 3111 }
3112 }
3113 return val;
3a66edc7 3114 } else {
3115 return NULL;
3116 }
942a3961 3117}
3118
3119static robj *lookupKeyRead(redisDb *db, robj *key) {
3120 expireIfNeeded(db,key);
3121 return lookupKey(db,key);
3122}
3123
3124static robj *lookupKeyWrite(redisDb *db, robj *key) {
3125 deleteIfVolatile(db,key);
37ab76c9 3126 touchWatchedKey(db,key);
942a3961 3127 return lookupKey(db,key);
3128}
3129
92b27fe9 3130static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3131 robj *o = lookupKeyRead(c->db, key);
3132 if (!o) addReply(c,reply);
3133 return o;
3134}
3135
3136static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3137 robj *o = lookupKeyWrite(c->db, key);
3138 if (!o) addReply(c,reply);
3139 return o;
3140}
3141
3142static int checkType(redisClient *c, robj *o, int type) {
3143 if (o->type != type) {
3144 addReply(c,shared.wrongtypeerr);
3145 return 1;
3146 }
3147 return 0;
3148}
3149
942a3961 3150static int deleteKey(redisDb *db, robj *key) {
3151 int retval;
3152
3153 /* We need to protect key from destruction: after the first dictDelete()
3154 * it may happen that 'key' is no longer valid if we don't increment
3155 * it's count. This may happen when we get the object reference directly
3156 * from the hash table with dictRandomKey() or dict iterators */
3157 incrRefCount(key);
3158 if (dictSize(db->expires)) dictDelete(db->expires,key);
3159 retval = dictDelete(db->dict,key);
3160 decrRefCount(key);
3161
3162 return retval == DICT_OK;
3163}
3164
724a51b1 3165/* Check if the nul-terminated string 's' can be represented by a long
3166 * (that is, is a number that fits into long without any other space or
3167 * character before or after the digits).
3168 *
3169 * If so, the function returns REDIS_OK and *longval is set to the value
3170 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 3171static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 3172 char buf[32], *endptr;
3173 long value;
3174 int slen;
e0a62c7f 3175
724a51b1 3176 value = strtol(s, &endptr, 10);
3177 if (endptr[0] != '\0') return REDIS_ERR;
ee14da56 3178 slen = ll2string(buf,32,value);
724a51b1 3179
3180 /* If the number converted back into a string is not identical
3181 * then it's not possible to encode the string as integer */
f69f2cba 3182 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 3183 if (longval) *longval = value;
3184 return REDIS_OK;
3185}
3186
942a3961 3187/* Try to encode a string object in order to save space */
05df7621 3188static robj *tryObjectEncoding(robj *o) {
942a3961 3189 long value;
942a3961 3190 sds s = o->ptr;
3305306f 3191
942a3961 3192 if (o->encoding != REDIS_ENCODING_RAW)
05df7621 3193 return o; /* Already encoded */
3305306f 3194
05df7621 3195 /* It's not safe to encode shared objects: shared objects can be shared
942a3961 3196 * everywhere in the "object space" of Redis. Encoded objects can only
3197 * appear as "values" (and not, for instance, as keys) */
05df7621 3198 if (o->refcount > 1) return o;
3305306f 3199
942a3961 3200 /* Currently we try to encode only strings */
dfc5e96c 3201 redisAssert(o->type == REDIS_STRING);
94754ccc 3202
724a51b1 3203 /* Check if we can represent this string as a long integer */
05df7621 3204 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
942a3961 3205
3206 /* Ok, this object can be encoded */
05df7621 3207 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3208 decrRefCount(o);
3209 incrRefCount(shared.integers[value]);
3210 return shared.integers[value];
3211 } else {
3212 o->encoding = REDIS_ENCODING_INT;
3213 sdsfree(o->ptr);
3214 o->ptr = (void*) value;
3215 return o;
3216 }
942a3961 3217}
3218
9d65a1bb 3219/* Get a decoded version of an encoded object (returned as a new object).
3220 * If the object is already raw-encoded just increment the ref count. */
3221static robj *getDecodedObject(robj *o) {
942a3961 3222 robj *dec;
e0a62c7f 3223
9d65a1bb 3224 if (o->encoding == REDIS_ENCODING_RAW) {
3225 incrRefCount(o);
3226 return o;
3227 }
942a3961 3228 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3229 char buf[32];
3230
ee14da56 3231 ll2string(buf,32,(long)o->ptr);
942a3961 3232 dec = createStringObject(buf,strlen(buf));
3233 return dec;
3234 } else {
08ee9b57 3235 redisPanic("Unknown encoding type");
942a3961 3236 }
3305306f 3237}
3238
d7f43c08 3239/* Compare two string objects via strcmp() or alike.
3240 * Note that the objects may be integer-encoded. In such a case we
ee14da56 3241 * use ll2string() to get a string representation of the numbers on the stack
1fd9bc8a 3242 * and compare the strings, it's much faster than calling getDecodedObject().
3243 *
3244 * Important note: if objects are not integer encoded, but binary-safe strings,
3245 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3246 * binary safe. */
724a51b1 3247static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 3248 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 3249 char bufa[128], bufb[128], *astr, *bstr;
3250 int bothsds = 1;
724a51b1 3251
e197b441 3252 if (a == b) return 0;
d7f43c08 3253 if (a->encoding != REDIS_ENCODING_RAW) {
ee14da56 3254 ll2string(bufa,sizeof(bufa),(long) a->ptr);
d7f43c08 3255 astr = bufa;
3256 bothsds = 0;
724a51b1 3257 } else {
d7f43c08 3258 astr = a->ptr;
724a51b1 3259 }
d7f43c08 3260 if (b->encoding != REDIS_ENCODING_RAW) {
ee14da56 3261 ll2string(bufb,sizeof(bufb),(long) b->ptr);
d7f43c08 3262 bstr = bufb;
3263 bothsds = 0;
3264 } else {
3265 bstr = b->ptr;
3266 }
3267 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3268}
3269
bf028098 3270/* Equal string objects return 1 if the two objects are the same from the
3271 * point of view of a string comparison, otherwise 0 is returned. Note that
3272 * this function is faster then checking for (compareStringObject(a,b) == 0)
3273 * because it can perform some more optimization. */
3274static int equalStringObjects(robj *a, robj *b) {
3275 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3276 return a->ptr == b->ptr;
3277 } else {
3278 return compareStringObjects(a,b) == 0;
3279 }
3280}
3281
0ea663ea 3282static size_t stringObjectLen(robj *o) {
dfc5e96c 3283 redisAssert(o->type == REDIS_STRING);
0ea663ea 3284 if (o->encoding == REDIS_ENCODING_RAW) {
3285 return sdslen(o->ptr);
3286 } else {
3287 char buf[32];
3288
ee14da56 3289 return ll2string(buf,32,(long)o->ptr);
0ea663ea 3290 }
3291}
3292
bd79a6bd
PN
3293static int getDoubleFromObject(robj *o, double *target) {
3294 double value;
682c73e8 3295 char *eptr;
bbe025e0 3296
bd79a6bd
PN
3297 if (o == NULL) {
3298 value = 0;
3299 } else {
3300 redisAssert(o->type == REDIS_STRING);
3301 if (o->encoding == REDIS_ENCODING_RAW) {
3302 value = strtod(o->ptr, &eptr);
682c73e8 3303 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3304 } else if (o->encoding == REDIS_ENCODING_INT) {
3305 value = (long)o->ptr;
3306 } else {
946342c1 3307 redisPanic("Unknown string encoding");
bd79a6bd
PN
3308 }
3309 }
3310
bd79a6bd
PN
3311 *target = value;
3312 return REDIS_OK;
3313}
bbe025e0 3314
bd79a6bd
PN
3315static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3316 double value;
3317 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3318 if (msg != NULL) {
3319 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3320 } else {
3321 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3322 }
bbe025e0
AM
3323 return REDIS_ERR;
3324 }
3325
bd79a6bd 3326 *target = value;
bbe025e0
AM
3327 return REDIS_OK;
3328}
3329
bd79a6bd
PN
3330static int getLongLongFromObject(robj *o, long long *target) {
3331 long long value;
682c73e8 3332 char *eptr;
bbe025e0 3333
bd79a6bd
PN
3334 if (o == NULL) {
3335 value = 0;
3336 } else {
3337 redisAssert(o->type == REDIS_STRING);
3338 if (o->encoding == REDIS_ENCODING_RAW) {
3339 value = strtoll(o->ptr, &eptr, 10);
682c73e8 3340 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3341 } else if (o->encoding == REDIS_ENCODING_INT) {
3342 value = (long)o->ptr;
3343 } else {
946342c1 3344 redisPanic("Unknown string encoding");
bd79a6bd
PN
3345 }
3346 }
3347
bd79a6bd
PN
3348 *target = value;
3349 return REDIS_OK;
3350}
bbe025e0 3351
bd79a6bd
PN
3352static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3353 long long value;
3354 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3355 if (msg != NULL) {
3356 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3357 } else {
3358 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3359 }
bbe025e0
AM
3360 return REDIS_ERR;
3361 }
3362
bd79a6bd 3363 *target = value;
bbe025e0
AM
3364 return REDIS_OK;
3365}
3366
bd79a6bd
PN
3367static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3368 long long value;
bbe025e0 3369
bd79a6bd
PN
3370 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3371 if (value < LONG_MIN || value > LONG_MAX) {
3372 if (msg != NULL) {
3373 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3374 } else {
3375 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3376 }
bbe025e0
AM
3377 return REDIS_ERR;
3378 }
3379
bd79a6bd 3380 *target = value;
bbe025e0
AM
3381 return REDIS_OK;
3382}
3383
06233c45 3384/*============================ RDB saving/loading =========================== */
ed9b544e 3385
f78fd11b 3386static int rdbSaveType(FILE *fp, unsigned char type) {
3387 if (fwrite(&type,1,1,fp) == 0) return -1;
3388 return 0;
3389}
3390
bb32ede5 3391static int rdbSaveTime(FILE *fp, time_t t) {
3392 int32_t t32 = (int32_t) t;
3393 if (fwrite(&t32,4,1,fp) == 0) return -1;
3394 return 0;
3395}
3396
e3566d4b 3397/* check rdbLoadLen() comments for more info */
f78fd11b 3398static int rdbSaveLen(FILE *fp, uint32_t len) {
3399 unsigned char buf[2];
3400
3401 if (len < (1<<6)) {
3402 /* Save a 6 bit len */
10c43610 3403 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3404 if (fwrite(buf,1,1,fp) == 0) return -1;
3405 } else if (len < (1<<14)) {
3406 /* Save a 14 bit len */
10c43610 3407 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3408 buf[1] = len&0xFF;
17be1a4a 3409 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3410 } else {
3411 /* Save a 32 bit len */
10c43610 3412 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3413 if (fwrite(buf,1,1,fp) == 0) return -1;
3414 len = htonl(len);
3415 if (fwrite(&len,4,1,fp) == 0) return -1;
3416 }
3417 return 0;
3418}
3419
32a66513 3420/* Encode 'value' as an integer if possible (if integer will fit the
3421 * supported range). If the function sucessful encoded the integer
3422 * then the (up to 5 bytes) encoded representation is written in the
3423 * string pointed by 'enc' and the length is returned. Otherwise
3424 * 0 is returned. */
3425static int rdbEncodeInteger(long long value, unsigned char *enc) {
e3566d4b 3426 /* Finally check if it fits in our ranges */
3427 if (value >= -(1<<7) && value <= (1<<7)-1) {
3428 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3429 enc[1] = value&0xFF;
3430 return 2;
3431 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3432 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3433 enc[1] = value&0xFF;
3434 enc[2] = (value>>8)&0xFF;
3435 return 3;
3436 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3437 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3438 enc[1] = value&0xFF;
3439 enc[2] = (value>>8)&0xFF;
3440 enc[3] = (value>>16)&0xFF;
3441 enc[4] = (value>>24)&0xFF;
3442 return 5;
3443 } else {
3444 return 0;
3445 }
3446}
3447
32a66513 3448/* String objects in the form "2391" "-100" without any space and with a
3449 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3450 * encoded as integers to save space */
3451static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3452 long long value;
3453 char *endptr, buf[32];
3454
3455 /* Check if it's possible to encode this value as a number */
3456 value = strtoll(s, &endptr, 10);
3457 if (endptr[0] != '\0') return 0;
3458 ll2string(buf,32,value);
3459
3460 /* If the number converted back into a string is not identical
3461 * then it's not possible to encode the string as integer */
3462 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3463
3464 return rdbEncodeInteger(value,enc);
3465}
3466
b1befe6a 3467static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3468 size_t comprlen, outlen;
774e3047 3469 unsigned char byte;
3470 void *out;
3471
3472 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3473 if (len <= 4) return 0;
3474 outlen = len-4;
3a2694c4 3475 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3476 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3477 if (comprlen == 0) {
88e85998 3478 zfree(out);
774e3047 3479 return 0;
3480 }
3481 /* Data compressed! Let's save it on disk */
3482 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3483 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3484 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3485 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3486 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3487 zfree(out);
774e3047 3488 return comprlen;
3489
3490writeerr:
88e85998 3491 zfree(out);
774e3047 3492 return -1;
3493}
3494
e3566d4b 3495/* Save a string objet as [len][data] on disk. If the object is a string
3496 * representation of an integer value we try to safe it in a special form */
b1befe6a 3497static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3498 int enclen;
10c43610 3499
774e3047 3500 /* Try integer encoding */
e3566d4b 3501 if (len <= 11) {
3502 unsigned char buf[5];
b1befe6a 3503 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3504 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3505 return 0;
3506 }
3507 }
774e3047 3508
3509 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3510 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3511 if (server.rdbcompression && len > 20) {
774e3047 3512 int retval;
3513
b1befe6a 3514 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3515 if (retval == -1) return -1;
3516 if (retval > 0) return 0;
3517 /* retval == 0 means data can't be compressed, save the old way */
3518 }
3519
3520 /* Store verbatim */
10c43610 3521 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3522 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3523 return 0;
3524}
3525
942a3961 3526/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3527static int rdbSaveStringObject(FILE *fp, robj *obj) {
3528 int retval;
942a3961 3529
32a66513 3530 /* Avoid to decode the object, then encode it again, if the
3531 * object is alrady integer encoded. */
3532 if (obj->encoding == REDIS_ENCODING_INT) {
3533 long val = (long) obj->ptr;
3534 unsigned char buf[5];
3535 int enclen;
3536
3537 if ((enclen = rdbEncodeInteger(val,buf)) > 0) {
3538 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3539 return 0;
3540 }
3541 /* otherwise... fall throught and continue with the usual
3542 * code path. */
3543 }
3544
f2d9f50f 3545 /* Avoid incr/decr ref count business when possible.
3546 * This plays well with copy-on-write given that we are probably
3547 * in a child process (BGSAVE). Also this makes sure key objects
3548 * of swapped objects are not incRefCount-ed (an assert does not allow
3549 * this in order to avoid bugs) */
3550 if (obj->encoding != REDIS_ENCODING_RAW) {
996cb5f7 3551 obj = getDecodedObject(obj);
b1befe6a 3552 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3553 decrRefCount(obj);
3554 } else {
b1befe6a 3555 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3556 }
9d65a1bb 3557 return retval;
942a3961 3558}
3559
a7866db6 3560/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3561 * 8 bit integer specifing the length of the representation.
3562 * This 8 bit integer has special values in order to specify the following
3563 * conditions:
3564 * 253: not a number
3565 * 254: + inf
3566 * 255: - inf
3567 */
3568static int rdbSaveDoubleValue(FILE *fp, double val) {
3569 unsigned char buf[128];
3570 int len;
3571
3572 if (isnan(val)) {
3573 buf[0] = 253;
3574 len = 1;
3575 } else if (!isfinite(val)) {
3576 len = 1;
3577 buf[0] = (val < 0) ? 255 : 254;
3578 } else {
88e8d89f 3579#if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
fe244589 3580 /* Check if the float is in a safe range to be casted into a
3581 * long long. We are assuming that long long is 64 bit here.
3582 * Also we are assuming that there are no implementations around where
3583 * double has precision < 52 bit.
3584 *
3585 * Under this assumptions we test if a double is inside an interval
3586 * where casting to long long is safe. Then using two castings we
3587 * make sure the decimal part is zero. If all this is true we use
3588 * integer printing function that is much faster. */
fb82e75c 3589 double min = -4503599627370495; /* (2^52)-1 */
3590 double max = 4503599627370496; /* -(2^52) */
fe244589 3591 if (val > min && val < max && val == ((double)((long long)val)))
8c096b16 3592 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3593 else
88e8d89f 3594#endif
8c096b16 3595 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3596 buf[0] = strlen((char*)buf+1);
a7866db6 3597 len = buf[0]+1;
3598 }
3599 if (fwrite(buf,len,1,fp) == 0) return -1;
3600 return 0;
3601}
3602
06233c45 3603/* Save a Redis object. */
3604static int rdbSaveObject(FILE *fp, robj *o) {
3605 if (o->type == REDIS_STRING) {
3606 /* Save a string value */
3607 if (rdbSaveStringObject(fp,o) == -1) return -1;
3608 } else if (o->type == REDIS_LIST) {
3609 /* Save a list value */
3610 list *list = o->ptr;
c7df85a4 3611 listIter li;
06233c45 3612 listNode *ln;
3613
06233c45 3614 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
c7df85a4 3615 listRewind(list,&li);
3616 while((ln = listNext(&li))) {
06233c45 3617 robj *eleobj = listNodeValue(ln);
3618
3619 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3620 }
3621 } else if (o->type == REDIS_SET) {
3622 /* Save a set value */
3623 dict *set = o->ptr;
3624 dictIterator *di = dictGetIterator(set);
3625 dictEntry *de;
3626
3627 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3628 while((de = dictNext(di)) != NULL) {
3629 robj *eleobj = dictGetEntryKey(de);
3630
3631 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3632 }
3633 dictReleaseIterator(di);
3634 } else if (o->type == REDIS_ZSET) {
3635 /* Save a set value */
3636 zset *zs = o->ptr;
3637 dictIterator *di = dictGetIterator(zs->dict);
3638 dictEntry *de;
3639
3640 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3641 while((de = dictNext(di)) != NULL) {
3642 robj *eleobj = dictGetEntryKey(de);
3643 double *score = dictGetEntryVal(de);
3644
3645 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3646 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3647 }
3648 dictReleaseIterator(di);
b1befe6a 3649 } else if (o->type == REDIS_HASH) {
3650 /* Save a hash value */
3651 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3652 unsigned char *p = zipmapRewind(o->ptr);
3653 unsigned int count = zipmapLen(o->ptr);
3654 unsigned char *key, *val;
3655 unsigned int klen, vlen;
3656
3657 if (rdbSaveLen(fp,count) == -1) return -1;
3658 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3659 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3660 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3661 }
3662 } else {
3663 dictIterator *di = dictGetIterator(o->ptr);
3664 dictEntry *de;
3665
3666 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3667 while((de = dictNext(di)) != NULL) {
3668 robj *key = dictGetEntryKey(de);
3669 robj *val = dictGetEntryVal(de);
3670
3671 if (rdbSaveStringObject(fp,key) == -1) return -1;
3672 if (rdbSaveStringObject(fp,val) == -1) return -1;
3673 }
3674 dictReleaseIterator(di);
3675 }
06233c45 3676 } else {
f83c6cb5 3677 redisPanic("Unknown object type");
06233c45 3678 }
3679 return 0;
3680}
3681
3682/* Return the length the object will have on disk if saved with
3683 * the rdbSaveObject() function. Currently we use a trick to get
3684 * this length with very little changes to the code. In the future
3685 * we could switch to a faster solution. */
b9bc0eef 3686static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3687 if (fp == NULL) fp = server.devnull;
06233c45 3688 rewind(fp);
3689 assert(rdbSaveObject(fp,o) != 1);
3690 return ftello(fp);
3691}
3692
06224fec 3693/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3694static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3695 off_t bytes = rdbSavedObjectLen(o,fp);
e0a62c7f 3696
06224fec 3697 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3698}
3699
ed9b544e 3700/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3701static int rdbSave(char *filename) {
ed9b544e 3702 dictIterator *di = NULL;
3703 dictEntry *de;
ed9b544e 3704 FILE *fp;
3705 char tmpfile[256];
3706 int j;
bb32ede5 3707 time_t now = time(NULL);
ed9b544e 3708
2316bb3b 3709 /* Wait for I/O therads to terminate, just in case this is a
3710 * foreground-saving, to avoid seeking the swap file descriptor at the
3711 * same time. */
3712 if (server.vm_enabled)
3713 waitEmptyIOJobsQueue();
3714
a3b21203 3715 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3716 fp = fopen(tmpfile,"w");
3717 if (!fp) {
3718 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3719 return REDIS_ERR;
3720 }
f78fd11b 3721 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3722 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3723 redisDb *db = server.db+j;
3724 dict *d = db->dict;
3305306f 3725 if (dictSize(d) == 0) continue;
ed9b544e 3726 di = dictGetIterator(d);
3727 if (!di) {
3728 fclose(fp);
3729 return REDIS_ERR;
3730 }
3731
3732 /* Write the SELECT DB opcode */
f78fd11b 3733 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3734 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3735
3736 /* Iterate this DB writing every entry */
3737 while((de = dictNext(di)) != NULL) {
3738 robj *key = dictGetEntryKey(de);
3739 robj *o = dictGetEntryVal(de);
bb32ede5 3740 time_t expiretime = getExpire(db,key);
3741
3742 /* Save the expire time */
3743 if (expiretime != -1) {
3744 /* If this key is already expired skip it */
3745 if (expiretime < now) continue;
3746 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3747 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3748 }
7e69548d 3749 /* Save the key and associated value. This requires special
3750 * handling if the value is swapped out. */
996cb5f7 3751 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3752 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3753 /* Save type, key, value */
3754 if (rdbSaveType(fp,o->type) == -1) goto werr;
3755 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3756 if (rdbSaveObject(fp,o) == -1) goto werr;
3757 } else {
996cb5f7 3758 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3759 robj *po;
7e69548d 3760 /* Get a preview of the object in memory */
3761 po = vmPreviewObject(key);
7e69548d 3762 /* Save type, key, value */
3763 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3764 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3765 if (rdbSaveObject(fp,po) == -1) goto werr;
3766 /* Remove the loaded object from memory */
3767 decrRefCount(po);
7e69548d 3768 }
ed9b544e 3769 }
3770 dictReleaseIterator(di);
3771 }
3772 /* EOF opcode */
f78fd11b 3773 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3774
3775 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3776 fflush(fp);
3777 fsync(fileno(fp));
3778 fclose(fp);
e0a62c7f 3779
ed9b544e 3780 /* Use RENAME to make sure the DB file is changed atomically only
3781 * if the generate DB file is ok. */
3782 if (rename(tmpfile,filename) == -1) {
325d1eb4 3783 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3784 unlink(tmpfile);
3785 return REDIS_ERR;
3786 }
3787 redisLog(REDIS_NOTICE,"DB saved on disk");
3788 server.dirty = 0;
3789 server.lastsave = time(NULL);
3790 return REDIS_OK;
3791
3792werr:
3793 fclose(fp);
3794 unlink(tmpfile);
3795 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3796 if (di) dictReleaseIterator(di);
3797 return REDIS_ERR;
3798}
3799
f78fd11b 3800static int rdbSaveBackground(char *filename) {
ed9b544e 3801 pid_t childpid;
3802
9d65a1bb 3803 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3804 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3805 if ((childpid = fork()) == 0) {
3806 /* Child */
054e426d 3807 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3808 close(server.fd);
f78fd11b 3809 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3810 _exit(0);
ed9b544e 3811 } else {
478c2c6f 3812 _exit(1);
ed9b544e 3813 }
3814 } else {
3815 /* Parent */
5a7c647e 3816 if (childpid == -1) {
3817 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3818 strerror(errno));
3819 return REDIS_ERR;
3820 }
ed9b544e 3821 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3822 server.bgsavechildpid = childpid;
884d4b39 3823 updateDictResizePolicy();
ed9b544e 3824 return REDIS_OK;
3825 }
3826 return REDIS_OK; /* unreached */
3827}
3828
a3b21203 3829static void rdbRemoveTempFile(pid_t childpid) {
3830 char tmpfile[256];
3831
3832 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3833 unlink(tmpfile);
3834}
3835
f78fd11b 3836static int rdbLoadType(FILE *fp) {
3837 unsigned char type;
7b45bfb2 3838 if (fread(&type,1,1,fp) == 0) return -1;
3839 return type;
3840}
3841
bb32ede5 3842static time_t rdbLoadTime(FILE *fp) {
3843 int32_t t32;
3844 if (fread(&t32,4,1,fp) == 0) return -1;
3845 return (time_t) t32;
3846}
3847
e3566d4b 3848/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3849 * of this file for a description of how this are stored on disk.
3850 *
3851 * isencoded is set to 1 if the readed length is not actually a length but
3852 * an "encoding type", check the above comments for more info */
c78a8ccc 3853static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3854 unsigned char buf[2];
3855 uint32_t len;
c78a8ccc 3856 int type;
f78fd11b 3857
e3566d4b 3858 if (isencoded) *isencoded = 0;
c78a8ccc 3859 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3860 type = (buf[0]&0xC0)>>6;
3861 if (type == REDIS_RDB_6BITLEN) {
3862 /* Read a 6 bit len */
3863 return buf[0]&0x3F;
3864 } else if (type == REDIS_RDB_ENCVAL) {
3865 /* Read a 6 bit len encoding type */
3866 if (isencoded) *isencoded = 1;
3867 return buf[0]&0x3F;
3868 } else if (type == REDIS_RDB_14BITLEN) {
3869 /* Read a 14 bit len */
3870 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3871 return ((buf[0]&0x3F)<<8)|buf[1];
3872 } else {
3873 /* Read a 32 bit len */
f78fd11b 3874 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3875 return ntohl(len);
f78fd11b 3876 }
f78fd11b 3877}
3878
ad30aa60 3879/* Load an integer-encoded object from file 'fp', with the specified
3880 * encoding type 'enctype'. If encode is true the function may return
3881 * an integer-encoded object as reply, otherwise the returned object
3882 * will always be encoded as a raw string. */
3883static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
e3566d4b 3884 unsigned char enc[4];
3885 long long val;
3886
3887 if (enctype == REDIS_RDB_ENC_INT8) {
3888 if (fread(enc,1,1,fp) == 0) return NULL;
3889 val = (signed char)enc[0];
3890 } else if (enctype == REDIS_RDB_ENC_INT16) {
3891 uint16_t v;
3892 if (fread(enc,2,1,fp) == 0) return NULL;
3893 v = enc[0]|(enc[1]<<8);
3894 val = (int16_t)v;
3895 } else if (enctype == REDIS_RDB_ENC_INT32) {
3896 uint32_t v;
3897 if (fread(enc,4,1,fp) == 0) return NULL;
3898 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3899 val = (int32_t)v;
3900 } else {
3901 val = 0; /* anti-warning */
f83c6cb5 3902 redisPanic("Unknown RDB integer encoding type");
e3566d4b 3903 }
ad30aa60 3904 if (encode)
3905 return createStringObjectFromLongLong(val);
3906 else
3907 return createObject(REDIS_STRING,sdsfromlonglong(val));
e3566d4b 3908}
3909
c78a8ccc 3910static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3911 unsigned int len, clen;
3912 unsigned char *c = NULL;
3913 sds val = NULL;
3914
c78a8ccc 3915 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3916 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3917 if ((c = zmalloc(clen)) == NULL) goto err;
3918 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3919 if (fread(c,clen,1,fp) == 0) goto err;
3920 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3921 zfree(c);
88e85998 3922 return createObject(REDIS_STRING,val);
3923err:
3924 zfree(c);
3925 sdsfree(val);
3926 return NULL;
3927}
3928
ad30aa60 3929static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
e3566d4b 3930 int isencoded;
3931 uint32_t len;
f78fd11b 3932 sds val;
3933
c78a8ccc 3934 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3935 if (isencoded) {
3936 switch(len) {
3937 case REDIS_RDB_ENC_INT8:
3938 case REDIS_RDB_ENC_INT16:
3939 case REDIS_RDB_ENC_INT32:
ad30aa60 3940 return rdbLoadIntegerObject(fp,len,encode);
88e85998 3941 case REDIS_RDB_ENC_LZF:
bdcb92f2 3942 return rdbLoadLzfStringObject(fp);
e3566d4b 3943 default:
f83c6cb5 3944 redisPanic("Unknown RDB encoding type");
e3566d4b 3945 }
3946 }
3947
f78fd11b 3948 if (len == REDIS_RDB_LENERR) return NULL;
3949 val = sdsnewlen(NULL,len);
3950 if (len && fread(val,len,1,fp) == 0) {
3951 sdsfree(val);
3952 return NULL;
3953 }
bdcb92f2 3954 return createObject(REDIS_STRING,val);
f78fd11b 3955}
3956
ad30aa60 3957static robj *rdbLoadStringObject(FILE *fp) {
3958 return rdbGenericLoadStringObject(fp,0);
3959}
3960
3961static robj *rdbLoadEncodedStringObject(FILE *fp) {
3962 return rdbGenericLoadStringObject(fp,1);
3963}
3964
a7866db6 3965/* For information about double serialization check rdbSaveDoubleValue() */
3966static int rdbLoadDoubleValue(FILE *fp, double *val) {
3967 char buf[128];
3968 unsigned char len;
3969
3970 if (fread(&len,1,1,fp) == 0) return -1;
3971 switch(len) {
3972 case 255: *val = R_NegInf; return 0;
3973 case 254: *val = R_PosInf; return 0;
3974 case 253: *val = R_Nan; return 0;
3975 default:
3976 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3977 buf[len] = '\0';
a7866db6 3978 sscanf(buf, "%lg", val);
3979 return 0;
3980 }
3981}
3982
c78a8ccc 3983/* Load a Redis object of the specified type from the specified file.
3984 * On success a newly allocated object is returned, otherwise NULL. */
3985static robj *rdbLoadObject(int type, FILE *fp) {
3986 robj *o;
3987
bcd11906 3988 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 3989 if (type == REDIS_STRING) {
3990 /* Read string value */
ad30aa60 3991 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 3992 o = tryObjectEncoding(o);
c78a8ccc 3993 } else if (type == REDIS_LIST || type == REDIS_SET) {
3994 /* Read list/set value */
3995 uint32_t listlen;
3996
3997 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3998 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3c68de9b 3999 /* It's faster to expand the dict to the right size asap in order
4000 * to avoid rehashing */
4001 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
4002 dictExpand(o->ptr,listlen);
c78a8ccc 4003 /* Load every single element of the list/set */
4004 while(listlen--) {
4005 robj *ele;
4006
ad30aa60 4007 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4008 ele = tryObjectEncoding(ele);
c78a8ccc 4009 if (type == REDIS_LIST) {
4010 listAddNodeTail((list*)o->ptr,ele);
4011 } else {
4012 dictAdd((dict*)o->ptr,ele,NULL);
4013 }
4014 }
4015 } else if (type == REDIS_ZSET) {
4016 /* Read list/set value */
ada386b2 4017 size_t zsetlen;
c78a8ccc 4018 zset *zs;
4019
4020 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4021 o = createZsetObject();
4022 zs = o->ptr;
4023 /* Load every single element of the list/set */
4024 while(zsetlen--) {
4025 robj *ele;
4026 double *score = zmalloc(sizeof(double));
4027
ad30aa60 4028 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4029 ele = tryObjectEncoding(ele);
c78a8ccc 4030 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4031 dictAdd(zs->dict,ele,score);
4032 zslInsert(zs->zsl,*score,ele);
4033 incrRefCount(ele); /* added to skiplist */
4034 }
ada386b2 4035 } else if (type == REDIS_HASH) {
4036 size_t hashlen;
4037
4038 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4039 o = createHashObject();
4040 /* Too many entries? Use an hash table. */
4041 if (hashlen > server.hash_max_zipmap_entries)
4042 convertToRealHash(o);
4043 /* Load every key/value, then set it into the zipmap or hash
4044 * table, as needed. */
4045 while(hashlen--) {
4046 robj *key, *val;
4047
4048 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4049 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4050 /* If we are using a zipmap and there are too big values
4051 * the object is converted to real hash table encoding. */
4052 if (o->encoding != REDIS_ENCODING_HT &&
4053 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4054 sdslen(val->ptr) > server.hash_max_zipmap_value))
4055 {
4056 convertToRealHash(o);
4057 }
4058
4059 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4060 unsigned char *zm = o->ptr;
4061
4062 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4063 val->ptr,sdslen(val->ptr),NULL);
4064 o->ptr = zm;
4065 decrRefCount(key);
4066 decrRefCount(val);
4067 } else {
05df7621 4068 key = tryObjectEncoding(key);
4069 val = tryObjectEncoding(val);
ada386b2 4070 dictAdd((dict*)o->ptr,key,val);
ada386b2 4071 }
4072 }
c78a8ccc 4073 } else {
f83c6cb5 4074 redisPanic("Unknown object type");
c78a8ccc 4075 }
4076 return o;
4077}
4078
f78fd11b 4079static int rdbLoad(char *filename) {
ed9b544e 4080 FILE *fp;
f78fd11b 4081 uint32_t dbid;
bb32ede5 4082 int type, retval, rdbver;
585af7e2 4083 int swap_all_values = 0;
3305306f 4084 dict *d = server.db[0].dict;
bb32ede5 4085 redisDb *db = server.db+0;
f78fd11b 4086 char buf[1024];
242a64f3 4087 time_t expiretime, now = time(NULL);
b492cf00 4088 long long loadedkeys = 0;
bb32ede5 4089
ed9b544e 4090 fp = fopen(filename,"r");
4091 if (!fp) return REDIS_ERR;
4092 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 4093 buf[9] = '\0';
4094 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 4095 fclose(fp);
4096 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4097 return REDIS_ERR;
4098 }
f78fd11b 4099 rdbver = atoi(buf+5);
c78a8ccc 4100 if (rdbver != 1) {
f78fd11b 4101 fclose(fp);
4102 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4103 return REDIS_ERR;
4104 }
ed9b544e 4105 while(1) {
585af7e2 4106 robj *key, *val;
ed9b544e 4107
585af7e2 4108 expiretime = -1;
ed9b544e 4109 /* Read type. */
f78fd11b 4110 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 4111 if (type == REDIS_EXPIRETIME) {
4112 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4113 /* We read the time so we need to read the object type again */
4114 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4115 }
ed9b544e 4116 if (type == REDIS_EOF) break;
4117 /* Handle SELECT DB opcode as a special case */
4118 if (type == REDIS_SELECTDB) {
c78a8ccc 4119 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 4120 goto eoferr;
ed9b544e 4121 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 4122 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 4123 exit(1);
4124 }
bb32ede5 4125 db = server.db+dbid;
4126 d = db->dict;
ed9b544e 4127 continue;
4128 }
4129 /* Read key */
585af7e2 4130 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
c78a8ccc 4131 /* Read value */
585af7e2 4132 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
89e689c5 4133 /* Check if the key already expired */
4134 if (expiretime != -1 && expiretime < now) {
4135 decrRefCount(key);
4136 decrRefCount(val);
4137 continue;
4138 }
ed9b544e 4139 /* Add the new object in the hash table */
585af7e2 4140 retval = dictAdd(d,key,val);
ed9b544e 4141 if (retval == DICT_ERR) {
585af7e2 4142 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
ed9b544e 4143 exit(1);
4144 }
242a64f3 4145 loadedkeys++;
bb32ede5 4146 /* Set the expire time if needed */
89e689c5 4147 if (expiretime != -1) setExpire(db,key,expiretime);
242a64f3 4148
b492cf00 4149 /* Handle swapping while loading big datasets when VM is on */
242a64f3 4150
4151 /* If we detecter we are hopeless about fitting something in memory
4152 * we just swap every new key on disk. Directly...
4153 * Note that's important to check for this condition before resorting
4154 * to random sampling, otherwise we may try to swap already
4155 * swapped keys. */
585af7e2 4156 if (swap_all_values) {
4157 dictEntry *de = dictFind(d,key);
242a64f3 4158
4159 /* de may be NULL since the key already expired */
4160 if (de) {
585af7e2 4161 key = dictGetEntryKey(de);
4162 val = dictGetEntryVal(de);
242a64f3 4163
585af7e2 4164 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
242a64f3 4165 dictGetEntryVal(de) = NULL;
4166 }
4167 }
4168 continue;
4169 }
4170
4171 /* If we have still some hope of having some value fitting memory
4172 * then we try random sampling. */
585af7e2 4173 if (!swap_all_values && server.vm_enabled && (loadedkeys % 5000) == 0) {
b492cf00 4174 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 4175 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 4176 }
242a64f3 4177 if (zmalloc_used_memory() > server.vm_max_memory)
585af7e2 4178 swap_all_values = 1; /* We are already using too much mem */
b492cf00 4179 }
ed9b544e 4180 }
4181 fclose(fp);
4182 return REDIS_OK;
4183
4184eoferr: /* unexpected end of file is handled here with a fatal exit */
f80dff62 4185 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 4186 exit(1);
4187 return REDIS_ERR; /* Just to avoid warning */
4188}
4189
b58ba105 4190/*================================== Shutdown =============================== */
fab43727 4191static int prepareForShutdown() {
b58ba105
AM
4192 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4193 /* Kill the saving child if there is a background saving in progress.
4194 We want to avoid race conditions, for instance our saving child may
4195 overwrite the synchronous saving did by SHUTDOWN. */
4196 if (server.bgsavechildpid != -1) {
4197 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4198 kill(server.bgsavechildpid,SIGKILL);
4199 rdbRemoveTempFile(server.bgsavechildpid);
4200 }
4201 if (server.appendonly) {
4202 /* Append only file: fsync() the AOF and exit */
b0bd87f6 4203 aof_fsync(server.appendfd);
b58ba105 4204 if (server.vm_enabled) unlink(server.vm_swap_file);
b58ba105
AM
4205 } else {
4206 /* Snapshotting. Perform a SYNC SAVE and exit */
4207 if (rdbSave(server.dbfilename) == REDIS_OK) {
4208 if (server.daemonize)
4209 unlink(server.pidfile);
4210 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
b58ba105
AM
4211 } else {
4212 /* Ooops.. error saving! The best we can do is to continue
4213 * operating. Note that if there was a background saving process,
4214 * in the next cron() Redis will be notified that the background
4215 * saving aborted, handling special stuff like slaves pending for
4216 * synchronization... */
4217 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
fab43727 4218 return REDIS_ERR;
b58ba105
AM
4219 }
4220 }
8513a757 4221 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
fab43727 4222 return REDIS_OK;
b58ba105
AM
4223}
4224
ed9b544e 4225/*================================== Commands =============================== */
4226
abcb223e 4227static void authCommand(redisClient *c) {
2e77c2ee 4228 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
4229 c->authenticated = 1;
4230 addReply(c,shared.ok);
4231 } else {
4232 c->authenticated = 0;
fa4c0aba 4233 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
4234 }
4235}
4236
ed9b544e 4237static void pingCommand(redisClient *c) {
4238 addReply(c,shared.pong);
4239}
4240
4241static void echoCommand(redisClient *c) {
dd88747b 4242 addReplyBulk(c,c->argv[1]);
ed9b544e 4243}
4244
4245/*=================================== Strings =============================== */
4246
526d00a5 4247static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
ed9b544e 4248 int retval;
10ce1276 4249 long seconds = 0; /* initialized to avoid an harmness warning */
ed9b544e 4250
526d00a5 4251 if (expire) {
4252 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4253 return;
4254 if (seconds <= 0) {
4255 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4256 return;
4257 }
4258 }
4259
37ab76c9 4260 touchWatchedKey(c->db,key);
526d00a5 4261 if (nx) deleteIfVolatile(c->db,key);
4262 retval = dictAdd(c->db->dict,key,val);
ed9b544e 4263 if (retval == DICT_ERR) {
4264 if (!nx) {
1b03836c 4265 /* If the key is about a swapped value, we want a new key object
4266 * to overwrite the old. So we delete the old key in the database.
4267 * This will also make sure that swap pages about the old object
4268 * will be marked as free. */
526d00a5 4269 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4270 incrRefCount(key);
4271 dictReplace(c->db->dict,key,val);
4272 incrRefCount(val);
ed9b544e 4273 } else {
c937aa89 4274 addReply(c,shared.czero);
ed9b544e 4275 return;
4276 }
4277 } else {
526d00a5 4278 incrRefCount(key);
4279 incrRefCount(val);
ed9b544e 4280 }
4281 server.dirty++;
526d00a5 4282 removeExpire(c->db,key);
4283 if (expire) setExpire(c->db,key,time(NULL)+seconds);
c937aa89 4284 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 4285}
4286
4287static void setCommand(redisClient *c) {
526d00a5 4288 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
ed9b544e 4289}
4290
4291static void setnxCommand(redisClient *c) {
526d00a5 4292 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4293}
4294
4295static void setexCommand(redisClient *c) {
4296 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
ed9b544e 4297}
4298
322fc7d8 4299static int getGenericCommand(redisClient *c) {
dd88747b 4300 robj *o;
e0a62c7f 4301
dd88747b 4302 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 4303 return REDIS_OK;
dd88747b 4304
4305 if (o->type != REDIS_STRING) {
4306 addReply(c,shared.wrongtypeerr);
4307 return REDIS_ERR;
ed9b544e 4308 } else {
dd88747b 4309 addReplyBulk(c,o);
4310 return REDIS_OK;
ed9b544e 4311 }
4312}
4313
322fc7d8 4314static void getCommand(redisClient *c) {
4315 getGenericCommand(c);
4316}
4317
f6b141c5 4318static void getsetCommand(redisClient *c) {
322fc7d8 4319 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 4320 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4321 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4322 } else {
4323 incrRefCount(c->argv[1]);
4324 }
4325 incrRefCount(c->argv[2]);
4326 server.dirty++;
4327 removeExpire(c->db,c->argv[1]);
4328}
4329
70003d28 4330static void mgetCommand(redisClient *c) {
70003d28 4331 int j;
e0a62c7f 4332
c937aa89 4333 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 4334 for (j = 1; j < c->argc; j++) {
3305306f 4335 robj *o = lookupKeyRead(c->db,c->argv[j]);
4336 if (o == NULL) {
c937aa89 4337 addReply(c,shared.nullbulk);
70003d28 4338 } else {
70003d28 4339 if (o->type != REDIS_STRING) {
c937aa89 4340 addReply(c,shared.nullbulk);
70003d28 4341 } else {
dd88747b 4342 addReplyBulk(c,o);
70003d28 4343 }
4344 }
4345 }
4346}
4347
6c446631 4348static void msetGenericCommand(redisClient *c, int nx) {
906573e7 4349 int j, busykeys = 0;
6c446631 4350
4351 if ((c->argc % 2) == 0) {
454d4e43 4352 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 4353 return;
4354 }
4355 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4356 * set nothing at all if at least one already key exists. */
4357 if (nx) {
4358 for (j = 1; j < c->argc; j += 2) {
906573e7 4359 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4360 busykeys++;
6c446631 4361 }
4362 }
4363 }
906573e7 4364 if (busykeys) {
4365 addReply(c, shared.czero);
4366 return;
4367 }
6c446631 4368
4369 for (j = 1; j < c->argc; j += 2) {
4370 int retval;
4371
05df7621 4372 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
6c446631 4373 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4374 if (retval == DICT_ERR) {
4375 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4376 incrRefCount(c->argv[j+1]);
4377 } else {
4378 incrRefCount(c->argv[j]);
4379 incrRefCount(c->argv[j+1]);
4380 }
4381 removeExpire(c->db,c->argv[j]);
4382 }
4383 server.dirty += (c->argc-1)/2;
4384 addReply(c, nx ? shared.cone : shared.ok);
4385}
4386
4387static void msetCommand(redisClient *c) {
4388 msetGenericCommand(c,0);
4389}
4390
4391static void msetnxCommand(redisClient *c) {
4392 msetGenericCommand(c,1);
4393}
4394
d68ed120 4395static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 4396 long long value;
4397 int retval;
4398 robj *o;
e0a62c7f 4399
3305306f 4400 o = lookupKeyWrite(c->db,c->argv[1]);
6485f293
PN
4401 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4402 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
ed9b544e 4403
4404 value += incr;
d6f4c262 4405 o = createStringObjectFromLongLong(value);
3305306f 4406 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 4407 if (retval == DICT_ERR) {
3305306f 4408 dictReplace(c->db->dict,c->argv[1],o);
4409 removeExpire(c->db,c->argv[1]);
ed9b544e 4410 } else {
4411 incrRefCount(c->argv[1]);
4412 }
4413 server.dirty++;
c937aa89 4414 addReply(c,shared.colon);
ed9b544e 4415 addReply(c,o);
4416 addReply(c,shared.crlf);
4417}
4418
4419static void incrCommand(redisClient *c) {
a4d1ba9a 4420 incrDecrCommand(c,1);
ed9b544e 4421}
4422
4423static void decrCommand(redisClient *c) {
a4d1ba9a 4424 incrDecrCommand(c,-1);
ed9b544e 4425}
4426
4427static void incrbyCommand(redisClient *c) {
bbe025e0
AM
4428 long long incr;
4429
bd79a6bd 4430 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4431 incrDecrCommand(c,incr);
ed9b544e 4432}
4433
4434static void decrbyCommand(redisClient *c) {
bbe025e0
AM
4435 long long incr;
4436
bd79a6bd 4437 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4438 incrDecrCommand(c,-incr);
ed9b544e 4439}
4440
4b00bebd 4441static void appendCommand(redisClient *c) {
4442 int retval;
4443 size_t totlen;
4444 robj *o;
4445
4446 o = lookupKeyWrite(c->db,c->argv[1]);
4447 if (o == NULL) {
4448 /* Create the key */
4449 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4450 incrRefCount(c->argv[1]);
4451 incrRefCount(c->argv[2]);
4452 totlen = stringObjectLen(c->argv[2]);
4453 } else {
4454 dictEntry *de;
e0a62c7f 4455
4b00bebd 4456 de = dictFind(c->db->dict,c->argv[1]);
4457 assert(de != NULL);
4458
4459 o = dictGetEntryVal(de);
4460 if (o->type != REDIS_STRING) {
4461 addReply(c,shared.wrongtypeerr);
4462 return;
4463 }
4464 /* If the object is specially encoded or shared we have to make
4465 * a copy */
4466 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4467 robj *decoded = getDecodedObject(o);
4468
4469 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4470 decrRefCount(decoded);
4471 dictReplace(c->db->dict,c->argv[1],o);
4472 }
4473 /* APPEND! */
4474 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4475 o->ptr = sdscatlen(o->ptr,
4476 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4477 } else {
4478 o->ptr = sdscatprintf(o->ptr, "%ld",
4479 (unsigned long) c->argv[2]->ptr);
4480 }
4481 totlen = sdslen(o->ptr);
4482 }
4483 server.dirty++;
4484 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4485}
4486
39191553 4487static void substrCommand(redisClient *c) {
4488 robj *o;
4489 long start = atoi(c->argv[2]->ptr);
4490 long end = atoi(c->argv[3]->ptr);
dd88747b 4491 size_t rangelen, strlen;
4492 sds range;
39191553 4493
dd88747b 4494 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4495 checkType(c,o,REDIS_STRING)) return;
39191553 4496
dd88747b 4497 o = getDecodedObject(o);
4498 strlen = sdslen(o->ptr);
8fe7fad7 4499
dd88747b 4500 /* convert negative indexes */
4501 if (start < 0) start = strlen+start;
4502 if (end < 0) end = strlen+end;
4503 if (start < 0) start = 0;
4504 if (end < 0) end = 0;
39191553 4505
dd88747b 4506 /* indexes sanity checks */
4507 if (start > end || (size_t)start >= strlen) {
4508 /* Out of range start or start > end result in null reply */
4509 addReply(c,shared.nullbulk);
4510 decrRefCount(o);
4511 return;
39191553 4512 }
dd88747b 4513 if ((size_t)end >= strlen) end = strlen-1;
4514 rangelen = (end-start)+1;
4515
4516 /* Return the result */
4517 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4518 range = sdsnewlen((char*)o->ptr+start,rangelen);
4519 addReplySds(c,range);
4520 addReply(c,shared.crlf);
4521 decrRefCount(o);
39191553 4522}
4523
ed9b544e 4524/* ========================= Type agnostic commands ========================= */
4525
4526static void delCommand(redisClient *c) {
5109cdff 4527 int deleted = 0, j;
4528
4529 for (j = 1; j < c->argc; j++) {
4530 if (deleteKey(c->db,c->argv[j])) {
37ab76c9 4531 touchWatchedKey(c->db,c->argv[j]);
5109cdff 4532 server.dirty++;
4533 deleted++;
4534 }
4535 }
482b672d 4536 addReplyLongLong(c,deleted);
ed9b544e 4537}
4538
4539static void existsCommand(redisClient *c) {
f4f06efc
PN
4540 expireIfNeeded(c->db,c->argv[1]);
4541 if (dictFind(c->db->dict,c->argv[1])) {
4542 addReply(c, shared.cone);
4543 } else {
4544 addReply(c, shared.czero);
4545 }
ed9b544e 4546}
4547
4548static void selectCommand(redisClient *c) {
4549 int id = atoi(c->argv[1]->ptr);
e0a62c7f 4550
ed9b544e 4551 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4552 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4553 } else {
4554 addReply(c,shared.ok);
4555 }
4556}
4557
4558static void randomkeyCommand(redisClient *c) {
4559 dictEntry *de;
dc4be23e 4560 robj *key;
e0a62c7f 4561
3305306f 4562 while(1) {
4563 de = dictGetRandomKey(c->db->dict);
ce7bef07 4564 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 4565 }
2b619329 4566
ed9b544e 4567 if (de == NULL) {
dc4be23e 4568 addReply(c,shared.nullbulk);
4569 return;
4570 }
4571
4572 key = dictGetEntryKey(de);
4573 if (server.vm_enabled) {
4574 key = dupStringObject(key);
4575 addReplyBulk(c,key);
4576 decrRefCount(key);
ed9b544e 4577 } else {
dc4be23e 4578 addReplyBulk(c,key);
ed9b544e 4579 }
4580}
4581
4582static void keysCommand(redisClient *c) {
4583 dictIterator *di;
4584 dictEntry *de;
4585 sds pattern = c->argv[1]->ptr;
4586 int plen = sdslen(pattern);
a3f9eec2 4587 unsigned long numkeys = 0;
ed9b544e 4588 robj *lenobj = createObject(REDIS_STRING,NULL);
4589
3305306f 4590 di = dictGetIterator(c->db->dict);
ed9b544e 4591 addReply(c,lenobj);
4592 decrRefCount(lenobj);
4593 while((de = dictNext(di)) != NULL) {
4594 robj *keyobj = dictGetEntryKey(de);
3305306f 4595
ed9b544e 4596 sds key = keyobj->ptr;
4597 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4598 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 4599 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4600 addReplyBulk(c,keyobj);
3305306f 4601 numkeys++;
3305306f 4602 }
ed9b544e 4603 }
4604 }
4605 dictReleaseIterator(di);
a3f9eec2 4606 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4607}
4608
4609static void dbsizeCommand(redisClient *c) {
4610 addReplySds(c,
3305306f 4611 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4612}
4613
4614static void lastsaveCommand(redisClient *c) {
4615 addReplySds(c,
c937aa89 4616 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4617}
4618
4619static void typeCommand(redisClient *c) {
3305306f 4620 robj *o;
ed9b544e 4621 char *type;
3305306f 4622
4623 o = lookupKeyRead(c->db,c->argv[1]);
4624 if (o == NULL) {
c937aa89 4625 type = "+none";
ed9b544e 4626 } else {
ed9b544e 4627 switch(o->type) {
c937aa89 4628 case REDIS_STRING: type = "+string"; break;
4629 case REDIS_LIST: type = "+list"; break;
4630 case REDIS_SET: type = "+set"; break;
412a8bce 4631 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4632 case REDIS_HASH: type = "+hash"; break;
4633 default: type = "+unknown"; break;
ed9b544e 4634 }
4635 }
4636 addReplySds(c,sdsnew(type));
4637 addReply(c,shared.crlf);
4638}
4639
4640static void saveCommand(redisClient *c) {
9d65a1bb 4641 if (server.bgsavechildpid != -1) {
05557f6d 4642 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4643 return;
4644 }
f78fd11b 4645 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4646 addReply(c,shared.ok);
4647 } else {
4648 addReply(c,shared.err);
4649 }
4650}
4651
4652static void bgsaveCommand(redisClient *c) {
9d65a1bb 4653 if (server.bgsavechildpid != -1) {
ed9b544e 4654 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4655 return;
4656 }
f78fd11b 4657 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4658 char *status = "+Background saving started\r\n";
4659 addReplySds(c,sdsnew(status));
ed9b544e 4660 } else {
4661 addReply(c,shared.err);
4662 }
4663}
4664
4665static void shutdownCommand(redisClient *c) {
fab43727 4666 if (prepareForShutdown() == REDIS_OK)
4667 exit(0);
4668 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
ed9b544e 4669}
4670
4671static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4672 robj *o;
4673
4674 /* To use the same key as src and dst is probably an error */
4675 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4676 addReply(c,shared.sameobjecterr);
ed9b544e 4677 return;
4678 }
4679
dd88747b 4680 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4681 return;
dd88747b 4682
ed9b544e 4683 incrRefCount(o);
3305306f 4684 deleteIfVolatile(c->db,c->argv[2]);
4685 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 4686 if (nx) {
4687 decrRefCount(o);
c937aa89 4688 addReply(c,shared.czero);
ed9b544e 4689 return;
4690 }
3305306f 4691 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 4692 } else {
4693 incrRefCount(c->argv[2]);
4694 }
3305306f 4695 deleteKey(c->db,c->argv[1]);
b167f877 4696 touchWatchedKey(c->db,c->argv[2]);
ed9b544e 4697 server.dirty++;
c937aa89 4698 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4699}
4700
4701static void renameCommand(redisClient *c) {
4702 renameGenericCommand(c,0);
4703}
4704
4705static void renamenxCommand(redisClient *c) {
4706 renameGenericCommand(c,1);
4707}
4708
4709static void moveCommand(redisClient *c) {
3305306f 4710 robj *o;
4711 redisDb *src, *dst;
ed9b544e 4712 int srcid;
4713
4714 /* Obtain source and target DB pointers */
3305306f 4715 src = c->db;
4716 srcid = c->db->id;
ed9b544e 4717 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4718 addReply(c,shared.outofrangeerr);
ed9b544e 4719 return;
4720 }
3305306f 4721 dst = c->db;
4722 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4723
4724 /* If the user is moving using as target the same
4725 * DB as the source DB it is probably an error. */
4726 if (src == dst) {
c937aa89 4727 addReply(c,shared.sameobjecterr);
ed9b544e 4728 return;
4729 }
4730
4731 /* Check if the element exists and get a reference */
3305306f 4732 o = lookupKeyWrite(c->db,c->argv[1]);
4733 if (!o) {
c937aa89 4734 addReply(c,shared.czero);
ed9b544e 4735 return;
4736 }
4737
4738 /* Try to add the element to the target DB */
3305306f 4739 deleteIfVolatile(dst,c->argv[1]);
4740 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 4741 addReply(c,shared.czero);
ed9b544e 4742 return;
4743 }
3305306f 4744 incrRefCount(c->argv[1]);
ed9b544e 4745 incrRefCount(o);
4746
4747 /* OK! key moved, free the entry in the source DB */
3305306f 4748 deleteKey(src,c->argv[1]);
ed9b544e 4749 server.dirty++;
c937aa89 4750 addReply(c,shared.cone);
ed9b544e 4751}
4752
4753/* =================================== Lists ================================ */
4754static void pushGenericCommand(redisClient *c, int where) {
4755 robj *lobj;
ed9b544e 4756 list *list;
3305306f 4757
4758 lobj = lookupKeyWrite(c->db,c->argv[1]);
4759 if (lobj == NULL) {
95242ab5 4760 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4761 addReply(c,shared.cone);
95242ab5 4762 return;
4763 }
ed9b544e 4764 lobj = createListObject();
4765 list = lobj->ptr;
4766 if (where == REDIS_HEAD) {
6b47e12e 4767 listAddNodeHead(list,c->argv[2]);
ed9b544e 4768 } else {
6b47e12e 4769 listAddNodeTail(list,c->argv[2]);
ed9b544e 4770 }
3305306f 4771 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 4772 incrRefCount(c->argv[1]);
4773 incrRefCount(c->argv[2]);
4774 } else {
ed9b544e 4775 if (lobj->type != REDIS_LIST) {
4776 addReply(c,shared.wrongtypeerr);
4777 return;
4778 }
95242ab5 4779 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4780 addReply(c,shared.cone);
95242ab5 4781 return;
4782 }
ed9b544e 4783 list = lobj->ptr;
4784 if (where == REDIS_HEAD) {
6b47e12e 4785 listAddNodeHead(list,c->argv[2]);
ed9b544e 4786 } else {
6b47e12e 4787 listAddNodeTail(list,c->argv[2]);
ed9b544e 4788 }
4789 incrRefCount(c->argv[2]);
4790 }
4791 server.dirty++;
482b672d 4792 addReplyLongLong(c,listLength(list));
ed9b544e 4793}
4794
4795static void lpushCommand(redisClient *c) {
4796 pushGenericCommand(c,REDIS_HEAD);
4797}
4798
4799static void rpushCommand(redisClient *c) {
4800 pushGenericCommand(c,REDIS_TAIL);
4801}
4802
4803static void llenCommand(redisClient *c) {
3305306f 4804 robj *o;
ed9b544e 4805 list *l;
dd88747b 4806
4807 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4808 checkType(c,o,REDIS_LIST)) return;
e0a62c7f 4809
dd88747b 4810 l = o->ptr;
4811 addReplyUlong(c,listLength(l));
ed9b544e 4812}
4813
4814static void lindexCommand(redisClient *c) {
3305306f 4815 robj *o;
ed9b544e 4816 int index = atoi(c->argv[2]->ptr);
dd88747b 4817 list *list;
4818 listNode *ln;
4819
4820 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4821 checkType(c,o,REDIS_LIST)) return;
4822 list = o->ptr;
4823
4824 ln = listIndex(list, index);
4825 if (ln == NULL) {
c937aa89 4826 addReply(c,shared.nullbulk);
ed9b544e 4827 } else {
dd88747b 4828 robj *ele = listNodeValue(ln);
4829 addReplyBulk(c,ele);
ed9b544e 4830 }
4831}
4832
4833static void lsetCommand(redisClient *c) {
3305306f 4834 robj *o;
ed9b544e 4835 int index = atoi(c->argv[2]->ptr);
dd88747b 4836 list *list;
4837 listNode *ln;
4838
4839 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4840 checkType(c,o,REDIS_LIST)) return;
4841 list = o->ptr;
4842
4843 ln = listIndex(list, index);
4844 if (ln == NULL) {
4845 addReply(c,shared.outofrangeerr);
ed9b544e 4846 } else {
dd88747b 4847 robj *ele = listNodeValue(ln);
ed9b544e 4848
dd88747b 4849 decrRefCount(ele);
4850 listNodeValue(ln) = c->argv[3];
4851 incrRefCount(c->argv[3]);
4852 addReply(c,shared.ok);
4853 server.dirty++;
ed9b544e 4854 }
4855}
4856
4857static void popGenericCommand(redisClient *c, int where) {
3305306f 4858 robj *o;
dd88747b 4859 list *list;
4860 listNode *ln;
3305306f 4861
dd88747b 4862 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4863 checkType(c,o,REDIS_LIST)) return;
4864 list = o->ptr;
ed9b544e 4865
dd88747b 4866 if (where == REDIS_HEAD)
4867 ln = listFirst(list);
4868 else
4869 ln = listLast(list);
ed9b544e 4870
dd88747b 4871 if (ln == NULL) {
4872 addReply(c,shared.nullbulk);
4873 } else {
4874 robj *ele = listNodeValue(ln);
4875 addReplyBulk(c,ele);
4876 listDelNode(list,ln);
3ea27d37 4877 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4878 server.dirty++;
ed9b544e 4879 }
4880}
4881
4882static void lpopCommand(redisClient *c) {
4883 popGenericCommand(c,REDIS_HEAD);
4884}
4885
4886static void rpopCommand(redisClient *c) {
4887 popGenericCommand(c,REDIS_TAIL);
4888}
4889
4890static void lrangeCommand(redisClient *c) {
3305306f 4891 robj *o;
ed9b544e 4892 int start = atoi(c->argv[2]->ptr);
4893 int end = atoi(c->argv[3]->ptr);
dd88747b 4894 int llen;
4895 int rangelen, j;
4896 list *list;
4897 listNode *ln;
4898 robj *ele;
4899
4e27f268 4900 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4901 || checkType(c,o,REDIS_LIST)) return;
dd88747b 4902 list = o->ptr;
4903 llen = listLength(list);
4904
4905 /* convert negative indexes */
4906 if (start < 0) start = llen+start;
4907 if (end < 0) end = llen+end;
4908 if (start < 0) start = 0;
4909 if (end < 0) end = 0;
4910
4911 /* indexes sanity checks */
4912 if (start > end || start >= llen) {
4913 /* Out of range start or start > end result in empty list */
4914 addReply(c,shared.emptymultibulk);
4915 return;
4916 }
4917 if (end >= llen) end = llen-1;
4918 rangelen = (end-start)+1;
3305306f 4919
dd88747b 4920 /* Return the result in form of a multi-bulk reply */
4921 ln = listIndex(list, start);
4922 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4923 for (j = 0; j < rangelen; j++) {
4924 ele = listNodeValue(ln);
4925 addReplyBulk(c,ele);
4926 ln = ln->next;
ed9b544e 4927 }
4928}
4929
4930static void ltrimCommand(redisClient *c) {
3305306f 4931 robj *o;
ed9b544e 4932 int start = atoi(c->argv[2]->ptr);
4933 int end = atoi(c->argv[3]->ptr);
dd88747b 4934 int llen;
4935 int j, ltrim, rtrim;
4936 list *list;
4937 listNode *ln;
4938
4939 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4940 checkType(c,o,REDIS_LIST)) return;
4941 list = o->ptr;
4942 llen = listLength(list);
4943
4944 /* convert negative indexes */
4945 if (start < 0) start = llen+start;
4946 if (end < 0) end = llen+end;
4947 if (start < 0) start = 0;
4948 if (end < 0) end = 0;
4949
4950 /* indexes sanity checks */
4951 if (start > end || start >= llen) {
4952 /* Out of range start or start > end result in empty list */
4953 ltrim = llen;
4954 rtrim = 0;
ed9b544e 4955 } else {
dd88747b 4956 if (end >= llen) end = llen-1;
4957 ltrim = start;
4958 rtrim = llen-end-1;
4959 }
ed9b544e 4960
dd88747b 4961 /* Remove list elements to perform the trim */
4962 for (j = 0; j < ltrim; j++) {
4963 ln = listFirst(list);
4964 listDelNode(list,ln);
4965 }
4966 for (j = 0; j < rtrim; j++) {
4967 ln = listLast(list);
4968 listDelNode(list,ln);
ed9b544e 4969 }
3ea27d37 4970 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4971 server.dirty++;
4972 addReply(c,shared.ok);
ed9b544e 4973}
4974
4975static void lremCommand(redisClient *c) {
3305306f 4976 robj *o;
dd88747b 4977 list *list;
4978 listNode *ln, *next;
4979 int toremove = atoi(c->argv[2]->ptr);
4980 int removed = 0;
4981 int fromtail = 0;
a4d1ba9a 4982
dd88747b 4983 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4984 checkType(c,o,REDIS_LIST)) return;
4985 list = o->ptr;
4986
4987 if (toremove < 0) {
4988 toremove = -toremove;
4989 fromtail = 1;
4990 }
4991 ln = fromtail ? list->tail : list->head;
4992 while (ln) {
4993 robj *ele = listNodeValue(ln);
4994
4995 next = fromtail ? ln->prev : ln->next;
bf028098 4996 if (equalStringObjects(ele,c->argv[3])) {
dd88747b 4997 listDelNode(list,ln);
4998 server.dirty++;
4999 removed++;
5000 if (toremove && removed == toremove) break;
ed9b544e 5001 }
dd88747b 5002 ln = next;
ed9b544e 5003 }
3ea27d37 5004 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5005 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 5006}
5007
12f9d551 5008/* This is the semantic of this command:
0f5f7e9a 5009 * RPOPLPUSH srclist dstlist:
12f9d551 5010 * IF LLEN(srclist) > 0
5011 * element = RPOP srclist
5012 * LPUSH dstlist element
5013 * RETURN element
5014 * ELSE
5015 * RETURN nil
5016 * END
5017 * END
5018 *
5019 * The idea is to be able to get an element from a list in a reliable way
5020 * since the element is not just returned but pushed against another list
5021 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5022 */
0f5f7e9a 5023static void rpoplpushcommand(redisClient *c) {
12f9d551 5024 robj *sobj;
dd88747b 5025 list *srclist;
5026 listNode *ln;
5027
5028 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5029 checkType(c,sobj,REDIS_LIST)) return;
5030 srclist = sobj->ptr;
5031 ln = listLast(srclist);
12f9d551 5032
dd88747b 5033 if (ln == NULL) {
12f9d551 5034 addReply(c,shared.nullbulk);
5035 } else {
dd88747b 5036 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5037 robj *ele = listNodeValue(ln);
5038 list *dstlist;
e20fb74f 5039
dd88747b 5040 if (dobj && dobj->type != REDIS_LIST) {
5041 addReply(c,shared.wrongtypeerr);
5042 return;
5043 }
12f9d551 5044
dd88747b 5045 /* Add the element to the target list (unless it's directly
5046 * passed to some BLPOP-ing client */
5047 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
5048 if (dobj == NULL) {
5049 /* Create the list if the key does not exist */
5050 dobj = createListObject();
5051 dictAdd(c->db->dict,c->argv[2],dobj);
5052 incrRefCount(c->argv[2]);
12f9d551 5053 }
dd88747b 5054 dstlist = dobj->ptr;
5055 listAddNodeHead(dstlist,ele);
5056 incrRefCount(ele);
12f9d551 5057 }
dd88747b 5058
5059 /* Send the element to the client as reply as well */
5060 addReplyBulk(c,ele);
5061
5062 /* Finally remove the element from the source list */
5063 listDelNode(srclist,ln);
3ea27d37 5064 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5065 server.dirty++;
12f9d551 5066 }
5067}
5068
ed9b544e 5069/* ==================================== Sets ================================ */
5070
5071static void saddCommand(redisClient *c) {
ed9b544e 5072 robj *set;
5073
3305306f 5074 set = lookupKeyWrite(c->db,c->argv[1]);
5075 if (set == NULL) {
ed9b544e 5076 set = createSetObject();
3305306f 5077 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 5078 incrRefCount(c->argv[1]);
5079 } else {
ed9b544e 5080 if (set->type != REDIS_SET) {
c937aa89 5081 addReply(c,shared.wrongtypeerr);
ed9b544e 5082 return;
5083 }
5084 }
5085 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5086 incrRefCount(c->argv[2]);
5087 server.dirty++;
c937aa89 5088 addReply(c,shared.cone);
ed9b544e 5089 } else {
c937aa89 5090 addReply(c,shared.czero);
ed9b544e 5091 }
5092}
5093
5094static void sremCommand(redisClient *c) {
3305306f 5095 robj *set;
ed9b544e 5096
dd88747b 5097 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5098 checkType(c,set,REDIS_SET)) return;
5099
5100 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5101 server.dirty++;
5102 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 5103 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5104 addReply(c,shared.cone);
ed9b544e 5105 } else {
dd88747b 5106 addReply(c,shared.czero);
ed9b544e 5107 }
5108}
5109
a4460ef4 5110static void smoveCommand(redisClient *c) {
5111 robj *srcset, *dstset;
5112
5113 srcset = lookupKeyWrite(c->db,c->argv[1]);
5114 dstset = lookupKeyWrite(c->db,c->argv[2]);
5115
5116 /* If the source key does not exist return 0, if it's of the wrong type
5117 * raise an error */
5118 if (srcset == NULL || srcset->type != REDIS_SET) {
5119 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5120 return;
5121 }
5122 /* Error if the destination key is not a set as well */
5123 if (dstset && dstset->type != REDIS_SET) {
5124 addReply(c,shared.wrongtypeerr);
5125 return;
5126 }
5127 /* Remove the element from the source set */
5128 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5129 /* Key not found in the src set! return zero */
5130 addReply(c,shared.czero);
5131 return;
5132 }
3ea27d37 5133 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5134 deleteKey(c->db,c->argv[1]);
a4460ef4 5135 server.dirty++;
5136 /* Add the element to the destination set */
5137 if (!dstset) {
5138 dstset = createSetObject();
5139 dictAdd(c->db->dict,c->argv[2],dstset);
5140 incrRefCount(c->argv[2]);
5141 }
5142 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5143 incrRefCount(c->argv[3]);
5144 addReply(c,shared.cone);
5145}
5146
ed9b544e 5147static void sismemberCommand(redisClient *c) {
3305306f 5148 robj *set;
ed9b544e 5149
dd88747b 5150 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5151 checkType(c,set,REDIS_SET)) return;
5152
5153 if (dictFind(set->ptr,c->argv[2]))
5154 addReply(c,shared.cone);
5155 else
c937aa89 5156 addReply(c,shared.czero);
ed9b544e 5157}
5158
5159static void scardCommand(redisClient *c) {
3305306f 5160 robj *o;
ed9b544e 5161 dict *s;
dd88747b 5162
5163 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5164 checkType(c,o,REDIS_SET)) return;
e0a62c7f 5165
dd88747b 5166 s = o->ptr;
5167 addReplyUlong(c,dictSize(s));
ed9b544e 5168}
5169
12fea928 5170static void spopCommand(redisClient *c) {
5171 robj *set;
5172 dictEntry *de;
5173
dd88747b 5174 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5175 checkType(c,set,REDIS_SET)) return;
5176
5177 de = dictGetRandomKey(set->ptr);
5178 if (de == NULL) {
12fea928 5179 addReply(c,shared.nullbulk);
5180 } else {
dd88747b 5181 robj *ele = dictGetEntryKey(de);
12fea928 5182
dd88747b 5183 addReplyBulk(c,ele);
5184 dictDelete(set->ptr,ele);
5185 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 5186 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5187 server.dirty++;
12fea928 5188 }
5189}
5190
2abb95a9 5191static void srandmemberCommand(redisClient *c) {
5192 robj *set;
5193 dictEntry *de;
5194
dd88747b 5195 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5196 checkType(c,set,REDIS_SET)) return;
5197
5198 de = dictGetRandomKey(set->ptr);
5199 if (de == NULL) {
2abb95a9 5200 addReply(c,shared.nullbulk);
5201 } else {
dd88747b 5202 robj *ele = dictGetEntryKey(de);
2abb95a9 5203
dd88747b 5204 addReplyBulk(c,ele);
2abb95a9 5205 }
5206}
5207
ed9b544e 5208static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5209 dict **d1 = (void*) s1, **d2 = (void*) s2;
5210
3305306f 5211 return dictSize(*d1)-dictSize(*d2);
ed9b544e 5212}
5213
682ac724 5214static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 5215 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5216 dictIterator *di;
5217 dictEntry *de;
5218 robj *lenobj = NULL, *dstset = NULL;
682ac724 5219 unsigned long j, cardinality = 0;
ed9b544e 5220
ed9b544e 5221 for (j = 0; j < setsnum; j++) {
5222 robj *setobj;
3305306f 5223
5224 setobj = dstkey ?
5225 lookupKeyWrite(c->db,setskeys[j]) :
5226 lookupKeyRead(c->db,setskeys[j]);
5227 if (!setobj) {
ed9b544e 5228 zfree(dv);
5faa6025 5229 if (dstkey) {
fdcaae84 5230 if (deleteKey(c->db,dstkey))
5231 server.dirty++;
0d36ded0 5232 addReply(c,shared.czero);
5faa6025 5233 } else {
4e27f268 5234 addReply(c,shared.emptymultibulk);
5faa6025 5235 }
ed9b544e 5236 return;
5237 }
ed9b544e 5238 if (setobj->type != REDIS_SET) {
5239 zfree(dv);
c937aa89 5240 addReply(c,shared.wrongtypeerr);
ed9b544e 5241 return;
5242 }
5243 dv[j] = setobj->ptr;
5244 }
5245 /* Sort sets from the smallest to largest, this will improve our
5246 * algorithm's performace */
5247 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5248
5249 /* The first thing we should output is the total number of elements...
5250 * since this is a multi-bulk write, but at this stage we don't know
5251 * the intersection set size, so we use a trick, append an empty object
5252 * to the output list and save the pointer to later modify it with the
5253 * right length */
5254 if (!dstkey) {
5255 lenobj = createObject(REDIS_STRING,NULL);
5256 addReply(c,lenobj);
5257 decrRefCount(lenobj);
5258 } else {
5259 /* If we have a target key where to store the resulting set
5260 * create this key with an empty set inside */
5261 dstset = createSetObject();
ed9b544e 5262 }
5263
5264 /* Iterate all the elements of the first (smallest) set, and test
5265 * the element against all the other sets, if at least one set does
5266 * not include the element it is discarded */
5267 di = dictGetIterator(dv[0]);
ed9b544e 5268
5269 while((de = dictNext(di)) != NULL) {
5270 robj *ele;
5271
5272 for (j = 1; j < setsnum; j++)
5273 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5274 if (j != setsnum)
5275 continue; /* at least one set does not contain the member */
5276 ele = dictGetEntryKey(de);
5277 if (!dstkey) {
dd88747b 5278 addReplyBulk(c,ele);
ed9b544e 5279 cardinality++;
5280 } else {
5281 dictAdd(dstset->ptr,ele,NULL);
5282 incrRefCount(ele);
5283 }
5284 }
5285 dictReleaseIterator(di);
5286
83cdfe18 5287 if (dstkey) {
3ea27d37 5288 /* Store the resulting set into the target, if the intersection
5289 * is not an empty set. */
83cdfe18 5290 deleteKey(c->db,dstkey);
3ea27d37 5291 if (dictSize((dict*)dstset->ptr) > 0) {
5292 dictAdd(c->db->dict,dstkey,dstset);
5293 incrRefCount(dstkey);
482b672d 5294 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5295 } else {
5296 decrRefCount(dstset);
d36c4e97 5297 addReply(c,shared.czero);
3ea27d37 5298 }
40d224a9 5299 server.dirty++;
d36c4e97 5300 } else {
5301 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 5302 }
ed9b544e 5303 zfree(dv);
5304}
5305
5306static void sinterCommand(redisClient *c) {
5307 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5308}
5309
5310static void sinterstoreCommand(redisClient *c) {
5311 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5312}
5313
f4f56e1d 5314#define REDIS_OP_UNION 0
5315#define REDIS_OP_DIFF 1
2830ca53 5316#define REDIS_OP_INTER 2
f4f56e1d 5317
5318static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 5319 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5320 dictIterator *di;
5321 dictEntry *de;
f4f56e1d 5322 robj *dstset = NULL;
40d224a9 5323 int j, cardinality = 0;
5324
40d224a9 5325 for (j = 0; j < setsnum; j++) {
5326 robj *setobj;
5327
5328 setobj = dstkey ?
5329 lookupKeyWrite(c->db,setskeys[j]) :
5330 lookupKeyRead(c->db,setskeys[j]);
5331 if (!setobj) {
5332 dv[j] = NULL;
5333 continue;
5334 }
5335 if (setobj->type != REDIS_SET) {
5336 zfree(dv);
5337 addReply(c,shared.wrongtypeerr);
5338 return;
5339 }
5340 dv[j] = setobj->ptr;
5341 }
5342
5343 /* We need a temp set object to store our union. If the dstkey
5344 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5345 * this set object will be the resulting object to set into the target key*/
5346 dstset = createSetObject();
5347
40d224a9 5348 /* Iterate all the elements of all the sets, add every element a single
5349 * time to the result set */
5350 for (j = 0; j < setsnum; j++) {
51829ed3 5351 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 5352 if (!dv[j]) continue; /* non existing keys are like empty sets */
5353
5354 di = dictGetIterator(dv[j]);
40d224a9 5355
5356 while((de = dictNext(di)) != NULL) {
5357 robj *ele;
5358
5359 /* dictAdd will not add the same element multiple times */
5360 ele = dictGetEntryKey(de);
f4f56e1d 5361 if (op == REDIS_OP_UNION || j == 0) {
5362 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5363 incrRefCount(ele);
40d224a9 5364 cardinality++;
5365 }
f4f56e1d 5366 } else if (op == REDIS_OP_DIFF) {
5367 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5368 cardinality--;
5369 }
40d224a9 5370 }
5371 }
5372 dictReleaseIterator(di);
51829ed3 5373
d36c4e97 5374 /* result set is empty? Exit asap. */
5375 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 5376 }
5377
f4f56e1d 5378 /* Output the content of the resulting set, if not in STORE mode */
5379 if (!dstkey) {
5380 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5381 di = dictGetIterator(dstset->ptr);
f4f56e1d 5382 while((de = dictNext(di)) != NULL) {
5383 robj *ele;
5384
5385 ele = dictGetEntryKey(de);
dd88747b 5386 addReplyBulk(c,ele);
f4f56e1d 5387 }
5388 dictReleaseIterator(di);
d36c4e97 5389 decrRefCount(dstset);
83cdfe18
AG
5390 } else {
5391 /* If we have a target key where to store the resulting set
5392 * create this key with the result set inside */
5393 deleteKey(c->db,dstkey);
3ea27d37 5394 if (dictSize((dict*)dstset->ptr) > 0) {
5395 dictAdd(c->db->dict,dstkey,dstset);
5396 incrRefCount(dstkey);
482b672d 5397 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5398 } else {
5399 decrRefCount(dstset);
d36c4e97 5400 addReply(c,shared.czero);
3ea27d37 5401 }
40d224a9 5402 server.dirty++;
5403 }
5404 zfree(dv);
5405}
5406
5407static void sunionCommand(redisClient *c) {
f4f56e1d 5408 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 5409}
5410
5411static void sunionstoreCommand(redisClient *c) {
f4f56e1d 5412 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5413}
5414
5415static void sdiffCommand(redisClient *c) {
5416 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5417}
5418
5419static void sdiffstoreCommand(redisClient *c) {
5420 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 5421}
5422
6b47e12e 5423/* ==================================== ZSets =============================== */
5424
5425/* ZSETs are ordered sets using two data structures to hold the same elements
5426 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5427 * data structure.
5428 *
5429 * The elements are added to an hash table mapping Redis objects to scores.
5430 * At the same time the elements are added to a skip list mapping scores
5431 * to Redis objects (so objects are sorted by scores in this "view"). */
5432
5433/* This skiplist implementation is almost a C translation of the original
5434 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5435 * Alternative to Balanced Trees", modified in three ways:
5436 * a) this implementation allows for repeated values.
5437 * b) the comparison is not just by key (our 'score') but by satellite data.
5438 * c) there is a back pointer, so it's a doubly linked list with the back
5439 * pointers being only at "level 1". This allows to traverse the list
5440 * from tail to head, useful for ZREVRANGE. */
5441
5442static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5443 zskiplistNode *zn = zmalloc(sizeof(*zn));
5444
5445 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2f4dd7e0 5446 if (level > 1)
2b37892e 5447 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
2f4dd7e0 5448 else
5449 zn->span = NULL;
6b47e12e 5450 zn->score = score;
5451 zn->obj = obj;
5452 return zn;
5453}
5454
5455static zskiplist *zslCreate(void) {
5456 int j;
5457 zskiplist *zsl;
e0a62c7f 5458
6b47e12e 5459 zsl = zmalloc(sizeof(*zsl));
5460 zsl->level = 1;
cc812361 5461 zsl->length = 0;
6b47e12e 5462 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 5463 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 5464 zsl->header->forward[j] = NULL;
94e543b5 5465
5466 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5467 if (j < ZSKIPLIST_MAXLEVEL-1)
5468 zsl->header->span[j] = 0;
69d95c3e 5469 }
e3870fab 5470 zsl->header->backward = NULL;
5471 zsl->tail = NULL;
6b47e12e 5472 return zsl;
5473}
5474
fd8ccf44 5475static void zslFreeNode(zskiplistNode *node) {
5476 decrRefCount(node->obj);
ad807e6f 5477 zfree(node->forward);
69d95c3e 5478 zfree(node->span);
fd8ccf44 5479 zfree(node);
5480}
5481
5482static void zslFree(zskiplist *zsl) {
ad807e6f 5483 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 5484
ad807e6f 5485 zfree(zsl->header->forward);
69d95c3e 5486 zfree(zsl->header->span);
ad807e6f 5487 zfree(zsl->header);
fd8ccf44 5488 while(node) {
599379dd 5489 next = node->forward[0];
fd8ccf44 5490 zslFreeNode(node);
5491 node = next;
5492 }
ad807e6f 5493 zfree(zsl);
fd8ccf44 5494}
5495
6b47e12e 5496static int zslRandomLevel(void) {
5497 int level = 1;
5498 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5499 level += 1;
10c2baa5 5500 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6b47e12e 5501}
5502
5503static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5504 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 5505 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 5506 int i, level;
5507
5508 x = zsl->header;
5509 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
5510 /* store rank that is crossed to reach the insert position */
5511 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 5512
9d60e6e4 5513 while (x->forward[i] &&
5514 (x->forward[i]->score < score ||
5515 (x->forward[i]->score == score &&
69d95c3e 5516 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 5517 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 5518 x = x->forward[i];
69d95c3e 5519 }
6b47e12e 5520 update[i] = x;
5521 }
6b47e12e 5522 /* we assume the key is not already inside, since we allow duplicated
5523 * scores, and the re-insertion of score and redis object should never
5524 * happpen since the caller of zslInsert() should test in the hash table
5525 * if the element is already inside or not. */
5526 level = zslRandomLevel();
5527 if (level > zsl->level) {
69d95c3e 5528 for (i = zsl->level; i < level; i++) {
2b37892e 5529 rank[i] = 0;
6b47e12e 5530 update[i] = zsl->header;
2b37892e 5531 update[i]->span[i-1] = zsl->length;
69d95c3e 5532 }
6b47e12e 5533 zsl->level = level;
5534 }
5535 x = zslCreateNode(level,score,obj);
5536 for (i = 0; i < level; i++) {
5537 x->forward[i] = update[i]->forward[i];
5538 update[i]->forward[i] = x;
69d95c3e
PN
5539
5540 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
5541 if (i > 0) {
5542 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5543 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5544 }
6b47e12e 5545 }
69d95c3e
PN
5546
5547 /* increment span for untouched levels */
5548 for (i = level; i < zsl->level; i++) {
2b37892e 5549 update[i]->span[i-1]++;
69d95c3e
PN
5550 }
5551
bb975144 5552 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 5553 if (x->forward[0])
5554 x->forward[0]->backward = x;
5555 else
5556 zsl->tail = x;
cc812361 5557 zsl->length++;
6b47e12e 5558}
5559
84105336
PN
5560/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5561void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5562 int i;
5563 for (i = 0; i < zsl->level; i++) {
5564 if (update[i]->forward[i] == x) {
5565 if (i > 0) {
5566 update[i]->span[i-1] += x->span[i-1] - 1;
5567 }
5568 update[i]->forward[i] = x->forward[i];
5569 } else {
5570 /* invariant: i > 0, because update[0]->forward[0]
5571 * is always equal to x */
5572 update[i]->span[i-1] -= 1;
5573 }
5574 }
5575 if (x->forward[0]) {
5576 x->forward[0]->backward = x->backward;
5577 } else {
5578 zsl->tail = x->backward;
5579 }
5580 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5581 zsl->level--;
5582 zsl->length--;
5583}
5584
50c55df5 5585/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 5586static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 5587 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5588 int i;
5589
5590 x = zsl->header;
5591 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 5592 while (x->forward[i] &&
5593 (x->forward[i]->score < score ||
5594 (x->forward[i]->score == score &&
5595 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 5596 x = x->forward[i];
5597 update[i] = x;
5598 }
5599 /* We may have multiple elements with the same score, what we need
5600 * is to find the element with both the right score and object. */
5601 x = x->forward[0];
bf028098 5602 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
84105336 5603 zslDeleteNode(zsl, x, update);
9d60e6e4 5604 zslFreeNode(x);
9d60e6e4 5605 return 1;
5606 } else {
5607 return 0; /* not found */
e197b441 5608 }
5609 return 0; /* not found */
fd8ccf44 5610}
5611
1807985b 5612/* Delete all the elements with score between min and max from the skiplist.
5613 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5614 * Note that this function takes the reference to the hash table view of the
5615 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 5616static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 5617 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5618 unsigned long removed = 0;
5619 int i;
5620
5621 x = zsl->header;
5622 for (i = zsl->level-1; i >= 0; i--) {
5623 while (x->forward[i] && x->forward[i]->score < min)
5624 x = x->forward[i];
5625 update[i] = x;
5626 }
5627 /* We may have multiple elements with the same score, what we need
5628 * is to find the element with both the right score and object. */
5629 x = x->forward[0];
5630 while (x && x->score <= max) {
84105336
PN
5631 zskiplistNode *next = x->forward[0];
5632 zslDeleteNode(zsl, x, update);
1807985b 5633 dictDelete(dict,x->obj);
5634 zslFreeNode(x);
1807985b 5635 removed++;
5636 x = next;
5637 }
5638 return removed; /* not found */
5639}
1807985b 5640
9212eafd 5641/* Delete all the elements with rank between start and end from the skiplist.
2424490f 5642 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
5643static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5644 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5645 unsigned long traversed = 0, removed = 0;
5646 int i;
5647
9212eafd
PN
5648 x = zsl->header;
5649 for (i = zsl->level-1; i >= 0; i--) {
5650 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5651 traversed += i > 0 ? x->span[i-1] : 1;
5652 x = x->forward[i];
1807985b 5653 }
9212eafd
PN
5654 update[i] = x;
5655 }
5656
5657 traversed++;
5658 x = x->forward[0];
5659 while (x && traversed <= end) {
84105336
PN
5660 zskiplistNode *next = x->forward[0];
5661 zslDeleteNode(zsl, x, update);
1807985b 5662 dictDelete(dict,x->obj);
5663 zslFreeNode(x);
1807985b 5664 removed++;
9212eafd 5665 traversed++;
1807985b 5666 x = next;
5667 }
9212eafd 5668 return removed;
1807985b 5669}
5670
50c55df5 5671/* Find the first node having a score equal or greater than the specified one.
5672 * Returns NULL if there is no match. */
5673static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5674 zskiplistNode *x;
5675 int i;
5676
5677 x = zsl->header;
5678 for (i = zsl->level-1; i >= 0; i--) {
5679 while (x->forward[i] && x->forward[i]->score < score)
5680 x = x->forward[i];
5681 }
5682 /* We may have multiple elements with the same score, what we need
5683 * is to find the element with both the right score and object. */
5684 return x->forward[0];
5685}
5686
27b0ccca
PN
5687/* Find the rank for an element by both score and key.
5688 * Returns 0 when the element cannot be found, rank otherwise.
5689 * Note that the rank is 1-based due to the span of zsl->header to the
5690 * first element. */
5691static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5692 zskiplistNode *x;
5693 unsigned long rank = 0;
5694 int i;
5695
5696 x = zsl->header;
5697 for (i = zsl->level-1; i >= 0; i--) {
5698 while (x->forward[i] &&
5699 (x->forward[i]->score < score ||
5700 (x->forward[i]->score == score &&
5701 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 5702 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
5703 x = x->forward[i];
5704 }
5705
5706 /* x might be equal to zsl->header, so test if obj is non-NULL */
bf028098 5707 if (x->obj && equalStringObjects(x->obj,o)) {
27b0ccca
PN
5708 return rank;
5709 }
5710 }
5711 return 0;
5712}
5713
e74825c2
PN
5714/* Finds an element by its rank. The rank argument needs to be 1-based. */
5715zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5716 zskiplistNode *x;
5717 unsigned long traversed = 0;
5718 int i;
5719
5720 x = zsl->header;
5721 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 5722 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5723 {
a50ea45c 5724 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
5725 x = x->forward[i];
5726 }
e74825c2
PN
5727 if (traversed == rank) {
5728 return x;
5729 }
5730 }
5731 return NULL;
5732}
5733
fd8ccf44 5734/* The actual Z-commands implementations */
5735
7db723ad 5736/* This generic command implements both ZADD and ZINCRBY.
e2665397 5737 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 5738 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 5739static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 5740 robj *zsetobj;
5741 zset *zs;
5742 double *score;
5743
e2665397 5744 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 5745 if (zsetobj == NULL) {
5746 zsetobj = createZsetObject();
e2665397 5747 dictAdd(c->db->dict,key,zsetobj);
5748 incrRefCount(key);
fd8ccf44 5749 } else {
5750 if (zsetobj->type != REDIS_ZSET) {
5751 addReply(c,shared.wrongtypeerr);
5752 return;
5753 }
5754 }
fd8ccf44 5755 zs = zsetobj->ptr;
e2665397 5756
7db723ad 5757 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 5758 * needs to handle the two different conditions. It's all about setting
5759 * '*score', that is, the new score to set, to the right value. */
5760 score = zmalloc(sizeof(double));
5761 if (doincrement) {
5762 dictEntry *de;
5763
5764 /* Read the old score. If the element was not present starts from 0 */
5765 de = dictFind(zs->dict,ele);
5766 if (de) {
5767 double *oldscore = dictGetEntryVal(de);
5768 *score = *oldscore + scoreval;
5769 } else {
5770 *score = scoreval;
5771 }
5772 } else {
5773 *score = scoreval;
5774 }
5775
5776 /* What follows is a simple remove and re-insert operation that is common
7db723ad 5777 * to both ZADD and ZINCRBY... */
e2665397 5778 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 5779 /* case 1: New element */
e2665397 5780 incrRefCount(ele); /* added to hash */
5781 zslInsert(zs->zsl,*score,ele);
5782 incrRefCount(ele); /* added to skiplist */
fd8ccf44 5783 server.dirty++;
e2665397 5784 if (doincrement)
e2665397 5785 addReplyDouble(c,*score);
91d71bfc 5786 else
5787 addReply(c,shared.cone);
fd8ccf44 5788 } else {
5789 dictEntry *de;
5790 double *oldscore;
e0a62c7f 5791
fd8ccf44 5792 /* case 2: Score update operation */
e2665397 5793 de = dictFind(zs->dict,ele);
dfc5e96c 5794 redisAssert(de != NULL);
fd8ccf44 5795 oldscore = dictGetEntryVal(de);
5796 if (*score != *oldscore) {
5797 int deleted;
5798
e2665397 5799 /* Remove and insert the element in the skip list with new score */
5800 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 5801 redisAssert(deleted != 0);
e2665397 5802 zslInsert(zs->zsl,*score,ele);
5803 incrRefCount(ele);
5804 /* Update the score in the hash table */
5805 dictReplace(zs->dict,ele,score);
fd8ccf44 5806 server.dirty++;
2161a965 5807 } else {
5808 zfree(score);
fd8ccf44 5809 }
e2665397 5810 if (doincrement)
5811 addReplyDouble(c,*score);
5812 else
5813 addReply(c,shared.czero);
fd8ccf44 5814 }
5815}
5816
e2665397 5817static void zaddCommand(redisClient *c) {
5818 double scoreval;
5819
bd79a6bd 5820 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5821 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5822}
5823
7db723ad 5824static void zincrbyCommand(redisClient *c) {
e2665397 5825 double scoreval;
5826
bd79a6bd 5827 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5828 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5829}
5830
1b7106e7 5831static void zremCommand(redisClient *c) {
5832 robj *zsetobj;
5833 zset *zs;
dd88747b 5834 dictEntry *de;
5835 double *oldscore;
5836 int deleted;
1b7106e7 5837
dd88747b 5838 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5839 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 5840
dd88747b 5841 zs = zsetobj->ptr;
5842 de = dictFind(zs->dict,c->argv[2]);
5843 if (de == NULL) {
5844 addReply(c,shared.czero);
5845 return;
1b7106e7 5846 }
dd88747b 5847 /* Delete from the skiplist */
5848 oldscore = dictGetEntryVal(de);
5849 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5850 redisAssert(deleted != 0);
5851
5852 /* Delete from the hash table */
5853 dictDelete(zs->dict,c->argv[2]);
5854 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5855 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5856 server.dirty++;
5857 addReply(c,shared.cone);
1b7106e7 5858}
5859
1807985b 5860static void zremrangebyscoreCommand(redisClient *c) {
bbe025e0
AM
5861 double min;
5862 double max;
dd88747b 5863 long deleted;
1807985b 5864 robj *zsetobj;
5865 zset *zs;
5866
bd79a6bd
PN
5867 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5868 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
bbe025e0 5869
dd88747b 5870 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5871 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 5872
dd88747b 5873 zs = zsetobj->ptr;
5874 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5875 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5876 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5877 server.dirty += deleted;
482b672d 5878 addReplyLongLong(c,deleted);
1807985b 5879}
5880
9212eafd 5881static void zremrangebyrankCommand(redisClient *c) {
bbe025e0
AM
5882 long start;
5883 long end;
dd88747b 5884 int llen;
5885 long deleted;
9212eafd
PN
5886 robj *zsetobj;
5887 zset *zs;
5888
bd79a6bd
PN
5889 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5890 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 5891
dd88747b 5892 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5893 checkType(c,zsetobj,REDIS_ZSET)) return;
5894 zs = zsetobj->ptr;
5895 llen = zs->zsl->length;
9212eafd 5896
dd88747b 5897 /* convert negative indexes */
5898 if (start < 0) start = llen+start;
5899 if (end < 0) end = llen+end;
5900 if (start < 0) start = 0;
5901 if (end < 0) end = 0;
9212eafd 5902
dd88747b 5903 /* indexes sanity checks */
5904 if (start > end || start >= llen) {
5905 addReply(c,shared.czero);
5906 return;
9212eafd 5907 }
dd88747b 5908 if (end >= llen) end = llen-1;
5909
5910 /* increment start and end because zsl*Rank functions
5911 * use 1-based rank */
5912 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5913 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5914 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5915 server.dirty += deleted;
482b672d 5916 addReplyLongLong(c, deleted);
9212eafd
PN
5917}
5918
8f92e768
PN
5919typedef struct {
5920 dict *dict;
5921 double weight;
5922} zsetopsrc;
5923
5924static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5925 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5926 unsigned long size1, size2;
5927 size1 = d1->dict ? dictSize(d1->dict) : 0;
5928 size2 = d2->dict ? dictSize(d2->dict) : 0;
5929 return size1 - size2;
5930}
5931
d2764cd6
PN
5932#define REDIS_AGGR_SUM 1
5933#define REDIS_AGGR_MIN 2
5934#define REDIS_AGGR_MAX 3
bc000c1d 5935#define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
d2764cd6
PN
5936
5937inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5938 if (aggregate == REDIS_AGGR_SUM) {
5939 *target = *target + val;
5940 } else if (aggregate == REDIS_AGGR_MIN) {
5941 *target = val < *target ? val : *target;
5942 } else if (aggregate == REDIS_AGGR_MAX) {
5943 *target = val > *target ? val : *target;
5944 } else {
5945 /* safety net */
f83c6cb5 5946 redisPanic("Unknown ZUNION/INTER aggregate type");
d2764cd6
PN
5947 }
5948}
5949
2830ca53 5950static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
bc000c1d 5951 int i, j, setnum;
d2764cd6 5952 int aggregate = REDIS_AGGR_SUM;
8f92e768 5953 zsetopsrc *src;
2830ca53
PN
5954 robj *dstobj;
5955 zset *dstzset;
b287c9bb
PN
5956 dictIterator *di;
5957 dictEntry *de;
5958
bc000c1d
JC
5959 /* expect setnum input keys to be given */
5960 setnum = atoi(c->argv[2]->ptr);
5961 if (setnum < 1) {
5d373da9 5962 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
2830ca53 5963 return;
b287c9bb 5964 }
2830ca53
PN
5965
5966 /* test if the expected number of keys would overflow */
bc000c1d 5967 if (3+setnum > c->argc) {
b287c9bb
PN
5968 addReply(c,shared.syntaxerr);
5969 return;
5970 }
5971
2830ca53 5972 /* read keys to be used for input */
bc000c1d
JC
5973 src = zmalloc(sizeof(zsetopsrc) * setnum);
5974 for (i = 0, j = 3; i < setnum; i++, j++) {
5975 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
5976 if (!obj) {
8f92e768 5977 src[i].dict = NULL;
b287c9bb 5978 } else {
bc000c1d
JC
5979 if (obj->type == REDIS_ZSET) {
5980 src[i].dict = ((zset*)obj->ptr)->dict;
5981 } else if (obj->type == REDIS_SET) {
5982 src[i].dict = (obj->ptr);
5983 } else {
8f92e768 5984 zfree(src);
b287c9bb
PN
5985 addReply(c,shared.wrongtypeerr);
5986 return;
5987 }
b287c9bb 5988 }
2830ca53
PN
5989
5990 /* default all weights to 1 */
8f92e768 5991 src[i].weight = 1.0;
b287c9bb
PN
5992 }
5993
2830ca53
PN
5994 /* parse optional extra arguments */
5995 if (j < c->argc) {
d2764cd6 5996 int remaining = c->argc - j;
b287c9bb 5997
2830ca53 5998 while (remaining) {
bc000c1d 5999 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 6000 j++; remaining--;
bc000c1d 6001 for (i = 0; i < setnum; i++, j++, remaining--) {
bd79a6bd 6002 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
bbe025e0 6003 return;
2830ca53 6004 }
d2764cd6
PN
6005 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6006 j++; remaining--;
6007 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6008 aggregate = REDIS_AGGR_SUM;
6009 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6010 aggregate = REDIS_AGGR_MIN;
6011 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6012 aggregate = REDIS_AGGR_MAX;
6013 } else {
6014 zfree(src);
6015 addReply(c,shared.syntaxerr);
6016 return;
6017 }
6018 j++; remaining--;
2830ca53 6019 } else {
8f92e768 6020 zfree(src);
2830ca53
PN
6021 addReply(c,shared.syntaxerr);
6022 return;
6023 }
6024 }
6025 }
b287c9bb 6026
d2764cd6
PN
6027 /* sort sets from the smallest to largest, this will improve our
6028 * algorithm's performance */
bc000c1d 6029 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
d2764cd6 6030
2830ca53
PN
6031 dstobj = createZsetObject();
6032 dstzset = dstobj->ptr;
6033
6034 if (op == REDIS_OP_INTER) {
8f92e768
PN
6035 /* skip going over all entries if the smallest zset is NULL or empty */
6036 if (src[0].dict && dictSize(src[0].dict) > 0) {
6037 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6038 * from small to large, all src[i > 0].dict are non-empty too */
6039 di = dictGetIterator(src[0].dict);
2830ca53 6040 while((de = dictNext(di)) != NULL) {
d2764cd6 6041 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6042 *score = src[0].weight * zunionInterDictValue(de);
2830ca53 6043
bc000c1d 6044 for (j = 1; j < setnum; j++) {
d2764cd6 6045 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6046 if (other) {
bc000c1d 6047 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6048 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6049 } else {
6050 break;
6051 }
6052 }
b287c9bb 6053
2830ca53 6054 /* skip entry when not present in every source dict */
bc000c1d 6055 if (j != setnum) {
2830ca53
PN
6056 zfree(score);
6057 } else {
6058 robj *o = dictGetEntryKey(de);
6059 dictAdd(dstzset->dict,o,score);
6060 incrRefCount(o); /* added to dictionary */
6061 zslInsert(dstzset->zsl,*score,o);
6062 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
6063 }
6064 }
2830ca53
PN
6065 dictReleaseIterator(di);
6066 }
6067 } else if (op == REDIS_OP_UNION) {
bc000c1d 6068 for (i = 0; i < setnum; i++) {
8f92e768 6069 if (!src[i].dict) continue;
2830ca53 6070
8f92e768 6071 di = dictGetIterator(src[i].dict);
2830ca53
PN
6072 while((de = dictNext(di)) != NULL) {
6073 /* skip key when already processed */
6074 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6075
d2764cd6 6076 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6077 *score = src[i].weight * zunionInterDictValue(de);
2830ca53 6078
d2764cd6
PN
6079 /* because the zsets are sorted by size, its only possible
6080 * for sets at larger indices to hold this entry */
bc000c1d 6081 for (j = (i+1); j < setnum; j++) {
d2764cd6 6082 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6083 if (other) {
bc000c1d 6084 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6085 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6086 }
6087 }
b287c9bb 6088
2830ca53
PN
6089 robj *o = dictGetEntryKey(de);
6090 dictAdd(dstzset->dict,o,score);
6091 incrRefCount(o); /* added to dictionary */
6092 zslInsert(dstzset->zsl,*score,o);
6093 incrRefCount(o); /* added to skiplist */
6094 }
6095 dictReleaseIterator(di);
b287c9bb 6096 }
2830ca53
PN
6097 } else {
6098 /* unknown operator */
6099 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
6100 }
6101
6102 deleteKey(c->db,dstkey);
3ea27d37 6103 if (dstzset->zsl->length) {
6104 dictAdd(c->db->dict,dstkey,dstobj);
6105 incrRefCount(dstkey);
482b672d 6106 addReplyLongLong(c, dstzset->zsl->length);
3ea27d37 6107 server.dirty++;
6108 } else {
8bca8773 6109 decrRefCount(dstobj);
3ea27d37 6110 addReply(c, shared.czero);
6111 }
8f92e768 6112 zfree(src);
b287c9bb
PN
6113}
6114
5d373da9 6115static void zunionstoreCommand(redisClient *c) {
2830ca53 6116 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
6117}
6118
5d373da9 6119static void zinterstoreCommand(redisClient *c) {
2830ca53 6120 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
6121}
6122
e3870fab 6123static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 6124 robj *o;
bbe025e0
AM
6125 long start;
6126 long end;
752da584 6127 int withscores = 0;
dd88747b 6128 int llen;
6129 int rangelen, j;
6130 zset *zsetobj;
6131 zskiplist *zsl;
6132 zskiplistNode *ln;
6133 robj *ele;
752da584 6134
bd79a6bd
PN
6135 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6136 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 6137
752da584 6138 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6139 withscores = 1;
6140 } else if (c->argc >= 5) {
6141 addReply(c,shared.syntaxerr);
6142 return;
6143 }
cc812361 6144
4e27f268 6145 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6146 || checkType(c,o,REDIS_ZSET)) return;
dd88747b 6147 zsetobj = o->ptr;
6148 zsl = zsetobj->zsl;
6149 llen = zsl->length;
cc812361 6150
dd88747b 6151 /* convert negative indexes */
6152 if (start < 0) start = llen+start;
6153 if (end < 0) end = llen+end;
6154 if (start < 0) start = 0;
6155 if (end < 0) end = 0;
cc812361 6156
dd88747b 6157 /* indexes sanity checks */
6158 if (start > end || start >= llen) {
6159 /* Out of range start or start > end result in empty list */
6160 addReply(c,shared.emptymultibulk);
6161 return;
6162 }
6163 if (end >= llen) end = llen-1;
6164 rangelen = (end-start)+1;
cc812361 6165
dd88747b 6166 /* check if starting point is trivial, before searching
6167 * the element in log(N) time */
6168 if (reverse) {
6169 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6170 } else {
6171 ln = start == 0 ?
6172 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6173 }
cc812361 6174
dd88747b 6175 /* Return the result in form of a multi-bulk reply */
6176 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6177 withscores ? (rangelen*2) : rangelen));
6178 for (j = 0; j < rangelen; j++) {
6179 ele = ln->obj;
6180 addReplyBulk(c,ele);
6181 if (withscores)
6182 addReplyDouble(c,ln->score);
6183 ln = reverse ? ln->backward : ln->forward[0];
cc812361 6184 }
6185}
6186
e3870fab 6187static void zrangeCommand(redisClient *c) {
6188 zrangeGenericCommand(c,0);
6189}
6190
6191static void zrevrangeCommand(redisClient *c) {
6192 zrangeGenericCommand(c,1);
6193}
6194
f44dd428 6195/* This command implements both ZRANGEBYSCORE and ZCOUNT.
6196 * If justcount is non-zero, just the count is returned. */
6197static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 6198 robj *o;
f44dd428 6199 double min, max;
6200 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 6201 int offset = 0, limit = -1;
0500ef27
SH
6202 int withscores = 0;
6203 int badsyntax = 0;
6204
f44dd428 6205 /* Parse the min-max interval. If one of the values is prefixed
6206 * by the "(" character, it's considered "open". For instance
6207 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6208 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6209 if (((char*)c->argv[2]->ptr)[0] == '(') {
6210 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6211 minex = 1;
6212 } else {
6213 min = strtod(c->argv[2]->ptr,NULL);
6214 }
6215 if (((char*)c->argv[3]->ptr)[0] == '(') {
6216 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6217 maxex = 1;
6218 } else {
6219 max = strtod(c->argv[3]->ptr,NULL);
6220 }
6221
6222 /* Parse "WITHSCORES": note that if the command was called with
6223 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6224 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 6225 if (c->argc == 5 || c->argc == 8) {
3a3978b1 6226 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6227 withscores = 1;
6228 else
6229 badsyntax = 1;
0500ef27 6230 }
3a3978b1 6231 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 6232 badsyntax = 1;
0500ef27 6233 if (badsyntax) {
454d4e43 6234 addReplySds(c,
6235 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 6236 return;
0500ef27
SH
6237 }
6238
f44dd428 6239 /* Parse "LIMIT" */
0500ef27 6240 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 6241 addReply(c,shared.syntaxerr);
6242 return;
0500ef27 6243 } else if (c->argc == (7 + withscores)) {
80181f78 6244 offset = atoi(c->argv[5]->ptr);
6245 limit = atoi(c->argv[6]->ptr);
0b13687c 6246 if (offset < 0) offset = 0;
80181f78 6247 }
50c55df5 6248
f44dd428 6249 /* Ok, lookup the key and get the range */
50c55df5 6250 o = lookupKeyRead(c->db,c->argv[1]);
6251 if (o == NULL) {
4e27f268 6252 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6253 } else {
6254 if (o->type != REDIS_ZSET) {
6255 addReply(c,shared.wrongtypeerr);
6256 } else {
6257 zset *zsetobj = o->ptr;
6258 zskiplist *zsl = zsetobj->zsl;
6259 zskiplistNode *ln;
f44dd428 6260 robj *ele, *lenobj = NULL;
6261 unsigned long rangelen = 0;
50c55df5 6262
f44dd428 6263 /* Get the first node with the score >= min, or with
6264 * score > min if 'minex' is true. */
50c55df5 6265 ln = zslFirstWithScore(zsl,min);
f44dd428 6266 while (minex && ln && ln->score == min) ln = ln->forward[0];
6267
50c55df5 6268 if (ln == NULL) {
6269 /* No element matching the speciifed interval */
f44dd428 6270 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6271 return;
6272 }
6273
6274 /* We don't know in advance how many matching elements there
6275 * are in the list, so we push this object that will represent
6276 * the multi-bulk length in the output buffer, and will "fix"
6277 * it later */
f44dd428 6278 if (!justcount) {
6279 lenobj = createObject(REDIS_STRING,NULL);
6280 addReply(c,lenobj);
6281 decrRefCount(lenobj);
6282 }
50c55df5 6283
f44dd428 6284 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 6285 if (offset) {
6286 offset--;
6287 ln = ln->forward[0];
6288 continue;
6289 }
6290 if (limit == 0) break;
f44dd428 6291 if (!justcount) {
6292 ele = ln->obj;
dd88747b 6293 addReplyBulk(c,ele);
f44dd428 6294 if (withscores)
6295 addReplyDouble(c,ln->score);
6296 }
50c55df5 6297 ln = ln->forward[0];
6298 rangelen++;
80181f78 6299 if (limit > 0) limit--;
50c55df5 6300 }
f44dd428 6301 if (justcount) {
482b672d 6302 addReplyLongLong(c,(long)rangelen);
f44dd428 6303 } else {
6304 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6305 withscores ? (rangelen*2) : rangelen);
6306 }
50c55df5 6307 }
6308 }
6309}
6310
f44dd428 6311static void zrangebyscoreCommand(redisClient *c) {
6312 genericZrangebyscoreCommand(c,0);
6313}
6314
6315static void zcountCommand(redisClient *c) {
6316 genericZrangebyscoreCommand(c,1);
6317}
6318
3c41331e 6319static void zcardCommand(redisClient *c) {
e197b441 6320 robj *o;
6321 zset *zs;
dd88747b 6322
6323 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6324 checkType(c,o,REDIS_ZSET)) return;
6325
6326 zs = o->ptr;
6327 addReplyUlong(c,zs->zsl->length);
e197b441 6328}
6329
6e333bbe 6330static void zscoreCommand(redisClient *c) {
6331 robj *o;
6332 zset *zs;
dd88747b 6333 dictEntry *de;
6334
6335 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6336 checkType(c,o,REDIS_ZSET)) return;
6337
6338 zs = o->ptr;
6339 de = dictFind(zs->dict,c->argv[2]);
6340 if (!de) {
96d8b4ee 6341 addReply(c,shared.nullbulk);
6e333bbe 6342 } else {
dd88747b 6343 double *score = dictGetEntryVal(de);
6e333bbe 6344
dd88747b 6345 addReplyDouble(c,*score);
6e333bbe 6346 }
6347}
6348
798d9e55 6349static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 6350 robj *o;
dd88747b 6351 zset *zs;
6352 zskiplist *zsl;
6353 dictEntry *de;
6354 unsigned long rank;
6355 double *score;
6356
6357 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6358 checkType(c,o,REDIS_ZSET)) return;
6359
6360 zs = o->ptr;
6361 zsl = zs->zsl;
6362 de = dictFind(zs->dict,c->argv[2]);
6363 if (!de) {
69d95c3e
PN
6364 addReply(c,shared.nullbulk);
6365 return;
6366 }
69d95c3e 6367
dd88747b 6368 score = dictGetEntryVal(de);
6369 rank = zslGetRank(zsl, *score, c->argv[2]);
6370 if (rank) {
6371 if (reverse) {
482b672d 6372 addReplyLongLong(c, zsl->length - rank);
27b0ccca 6373 } else {
482b672d 6374 addReplyLongLong(c, rank-1);
69d95c3e 6375 }
dd88747b 6376 } else {
6377 addReply(c,shared.nullbulk);
978c2c94 6378 }
6379}
6380
798d9e55
PN
6381static void zrankCommand(redisClient *c) {
6382 zrankGenericCommand(c, 0);
6383}
6384
6385static void zrevrankCommand(redisClient *c) {
6386 zrankGenericCommand(c, 1);
6387}
6388
7fb16bac
PN
6389/* ========================= Hashes utility functions ======================= */
6390#define REDIS_HASH_KEY 1
6391#define REDIS_HASH_VALUE 2
978c2c94 6392
7fb16bac
PN
6393/* Check the length of a number of objects to see if we need to convert a
6394 * zipmap to a real hash. Note that we only check string encoded objects
6395 * as their string length can be queried in constant time. */
6396static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6397 int i;
6398 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
978c2c94 6399
7fb16bac
PN
6400 for (i = start; i <= end; i++) {
6401 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6402 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6403 {
6404 convertToRealHash(subject);
978c2c94 6405 return;
6406 }
6407 }
7fb16bac 6408}
bae2c7ec 6409
97224de7
PN
6410/* Encode given objects in-place when the hash uses a dict. */
6411static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6412 if (subject->encoding == REDIS_ENCODING_HT) {
3f973463
PN
6413 if (o1) *o1 = tryObjectEncoding(*o1);
6414 if (o2) *o2 = tryObjectEncoding(*o2);
97224de7
PN
6415 }
6416}
6417
7fb16bac 6418/* Get the value from a hash identified by key. Returns either a string
a3f3af86
PN
6419 * object or NULL if the value cannot be found. The refcount of the object
6420 * is always increased by 1 when the value was found. */
7fb16bac
PN
6421static robj *hashGet(robj *o, robj *key) {
6422 robj *value = NULL;
978c2c94 6423 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7fb16bac
PN
6424 unsigned char *v;
6425 unsigned int vlen;
6426 key = getDecodedObject(key);
6427 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6428 value = createStringObject((char*)v,vlen);
6429 }
6430 decrRefCount(key);
6431 } else {
6432 dictEntry *de = dictFind(o->ptr,key);
6433 if (de != NULL) {
6434 value = dictGetEntryVal(de);
a3f3af86 6435 incrRefCount(value);
7fb16bac
PN
6436 }
6437 }
6438 return value;
6439}
978c2c94 6440
7fb16bac
PN
6441/* Test if the key exists in the given hash. Returns 1 if the key
6442 * exists and 0 when it doesn't. */
6443static int hashExists(robj *o, robj *key) {
6444 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6445 key = getDecodedObject(key);
6446 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6447 decrRefCount(key);
6448 return 1;
6449 }
6450 decrRefCount(key);
6451 } else {
6452 if (dictFind(o->ptr,key) != NULL) {
6453 return 1;
6454 }
6455 }
6456 return 0;
6457}
bae2c7ec 6458
7fb16bac
PN
6459/* Add an element, discard the old if the key already exists.
6460 * Return 0 on insert and 1 on update. */
feb8d7e6 6461static int hashSet(robj *o, robj *key, robj *value) {
7fb16bac
PN
6462 int update = 0;
6463 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6464 key = getDecodedObject(key);
6465 value = getDecodedObject(value);
6466 o->ptr = zipmapSet(o->ptr,
6467 key->ptr,sdslen(key->ptr),
6468 value->ptr,sdslen(value->ptr), &update);
6469 decrRefCount(key);
6470 decrRefCount(value);
6471
6472 /* Check if the zipmap needs to be upgraded to a real hash table */
6473 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
bae2c7ec 6474 convertToRealHash(o);
978c2c94 6475 } else {
7fb16bac
PN
6476 if (dictReplace(o->ptr,key,value)) {
6477 /* Insert */
6478 incrRefCount(key);
978c2c94 6479 } else {
7fb16bac 6480 /* Update */
978c2c94 6481 update = 1;
6482 }
7fb16bac 6483 incrRefCount(value);
978c2c94 6484 }
7fb16bac 6485 return update;
978c2c94 6486}
6487
7fb16bac
PN
6488/* Delete an element from a hash.
6489 * Return 1 on deleted and 0 on not found. */
6490static int hashDelete(robj *o, robj *key) {
6491 int deleted = 0;
6492 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6493 key = getDecodedObject(key);
6494 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6495 decrRefCount(key);
6496 } else {
6497 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6498 /* Always check if the dictionary needs a resize after a delete. */
6499 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
d33278d1 6500 }
7fb16bac
PN
6501 return deleted;
6502}
d33278d1 6503
7fb16bac 6504/* Return the number of elements in a hash. */
c811bb38 6505static unsigned long hashLength(robj *o) {
7fb16bac
PN
6506 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6507 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6508}
6509
6510/* Structure to hold hash iteration abstration. Note that iteration over
6511 * hashes involves both fields and values. Because it is possible that
6512 * not both are required, store pointers in the iterator to avoid
6513 * unnecessary memory allocation for fields/values. */
6514typedef struct {
6515 int encoding;
6516 unsigned char *zi;
6517 unsigned char *zk, *zv;
6518 unsigned int zklen, zvlen;
6519
6520 dictIterator *di;
6521 dictEntry *de;
6522} hashIterator;
6523
c44d3b56
PN
6524static hashIterator *hashInitIterator(robj *subject) {
6525 hashIterator *hi = zmalloc(sizeof(hashIterator));
7fb16bac
PN
6526 hi->encoding = subject->encoding;
6527 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6528 hi->zi = zipmapRewind(subject->ptr);
6529 } else if (hi->encoding == REDIS_ENCODING_HT) {
6530 hi->di = dictGetIterator(subject->ptr);
d33278d1 6531 } else {
7fb16bac 6532 redisAssert(NULL);
d33278d1 6533 }
c44d3b56 6534 return hi;
7fb16bac 6535}
d33278d1 6536
7fb16bac
PN
6537static void hashReleaseIterator(hashIterator *hi) {
6538 if (hi->encoding == REDIS_ENCODING_HT) {
6539 dictReleaseIterator(hi->di);
d33278d1 6540 }
c44d3b56 6541 zfree(hi);
7fb16bac 6542}
d33278d1 6543
7fb16bac
PN
6544/* Move to the next entry in the hash. Return REDIS_OK when the next entry
6545 * could be found and REDIS_ERR when the iterator reaches the end. */
c811bb38 6546static int hashNext(hashIterator *hi) {
7fb16bac
PN
6547 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6548 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6549 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6550 } else {
6551 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6552 }
6553 return REDIS_OK;
6554}
d33278d1 6555
0c390abc 6556/* Get key or value object at current iteration position.
a3f3af86 6557 * This increases the refcount of the field object by 1. */
c811bb38 6558static robj *hashCurrent(hashIterator *hi, int what) {
7fb16bac
PN
6559 robj *o;
6560 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6561 if (what & REDIS_HASH_KEY) {
6562 o = createStringObject((char*)hi->zk,hi->zklen);
6563 } else {
6564 o = createStringObject((char*)hi->zv,hi->zvlen);
d33278d1 6565 }
d33278d1 6566 } else {
7fb16bac
PN
6567 if (what & REDIS_HASH_KEY) {
6568 o = dictGetEntryKey(hi->de);
6569 } else {
6570 o = dictGetEntryVal(hi->de);
d33278d1 6571 }
a3f3af86 6572 incrRefCount(o);
d33278d1 6573 }
7fb16bac 6574 return o;
d33278d1
PN
6575}
6576
7fb16bac
PN
6577static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6578 robj *o = lookupKeyWrite(c->db,key);
01426b05
PN
6579 if (o == NULL) {
6580 o = createHashObject();
7fb16bac
PN
6581 dictAdd(c->db->dict,key,o);
6582 incrRefCount(key);
01426b05
PN
6583 } else {
6584 if (o->type != REDIS_HASH) {
6585 addReply(c,shared.wrongtypeerr);
7fb16bac 6586 return NULL;
01426b05
PN
6587 }
6588 }
7fb16bac
PN
6589 return o;
6590}
01426b05 6591
7fb16bac
PN
6592/* ============================= Hash commands ============================== */
6593static void hsetCommand(redisClient *c) {
6e9e463f 6594 int update;
7fb16bac 6595 robj *o;
bbe025e0 6596
7fb16bac
PN
6597 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6598 hashTryConversion(o,c->argv,2,3);
97224de7 6599 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6600 update = hashSet(o,c->argv[2],c->argv[3]);
6e9e463f 6601 addReply(c, update ? shared.czero : shared.cone);
7fb16bac
PN
6602 server.dirty++;
6603}
01426b05 6604
1f1c7695
PN
6605static void hsetnxCommand(redisClient *c) {
6606 robj *o;
6607 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6608 hashTryConversion(o,c->argv,2,3);
6609
6610 if (hashExists(o, c->argv[2])) {
6611 addReply(c, shared.czero);
01426b05 6612 } else {
97224de7 6613 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6614 hashSet(o,c->argv[2],c->argv[3]);
1f1c7695
PN
6615 addReply(c, shared.cone);
6616 server.dirty++;
6617 }
6618}
01426b05 6619
7fb16bac
PN
6620static void hmsetCommand(redisClient *c) {
6621 int i;
6622 robj *o;
01426b05 6623
7fb16bac
PN
6624 if ((c->argc % 2) == 1) {
6625 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6626 return;
6627 }
01426b05 6628
7fb16bac
PN
6629 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6630 hashTryConversion(o,c->argv,2,c->argc-1);
6631 for (i = 2; i < c->argc; i += 2) {
97224de7 6632 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
feb8d7e6 6633 hashSet(o,c->argv[i],c->argv[i+1]);
7fb16bac
PN
6634 }
6635 addReply(c, shared.ok);
edc2f63a 6636 server.dirty++;
7fb16bac
PN
6637}
6638
6639static void hincrbyCommand(redisClient *c) {
6640 long long value, incr;
6641 robj *o, *current, *new;
6642
bd79a6bd 6643 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
7fb16bac
PN
6644 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6645 if ((current = hashGet(o,c->argv[2])) != NULL) {
946342c1
PN
6646 if (getLongLongFromObjectOrReply(c,current,&value,
6647 "hash value is not an integer") != REDIS_OK) {
6648 decrRefCount(current);
6649 return;
6650 }
a3f3af86 6651 decrRefCount(current);
7fb16bac
PN
6652 } else {
6653 value = 0;
01426b05
PN
6654 }
6655
7fb16bac 6656 value += incr;
3f973463
PN
6657 new = createStringObjectFromLongLong(value);
6658 hashTryObjectEncoding(o,&c->argv[2],NULL);
feb8d7e6 6659 hashSet(o,c->argv[2],new);
7fb16bac
PN
6660 decrRefCount(new);
6661 addReplyLongLong(c,value);
01426b05 6662 server.dirty++;
01426b05
PN
6663}
6664
978c2c94 6665static void hgetCommand(redisClient *c) {
7fb16bac 6666 robj *o, *value;
dd88747b 6667 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6668 checkType(c,o,REDIS_HASH)) return;
6669
7fb16bac
PN
6670 if ((value = hashGet(o,c->argv[2])) != NULL) {
6671 addReplyBulk(c,value);
a3f3af86 6672 decrRefCount(value);
dd88747b 6673 } else {
7fb16bac 6674 addReply(c,shared.nullbulk);
69d95c3e 6675 }
69d95c3e
PN
6676}
6677
09aeb579
PN
6678static void hmgetCommand(redisClient *c) {
6679 int i;
7fb16bac
PN
6680 robj *o, *value;
6681 o = lookupKeyRead(c->db,c->argv[1]);
6682 if (o != NULL && o->type != REDIS_HASH) {
6683 addReply(c,shared.wrongtypeerr);
09aeb579
PN
6684 }
6685
7fb16bac
PN
6686 /* Note the check for o != NULL happens inside the loop. This is
6687 * done because objects that cannot be found are considered to be
6688 * an empty hash. The reply should then be a series of NULLs. */
09aeb579 6689 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7fb16bac
PN
6690 for (i = 2; i < c->argc; i++) {
6691 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6692 addReplyBulk(c,value);
a3f3af86 6693 decrRefCount(value);
7fb16bac
PN
6694 } else {
6695 addReply(c,shared.nullbulk);
09aeb579
PN
6696 }
6697 }
6698}
6699
07efaf74 6700static void hdelCommand(redisClient *c) {
dd88747b 6701 robj *o;
dd88747b 6702 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6703 checkType(c,o,REDIS_HASH)) return;
07efaf74 6704
7fb16bac
PN
6705 if (hashDelete(o,c->argv[2])) {
6706 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6707 addReply(c,shared.cone);
6708 server.dirty++;
dd88747b 6709 } else {
7fb16bac 6710 addReply(c,shared.czero);
07efaf74 6711 }
6712}
6713
92b27fe9 6714static void hlenCommand(redisClient *c) {
6715 robj *o;
dd88747b 6716 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 6717 checkType(c,o,REDIS_HASH)) return;
6718
7fb16bac 6719 addReplyUlong(c,hashLength(o));
92b27fe9 6720}
6721
78409a0f 6722static void genericHgetallCommand(redisClient *c, int flags) {
7fb16bac 6723 robj *o, *lenobj, *obj;
78409a0f 6724 unsigned long count = 0;
c44d3b56 6725 hashIterator *hi;
78409a0f 6726
4e27f268 6727 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
78409a0f 6728 || checkType(c,o,REDIS_HASH)) return;
6729
6730 lenobj = createObject(REDIS_STRING,NULL);
6731 addReply(c,lenobj);
6732 decrRefCount(lenobj);
6733
c44d3b56
PN
6734 hi = hashInitIterator(o);
6735 while (hashNext(hi) != REDIS_ERR) {
7fb16bac 6736 if (flags & REDIS_HASH_KEY) {
c44d3b56 6737 obj = hashCurrent(hi,REDIS_HASH_KEY);
7fb16bac 6738 addReplyBulk(c,obj);
a3f3af86 6739 decrRefCount(obj);
7fb16bac 6740 count++;
78409a0f 6741 }
7fb16bac 6742 if (flags & REDIS_HASH_VALUE) {
c44d3b56 6743 obj = hashCurrent(hi,REDIS_HASH_VALUE);
7fb16bac 6744 addReplyBulk(c,obj);
a3f3af86 6745 decrRefCount(obj);
7fb16bac 6746 count++;
78409a0f 6747 }
78409a0f 6748 }
c44d3b56 6749 hashReleaseIterator(hi);
7fb16bac 6750
78409a0f 6751 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6752}
6753
6754static void hkeysCommand(redisClient *c) {
7fb16bac 6755 genericHgetallCommand(c,REDIS_HASH_KEY);
78409a0f 6756}
6757
6758static void hvalsCommand(redisClient *c) {
7fb16bac 6759 genericHgetallCommand(c,REDIS_HASH_VALUE);
78409a0f 6760}
6761
6762static void hgetallCommand(redisClient *c) {
7fb16bac 6763 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
78409a0f 6764}
6765
a86f14b1 6766static void hexistsCommand(redisClient *c) {
6767 robj *o;
a86f14b1 6768 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6769 checkType(c,o,REDIS_HASH)) return;
6770
7fb16bac 6771 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
a86f14b1 6772}
6773
ada386b2 6774static void convertToRealHash(robj *o) {
6775 unsigned char *key, *val, *p, *zm = o->ptr;
6776 unsigned int klen, vlen;
6777 dict *dict = dictCreate(&hashDictType,NULL);
6778
6779 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6780 p = zipmapRewind(zm);
6781 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6782 robj *keyobj, *valobj;
6783
6784 keyobj = createStringObject((char*)key,klen);
6785 valobj = createStringObject((char*)val,vlen);
05df7621 6786 keyobj = tryObjectEncoding(keyobj);
6787 valobj = tryObjectEncoding(valobj);
ada386b2 6788 dictAdd(dict,keyobj,valobj);
6789 }
6790 o->encoding = REDIS_ENCODING_HT;
6791 o->ptr = dict;
6792 zfree(zm);
6793}
6794
6b47e12e 6795/* ========================= Non type-specific commands ==================== */
6796
ed9b544e 6797static void flushdbCommand(redisClient *c) {
ca37e9cd 6798 server.dirty += dictSize(c->db->dict);
9b30e1a2 6799 touchWatchedKeysOnFlush(c->db->id);
3305306f 6800 dictEmpty(c->db->dict);
6801 dictEmpty(c->db->expires);
ed9b544e 6802 addReply(c,shared.ok);
ed9b544e 6803}
6804
6805static void flushallCommand(redisClient *c) {
9b30e1a2 6806 touchWatchedKeysOnFlush(-1);
ca37e9cd 6807 server.dirty += emptyDb();
ed9b544e 6808 addReply(c,shared.ok);
500ece7c 6809 if (server.bgsavechildpid != -1) {
6810 kill(server.bgsavechildpid,SIGKILL);
6811 rdbRemoveTempFile(server.bgsavechildpid);
6812 }
f78fd11b 6813 rdbSave(server.dbfilename);
ca37e9cd 6814 server.dirty++;
ed9b544e 6815}
6816
56906eef 6817static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 6818 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 6819 so->type = type;
6820 so->pattern = pattern;
6821 return so;
6822}
6823
6824/* Return the value associated to the key with a name obtained
55017f9d
PN
6825 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6826 * The returned object will always have its refcount increased by 1
6827 * when it is non-NULL. */
56906eef 6828static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6d7d1370 6829 char *p, *f;
ed9b544e 6830 sds spat, ssub;
6d7d1370
PN
6831 robj keyobj, fieldobj, *o;
6832 int prefixlen, sublen, postfixlen, fieldlen;
ed9b544e 6833 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6834 struct {
f1017b3f 6835 long len;
6836 long free;
ed9b544e 6837 char buf[REDIS_SORTKEY_MAX+1];
6d7d1370 6838 } keyname, fieldname;
ed9b544e 6839
28173a49 6840 /* If the pattern is "#" return the substitution object itself in order
6841 * to implement the "SORT ... GET #" feature. */
6842 spat = pattern->ptr;
6843 if (spat[0] == '#' && spat[1] == '\0') {
55017f9d 6844 incrRefCount(subst);
28173a49 6845 return subst;
6846 }
6847
6848 /* The substitution object may be specially encoded. If so we create
9d65a1bb 6849 * a decoded object on the fly. Otherwise getDecodedObject will just
6850 * increment the ref count, that we'll decrement later. */
6851 subst = getDecodedObject(subst);
942a3961 6852
ed9b544e 6853 ssub = subst->ptr;
6854 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6855 p = strchr(spat,'*');
ed5a857a 6856 if (!p) {
6857 decrRefCount(subst);
6858 return NULL;
6859 }
ed9b544e 6860
6d7d1370
PN
6861 /* Find out if we're dealing with a hash dereference. */
6862 if ((f = strstr(p+1, "->")) != NULL) {
6863 fieldlen = sdslen(spat)-(f-spat);
6864 /* this also copies \0 character */
6865 memcpy(fieldname.buf,f+2,fieldlen-1);
6866 fieldname.len = fieldlen-2;
6867 } else {
6868 fieldlen = 0;
6869 }
6870
ed9b544e 6871 prefixlen = p-spat;
6872 sublen = sdslen(ssub);
6d7d1370 6873 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
ed9b544e 6874 memcpy(keyname.buf,spat,prefixlen);
6875 memcpy(keyname.buf+prefixlen,ssub,sublen);
6876 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6877 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6878 keyname.len = prefixlen+sublen+postfixlen;
942a3961 6879 decrRefCount(subst);
6880
6d7d1370
PN
6881 /* Lookup substituted key */
6882 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6883 o = lookupKeyRead(db,&keyobj);
55017f9d
PN
6884 if (o == NULL) return NULL;
6885
6886 if (fieldlen > 0) {
6887 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6d7d1370 6888
705dad38
PN
6889 /* Retrieve value from hash by the field name. This operation
6890 * already increases the refcount of the returned object. */
6d7d1370
PN
6891 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6892 o = hashGet(o, &fieldobj);
705dad38 6893 } else {
55017f9d 6894 if (o->type != REDIS_STRING) return NULL;
b6f07345 6895
705dad38
PN
6896 /* Every object that this function returns needs to have its refcount
6897 * increased. sortCommand decreases it again. */
6898 incrRefCount(o);
6d7d1370
PN
6899 }
6900
6901 return o;
ed9b544e 6902}
6903
6904/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6905 * the additional parameter is not standard but a BSD-specific we have to
6906 * pass sorting parameters via the global 'server' structure */
6907static int sortCompare(const void *s1, const void *s2) {
6908 const redisSortObject *so1 = s1, *so2 = s2;
6909 int cmp;
6910
6911 if (!server.sort_alpha) {
6912 /* Numeric sorting. Here it's trivial as we precomputed scores */
6913 if (so1->u.score > so2->u.score) {
6914 cmp = 1;
6915 } else if (so1->u.score < so2->u.score) {
6916 cmp = -1;
6917 } else {
6918 cmp = 0;
6919 }
6920 } else {
6921 /* Alphanumeric sorting */
6922 if (server.sort_bypattern) {
6923 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6924 /* At least one compare object is NULL */
6925 if (so1->u.cmpobj == so2->u.cmpobj)
6926 cmp = 0;
6927 else if (so1->u.cmpobj == NULL)
6928 cmp = -1;
6929 else
6930 cmp = 1;
6931 } else {
6932 /* We have both the objects, use strcoll */
6933 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6934 }
6935 } else {
08ee9b57 6936 /* Compare elements directly. */
6937 cmp = compareStringObjects(so1->obj,so2->obj);
ed9b544e 6938 }
6939 }
6940 return server.sort_desc ? -cmp : cmp;
6941}
6942
6943/* The SORT command is the most complex command in Redis. Warning: this code
6944 * is optimized for speed and a bit less for readability */
6945static void sortCommand(redisClient *c) {
ed9b544e 6946 list *operations;
6947 int outputlen = 0;
6948 int desc = 0, alpha = 0;
6949 int limit_start = 0, limit_count = -1, start, end;
6950 int j, dontsort = 0, vectorlen;
6951 int getop = 0; /* GET operation counter */
443c6409 6952 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 6953 redisSortObject *vector; /* Resulting vector to sort */
6954
6955 /* Lookup the key to sort. It must be of the right types */
3305306f 6956 sortval = lookupKeyRead(c->db,c->argv[1]);
6957 if (sortval == NULL) {
4e27f268 6958 addReply(c,shared.emptymultibulk);
ed9b544e 6959 return;
6960 }
a5eb649b 6961 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6962 sortval->type != REDIS_ZSET)
6963 {
c937aa89 6964 addReply(c,shared.wrongtypeerr);
ed9b544e 6965 return;
6966 }
6967
6968 /* Create a list of operations to perform for every sorted element.
6969 * Operations can be GET/DEL/INCR/DECR */
6970 operations = listCreate();
092dac2a 6971 listSetFreeMethod(operations,zfree);
ed9b544e 6972 j = 2;
6973
6974 /* Now we need to protect sortval incrementing its count, in the future
6975 * SORT may have options able to overwrite/delete keys during the sorting
6976 * and the sorted key itself may get destroied */
6977 incrRefCount(sortval);
6978
6979 /* The SORT command has an SQL-alike syntax, parse it */
6980 while(j < c->argc) {
6981 int leftargs = c->argc-j-1;
6982 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6983 desc = 0;
6984 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6985 desc = 1;
6986 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6987 alpha = 1;
6988 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6989 limit_start = atoi(c->argv[j+1]->ptr);
6990 limit_count = atoi(c->argv[j+2]->ptr);
6991 j+=2;
443c6409 6992 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6993 storekey = c->argv[j+1];
6994 j++;
ed9b544e 6995 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6996 sortby = c->argv[j+1];
6997 /* If the BY pattern does not contain '*', i.e. it is constant,
6998 * we don't need to sort nor to lookup the weight keys. */
6999 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7000 j++;
7001 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7002 listAddNodeTail(operations,createSortOperation(
7003 REDIS_SORT_GET,c->argv[j+1]));
7004 getop++;
7005 j++;
ed9b544e 7006 } else {
7007 decrRefCount(sortval);
7008 listRelease(operations);
c937aa89 7009 addReply(c,shared.syntaxerr);
ed9b544e 7010 return;
7011 }
7012 j++;
7013 }
7014
7015 /* Load the sorting vector with all the objects to sort */
a5eb649b 7016 switch(sortval->type) {
7017 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
7018 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7019 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
f83c6cb5 7020 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
a5eb649b 7021 }
ed9b544e 7022 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 7023 j = 0;
a5eb649b 7024
ed9b544e 7025 if (sortval->type == REDIS_LIST) {
7026 list *list = sortval->ptr;
6208b3a7 7027 listNode *ln;
c7df85a4 7028 listIter li;
6208b3a7 7029
c7df85a4 7030 listRewind(list,&li);
7031 while((ln = listNext(&li))) {
ed9b544e 7032 robj *ele = ln->value;
7033 vector[j].obj = ele;
7034 vector[j].u.score = 0;
7035 vector[j].u.cmpobj = NULL;
ed9b544e 7036 j++;
7037 }
7038 } else {
a5eb649b 7039 dict *set;
ed9b544e 7040 dictIterator *di;
7041 dictEntry *setele;
7042
a5eb649b 7043 if (sortval->type == REDIS_SET) {
7044 set = sortval->ptr;
7045 } else {
7046 zset *zs = sortval->ptr;
7047 set = zs->dict;
7048 }
7049
ed9b544e 7050 di = dictGetIterator(set);
ed9b544e 7051 while((setele = dictNext(di)) != NULL) {
7052 vector[j].obj = dictGetEntryKey(setele);
7053 vector[j].u.score = 0;
7054 vector[j].u.cmpobj = NULL;
7055 j++;
7056 }
7057 dictReleaseIterator(di);
7058 }
dfc5e96c 7059 redisAssert(j == vectorlen);
ed9b544e 7060
7061 /* Now it's time to load the right scores in the sorting vector */
7062 if (dontsort == 0) {
7063 for (j = 0; j < vectorlen; j++) {
6d7d1370 7064 robj *byval;
ed9b544e 7065 if (sortby) {
6d7d1370 7066 /* lookup value to sort by */
3305306f 7067 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
705dad38 7068 if (!byval) continue;
ed9b544e 7069 } else {
6d7d1370
PN
7070 /* use object itself to sort by */
7071 byval = vector[j].obj;
7072 }
7073
7074 if (alpha) {
08ee9b57 7075 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6d7d1370
PN
7076 } else {
7077 if (byval->encoding == REDIS_ENCODING_RAW) {
7078 vector[j].u.score = strtod(byval->ptr,NULL);
16fa22f1 7079 } else if (byval->encoding == REDIS_ENCODING_INT) {
6d7d1370
PN
7080 /* Don't need to decode the object if it's
7081 * integer-encoded (the only encoding supported) so
7082 * far. We can just cast it */
16fa22f1
PN
7083 vector[j].u.score = (long)byval->ptr;
7084 } else {
7085 redisAssert(1 != 1);
942a3961 7086 }
ed9b544e 7087 }
6d7d1370 7088
705dad38
PN
7089 /* when the object was retrieved using lookupKeyByPattern,
7090 * its refcount needs to be decreased. */
7091 if (sortby) {
7092 decrRefCount(byval);
ed9b544e 7093 }
7094 }
7095 }
7096
7097 /* We are ready to sort the vector... perform a bit of sanity check
7098 * on the LIMIT option too. We'll use a partial version of quicksort. */
7099 start = (limit_start < 0) ? 0 : limit_start;
7100 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7101 if (start >= vectorlen) {
7102 start = vectorlen-1;
7103 end = vectorlen-2;
7104 }
7105 if (end >= vectorlen) end = vectorlen-1;
7106
7107 if (dontsort == 0) {
7108 server.sort_desc = desc;
7109 server.sort_alpha = alpha;
7110 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 7111 if (sortby && (start != 0 || end != vectorlen-1))
7112 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7113 else
7114 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 7115 }
7116
7117 /* Send command output to the output buffer, performing the specified
7118 * GET/DEL/INCR/DECR operations if any. */
7119 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 7120 if (storekey == NULL) {
7121 /* STORE option not specified, sent the sorting result to client */
7122 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7123 for (j = start; j <= end; j++) {
7124 listNode *ln;
c7df85a4 7125 listIter li;
7126
dd88747b 7127 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 7128 listRewind(operations,&li);
7129 while((ln = listNext(&li))) {
443c6409 7130 redisSortOperation *sop = ln->value;
7131 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7132 vector[j].obj);
7133
7134 if (sop->type == REDIS_SORT_GET) {
55017f9d 7135 if (!val) {
443c6409 7136 addReply(c,shared.nullbulk);
7137 } else {
dd88747b 7138 addReplyBulk(c,val);
55017f9d 7139 decrRefCount(val);
443c6409 7140 }
7141 } else {
dfc5e96c 7142 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 7143 }
7144 }
ed9b544e 7145 }
443c6409 7146 } else {
7147 robj *listObject = createListObject();
7148 list *listPtr = (list*) listObject->ptr;
7149
7150 /* STORE option specified, set the sorting result as a List object */
7151 for (j = start; j <= end; j++) {
7152 listNode *ln;
c7df85a4 7153 listIter li;
7154
443c6409 7155 if (!getop) {
7156 listAddNodeTail(listPtr,vector[j].obj);
7157 incrRefCount(vector[j].obj);
7158 }
c7df85a4 7159 listRewind(operations,&li);
7160 while((ln = listNext(&li))) {
443c6409 7161 redisSortOperation *sop = ln->value;
7162 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7163 vector[j].obj);
7164
7165 if (sop->type == REDIS_SORT_GET) {
55017f9d 7166 if (!val) {
443c6409 7167 listAddNodeTail(listPtr,createStringObject("",0));
7168 } else {
55017f9d
PN
7169 /* We should do a incrRefCount on val because it is
7170 * added to the list, but also a decrRefCount because
7171 * it is returned by lookupKeyByPattern. This results
7172 * in doing nothing at all. */
443c6409 7173 listAddNodeTail(listPtr,val);
443c6409 7174 }
ed9b544e 7175 } else {
dfc5e96c 7176 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 7177 }
ed9b544e 7178 }
ed9b544e 7179 }
121796f7 7180 if (dictReplace(c->db->dict,storekey,listObject)) {
7181 incrRefCount(storekey);
7182 }
443c6409 7183 /* Note: we add 1 because the DB is dirty anyway since even if the
7184 * SORT result is empty a new key is set and maybe the old content
7185 * replaced. */
7186 server.dirty += 1+outputlen;
7187 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 7188 }
7189
7190 /* Cleanup */
7191 decrRefCount(sortval);
7192 listRelease(operations);
7193 for (j = 0; j < vectorlen; j++) {
16fa22f1 7194 if (alpha && vector[j].u.cmpobj)
ed9b544e 7195 decrRefCount(vector[j].u.cmpobj);
7196 }
7197 zfree(vector);
7198}
7199
ec6c7a1d 7200/* Convert an amount of bytes into a human readable string in the form
7201 * of 100B, 2G, 100M, 4K, and so forth. */
7202static void bytesToHuman(char *s, unsigned long long n) {
7203 double d;
7204
7205 if (n < 1024) {
7206 /* Bytes */
7207 sprintf(s,"%lluB",n);
7208 return;
7209 } else if (n < (1024*1024)) {
7210 d = (double)n/(1024);
7211 sprintf(s,"%.2fK",d);
7212 } else if (n < (1024LL*1024*1024)) {
7213 d = (double)n/(1024*1024);
7214 sprintf(s,"%.2fM",d);
7215 } else if (n < (1024LL*1024*1024*1024)) {
7216 d = (double)n/(1024LL*1024*1024);
b72f6a4b 7217 sprintf(s,"%.2fG",d);
ec6c7a1d 7218 }
7219}
7220
1c85b79f 7221/* Create the string returned by the INFO command. This is decoupled
7222 * by the INFO command itself as we need to report the same information
7223 * on memory corruption problems. */
7224static sds genRedisInfoString(void) {
ed9b544e 7225 sds info;
7226 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 7227 int j;
ec6c7a1d 7228 char hmem[64];
55a8298f 7229
b72f6a4b 7230 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 7231 info = sdscatprintf(sdsempty(),
7232 "redis_version:%s\r\n"
5436146c
PN
7233 "redis_git_sha1:%s\r\n"
7234 "redis_git_dirty:%d\r\n"
f1017b3f 7235 "arch_bits:%s\r\n"
7a932b74 7236 "multiplexing_api:%s\r\n"
0d7170a4 7237 "process_id:%ld\r\n"
682ac724 7238 "uptime_in_seconds:%ld\r\n"
7239 "uptime_in_days:%ld\r\n"
ed9b544e 7240 "connected_clients:%d\r\n"
7241 "connected_slaves:%d\r\n"
f86a74e9 7242 "blocked_clients:%d\r\n"
5fba9f71 7243 "used_memory:%zu\r\n"
ec6c7a1d 7244 "used_memory_human:%s\r\n"
ed9b544e 7245 "changes_since_last_save:%lld\r\n"
be2bb6b0 7246 "bgsave_in_progress:%d\r\n"
682ac724 7247 "last_save_time:%ld\r\n"
b3fad521 7248 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 7249 "total_connections_received:%lld\r\n"
7250 "total_commands_processed:%lld\r\n"
2a6a2ed1 7251 "expired_keys:%lld\r\n"
3be2c9d7 7252 "hash_max_zipmap_entries:%zu\r\n"
7253 "hash_max_zipmap_value:%zu\r\n"
ffc6b7f8 7254 "pubsub_channels:%ld\r\n"
7255 "pubsub_patterns:%u\r\n"
7d98e08c 7256 "vm_enabled:%d\r\n"
a0f643ea 7257 "role:%s\r\n"
ed9b544e 7258 ,REDIS_VERSION,
5436146c 7259 REDIS_GIT_SHA1,
274e45e3 7260 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
f1017b3f 7261 (sizeof(long) == 8) ? "64" : "32",
7a932b74 7262 aeGetApiName(),
0d7170a4 7263 (long) getpid(),
a0f643ea 7264 uptime,
7265 uptime/(3600*24),
ed9b544e 7266 listLength(server.clients)-listLength(server.slaves),
7267 listLength(server.slaves),
d5d55fc3 7268 server.blpop_blocked_clients,
b72f6a4b 7269 zmalloc_used_memory(),
ec6c7a1d 7270 hmem,
ed9b544e 7271 server.dirty,
9d65a1bb 7272 server.bgsavechildpid != -1,
ed9b544e 7273 server.lastsave,
b3fad521 7274 server.bgrewritechildpid != -1,
ed9b544e 7275 server.stat_numconnections,
7276 server.stat_numcommands,
2a6a2ed1 7277 server.stat_expiredkeys,
55a8298f 7278 server.hash_max_zipmap_entries,
7279 server.hash_max_zipmap_value,
ffc6b7f8 7280 dictSize(server.pubsub_channels),
7281 listLength(server.pubsub_patterns),
7d98e08c 7282 server.vm_enabled != 0,
a0f643ea 7283 server.masterhost == NULL ? "master" : "slave"
ed9b544e 7284 );
a0f643ea 7285 if (server.masterhost) {
7286 info = sdscatprintf(info,
7287 "master_host:%s\r\n"
7288 "master_port:%d\r\n"
7289 "master_link_status:%s\r\n"
7290 "master_last_io_seconds_ago:%d\r\n"
7291 ,server.masterhost,
7292 server.masterport,
7293 (server.replstate == REDIS_REPL_CONNECTED) ?
7294 "up" : "down",
f72b934d 7295 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 7296 );
7297 }
7d98e08c 7298 if (server.vm_enabled) {
1064ef87 7299 lockThreadedIO();
7d98e08c 7300 info = sdscatprintf(info,
7301 "vm_conf_max_memory:%llu\r\n"
7302 "vm_conf_page_size:%llu\r\n"
7303 "vm_conf_pages:%llu\r\n"
7304 "vm_stats_used_pages:%llu\r\n"
7305 "vm_stats_swapped_objects:%llu\r\n"
7306 "vm_stats_swappin_count:%llu\r\n"
7307 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 7308 "vm_stats_io_newjobs_len:%lu\r\n"
7309 "vm_stats_io_processing_len:%lu\r\n"
7310 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 7311 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 7312 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 7313 ,(unsigned long long) server.vm_max_memory,
7314 (unsigned long long) server.vm_page_size,
7315 (unsigned long long) server.vm_pages,
7316 (unsigned long long) server.vm_stats_used_pages,
7317 (unsigned long long) server.vm_stats_swapped_objects,
7318 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 7319 (unsigned long long) server.vm_stats_swapouts,
7320 (unsigned long) listLength(server.io_newjobs),
7321 (unsigned long) listLength(server.io_processing),
7322 (unsigned long) listLength(server.io_processed),
d5d55fc3 7323 (unsigned long) server.io_active_threads,
7324 (unsigned long) server.vm_blocked_clients
7d98e08c 7325 );
1064ef87 7326 unlockThreadedIO();
7d98e08c 7327 }
c3cb078d 7328 for (j = 0; j < server.dbnum; j++) {
7329 long long keys, vkeys;
7330
7331 keys = dictSize(server.db[j].dict);
7332 vkeys = dictSize(server.db[j].expires);
7333 if (keys || vkeys) {
9d65a1bb 7334 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 7335 j, keys, vkeys);
7336 }
7337 }
1c85b79f 7338 return info;
7339}
7340
7341static void infoCommand(redisClient *c) {
7342 sds info = genRedisInfoString();
83c6a618 7343 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7344 (unsigned long)sdslen(info)));
ed9b544e 7345 addReplySds(c,info);
70003d28 7346 addReply(c,shared.crlf);
ed9b544e 7347}
7348
3305306f 7349static void monitorCommand(redisClient *c) {
7350 /* ignore MONITOR if aleady slave or in monitor mode */
7351 if (c->flags & REDIS_SLAVE) return;
7352
7353 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7354 c->slaveseldb = 0;
6b47e12e 7355 listAddNodeTail(server.monitors,c);
3305306f 7356 addReply(c,shared.ok);
7357}
7358
7359/* ================================= Expire ================================= */
7360static int removeExpire(redisDb *db, robj *key) {
7361 if (dictDelete(db->expires,key) == DICT_OK) {
7362 return 1;
7363 } else {
7364 return 0;
7365 }
7366}
7367
7368static int setExpire(redisDb *db, robj *key, time_t when) {
7369 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7370 return 0;
7371 } else {
7372 incrRefCount(key);
7373 return 1;
7374 }
7375}
7376
bb32ede5 7377/* Return the expire time of the specified key, or -1 if no expire
7378 * is associated with this key (i.e. the key is non volatile) */
7379static time_t getExpire(redisDb *db, robj *key) {
7380 dictEntry *de;
7381
7382 /* No expire? return ASAP */
7383 if (dictSize(db->expires) == 0 ||
7384 (de = dictFind(db->expires,key)) == NULL) return -1;
7385
7386 return (time_t) dictGetEntryVal(de);
7387}
7388
3305306f 7389static int expireIfNeeded(redisDb *db, robj *key) {
7390 time_t when;
7391 dictEntry *de;
7392
7393 /* No expire? return ASAP */
7394 if (dictSize(db->expires) == 0 ||
7395 (de = dictFind(db->expires,key)) == NULL) return 0;
7396
7397 /* Lookup the expire */
7398 when = (time_t) dictGetEntryVal(de);
7399 if (time(NULL) <= when) return 0;
7400
7401 /* Delete the key */
7402 dictDelete(db->expires,key);
2a6a2ed1 7403 server.stat_expiredkeys++;
3305306f 7404 return dictDelete(db->dict,key) == DICT_OK;
7405}
7406
7407static int deleteIfVolatile(redisDb *db, robj *key) {
7408 dictEntry *de;
7409
7410 /* No expire? return ASAP */
7411 if (dictSize(db->expires) == 0 ||
7412 (de = dictFind(db->expires,key)) == NULL) return 0;
7413
7414 /* Delete the key */
0c66a471 7415 server.dirty++;
2a6a2ed1 7416 server.stat_expiredkeys++;
3305306f 7417 dictDelete(db->expires,key);
7418 return dictDelete(db->dict,key) == DICT_OK;
7419}
7420
bbe025e0 7421static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
3305306f 7422 dictEntry *de;
bbe025e0
AM
7423 time_t seconds;
7424
bd79a6bd 7425 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
bbe025e0
AM
7426
7427 seconds -= offset;
3305306f 7428
802e8373 7429 de = dictFind(c->db->dict,key);
3305306f 7430 if (de == NULL) {
7431 addReply(c,shared.czero);
7432 return;
7433 }
d4dd6556 7434 if (seconds <= 0) {
43e5ccdf 7435 if (deleteKey(c->db,key)) server.dirty++;
7436 addReply(c, shared.cone);
3305306f 7437 return;
7438 } else {
7439 time_t when = time(NULL)+seconds;
802e8373 7440 if (setExpire(c->db,key,when)) {
3305306f 7441 addReply(c,shared.cone);
77423026 7442 server.dirty++;
7443 } else {
3305306f 7444 addReply(c,shared.czero);
77423026 7445 }
3305306f 7446 return;
7447 }
7448}
7449
802e8373 7450static void expireCommand(redisClient *c) {
bbe025e0 7451 expireGenericCommand(c,c->argv[1],c->argv[2],0);
802e8373 7452}
7453
7454static void expireatCommand(redisClient *c) {
bbe025e0 7455 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
802e8373 7456}
7457
fd88489a 7458static void ttlCommand(redisClient *c) {
7459 time_t expire;
7460 int ttl = -1;
7461
7462 expire = getExpire(c->db,c->argv[1]);
7463 if (expire != -1) {
7464 ttl = (int) (expire-time(NULL));
7465 if (ttl < 0) ttl = -1;
7466 }
7467 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7468}
7469
6e469882 7470/* ================================ MULTI/EXEC ============================== */
7471
7472/* Client state initialization for MULTI/EXEC */
7473static void initClientMultiState(redisClient *c) {
7474 c->mstate.commands = NULL;
7475 c->mstate.count = 0;
7476}
7477
7478/* Release all the resources associated with MULTI/EXEC state */
7479static void freeClientMultiState(redisClient *c) {
7480 int j;
7481
7482 for (j = 0; j < c->mstate.count; j++) {
7483 int i;
7484 multiCmd *mc = c->mstate.commands+j;
7485
7486 for (i = 0; i < mc->argc; i++)
7487 decrRefCount(mc->argv[i]);
7488 zfree(mc->argv);
7489 }
7490 zfree(c->mstate.commands);
7491}
7492
7493/* Add a new command into the MULTI commands queue */
7494static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7495 multiCmd *mc;
7496 int j;
7497
7498 c->mstate.commands = zrealloc(c->mstate.commands,
7499 sizeof(multiCmd)*(c->mstate.count+1));
7500 mc = c->mstate.commands+c->mstate.count;
7501 mc->cmd = cmd;
7502 mc->argc = c->argc;
7503 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7504 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7505 for (j = 0; j < c->argc; j++)
7506 incrRefCount(mc->argv[j]);
7507 c->mstate.count++;
7508}
7509
7510static void multiCommand(redisClient *c) {
6531c94d 7511 if (c->flags & REDIS_MULTI) {
7512 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
7513 return;
7514 }
6e469882 7515 c->flags |= REDIS_MULTI;
36c548f0 7516 addReply(c,shared.ok);
6e469882 7517}
7518
18b6cb76
DJ
7519static void discardCommand(redisClient *c) {
7520 if (!(c->flags & REDIS_MULTI)) {
7521 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7522 return;
7523 }
7524
7525 freeClientMultiState(c);
7526 initClientMultiState(c);
7527 c->flags &= (~REDIS_MULTI);
7528 addReply(c,shared.ok);
7529}
7530
66c8853f 7531/* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7532 * implememntation for more information. */
7533static void execCommandReplicateMulti(redisClient *c) {
7534 struct redisCommand *cmd;
7535 robj *multistring = createStringObject("MULTI",5);
7536
7537 cmd = lookupCommand("multi");
7538 if (server.appendonly)
7539 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7540 if (listLength(server.slaves))
7541 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7542 decrRefCount(multistring);
7543}
7544
6e469882 7545static void execCommand(redisClient *c) {
7546 int j;
7547 robj **orig_argv;
7548 int orig_argc;
7549
7550 if (!(c->flags & REDIS_MULTI)) {
7551 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7552 return;
7553 }
7554
37ab76c9 7555 /* Check if we need to abort the EXEC if some WATCHed key was touched.
7556 * A failed EXEC will return a multi bulk nil object. */
7557 if (c->flags & REDIS_DIRTY_CAS) {
7558 freeClientMultiState(c);
7559 initClientMultiState(c);
7560 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7561 unwatchAllKeys(c);
7562 addReply(c,shared.nullmultibulk);
7563 return;
7564 }
7565
66c8853f 7566 /* Replicate a MULTI request now that we are sure the block is executed.
7567 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7568 * both the AOF and the replication link will have the same consistency
7569 * and atomicity guarantees. */
7570 execCommandReplicateMulti(c);
7571
7572 /* Exec all the queued commands */
1ad4d316 7573 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
6e469882 7574 orig_argv = c->argv;
7575 orig_argc = c->argc;
7576 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7577 for (j = 0; j < c->mstate.count; j++) {
7578 c->argc = c->mstate.commands[j].argc;
7579 c->argv = c->mstate.commands[j].argv;
7580 call(c,c->mstate.commands[j].cmd);
7581 }
7582 c->argv = orig_argv;
7583 c->argc = orig_argc;
7584 freeClientMultiState(c);
7585 initClientMultiState(c);
1ad4d316 7586 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
66c8853f 7587 /* Make sure the EXEC command is always replicated / AOF, since we
7588 * always send the MULTI command (we can't know beforehand if the
7589 * next operations will contain at least a modification to the DB). */
7590 server.dirty++;
6e469882 7591}
7592
4409877e 7593/* =========================== Blocking Operations ========================= */
7594
7595/* Currently Redis blocking operations support is limited to list POP ops,
7596 * so the current implementation is not fully generic, but it is also not
7597 * completely specific so it will not require a rewrite to support new
7598 * kind of blocking operations in the future.
7599 *
7600 * Still it's important to note that list blocking operations can be already
7601 * used as a notification mechanism in order to implement other blocking
7602 * operations at application level, so there must be a very strong evidence
7603 * of usefulness and generality before new blocking operations are implemented.
7604 *
7605 * This is how the current blocking POP works, we use BLPOP as example:
7606 * - If the user calls BLPOP and the key exists and contains a non empty list
7607 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7608 * if there is not to block.
7609 * - If instead BLPOP is called and the key does not exists or the list is
7610 * empty we need to block. In order to do so we remove the notification for
7611 * new data to read in the client socket (so that we'll not serve new
7612 * requests if the blocking request is not served). Also we put the client
37ab76c9 7613 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
4409877e 7614 * blocking for this keys.
7615 * - If a PUSH operation against a key with blocked clients waiting is
7616 * performed, we serve the first in the list: basically instead to push
7617 * the new element inside the list we return it to the (first / oldest)
7618 * blocking client, unblock the client, and remove it form the list.
7619 *
7620 * The above comment and the source code should be enough in order to understand
7621 * the implementation and modify / fix it later.
7622 */
7623
7624/* Set a client in blocking mode for the specified key, with the specified
7625 * timeout */
b177fd30 7626static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 7627 dictEntry *de;
7628 list *l;
b177fd30 7629 int j;
4409877e 7630
37ab76c9 7631 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
7632 c->blocking_keys_num = numkeys;
4409877e 7633 c->blockingto = timeout;
b177fd30 7634 for (j = 0; j < numkeys; j++) {
7635 /* Add the key in the client structure, to map clients -> keys */
37ab76c9 7636 c->blocking_keys[j] = keys[j];
b177fd30 7637 incrRefCount(keys[j]);
4409877e 7638
b177fd30 7639 /* And in the other "side", to map keys -> clients */
37ab76c9 7640 de = dictFind(c->db->blocking_keys,keys[j]);
b177fd30 7641 if (de == NULL) {
7642 int retval;
7643
7644 /* For every key we take a list of clients blocked for it */
7645 l = listCreate();
37ab76c9 7646 retval = dictAdd(c->db->blocking_keys,keys[j],l);
b177fd30 7647 incrRefCount(keys[j]);
7648 assert(retval == DICT_OK);
7649 } else {
7650 l = dictGetEntryVal(de);
7651 }
7652 listAddNodeTail(l,c);
4409877e 7653 }
b177fd30 7654 /* Mark the client as a blocked client */
4409877e 7655 c->flags |= REDIS_BLOCKED;
d5d55fc3 7656 server.blpop_blocked_clients++;
4409877e 7657}
7658
7659/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 7660static void unblockClientWaitingData(redisClient *c) {
4409877e 7661 dictEntry *de;
7662 list *l;
b177fd30 7663 int j;
4409877e 7664
37ab76c9 7665 assert(c->blocking_keys != NULL);
b177fd30 7666 /* The client may wait for multiple keys, so unblock it for every key. */
37ab76c9 7667 for (j = 0; j < c->blocking_keys_num; j++) {
b177fd30 7668 /* Remove this client from the list of clients waiting for this key. */
37ab76c9 7669 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
b177fd30 7670 assert(de != NULL);
7671 l = dictGetEntryVal(de);
7672 listDelNode(l,listSearchKey(l,c));
7673 /* If the list is empty we need to remove it to avoid wasting memory */
7674 if (listLength(l) == 0)
37ab76c9 7675 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
7676 decrRefCount(c->blocking_keys[j]);
b177fd30 7677 }
7678 /* Cleanup the client structure */
37ab76c9 7679 zfree(c->blocking_keys);
7680 c->blocking_keys = NULL;
4409877e 7681 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 7682 server.blpop_blocked_clients--;
5921aa36 7683 /* We want to process data if there is some command waiting
b0d8747d 7684 * in the input buffer. Note that this is safe even if
7685 * unblockClientWaitingData() gets called from freeClient() because
7686 * freeClient() will be smart enough to call this function
7687 * *after* c->querybuf was set to NULL. */
4409877e 7688 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7689}
7690
7691/* This should be called from any function PUSHing into lists.
7692 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7693 * 'ele' is the element pushed.
7694 *
7695 * If the function returns 0 there was no client waiting for a list push
7696 * against this key.
7697 *
7698 * If the function returns 1 there was a client waiting for a list push
7699 * against this key, the element was passed to this client thus it's not
7700 * needed to actually add it to the list and the caller should return asap. */
7701static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7702 struct dictEntry *de;
7703 redisClient *receiver;
7704 list *l;
7705 listNode *ln;
7706
37ab76c9 7707 de = dictFind(c->db->blocking_keys,key);
4409877e 7708 if (de == NULL) return 0;
7709 l = dictGetEntryVal(de);
7710 ln = listFirst(l);
7711 assert(ln != NULL);
7712 receiver = ln->value;
4409877e 7713
b177fd30 7714 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 7715 addReplyBulk(receiver,key);
7716 addReplyBulk(receiver,ele);
b0d8747d 7717 unblockClientWaitingData(receiver);
4409877e 7718 return 1;
7719}
7720
7721/* Blocking RPOP/LPOP */
7722static void blockingPopGenericCommand(redisClient *c, int where) {
7723 robj *o;
7724 time_t timeout;
b177fd30 7725 int j;
4409877e 7726
b177fd30 7727 for (j = 1; j < c->argc-1; j++) {
7728 o = lookupKeyWrite(c->db,c->argv[j]);
7729 if (o != NULL) {
7730 if (o->type != REDIS_LIST) {
7731 addReply(c,shared.wrongtypeerr);
4409877e 7732 return;
b177fd30 7733 } else {
7734 list *list = o->ptr;
7735 if (listLength(list) != 0) {
7736 /* If the list contains elements fall back to the usual
7737 * non-blocking POP operation */
7738 robj *argv[2], **orig_argv;
7739 int orig_argc;
e0a62c7f 7740
b177fd30 7741 /* We need to alter the command arguments before to call
7742 * popGenericCommand() as the command takes a single key. */
7743 orig_argv = c->argv;
7744 orig_argc = c->argc;
7745 argv[1] = c->argv[j];
7746 c->argv = argv;
7747 c->argc = 2;
7748
7749 /* Also the return value is different, we need to output
7750 * the multi bulk reply header and the key name. The
7751 * "real" command will add the last element (the value)
7752 * for us. If this souds like an hack to you it's just
7753 * because it is... */
7754 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 7755 addReplyBulk(c,argv[1]);
b177fd30 7756 popGenericCommand(c,where);
7757
7758 /* Fix the client structure with the original stuff */
7759 c->argv = orig_argv;
7760 c->argc = orig_argc;
7761 return;
7762 }
4409877e 7763 }
7764 }
7765 }
7766 /* If the list is empty or the key does not exists we must block */
b177fd30 7767 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 7768 if (timeout > 0) timeout += time(NULL);
b177fd30 7769 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 7770}
7771
7772static void blpopCommand(redisClient *c) {
7773 blockingPopGenericCommand(c,REDIS_HEAD);
7774}
7775
7776static void brpopCommand(redisClient *c) {
7777 blockingPopGenericCommand(c,REDIS_TAIL);
7778}
7779
ed9b544e 7780/* =============================== Replication ============================= */
7781
a4d1ba9a 7782static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7783 ssize_t nwritten, ret = size;
7784 time_t start = time(NULL);
7785
7786 timeout++;
7787 while(size) {
7788 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7789 nwritten = write(fd,ptr,size);
7790 if (nwritten == -1) return -1;
7791 ptr += nwritten;
7792 size -= nwritten;
7793 }
7794 if ((time(NULL)-start) > timeout) {
7795 errno = ETIMEDOUT;
7796 return -1;
7797 }
7798 }
7799 return ret;
7800}
7801
a4d1ba9a 7802static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7803 ssize_t nread, totread = 0;
7804 time_t start = time(NULL);
7805
7806 timeout++;
7807 while(size) {
7808 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7809 nread = read(fd,ptr,size);
7810 if (nread == -1) return -1;
7811 ptr += nread;
7812 size -= nread;
7813 totread += nread;
7814 }
7815 if ((time(NULL)-start) > timeout) {
7816 errno = ETIMEDOUT;
7817 return -1;
7818 }
7819 }
7820 return totread;
7821}
7822
7823static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7824 ssize_t nread = 0;
7825
7826 size--;
7827 while(size) {
7828 char c;
7829
7830 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7831 if (c == '\n') {
7832 *ptr = '\0';
7833 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7834 return nread;
7835 } else {
7836 *ptr++ = c;
7837 *ptr = '\0';
7838 nread++;
7839 }
7840 }
7841 return nread;
7842}
7843
7844static void syncCommand(redisClient *c) {
40d224a9 7845 /* ignore SYNC if aleady slave or in monitor mode */
7846 if (c->flags & REDIS_SLAVE) return;
7847
7848 /* SYNC can't be issued when the server has pending data to send to
7849 * the client about already issued commands. We need a fresh reply
7850 * buffer registering the differences between the BGSAVE and the current
7851 * dataset, so that we can copy to other slaves if needed. */
7852 if (listLength(c->reply) != 0) {
7853 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7854 return;
7855 }
7856
7857 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7858 /* Here we need to check if there is a background saving operation
7859 * in progress, or if it is required to start one */
9d65a1bb 7860 if (server.bgsavechildpid != -1) {
40d224a9 7861 /* Ok a background save is in progress. Let's check if it is a good
7862 * one for replication, i.e. if there is another slave that is
7863 * registering differences since the server forked to save */
7864 redisClient *slave;
7865 listNode *ln;
c7df85a4 7866 listIter li;
40d224a9 7867
c7df85a4 7868 listRewind(server.slaves,&li);
7869 while((ln = listNext(&li))) {
40d224a9 7870 slave = ln->value;
7871 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 7872 }
7873 if (ln) {
7874 /* Perfect, the server is already registering differences for
7875 * another slave. Set the right state, and copy the buffer. */
7876 listRelease(c->reply);
7877 c->reply = listDup(slave->reply);
40d224a9 7878 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7879 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7880 } else {
7881 /* No way, we need to wait for the next BGSAVE in order to
7882 * register differences */
7883 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7884 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7885 }
7886 } else {
7887 /* Ok we don't have a BGSAVE in progress, let's start one */
7888 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7889 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7890 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7891 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7892 return;
7893 }
7894 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7895 }
6208b3a7 7896 c->repldbfd = -1;
40d224a9 7897 c->flags |= REDIS_SLAVE;
7898 c->slaveseldb = 0;
6b47e12e 7899 listAddNodeTail(server.slaves,c);
40d224a9 7900 return;
7901}
7902
6208b3a7 7903static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7904 redisClient *slave = privdata;
7905 REDIS_NOTUSED(el);
7906 REDIS_NOTUSED(mask);
7907 char buf[REDIS_IOBUF_LEN];
7908 ssize_t nwritten, buflen;
7909
7910 if (slave->repldboff == 0) {
7911 /* Write the bulk write count before to transfer the DB. In theory here
7912 * we don't know how much room there is in the output buffer of the
7913 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7914 * operations) will never be smaller than the few bytes we need. */
7915 sds bulkcount;
7916
7917 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7918 slave->repldbsize);
7919 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7920 {
7921 sdsfree(bulkcount);
7922 freeClient(slave);
7923 return;
7924 }
7925 sdsfree(bulkcount);
7926 }
7927 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7928 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7929 if (buflen <= 0) {
7930 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7931 (buflen == 0) ? "premature EOF" : strerror(errno));
7932 freeClient(slave);
7933 return;
7934 }
7935 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 7936 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 7937 strerror(errno));
7938 freeClient(slave);
7939 return;
7940 }
7941 slave->repldboff += nwritten;
7942 if (slave->repldboff == slave->repldbsize) {
7943 close(slave->repldbfd);
7944 slave->repldbfd = -1;
7945 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7946 slave->replstate = REDIS_REPL_ONLINE;
7947 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 7948 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 7949 freeClient(slave);
7950 return;
7951 }
7952 addReplySds(slave,sdsempty());
7953 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7954 }
7955}
ed9b544e 7956
a3b21203 7957/* This function is called at the end of every backgrond saving.
7958 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7959 * otherwise REDIS_ERR is passed to the function.
7960 *
7961 * The goal of this function is to handle slaves waiting for a successful
7962 * background saving in order to perform non-blocking synchronization. */
7963static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 7964 listNode *ln;
7965 int startbgsave = 0;
c7df85a4 7966 listIter li;
ed9b544e 7967
c7df85a4 7968 listRewind(server.slaves,&li);
7969 while((ln = listNext(&li))) {
6208b3a7 7970 redisClient *slave = ln->value;
ed9b544e 7971
6208b3a7 7972 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7973 startbgsave = 1;
7974 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7975 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 7976 struct redis_stat buf;
e0a62c7f 7977
6208b3a7 7978 if (bgsaveerr != REDIS_OK) {
7979 freeClient(slave);
7980 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7981 continue;
7982 }
7983 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 7984 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 7985 freeClient(slave);
7986 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7987 continue;
7988 }
7989 slave->repldboff = 0;
7990 slave->repldbsize = buf.st_size;
7991 slave->replstate = REDIS_REPL_SEND_BULK;
7992 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 7993 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 7994 freeClient(slave);
7995 continue;
7996 }
7997 }
ed9b544e 7998 }
6208b3a7 7999 if (startbgsave) {
8000 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 8001 listIter li;
8002
8003 listRewind(server.slaves,&li);
6208b3a7 8004 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 8005 while((ln = listNext(&li))) {
6208b3a7 8006 redisClient *slave = ln->value;
ed9b544e 8007
6208b3a7 8008 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8009 freeClient(slave);
8010 }
8011 }
8012 }
ed9b544e 8013}
8014
8015static int syncWithMaster(void) {
d0ccebcf 8016 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 8017 long dumpsize;
ed9b544e 8018 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 8019 int dfd, maxtries = 5;
ed9b544e 8020
8021 if (fd == -1) {
8022 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8023 strerror(errno));
8024 return REDIS_ERR;
8025 }
d0ccebcf 8026
8027 /* AUTH with the master if required. */
8028 if(server.masterauth) {
8029 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8030 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8031 close(fd);
8032 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8033 strerror(errno));
8034 return REDIS_ERR;
8035 }
8036 /* Read the AUTH result. */
8037 if (syncReadLine(fd,buf,1024,3600) == -1) {
8038 close(fd);
8039 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8040 strerror(errno));
8041 return REDIS_ERR;
8042 }
8043 if (buf[0] != '+') {
8044 close(fd);
8045 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8046 return REDIS_ERR;
8047 }
8048 }
8049
ed9b544e 8050 /* Issue the SYNC command */
8051 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8052 close(fd);
8053 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8054 strerror(errno));
8055 return REDIS_ERR;
8056 }
8057 /* Read the bulk write count */
8c4d91fc 8058 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 8059 close(fd);
8060 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8061 strerror(errno));
8062 return REDIS_ERR;
8063 }
4aa701c1 8064 if (buf[0] != '$') {
8065 close(fd);
8066 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8067 return REDIS_ERR;
8068 }
18e61fa2 8069 dumpsize = strtol(buf+1,NULL,10);
8070 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 8071 /* Read the bulk write data on a temp file */
8c5abee8 8072 while(maxtries--) {
8073 snprintf(tmpfile,256,
8074 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8075 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8076 if (dfd != -1) break;
5de9ad7c 8077 sleep(1);
8c5abee8 8078 }
ed9b544e 8079 if (dfd == -1) {
8080 close(fd);
8081 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8082 return REDIS_ERR;
8083 }
8084 while(dumpsize) {
8085 int nread, nwritten;
8086
8087 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8088 if (nread == -1) {
8089 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8090 strerror(errno));
8091 close(fd);
8092 close(dfd);
8093 return REDIS_ERR;
8094 }
8095 nwritten = write(dfd,buf,nread);
8096 if (nwritten == -1) {
8097 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8098 close(fd);
8099 close(dfd);
8100 return REDIS_ERR;
8101 }
8102 dumpsize -= nread;
8103 }
8104 close(dfd);
8105 if (rename(tmpfile,server.dbfilename) == -1) {
8106 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8107 unlink(tmpfile);
8108 close(fd);
8109 return REDIS_ERR;
8110 }
8111 emptyDb();
f78fd11b 8112 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 8113 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8114 close(fd);
8115 return REDIS_ERR;
8116 }
8117 server.master = createClient(fd);
8118 server.master->flags |= REDIS_MASTER;
179b3952 8119 server.master->authenticated = 1;
ed9b544e 8120 server.replstate = REDIS_REPL_CONNECTED;
8121 return REDIS_OK;
8122}
8123
321b0e13 8124static void slaveofCommand(redisClient *c) {
8125 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8126 !strcasecmp(c->argv[2]->ptr,"one")) {
8127 if (server.masterhost) {
8128 sdsfree(server.masterhost);
8129 server.masterhost = NULL;
8130 if (server.master) freeClient(server.master);
8131 server.replstate = REDIS_REPL_NONE;
8132 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8133 }
8134 } else {
8135 sdsfree(server.masterhost);
8136 server.masterhost = sdsdup(c->argv[1]->ptr);
8137 server.masterport = atoi(c->argv[2]->ptr);
8138 if (server.master) freeClient(server.master);
8139 server.replstate = REDIS_REPL_CONNECT;
8140 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8141 server.masterhost, server.masterport);
8142 }
8143 addReply(c,shared.ok);
8144}
8145
3fd78bcd 8146/* ============================ Maxmemory directive ======================== */
8147
a5819310 8148/* Try to free one object form the pre-allocated objects free list.
8149 * This is useful under low mem conditions as by default we take 1 million
8150 * free objects allocated. On success REDIS_OK is returned, otherwise
8151 * REDIS_ERR. */
8152static int tryFreeOneObjectFromFreelist(void) {
f870935d 8153 robj *o;
8154
a5819310 8155 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8156 if (listLength(server.objfreelist)) {
8157 listNode *head = listFirst(server.objfreelist);
8158 o = listNodeValue(head);
8159 listDelNode(server.objfreelist,head);
8160 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8161 zfree(o);
8162 return REDIS_OK;
8163 } else {
8164 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8165 return REDIS_ERR;
8166 }
f870935d 8167}
8168
3fd78bcd 8169/* This function gets called when 'maxmemory' is set on the config file to limit
8170 * the max memory used by the server, and we are out of memory.
8171 * This function will try to, in order:
8172 *
8173 * - Free objects from the free list
8174 * - Try to remove keys with an EXPIRE set
8175 *
8176 * It is not possible to free enough memory to reach used-memory < maxmemory
8177 * the server will start refusing commands that will enlarge even more the
8178 * memory usage.
8179 */
8180static void freeMemoryIfNeeded(void) {
8181 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 8182 int j, k, freed = 0;
8183
8184 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8185 for (j = 0; j < server.dbnum; j++) {
8186 int minttl = -1;
8187 robj *minkey = NULL;
8188 struct dictEntry *de;
8189
8190 if (dictSize(server.db[j].expires)) {
8191 freed = 1;
8192 /* From a sample of three keys drop the one nearest to
8193 * the natural expire */
8194 for (k = 0; k < 3; k++) {
8195 time_t t;
8196
8197 de = dictGetRandomKey(server.db[j].expires);
8198 t = (time_t) dictGetEntryVal(de);
8199 if (minttl == -1 || t < minttl) {
8200 minkey = dictGetEntryKey(de);
8201 minttl = t;
3fd78bcd 8202 }
3fd78bcd 8203 }
a5819310 8204 deleteKey(server.db+j,minkey);
3fd78bcd 8205 }
3fd78bcd 8206 }
a5819310 8207 if (!freed) return; /* nothing to free... */
3fd78bcd 8208 }
8209}
8210
f80dff62 8211/* ============================== Append Only file ========================== */
8212
28ed1f33 8213/* Write the append only file buffer on disk.
8214 *
8215 * Since we are required to write the AOF before replying to the client,
8216 * and the only way the client socket can get a write is entering when the
8217 * the event loop, we accumulate all the AOF writes in a memory
8218 * buffer and write it on disk using this function just before entering
8219 * the event loop again. */
8220static void flushAppendOnlyFile(void) {
8221 time_t now;
8222 ssize_t nwritten;
8223
8224 if (sdslen(server.aofbuf) == 0) return;
8225
8226 /* We want to perform a single write. This should be guaranteed atomic
8227 * at least if the filesystem we are writing is a real physical one.
8228 * While this will save us against the server being killed I don't think
8229 * there is much to do about the whole server stopping for power problems
8230 * or alike */
8231 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8232 if (nwritten != (signed)sdslen(server.aofbuf)) {
8233 /* Ooops, we are in troubles. The best thing to do for now is
8234 * aborting instead of giving the illusion that everything is
8235 * working as expected. */
8236 if (nwritten == -1) {
8237 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8238 } else {
8239 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8240 }
8241 exit(1);
8242 }
8243 sdsfree(server.aofbuf);
8244 server.aofbuf = sdsempty();
8245
38db9171 8246 /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
8247 * childs performing heavy I/O on disk. */
8248 if (server.no_appendfsync_on_rewrite &&
8249 (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
8250 return;
28ed1f33 8251 /* Fsync if needed */
8252 now = time(NULL);
8253 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8254 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8255 now-server.lastfsync > 1))
8256 {
8257 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8258 * flushing metadata. */
8259 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8260 server.lastfsync = now;
8261 }
8262}
8263
9376e434
PN
8264static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8265 int j;
8266 buf = sdscatprintf(buf,"*%d\r\n",argc);
8267 for (j = 0; j < argc; j++) {
8268 robj *o = getDecodedObject(argv[j]);
8269 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8270 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8271 buf = sdscatlen(buf,"\r\n",2);
8272 decrRefCount(o);
8273 }
8274 return buf;
8275}
8276
8277static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8278 int argc = 3;
8279 long when;
8280 robj *argv[3];
8281
8282 /* Make sure we can use strtol */
8283 seconds = getDecodedObject(seconds);
8284 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8285 decrRefCount(seconds);
8286
8287 argv[0] = createStringObject("EXPIREAT",8);
8288 argv[1] = key;
8289 argv[2] = createObject(REDIS_STRING,
8290 sdscatprintf(sdsempty(),"%ld",when));
8291 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8292 decrRefCount(argv[0]);
8293 decrRefCount(argv[2]);
8294 return buf;
8295}
8296
f80dff62 8297static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8298 sds buf = sdsempty();
f80dff62 8299 robj *tmpargv[3];
8300
8301 /* The DB this command was targetting is not the same as the last command
8302 * we appendend. To issue a SELECT command is needed. */
8303 if (dictid != server.appendseldb) {
8304 char seldb[64];
8305
8306 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 8307 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 8308 (unsigned long)strlen(seldb),seldb);
f80dff62 8309 server.appendseldb = dictid;
8310 }
8311
f80dff62 8312 if (cmd->proc == expireCommand) {
9376e434
PN
8313 /* Translate EXPIRE into EXPIREAT */
8314 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8315 } else if (cmd->proc == setexCommand) {
8316 /* Translate SETEX to SET and EXPIREAT */
8317 tmpargv[0] = createStringObject("SET",3);
f80dff62 8318 tmpargv[1] = argv[1];
9376e434
PN
8319 tmpargv[2] = argv[3];
8320 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8321 decrRefCount(tmpargv[0]);
8322 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8323 } else {
8324 buf = catAppendOnlyGenericCommand(buf,argc,argv);
f80dff62 8325 }
8326
28ed1f33 8327 /* Append to the AOF buffer. This will be flushed on disk just before
8328 * of re-entering the event loop, so before the client will get a
8329 * positive reply about the operation performed. */
8330 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8331
85a83172 8332 /* If a background append only file rewriting is in progress we want to
8333 * accumulate the differences between the child DB and the current one
8334 * in a buffer, so that when the child process will do its work we
8335 * can append the differences to the new append only file. */
8336 if (server.bgrewritechildpid != -1)
8337 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8338
8339 sdsfree(buf);
f80dff62 8340}
8341
8342/* In Redis commands are always executed in the context of a client, so in
8343 * order to load the append only file we need to create a fake client. */
8344static struct redisClient *createFakeClient(void) {
8345 struct redisClient *c = zmalloc(sizeof(*c));
8346
8347 selectDb(c,0);
8348 c->fd = -1;
8349 c->querybuf = sdsempty();
8350 c->argc = 0;
8351 c->argv = NULL;
8352 c->flags = 0;
9387d17d 8353 /* We set the fake client as a slave waiting for the synchronization
8354 * so that Redis will not try to send replies to this client. */
8355 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 8356 c->reply = listCreate();
8357 listSetFreeMethod(c->reply,decrRefCount);
8358 listSetDupMethod(c->reply,dupClientReplyValue);
4132ad8d 8359 initClientMultiState(c);
f80dff62 8360 return c;
8361}
8362
8363static void freeFakeClient(struct redisClient *c) {
8364 sdsfree(c->querybuf);
8365 listRelease(c->reply);
4132ad8d 8366 freeClientMultiState(c);
f80dff62 8367 zfree(c);
8368}
8369
8370/* Replay the append log file. On error REDIS_OK is returned. On non fatal
8371 * error (the append only file is zero-length) REDIS_ERR is returned. On
8372 * fatal error an error message is logged and the program exists. */
8373int loadAppendOnlyFile(char *filename) {
8374 struct redisClient *fakeClient;
8375 FILE *fp = fopen(filename,"r");
8376 struct redis_stat sb;
b492cf00 8377 unsigned long long loadedkeys = 0;
4132ad8d 8378 int appendonly = server.appendonly;
f80dff62 8379
8380 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8381 return REDIS_ERR;
8382
8383 if (fp == NULL) {
8384 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8385 exit(1);
8386 }
8387
4132ad8d
PN
8388 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8389 * to the same file we're about to read. */
8390 server.appendonly = 0;
8391
f80dff62 8392 fakeClient = createFakeClient();
8393 while(1) {
8394 int argc, j;
8395 unsigned long len;
8396 robj **argv;
8397 char buf[128];
8398 sds argsds;
8399 struct redisCommand *cmd;
8400
8401 if (fgets(buf,sizeof(buf),fp) == NULL) {
8402 if (feof(fp))
8403 break;
8404 else
8405 goto readerr;
8406 }
8407 if (buf[0] != '*') goto fmterr;
8408 argc = atoi(buf+1);
8409 argv = zmalloc(sizeof(robj*)*argc);
8410 for (j = 0; j < argc; j++) {
8411 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8412 if (buf[0] != '$') goto fmterr;
8413 len = strtol(buf+1,NULL,10);
8414 argsds = sdsnewlen(NULL,len);
0f151ef1 8415 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 8416 argv[j] = createObject(REDIS_STRING,argsds);
8417 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8418 }
8419
8420 /* Command lookup */
8421 cmd = lookupCommand(argv[0]->ptr);
8422 if (!cmd) {
8423 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8424 exit(1);
8425 }
bdcb92f2 8426 /* Try object encoding */
f80dff62 8427 if (cmd->flags & REDIS_CMD_BULK)
05df7621 8428 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
f80dff62 8429 /* Run the command in the context of a fake client */
8430 fakeClient->argc = argc;
8431 fakeClient->argv = argv;
8432 cmd->proc(fakeClient);
8433 /* Discard the reply objects list from the fake client */
8434 while(listLength(fakeClient->reply))
8435 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8436 /* Clean up, ready for the next command */
8437 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8438 zfree(argv);
b492cf00 8439 /* Handle swapping while loading big datasets when VM is on */
8440 loadedkeys++;
8441 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8442 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 8443 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 8444 }
8445 }
f80dff62 8446 }
4132ad8d
PN
8447
8448 /* This point can only be reached when EOF is reached without errors.
8449 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8450 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8451
f80dff62 8452 fclose(fp);
8453 freeFakeClient(fakeClient);
4132ad8d 8454 server.appendonly = appendonly;
f80dff62 8455 return REDIS_OK;
8456
8457readerr:
8458 if (feof(fp)) {
8459 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8460 } else {
8461 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8462 }
8463 exit(1);
8464fmterr:
8465 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8466 exit(1);
8467}
8468
9d65a1bb 8469/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
9c8e3cee 8470static int fwriteBulkObject(FILE *fp, robj *obj) {
9d65a1bb 8471 char buf[128];
b9bc0eef 8472 int decrrc = 0;
8473
f2d9f50f 8474 /* Avoid the incr/decr ref count business if possible to help
8475 * copy-on-write (we are often in a child process when this function
8476 * is called).
8477 * Also makes sure that key objects don't get incrRefCount-ed when VM
8478 * is enabled */
8479 if (obj->encoding != REDIS_ENCODING_RAW) {
b9bc0eef 8480 obj = getDecodedObject(obj);
8481 decrrc = 1;
8482 }
9d65a1bb 8483 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8484 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 8485 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8486 goto err;
9d65a1bb 8487 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 8488 if (decrrc) decrRefCount(obj);
9d65a1bb 8489 return 1;
8490err:
b9bc0eef 8491 if (decrrc) decrRefCount(obj);
9d65a1bb 8492 return 0;
8493}
8494
9c8e3cee 8495/* Write binary-safe string into a file in the bulkformat
8496 * $<count>\r\n<payload>\r\n */
8497static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8498 char buf[128];
8499
8500 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8501 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8502 if (len && fwrite(s,len,1,fp) == 0) return 0;
8503 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8504 return 1;
8505}
8506
9d65a1bb 8507/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8508static int fwriteBulkDouble(FILE *fp, double d) {
8509 char buf[128], dbuf[128];
8510
8511 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8512 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8513 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8514 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8515 return 1;
8516}
8517
8518/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8519static int fwriteBulkLong(FILE *fp, long l) {
8520 char buf[128], lbuf[128];
8521
8522 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8523 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8524 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8525 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8526 return 1;
8527}
8528
8529/* Write a sequence of commands able to fully rebuild the dataset into
8530 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8531static int rewriteAppendOnlyFile(char *filename) {
8532 dictIterator *di = NULL;
8533 dictEntry *de;
8534 FILE *fp;
8535 char tmpfile[256];
8536 int j;
8537 time_t now = time(NULL);
8538
8539 /* Note that we have to use a different temp name here compared to the
8540 * one used by rewriteAppendOnlyFileBackground() function. */
8541 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8542 fp = fopen(tmpfile,"w");
8543 if (!fp) {
8544 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8545 return REDIS_ERR;
8546 }
8547 for (j = 0; j < server.dbnum; j++) {
8548 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8549 redisDb *db = server.db+j;
8550 dict *d = db->dict;
8551 if (dictSize(d) == 0) continue;
8552 di = dictGetIterator(d);
8553 if (!di) {
8554 fclose(fp);
8555 return REDIS_ERR;
8556 }
8557
8558 /* SELECT the new DB */
8559 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 8560 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 8561
8562 /* Iterate this DB writing every entry */
8563 while((de = dictNext(di)) != NULL) {
e7546c63 8564 robj *key, *o;
8565 time_t expiretime;
8566 int swapped;
8567
8568 key = dictGetEntryKey(de);
b9bc0eef 8569 /* If the value for this key is swapped, load a preview in memory.
8570 * We use a "swapped" flag to remember if we need to free the
8571 * value object instead to just increment the ref count anyway
8572 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 8573 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8574 key->storage == REDIS_VM_SWAPPING) {
e7546c63 8575 o = dictGetEntryVal(de);
8576 swapped = 0;
8577 } else {
8578 o = vmPreviewObject(key);
e7546c63 8579 swapped = 1;
8580 }
8581 expiretime = getExpire(db,key);
9d65a1bb 8582
8583 /* Save the key and associated value */
9d65a1bb 8584 if (o->type == REDIS_STRING) {
8585 /* Emit a SET command */
8586 char cmd[]="*3\r\n$3\r\nSET\r\n";
8587 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8588 /* Key and value */
9c8e3cee 8589 if (fwriteBulkObject(fp,key) == 0) goto werr;
8590 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 8591 } else if (o->type == REDIS_LIST) {
8592 /* Emit the RPUSHes needed to rebuild the list */
8593 list *list = o->ptr;
8594 listNode *ln;
c7df85a4 8595 listIter li;
9d65a1bb 8596
c7df85a4 8597 listRewind(list,&li);
8598 while((ln = listNext(&li))) {
9d65a1bb 8599 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8600 robj *eleobj = listNodeValue(ln);
8601
8602 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8603 if (fwriteBulkObject(fp,key) == 0) goto werr;
8604 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8605 }
8606 } else if (o->type == REDIS_SET) {
8607 /* Emit the SADDs needed to rebuild the set */
8608 dict *set = o->ptr;
8609 dictIterator *di = dictGetIterator(set);
8610 dictEntry *de;
8611
8612 while((de = dictNext(di)) != NULL) {
8613 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8614 robj *eleobj = dictGetEntryKey(de);
8615
8616 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8617 if (fwriteBulkObject(fp,key) == 0) goto werr;
8618 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8619 }
8620 dictReleaseIterator(di);
8621 } else if (o->type == REDIS_ZSET) {
8622 /* Emit the ZADDs needed to rebuild the sorted set */
8623 zset *zs = o->ptr;
8624 dictIterator *di = dictGetIterator(zs->dict);
8625 dictEntry *de;
8626
8627 while((de = dictNext(di)) != NULL) {
8628 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8629 robj *eleobj = dictGetEntryKey(de);
8630 double *score = dictGetEntryVal(de);
8631
8632 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8633 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8634 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 8635 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8636 }
8637 dictReleaseIterator(di);
9c8e3cee 8638 } else if (o->type == REDIS_HASH) {
8639 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8640
8641 /* Emit the HSETs needed to rebuild the hash */
8642 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8643 unsigned char *p = zipmapRewind(o->ptr);
8644 unsigned char *field, *val;
8645 unsigned int flen, vlen;
8646
8647 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8648 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8649 if (fwriteBulkObject(fp,key) == 0) goto werr;
8650 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8651 return -1;
8652 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8653 return -1;
8654 }
8655 } else {
8656 dictIterator *di = dictGetIterator(o->ptr);
8657 dictEntry *de;
8658
8659 while((de = dictNext(di)) != NULL) {
8660 robj *field = dictGetEntryKey(de);
8661 robj *val = dictGetEntryVal(de);
8662
8663 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8664 if (fwriteBulkObject(fp,key) == 0) goto werr;
8665 if (fwriteBulkObject(fp,field) == -1) return -1;
8666 if (fwriteBulkObject(fp,val) == -1) return -1;
8667 }
8668 dictReleaseIterator(di);
8669 }
9d65a1bb 8670 } else {
f83c6cb5 8671 redisPanic("Unknown object type");
9d65a1bb 8672 }
8673 /* Save the expire time */
8674 if (expiretime != -1) {
e96e4fbf 8675 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 8676 /* If this key is already expired skip it */
8677 if (expiretime < now) continue;
8678 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8679 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8680 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8681 }
b9bc0eef 8682 if (swapped) decrRefCount(o);
9d65a1bb 8683 }
8684 dictReleaseIterator(di);
8685 }
8686
8687 /* Make sure data will not remain on the OS's output buffers */
8688 fflush(fp);
b0bd87f6 8689 aof_fsync(fileno(fp));
9d65a1bb 8690 fclose(fp);
e0a62c7f 8691
9d65a1bb 8692 /* Use RENAME to make sure the DB file is changed atomically only
8693 * if the generate DB file is ok. */
8694 if (rename(tmpfile,filename) == -1) {
8695 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8696 unlink(tmpfile);
8697 return REDIS_ERR;
8698 }
8699 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8700 return REDIS_OK;
8701
8702werr:
8703 fclose(fp);
8704 unlink(tmpfile);
e96e4fbf 8705 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 8706 if (di) dictReleaseIterator(di);
8707 return REDIS_ERR;
8708}
8709
8710/* This is how rewriting of the append only file in background works:
8711 *
8712 * 1) The user calls BGREWRITEAOF
8713 * 2) Redis calls this function, that forks():
8714 * 2a) the child rewrite the append only file in a temp file.
8715 * 2b) the parent accumulates differences in server.bgrewritebuf.
8716 * 3) When the child finished '2a' exists.
8717 * 4) The parent will trap the exit code, if it's OK, will append the
8718 * data accumulated into server.bgrewritebuf into the temp file, and
8719 * finally will rename(2) the temp file in the actual file name.
8720 * The the new file is reopened as the new append only file. Profit!
8721 */
8722static int rewriteAppendOnlyFileBackground(void) {
8723 pid_t childpid;
8724
8725 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 8726 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 8727 if ((childpid = fork()) == 0) {
8728 /* Child */
8729 char tmpfile[256];
9d65a1bb 8730
054e426d 8731 if (server.vm_enabled) vmReopenSwapFile();
8732 close(server.fd);
9d65a1bb 8733 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8734 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 8735 _exit(0);
9d65a1bb 8736 } else {
478c2c6f 8737 _exit(1);
9d65a1bb 8738 }
8739 } else {
8740 /* Parent */
8741 if (childpid == -1) {
8742 redisLog(REDIS_WARNING,
8743 "Can't rewrite append only file in background: fork: %s",
8744 strerror(errno));
8745 return REDIS_ERR;
8746 }
8747 redisLog(REDIS_NOTICE,
8748 "Background append only file rewriting started by pid %d",childpid);
8749 server.bgrewritechildpid = childpid;
884d4b39 8750 updateDictResizePolicy();
85a83172 8751 /* We set appendseldb to -1 in order to force the next call to the
8752 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8753 * accumulated by the parent into server.bgrewritebuf will start
8754 * with a SELECT statement and it will be safe to merge. */
8755 server.appendseldb = -1;
9d65a1bb 8756 return REDIS_OK;
8757 }
8758 return REDIS_OK; /* unreached */
8759}
8760
8761static void bgrewriteaofCommand(redisClient *c) {
8762 if (server.bgrewritechildpid != -1) {
8763 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8764 return;
8765 }
8766 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 8767 char *status = "+Background append only file rewriting started\r\n";
8768 addReplySds(c,sdsnew(status));
9d65a1bb 8769 } else {
8770 addReply(c,shared.err);
8771 }
8772}
8773
8774static void aofRemoveTempFile(pid_t childpid) {
8775 char tmpfile[256];
8776
8777 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8778 unlink(tmpfile);
8779}
8780
996cb5f7 8781/* Virtual Memory is composed mainly of two subsystems:
8782 * - Blocking Virutal Memory
8783 * - Threaded Virtual Memory I/O
8784 * The two parts are not fully decoupled, but functions are split among two
8785 * different sections of the source code (delimited by comments) in order to
8786 * make more clear what functionality is about the blocking VM and what about
8787 * the threaded (not blocking) VM.
8788 *
8789 * Redis VM design:
8790 *
8791 * Redis VM is a blocking VM (one that blocks reading swapped values from
8792 * disk into memory when a value swapped out is needed in memory) that is made
8793 * unblocking by trying to examine the command argument vector in order to
8794 * load in background values that will likely be needed in order to exec
8795 * the command. The command is executed only once all the relevant keys
8796 * are loaded into memory.
8797 *
8798 * This basically is almost as simple of a blocking VM, but almost as parallel
8799 * as a fully non-blocking VM.
8800 */
8801
2e5eb04e 8802/* Called when the user switches from "appendonly yes" to "appendonly no"
8803 * at runtime using the CONFIG command. */
8804static void stopAppendOnly(void) {
8805 flushAppendOnlyFile();
b0bd87f6 8806 aof_fsync(server.appendfd);
2e5eb04e 8807 close(server.appendfd);
8808
8809 server.appendfd = -1;
8810 server.appendseldb = -1;
8811 server.appendonly = 0;
8812 /* rewrite operation in progress? kill it, wait child exit */
8813 if (server.bgsavechildpid != -1) {
8814 int statloc;
8815
30dd89b6 8816 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8817 wait3(&statloc,0,NULL);
2e5eb04e 8818 /* reset the buffer accumulating changes while the child saves */
8819 sdsfree(server.bgrewritebuf);
8820 server.bgrewritebuf = sdsempty();
30dd89b6 8821 server.bgsavechildpid = -1;
2e5eb04e 8822 }
8823}
8824
8825/* Called when the user switches from "appendonly no" to "appendonly yes"
8826 * at runtime using the CONFIG command. */
8827static int startAppendOnly(void) {
8828 server.appendonly = 1;
8829 server.lastfsync = time(NULL);
8830 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8831 if (server.appendfd == -1) {
8832 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8833 return REDIS_ERR;
8834 }
8835 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8836 server.appendonly = 0;
8837 close(server.appendfd);
8838 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8839 return REDIS_ERR;
8840 }
8841 return REDIS_OK;
8842}
8843
996cb5f7 8844/* =================== Virtual Memory - Blocking Side ====================== */
054e426d 8845
75680a3c 8846static void vmInit(void) {
8847 off_t totsize;
996cb5f7 8848 int pipefds[2];
bcaa7a4f 8849 size_t stacksize;
8b5bb414 8850 struct flock fl;
75680a3c 8851
4ad37480 8852 if (server.vm_max_threads != 0)
8853 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8854
054e426d 8855 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8b5bb414 8856 /* Try to open the old swap file, otherwise create it */
6fa987e3 8857 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8858 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8859 }
75680a3c 8860 if (server.vm_fp == NULL) {
6fa987e3 8861 redisLog(REDIS_WARNING,
8b5bb414 8862 "Can't open the swap file: %s. Exiting.",
6fa987e3 8863 strerror(errno));
75680a3c 8864 exit(1);
8865 }
8866 server.vm_fd = fileno(server.vm_fp);
8b5bb414 8867 /* Lock the swap file for writing, this is useful in order to avoid
8868 * another instance to use the same swap file for a config error. */
8869 fl.l_type = F_WRLCK;
8870 fl.l_whence = SEEK_SET;
8871 fl.l_start = fl.l_len = 0;
8872 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8873 redisLog(REDIS_WARNING,
8874 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8875 exit(1);
8876 }
8877 /* Initialize */
75680a3c 8878 server.vm_next_page = 0;
8879 server.vm_near_pages = 0;
7d98e08c 8880 server.vm_stats_used_pages = 0;
8881 server.vm_stats_swapped_objects = 0;
8882 server.vm_stats_swapouts = 0;
8883 server.vm_stats_swapins = 0;
75680a3c 8884 totsize = server.vm_pages*server.vm_page_size;
8885 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8886 if (ftruncate(server.vm_fd,totsize) == -1) {
8887 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8888 strerror(errno));
8889 exit(1);
8890 } else {
8891 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8892 }
7d30035d 8893 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 8894 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 8895 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 8896 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 8897
996cb5f7 8898 /* Initialize threaded I/O (used by Virtual Memory) */
8899 server.io_newjobs = listCreate();
8900 server.io_processing = listCreate();
8901 server.io_processed = listCreate();
d5d55fc3 8902 server.io_ready_clients = listCreate();
92f8e882 8903 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 8904 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8905 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 8906 server.io_active_threads = 0;
996cb5f7 8907 if (pipe(pipefds) == -1) {
8908 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8909 ,strerror(errno));
8910 exit(1);
8911 }
8912 server.io_ready_pipe_read = pipefds[0];
8913 server.io_ready_pipe_write = pipefds[1];
8914 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 8915 /* LZF requires a lot of stack */
8916 pthread_attr_init(&server.io_threads_attr);
8917 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8918 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8919 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 8920 /* Listen for events in the threaded I/O pipe */
8921 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8922 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8923 oom("creating file event");
75680a3c 8924}
8925
06224fec 8926/* Mark the page as used */
8927static void vmMarkPageUsed(off_t page) {
8928 off_t byte = page/8;
8929 int bit = page&7;
970e10bb 8930 redisAssert(vmFreePage(page) == 1);
06224fec 8931 server.vm_bitmap[byte] |= 1<<bit;
8932}
8933
8934/* Mark N contiguous pages as used, with 'page' being the first. */
8935static void vmMarkPagesUsed(off_t page, off_t count) {
8936 off_t j;
8937
8938 for (j = 0; j < count; j++)
7d30035d 8939 vmMarkPageUsed(page+j);
7d98e08c 8940 server.vm_stats_used_pages += count;
7c775e09 8941 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8942 (long long)count, (long long)page);
06224fec 8943}
8944
8945/* Mark the page as free */
8946static void vmMarkPageFree(off_t page) {
8947 off_t byte = page/8;
8948 int bit = page&7;
970e10bb 8949 redisAssert(vmFreePage(page) == 0);
06224fec 8950 server.vm_bitmap[byte] &= ~(1<<bit);
8951}
8952
8953/* Mark N contiguous pages as free, with 'page' being the first. */
8954static void vmMarkPagesFree(off_t page, off_t count) {
8955 off_t j;
8956
8957 for (j = 0; j < count; j++)
7d30035d 8958 vmMarkPageFree(page+j);
7d98e08c 8959 server.vm_stats_used_pages -= count;
7c775e09 8960 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8961 (long long)count, (long long)page);
06224fec 8962}
8963
8964/* Test if the page is free */
8965static int vmFreePage(off_t page) {
8966 off_t byte = page/8;
8967 int bit = page&7;
7d30035d 8968 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 8969}
8970
8971/* Find N contiguous free pages storing the first page of the cluster in *first.
e0a62c7f 8972 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
3a66edc7 8973 * REDIS_ERR is returned.
06224fec 8974 *
8975 * This function uses a simple algorithm: we try to allocate
8976 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8977 * again from the start of the swap file searching for free spaces.
8978 *
8979 * If it looks pretty clear that there are no free pages near our offset
8980 * we try to find less populated places doing a forward jump of
8981 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8982 * without hurry, and then we jump again and so forth...
e0a62c7f 8983 *
06224fec 8984 * This function can be improved using a free list to avoid to guess
8985 * too much, since we could collect data about freed pages.
8986 *
8987 * note: I implemented this function just after watching an episode of
8988 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8989 */
c7df85a4 8990static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 8991 off_t base, offset = 0, since_jump = 0, numfree = 0;
8992
8993 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8994 server.vm_near_pages = 0;
8995 server.vm_next_page = 0;
8996 }
8997 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8998 base = server.vm_next_page;
8999
9000 while(offset < server.vm_pages) {
9001 off_t this = base+offset;
9002
9003 /* If we overflow, restart from page zero */
9004 if (this >= server.vm_pages) {
9005 this -= server.vm_pages;
9006 if (this == 0) {
9007 /* Just overflowed, what we found on tail is no longer
9008 * interesting, as it's no longer contiguous. */
9009 numfree = 0;
9010 }
9011 }
9012 if (vmFreePage(this)) {
9013 /* This is a free page */
9014 numfree++;
9015 /* Already got N free pages? Return to the caller, with success */
9016 if (numfree == n) {
7d30035d 9017 *first = this-(n-1);
9018 server.vm_next_page = this+1;
7c775e09 9019 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 9020 return REDIS_OK;
06224fec 9021 }
9022 } else {
9023 /* The current one is not a free page */
9024 numfree = 0;
9025 }
9026
9027 /* Fast-forward if the current page is not free and we already
9028 * searched enough near this place. */
9029 since_jump++;
9030 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9031 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9032 since_jump = 0;
9033 /* Note that even if we rewind after the jump, we are don't need
9034 * to make sure numfree is set to zero as we only jump *if* it
9035 * is set to zero. */
9036 } else {
9037 /* Otherwise just check the next page */
9038 offset++;
9039 }
9040 }
3a66edc7 9041 return REDIS_ERR;
9042}
9043
a5819310 9044/* Write the specified object at the specified page of the swap file */
9045static int vmWriteObjectOnSwap(robj *o, off_t page) {
9046 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9047 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9048 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9049 redisLog(REDIS_WARNING,
9ebed7cf 9050 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 9051 strerror(errno));
9052 return REDIS_ERR;
9053 }
9054 rdbSaveObject(server.vm_fp,o);
ba76a8f9 9055 fflush(server.vm_fp);
a5819310 9056 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9057 return REDIS_OK;
9058}
9059
3a66edc7 9060/* Swap the 'val' object relative to 'key' into disk. Store all the information
9061 * needed to later retrieve the object into the key object.
9062 * If we can't find enough contiguous empty pages to swap the object on disk
9063 * REDIS_ERR is returned. */
a69a0c9c 9064static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 9065 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 9066 off_t page;
9067
9068 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 9069 assert(key->refcount == 1);
3a66edc7 9070 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 9071 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 9072 key->vm.page = page;
9073 key->vm.usedpages = pages;
9074 key->storage = REDIS_VM_SWAPPED;
d894161b 9075 key->vtype = val->type;
3a66edc7 9076 decrRefCount(val); /* Deallocate the object from memory. */
9077 vmMarkPagesUsed(page,pages);
7d30035d 9078 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
9079 (unsigned char*) key->ptr,
9080 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 9081 server.vm_stats_swapped_objects++;
9082 server.vm_stats_swapouts++;
3a66edc7 9083 return REDIS_OK;
9084}
9085
a5819310 9086static robj *vmReadObjectFromSwap(off_t page, int type) {
9087 robj *o;
3a66edc7 9088
a5819310 9089 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9090 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 9091 redisLog(REDIS_WARNING,
d5d55fc3 9092 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 9093 strerror(errno));
478c2c6f 9094 _exit(1);
3a66edc7 9095 }
a5819310 9096 o = rdbLoadObject(type,server.vm_fp);
9097 if (o == NULL) {
d5d55fc3 9098 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 9099 _exit(1);
3a66edc7 9100 }
a5819310 9101 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9102 return o;
9103}
9104
9105/* Load the value object relative to the 'key' object from swap to memory.
9106 * The newly allocated object is returned.
9107 *
9108 * If preview is true the unserialized object is returned to the caller but
9109 * no changes are made to the key object, nor the pages are marked as freed */
9110static robj *vmGenericLoadObject(robj *key, int preview) {
9111 robj *val;
9112
d5d55fc3 9113 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
a5819310 9114 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 9115 if (!preview) {
9116 key->storage = REDIS_VM_MEMORY;
9117 key->vm.atime = server.unixtime;
9118 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9119 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
9120 (unsigned char*) key->ptr);
7d98e08c 9121 server.vm_stats_swapped_objects--;
38aba9a1 9122 } else {
9123 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
9124 (unsigned char*) key->ptr);
7e69548d 9125 }
7d98e08c 9126 server.vm_stats_swapins++;
3a66edc7 9127 return val;
06224fec 9128}
9129
7e69548d 9130/* Plain object loading, from swap to memory */
9131static robj *vmLoadObject(robj *key) {
996cb5f7 9132 /* If we are loading the object in background, stop it, we
9133 * need to load this object synchronously ASAP. */
9134 if (key->storage == REDIS_VM_LOADING)
9135 vmCancelThreadedIOJob(key);
7e69548d 9136 return vmGenericLoadObject(key,0);
9137}
9138
9139/* Just load the value on disk, without to modify the key.
9140 * This is useful when we want to perform some operation on the value
9141 * without to really bring it from swap to memory, like while saving the
9142 * dataset or rewriting the append only log. */
9143static robj *vmPreviewObject(robj *key) {
9144 return vmGenericLoadObject(key,1);
9145}
9146
4ef8de8a 9147/* How a good candidate is this object for swapping?
9148 * The better candidate it is, the greater the returned value.
9149 *
9150 * Currently we try to perform a fast estimation of the object size in
9151 * memory, and combine it with aging informations.
9152 *
9153 * Basically swappability = idle-time * log(estimated size)
9154 *
9155 * Bigger objects are preferred over smaller objects, but not
9156 * proportionally, this is why we use the logarithm. This algorithm is
9157 * just a first try and will probably be tuned later. */
9158static double computeObjectSwappability(robj *o) {
9159 time_t age = server.unixtime - o->vm.atime;
9160 long asize = 0;
9161 list *l;
9162 dict *d;
9163 struct dictEntry *de;
9164 int z;
9165
9166 if (age <= 0) return 0;
9167 switch(o->type) {
9168 case REDIS_STRING:
9169 if (o->encoding != REDIS_ENCODING_RAW) {
9170 asize = sizeof(*o);
9171 } else {
9172 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9173 }
9174 break;
9175 case REDIS_LIST:
9176 l = o->ptr;
9177 listNode *ln = listFirst(l);
9178
9179 asize = sizeof(list);
9180 if (ln) {
9181 robj *ele = ln->value;
9182 long elesize;
9183
9184 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9185 (sizeof(*o)+sdslen(ele->ptr)) :
9186 sizeof(*o);
9187 asize += (sizeof(listNode)+elesize)*listLength(l);
9188 }
9189 break;
9190 case REDIS_SET:
9191 case REDIS_ZSET:
9192 z = (o->type == REDIS_ZSET);
9193 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9194
9195 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9196 if (z) asize += sizeof(zset)-sizeof(dict);
9197 if (dictSize(d)) {
9198 long elesize;
9199 robj *ele;
9200
9201 de = dictGetRandomKey(d);
9202 ele = dictGetEntryKey(de);
9203 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9204 (sizeof(*o)+sdslen(ele->ptr)) :
9205 sizeof(*o);
9206 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9207 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9208 }
9209 break;
a97b9060 9210 case REDIS_HASH:
9211 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9212 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9213 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9214 unsigned int klen, vlen;
9215 unsigned char *key, *val;
9216
9217 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9218 klen = 0;
9219 vlen = 0;
9220 }
9221 asize = len*(klen+vlen+3);
9222 } else if (o->encoding == REDIS_ENCODING_HT) {
9223 d = o->ptr;
9224 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9225 if (dictSize(d)) {
9226 long elesize;
9227 robj *ele;
9228
9229 de = dictGetRandomKey(d);
9230 ele = dictGetEntryKey(de);
9231 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9232 (sizeof(*o)+sdslen(ele->ptr)) :
9233 sizeof(*o);
9234 ele = dictGetEntryVal(de);
9235 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9236 (sizeof(*o)+sdslen(ele->ptr)) :
9237 sizeof(*o);
9238 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9239 }
9240 }
9241 break;
4ef8de8a 9242 }
c8c72447 9243 return (double)age*log(1+asize);
4ef8de8a 9244}
9245
9246/* Try to swap an object that's a good candidate for swapping.
9247 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 9248 * to swap any object at all.
9249 *
9250 * If 'usethreaded' is true, Redis will try to swap the object in background
9251 * using I/O threads. */
9252static int vmSwapOneObject(int usethreads) {
4ef8de8a 9253 int j, i;
9254 struct dictEntry *best = NULL;
9255 double best_swappability = 0;
b9bc0eef 9256 redisDb *best_db = NULL;
4ef8de8a 9257 robj *key, *val;
9258
9259 for (j = 0; j < server.dbnum; j++) {
9260 redisDb *db = server.db+j;
b72f6a4b 9261 /* Why maxtries is set to 100?
9262 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9263 * are swappable objects */
b0d8747d 9264 int maxtries = 100;
4ef8de8a 9265
9266 if (dictSize(db->dict) == 0) continue;
9267 for (i = 0; i < 5; i++) {
9268 dictEntry *de;
9269 double swappability;
9270
e3cadb8a 9271 if (maxtries) maxtries--;
4ef8de8a 9272 de = dictGetRandomKey(db->dict);
9273 key = dictGetEntryKey(de);
9274 val = dictGetEntryVal(de);
1064ef87 9275 /* Only swap objects that are currently in memory.
9276 *
9277 * Also don't swap shared objects if threaded VM is on, as we
9278 * try to ensure that the main thread does not touch the
9279 * object while the I/O thread is using it, but we can't
9280 * control other keys without adding additional mutex. */
9281 if (key->storage != REDIS_VM_MEMORY ||
9282 (server.vm_max_threads != 0 && val->refcount != 1)) {
e3cadb8a 9283 if (maxtries) i--; /* don't count this try */
9284 continue;
9285 }
4ef8de8a 9286 swappability = computeObjectSwappability(val);
9287 if (!best || swappability > best_swappability) {
9288 best = de;
9289 best_swappability = swappability;
b9bc0eef 9290 best_db = db;
4ef8de8a 9291 }
9292 }
9293 }
7c775e09 9294 if (best == NULL) return REDIS_ERR;
4ef8de8a 9295 key = dictGetEntryKey(best);
9296 val = dictGetEntryVal(best);
9297
e3cadb8a 9298 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 9299 key->ptr, best_swappability);
9300
9301 /* Unshare the key if needed */
9302 if (key->refcount > 1) {
9303 robj *newkey = dupStringObject(key);
9304 decrRefCount(key);
9305 key = dictGetEntryKey(best) = newkey;
9306 }
9307 /* Swap it */
a69a0c9c 9308 if (usethreads) {
b9bc0eef 9309 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 9310 return REDIS_OK;
9311 } else {
a69a0c9c 9312 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9313 dictGetEntryVal(best) = NULL;
9314 return REDIS_OK;
9315 } else {
9316 return REDIS_ERR;
9317 }
4ef8de8a 9318 }
9319}
9320
a69a0c9c 9321static int vmSwapOneObjectBlocking() {
9322 return vmSwapOneObject(0);
9323}
9324
9325static int vmSwapOneObjectThreaded() {
9326 return vmSwapOneObject(1);
9327}
9328
7e69548d 9329/* Return true if it's safe to swap out objects in a given moment.
9330 * Basically we don't want to swap objects out while there is a BGSAVE
9331 * or a BGAEOREWRITE running in backgroud. */
9332static int vmCanSwapOut(void) {
9333 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9334}
9335
1b03836c 9336/* Delete a key if swapped. Returns 1 if the key was found, was swapped
9337 * and was deleted. Otherwise 0 is returned. */
9338static int deleteIfSwapped(redisDb *db, robj *key) {
9339 dictEntry *de;
9340 robj *foundkey;
9341
9342 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9343 foundkey = dictGetEntryKey(de);
9344 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9345 deleteKey(db,key);
9346 return 1;
9347}
9348
996cb5f7 9349/* =================== Virtual Memory - Threaded I/O ======================= */
9350
b9bc0eef 9351static void freeIOJob(iojob *j) {
d5d55fc3 9352 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9353 j->type == REDIS_IOJOB_DO_SWAP ||
9354 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
b9bc0eef 9355 decrRefCount(j->val);
78ebe4c8 9356 /* We don't decrRefCount the j->key field as we did't incremented
9357 * the count creating IO Jobs. This is because the key field here is
9358 * just used as an indentifier and if a key is removed the Job should
9359 * never be touched again. */
b9bc0eef 9360 zfree(j);
9361}
9362
996cb5f7 9363/* Every time a thread finished a Job, it writes a byte into the write side
9364 * of an unix pipe in order to "awake" the main thread, and this function
9365 * is called. */
9366static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9367 int mask)
9368{
9369 char buf[1];
b0d8747d 9370 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 9371 REDIS_NOTUSED(el);
9372 REDIS_NOTUSED(mask);
9373 REDIS_NOTUSED(privdata);
9374
9375 /* For every byte we read in the read side of the pipe, there is one
9376 * I/O job completed to process. */
9377 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 9378 iojob *j;
9379 listNode *ln;
9380 robj *key;
9381 struct dictEntry *de;
9382
996cb5f7 9383 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 9384
9385 /* Get the processed element (the oldest one) */
9386 lockThreadedIO();
1064ef87 9387 assert(listLength(server.io_processed) != 0);
f6c0bba8 9388 if (toprocess == -1) {
9389 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9390 if (toprocess <= 0) toprocess = 1;
9391 }
b9bc0eef 9392 ln = listFirst(server.io_processed);
9393 j = ln->value;
9394 listDelNode(server.io_processed,ln);
9395 unlockThreadedIO();
9396 /* If this job is marked as canceled, just ignore it */
9397 if (j->canceled) {
9398 freeIOJob(j);
9399 continue;
9400 }
9401 /* Post process it in the main thread, as there are things we
9402 * can do just here to avoid race conditions and/or invasive locks */
6c96ba7d 9403 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 9404 de = dictFind(j->db->dict,j->key);
9405 assert(de != NULL);
9406 key = dictGetEntryKey(de);
9407 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9408 redisDb *db;
9409
b9bc0eef 9410 /* Key loaded, bring it at home */
9411 key->storage = REDIS_VM_MEMORY;
9412 key->vm.atime = server.unixtime;
9413 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9414 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9415 (unsigned char*) key->ptr);
9416 server.vm_stats_swapped_objects--;
9417 server.vm_stats_swapins++;
d5d55fc3 9418 dictGetEntryVal(de) = j->val;
9419 incrRefCount(j->val);
9420 db = j->db;
b9bc0eef 9421 freeIOJob(j);
d5d55fc3 9422 /* Handle clients waiting for this key to be loaded. */
9423 handleClientsBlockedOnSwappedKey(db,key);
b9bc0eef 9424 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9425 /* Now we know the amount of pages required to swap this object.
9426 * Let's find some space for it, and queue this task again
9427 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 9428 if (!vmCanSwapOut() ||
9429 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9430 {
9431 /* Ooops... no space or we can't swap as there is
9432 * a fork()ed Redis trying to save stuff on disk. */
b9bc0eef 9433 freeIOJob(j);
054e426d 9434 key->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 9435 } else {
c7df85a4 9436 /* Note that we need to mark this pages as used now,
9437 * if the job will be canceled, we'll mark them as freed
9438 * again. */
9439 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 9440 j->type = REDIS_IOJOB_DO_SWAP;
9441 lockThreadedIO();
9442 queueIOJob(j);
9443 unlockThreadedIO();
9444 }
9445 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9446 robj *val;
9447
9448 /* Key swapped. We can finally free some memory. */
6c96ba7d 9449 if (key->storage != REDIS_VM_SWAPPING) {
9450 printf("key->storage: %d\n",key->storage);
9451 printf("key->name: %s\n",(char*)key->ptr);
9452 printf("key->refcount: %d\n",key->refcount);
9453 printf("val: %p\n",(void*)j->val);
9454 printf("val->type: %d\n",j->val->type);
9455 printf("val->ptr: %s\n",(char*)j->val->ptr);
9456 }
9457 redisAssert(key->storage == REDIS_VM_SWAPPING);
b9bc0eef 9458 val = dictGetEntryVal(de);
9459 key->vm.page = j->page;
9460 key->vm.usedpages = j->pages;
9461 key->storage = REDIS_VM_SWAPPED;
9462 key->vtype = j->val->type;
9463 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 9464 dictGetEntryVal(de) = NULL;
b9bc0eef 9465 redisLog(REDIS_DEBUG,
9466 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9467 (unsigned char*) key->ptr,
9468 (unsigned long long) j->page, (unsigned long long) j->pages);
9469 server.vm_stats_swapped_objects++;
9470 server.vm_stats_swapouts++;
9471 freeIOJob(j);
f11b8647 9472 /* Put a few more swap requests in queue if we are still
9473 * out of memory */
b0d8747d 9474 if (trytoswap && vmCanSwapOut() &&
9475 zmalloc_used_memory() > server.vm_max_memory)
9476 {
f11b8647 9477 int more = 1;
9478 while(more) {
9479 lockThreadedIO();
9480 more = listLength(server.io_newjobs) <
9481 (unsigned) server.vm_max_threads;
9482 unlockThreadedIO();
9483 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 9484 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9485 trytoswap = 0;
9486 break;
9487 }
f11b8647 9488 }
9489 }
b9bc0eef 9490 }
c953f24b 9491 processed++;
f6c0bba8 9492 if (processed == toprocess) return;
996cb5f7 9493 }
9494 if (retval < 0 && errno != EAGAIN) {
9495 redisLog(REDIS_WARNING,
9496 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9497 strerror(errno));
9498 }
9499}
9500
9501static void lockThreadedIO(void) {
9502 pthread_mutex_lock(&server.io_mutex);
9503}
9504
9505static void unlockThreadedIO(void) {
9506 pthread_mutex_unlock(&server.io_mutex);
9507}
9508
9509/* Remove the specified object from the threaded I/O queue if still not
9510 * processed, otherwise make sure to flag it as canceled. */
9511static void vmCancelThreadedIOJob(robj *o) {
9512 list *lists[3] = {
6c96ba7d 9513 server.io_newjobs, /* 0 */
9514 server.io_processing, /* 1 */
9515 server.io_processed /* 2 */
996cb5f7 9516 };
9517 int i;
9518
9519 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 9520again:
996cb5f7 9521 lockThreadedIO();
9522 /* Search for a matching key in one of the queues */
9523 for (i = 0; i < 3; i++) {
9524 listNode *ln;
c7df85a4 9525 listIter li;
996cb5f7 9526
c7df85a4 9527 listRewind(lists[i],&li);
9528 while ((ln = listNext(&li)) != NULL) {
996cb5f7 9529 iojob *job = ln->value;
9530
6c96ba7d 9531 if (job->canceled) continue; /* Skip this, already canceled. */
78ebe4c8 9532 if (job->key == o) {
970e10bb 9533 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9534 (void*)job, (char*)o->ptr, job->type, i);
427a2153 9535 /* Mark the pages as free since the swap didn't happened
9536 * or happened but is now discarded. */
970e10bb 9537 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 9538 vmMarkPagesFree(job->page,job->pages);
9539 /* Cancel the job. It depends on the list the job is
9540 * living in. */
996cb5f7 9541 switch(i) {
9542 case 0: /* io_newjobs */
6c96ba7d 9543 /* If the job was yet not processed the best thing to do
996cb5f7 9544 * is to remove it from the queue at all */
6c96ba7d 9545 freeIOJob(job);
996cb5f7 9546 listDelNode(lists[i],ln);
9547 break;
9548 case 1: /* io_processing */
d5d55fc3 9549 /* Oh Shi- the thread is messing with the Job:
9550 *
9551 * Probably it's accessing the object if this is a
9552 * PREPARE_SWAP or DO_SWAP job.
9553 * If it's a LOAD job it may be reading from disk and
9554 * if we don't wait for the job to terminate before to
9555 * cancel it, maybe in a few microseconds data can be
9556 * corrupted in this pages. So the short story is:
9557 *
9558 * Better to wait for the job to move into the
9559 * next queue (processed)... */
9560
9561 /* We try again and again until the job is completed. */
9562 unlockThreadedIO();
9563 /* But let's wait some time for the I/O thread
9564 * to finish with this job. After all this condition
9565 * should be very rare. */
9566 usleep(1);
9567 goto again;
996cb5f7 9568 case 2: /* io_processed */
2e111efe 9569 /* The job was already processed, that's easy...
9570 * just mark it as canceled so that we'll ignore it
9571 * when processing completed jobs. */
996cb5f7 9572 job->canceled = 1;
9573 break;
9574 }
c7df85a4 9575 /* Finally we have to adjust the storage type of the object
9576 * in order to "UNDO" the operaiton. */
996cb5f7 9577 if (o->storage == REDIS_VM_LOADING)
9578 o->storage = REDIS_VM_SWAPPED;
9579 else if (o->storage == REDIS_VM_SWAPPING)
9580 o->storage = REDIS_VM_MEMORY;
9581 unlockThreadedIO();
9582 return;
9583 }
9584 }
9585 }
9586 unlockThreadedIO();
9587 assert(1 != 1); /* We should never reach this */
9588}
9589
b9bc0eef 9590static void *IOThreadEntryPoint(void *arg) {
9591 iojob *j;
9592 listNode *ln;
9593 REDIS_NOTUSED(arg);
9594
9595 pthread_detach(pthread_self());
9596 while(1) {
9597 /* Get a new job to process */
9598 lockThreadedIO();
9599 if (listLength(server.io_newjobs) == 0) {
9600 /* No new jobs in queue, exit. */
9ebed7cf 9601 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9602 (long) pthread_self());
b9bc0eef 9603 server.io_active_threads--;
9604 unlockThreadedIO();
9605 return NULL;
9606 }
9607 ln = listFirst(server.io_newjobs);
9608 j = ln->value;
9609 listDelNode(server.io_newjobs,ln);
9610 /* Add the job in the processing queue */
9611 j->thread = pthread_self();
9612 listAddNodeTail(server.io_processing,j);
9613 ln = listLast(server.io_processing); /* We use ln later to remove it */
9614 unlockThreadedIO();
9ebed7cf 9615 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9616 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 9617
9618 /* Process the Job */
9619 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9620 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
b9bc0eef 9621 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9622 FILE *fp = fopen("/dev/null","w+");
9623 j->pages = rdbSavedObjectPages(j->val,fp);
9624 fclose(fp);
9625 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 9626 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9627 j->canceled = 1;
b9bc0eef 9628 }
9629
9630 /* Done: insert the job into the processed queue */
9ebed7cf 9631 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9632 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 9633 lockThreadedIO();
9634 listDelNode(server.io_processing,ln);
9635 listAddNodeTail(server.io_processed,j);
9636 unlockThreadedIO();
e0a62c7f 9637
b9bc0eef 9638 /* Signal the main thread there is new stuff to process */
9639 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9640 }
9641 return NULL; /* never reached */
9642}
9643
9644static void spawnIOThread(void) {
9645 pthread_t thread;
478c2c6f 9646 sigset_t mask, omask;
a97b9060 9647 int err;
b9bc0eef 9648
478c2c6f 9649 sigemptyset(&mask);
9650 sigaddset(&mask,SIGCHLD);
9651 sigaddset(&mask,SIGHUP);
9652 sigaddset(&mask,SIGPIPE);
9653 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 9654 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9655 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9656 strerror(err));
9657 usleep(1000000);
9658 }
478c2c6f 9659 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 9660 server.io_active_threads++;
9661}
9662
4ee9488d 9663/* We need to wait for the last thread to exit before we are able to
9664 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 9665static void waitEmptyIOJobsQueue(void) {
4ee9488d 9666 while(1) {
76b7233a 9667 int io_processed_len;
9668
4ee9488d 9669 lockThreadedIO();
054e426d 9670 if (listLength(server.io_newjobs) == 0 &&
9671 listLength(server.io_processing) == 0 &&
9672 server.io_active_threads == 0)
9673 {
4ee9488d 9674 unlockThreadedIO();
9675 return;
9676 }
76b7233a 9677 /* While waiting for empty jobs queue condition we post-process some
9678 * finshed job, as I/O threads may be hanging trying to write against
9679 * the io_ready_pipe_write FD but there are so much pending jobs that
9680 * it's blocking. */
9681 io_processed_len = listLength(server.io_processed);
4ee9488d 9682 unlockThreadedIO();
76b7233a 9683 if (io_processed_len) {
9684 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9685 usleep(1000); /* 1 millisecond */
9686 } else {
9687 usleep(10000); /* 10 milliseconds */
9688 }
4ee9488d 9689 }
9690}
9691
054e426d 9692static void vmReopenSwapFile(void) {
478c2c6f 9693 /* Note: we don't close the old one as we are in the child process
9694 * and don't want to mess at all with the original file object. */
054e426d 9695 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9696 if (server.vm_fp == NULL) {
9697 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9698 server.vm_swap_file);
478c2c6f 9699 _exit(1);
054e426d 9700 }
9701 server.vm_fd = fileno(server.vm_fp);
9702}
9703
b9bc0eef 9704/* This function must be called while with threaded IO locked */
9705static void queueIOJob(iojob *j) {
6c96ba7d 9706 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9707 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 9708 listAddNodeTail(server.io_newjobs,j);
9709 if (server.io_active_threads < server.vm_max_threads)
9710 spawnIOThread();
9711}
9712
9713static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9714 iojob *j;
e0a62c7f 9715
b9bc0eef 9716 assert(key->storage == REDIS_VM_MEMORY);
9717 assert(key->refcount == 1);
9718
9719 j = zmalloc(sizeof(*j));
9720 j->type = REDIS_IOJOB_PREPARE_SWAP;
9721 j->db = db;
78ebe4c8 9722 j->key = key;
b9bc0eef 9723 j->val = val;
9724 incrRefCount(val);
9725 j->canceled = 0;
9726 j->thread = (pthread_t) -1;
f11b8647 9727 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 9728
9729 lockThreadedIO();
9730 queueIOJob(j);
9731 unlockThreadedIO();
9732 return REDIS_OK;
9733}
9734
b0d8747d 9735/* ============ Virtual Memory - Blocking clients on missing keys =========== */
9736
d5d55fc3 9737/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9738 * If there is not already a job loading the key, it is craeted.
9739 * The key is added to the io_keys list in the client structure, and also
9740 * in the hash table mapping swapped keys to waiting clients, that is,
9741 * server.io_waited_keys. */
9742static int waitForSwappedKey(redisClient *c, robj *key) {
9743 struct dictEntry *de;
9744 robj *o;
9745 list *l;
9746
9747 /* If the key does not exist or is already in RAM we don't need to
9748 * block the client at all. */
9749 de = dictFind(c->db->dict,key);
9750 if (de == NULL) return 0;
9751 o = dictGetEntryKey(de);
9752 if (o->storage == REDIS_VM_MEMORY) {
9753 return 0;
9754 } else if (o->storage == REDIS_VM_SWAPPING) {
9755 /* We were swapping the key, undo it! */
9756 vmCancelThreadedIOJob(o);
9757 return 0;
9758 }
e0a62c7f 9759
d5d55fc3 9760 /* OK: the key is either swapped, or being loaded just now. */
9761
9762 /* Add the key to the list of keys this client is waiting for.
9763 * This maps clients to keys they are waiting for. */
9764 listAddNodeTail(c->io_keys,key);
9765 incrRefCount(key);
9766
9767 /* Add the client to the swapped keys => clients waiting map. */
9768 de = dictFind(c->db->io_keys,key);
9769 if (de == NULL) {
9770 int retval;
9771
9772 /* For every key we take a list of clients blocked for it */
9773 l = listCreate();
9774 retval = dictAdd(c->db->io_keys,key,l);
9775 incrRefCount(key);
9776 assert(retval == DICT_OK);
9777 } else {
9778 l = dictGetEntryVal(de);
9779 }
9780 listAddNodeTail(l,c);
9781
9782 /* Are we already loading the key from disk? If not create a job */
9783 if (o->storage == REDIS_VM_SWAPPED) {
9784 iojob *j;
9785
9786 o->storage = REDIS_VM_LOADING;
9787 j = zmalloc(sizeof(*j));
9788 j->type = REDIS_IOJOB_LOAD;
9789 j->db = c->db;
78ebe4c8 9790 j->key = o;
d5d55fc3 9791 j->key->vtype = o->vtype;
9792 j->page = o->vm.page;
9793 j->val = NULL;
9794 j->canceled = 0;
9795 j->thread = (pthread_t) -1;
9796 lockThreadedIO();
9797 queueIOJob(j);
9798 unlockThreadedIO();
9799 }
9800 return 1;
9801}
9802
6f078746
PN
9803/* Preload keys for any command with first, last and step values for
9804 * the command keys prototype, as defined in the command table. */
9805static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9806 int j, last;
9807 if (cmd->vm_firstkey == 0) return;
9808 last = cmd->vm_lastkey;
9809 if (last < 0) last = argc+last;
9810 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
9811 redisAssert(j < argc);
9812 waitForSwappedKey(c,argv[j]);
9813 }
9814}
9815
5d373da9 9816/* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
739ba0d2
PN
9817 * Note that the number of keys to preload is user-defined, so we need to
9818 * apply a sanity check against argc. */
ca1788b5 9819static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
76583ea4 9820 int i, num;
ca1788b5 9821 REDIS_NOTUSED(cmd);
ca1788b5
PN
9822
9823 num = atoi(argv[2]->ptr);
739ba0d2 9824 if (num > (argc-3)) return;
76583ea4 9825 for (i = 0; i < num; i++) {
ca1788b5 9826 waitForSwappedKey(c,argv[3+i]);
76583ea4
PN
9827 }
9828}
9829
3805e04f
PN
9830/* Preload keys needed to execute the entire MULTI/EXEC block.
9831 *
9832 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
9833 * and will block the client when any command requires a swapped out value. */
9834static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9835 int i, margc;
9836 struct redisCommand *mcmd;
9837 robj **margv;
9838 REDIS_NOTUSED(cmd);
9839 REDIS_NOTUSED(argc);
9840 REDIS_NOTUSED(argv);
9841
9842 if (!(c->flags & REDIS_MULTI)) return;
9843 for (i = 0; i < c->mstate.count; i++) {
9844 mcmd = c->mstate.commands[i].cmd;
9845 margc = c->mstate.commands[i].argc;
9846 margv = c->mstate.commands[i].argv;
9847
9848 if (mcmd->vm_preload_proc != NULL) {
9849 mcmd->vm_preload_proc(c,mcmd,margc,margv);
9850 } else {
9851 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
9852 }
76583ea4
PN
9853 }
9854}
9855
b0d8747d 9856/* Is this client attempting to run a command against swapped keys?
d5d55fc3 9857 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 9858 *
d5d55fc3 9859 * The important idea about this function is that it can fail! If keys will
9860 * still be swapped when the client is resumed, this key lookups will
9861 * just block loading keys from disk. In practical terms this should only
9862 * happen with SORT BY command or if there is a bug in this function.
9863 *
9864 * Return 1 if the client is marked as blocked, 0 if the client can
9865 * continue as the keys it is going to access appear to be in memory. */
0a6f3f0f 9866static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
76583ea4 9867 if (cmd->vm_preload_proc != NULL) {
ca1788b5 9868 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
76583ea4 9869 } else {
6f078746 9870 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
76583ea4
PN
9871 }
9872
d5d55fc3 9873 /* If the client was blocked for at least one key, mark it as blocked. */
9874 if (listLength(c->io_keys)) {
9875 c->flags |= REDIS_IO_WAIT;
9876 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9877 server.vm_blocked_clients++;
9878 return 1;
9879 } else {
9880 return 0;
9881 }
9882}
9883
9884/* Remove the 'key' from the list of blocked keys for a given client.
9885 *
9886 * The function returns 1 when there are no longer blocking keys after
9887 * the current one was removed (and the client can be unblocked). */
9888static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9889 list *l;
9890 listNode *ln;
9891 listIter li;
9892 struct dictEntry *de;
9893
9894 /* Remove the key from the list of keys this client is waiting for. */
9895 listRewind(c->io_keys,&li);
9896 while ((ln = listNext(&li)) != NULL) {
bf028098 9897 if (equalStringObjects(ln->value,key)) {
d5d55fc3 9898 listDelNode(c->io_keys,ln);
9899 break;
9900 }
9901 }
9902 assert(ln != NULL);
9903
9904 /* Remove the client form the key => waiting clients map. */
9905 de = dictFind(c->db->io_keys,key);
9906 assert(de != NULL);
9907 l = dictGetEntryVal(de);
9908 ln = listSearchKey(l,c);
9909 assert(ln != NULL);
9910 listDelNode(l,ln);
9911 if (listLength(l) == 0)
9912 dictDelete(c->db->io_keys,key);
9913
9914 return listLength(c->io_keys) == 0;
9915}
9916
9917static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9918 struct dictEntry *de;
9919 list *l;
9920 listNode *ln;
9921 int len;
9922
9923 de = dictFind(db->io_keys,key);
9924 if (!de) return;
9925
9926 l = dictGetEntryVal(de);
9927 len = listLength(l);
9928 /* Note: we can't use something like while(listLength(l)) as the list
9929 * can be freed by the calling function when we remove the last element. */
9930 while (len--) {
9931 ln = listFirst(l);
9932 redisClient *c = ln->value;
9933
9934 if (dontWaitForSwappedKey(c,key)) {
9935 /* Put the client in the list of clients ready to go as we
9936 * loaded all the keys about it. */
9937 listAddNodeTail(server.io_ready_clients,c);
9938 }
9939 }
b0d8747d 9940}
b0d8747d 9941
500ece7c 9942/* =========================== Remote Configuration ========================= */
9943
9944static void configSetCommand(redisClient *c) {
9945 robj *o = getDecodedObject(c->argv[3]);
2e5eb04e 9946 long long ll;
9947
500ece7c 9948 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9949 zfree(server.dbfilename);
9950 server.dbfilename = zstrdup(o->ptr);
9951 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9952 zfree(server.requirepass);
9953 server.requirepass = zstrdup(o->ptr);
9954 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9955 zfree(server.masterauth);
9956 server.masterauth = zstrdup(o->ptr);
9957 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
2e5eb04e 9958 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9959 ll < 0) goto badfmt;
9960 server.maxmemory = ll;
9961 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
9962 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9963 ll < 0 || ll > LONG_MAX) goto badfmt;
9964 server.maxidletime = ll;
1b677732 9965 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
9966 if (!strcasecmp(o->ptr,"no")) {
9967 server.appendfsync = APPENDFSYNC_NO;
9968 } else if (!strcasecmp(o->ptr,"everysec")) {
9969 server.appendfsync = APPENDFSYNC_EVERYSEC;
9970 } else if (!strcasecmp(o->ptr,"always")) {
9971 server.appendfsync = APPENDFSYNC_ALWAYS;
9972 } else {
9973 goto badfmt;
9974 }
38db9171 9975 } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) {
9976 int yn = yesnotoi(o->ptr);
9977
9978 if (yn == -1) goto badfmt;
9979 server.no_appendfsync_on_rewrite = yn;
2e5eb04e 9980 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
9981 int old = server.appendonly;
9982 int new = yesnotoi(o->ptr);
9983
9984 if (new == -1) goto badfmt;
9985 if (old != new) {
9986 if (new == 0) {
9987 stopAppendOnly();
9988 } else {
9989 if (startAppendOnly() == REDIS_ERR) {
9990 addReplySds(c,sdscatprintf(sdsempty(),
9991 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
9992 decrRefCount(o);
9993 return;
9994 }
9995 }
9996 }
a34e0a25 9997 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
9998 int vlen, j;
9999 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10000
10001 /* Perform sanity check before setting the new config:
10002 * - Even number of args
10003 * - Seconds >= 1, changes >= 0 */
10004 if (vlen & 1) {
10005 sdsfreesplitres(v,vlen);
10006 goto badfmt;
10007 }
10008 for (j = 0; j < vlen; j++) {
10009 char *eptr;
10010 long val;
10011
10012 val = strtoll(v[j], &eptr, 10);
10013 if (eptr[0] != '\0' ||
10014 ((j & 1) == 0 && val < 1) ||
10015 ((j & 1) == 1 && val < 0)) {
10016 sdsfreesplitres(v,vlen);
10017 goto badfmt;
10018 }
10019 }
10020 /* Finally set the new config */
10021 resetServerSaveParams();
10022 for (j = 0; j < vlen; j += 2) {
10023 time_t seconds;
10024 int changes;
10025
10026 seconds = strtoll(v[j],NULL,10);
10027 changes = strtoll(v[j+1],NULL,10);
10028 appendServerSaveParams(seconds, changes);
10029 }
10030 sdsfreesplitres(v,vlen);
500ece7c 10031 } else {
10032 addReplySds(c,sdscatprintf(sdsempty(),
10033 "-ERR not supported CONFIG parameter %s\r\n",
10034 (char*)c->argv[2]->ptr));
10035 decrRefCount(o);
10036 return;
10037 }
10038 decrRefCount(o);
10039 addReply(c,shared.ok);
a34e0a25 10040 return;
10041
10042badfmt: /* Bad format errors */
10043 addReplySds(c,sdscatprintf(sdsempty(),
10044 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10045 (char*)o->ptr,
10046 (char*)c->argv[2]->ptr));
10047 decrRefCount(o);
500ece7c 10048}
10049
10050static void configGetCommand(redisClient *c) {
10051 robj *o = getDecodedObject(c->argv[2]);
10052 robj *lenobj = createObject(REDIS_STRING,NULL);
10053 char *pattern = o->ptr;
10054 int matches = 0;
10055
10056 addReply(c,lenobj);
10057 decrRefCount(lenobj);
10058
10059 if (stringmatch(pattern,"dbfilename",0)) {
10060 addReplyBulkCString(c,"dbfilename");
10061 addReplyBulkCString(c,server.dbfilename);
10062 matches++;
10063 }
10064 if (stringmatch(pattern,"requirepass",0)) {
10065 addReplyBulkCString(c,"requirepass");
10066 addReplyBulkCString(c,server.requirepass);
10067 matches++;
10068 }
10069 if (stringmatch(pattern,"masterauth",0)) {
10070 addReplyBulkCString(c,"masterauth");
10071 addReplyBulkCString(c,server.masterauth);
10072 matches++;
10073 }
10074 if (stringmatch(pattern,"maxmemory",0)) {
10075 char buf[128];
10076
2e5eb04e 10077 ll2string(buf,128,server.maxmemory);
500ece7c 10078 addReplyBulkCString(c,"maxmemory");
10079 addReplyBulkCString(c,buf);
10080 matches++;
10081 }
2e5eb04e 10082 if (stringmatch(pattern,"timeout",0)) {
10083 char buf[128];
10084
10085 ll2string(buf,128,server.maxidletime);
10086 addReplyBulkCString(c,"timeout");
10087 addReplyBulkCString(c,buf);
10088 matches++;
10089 }
10090 if (stringmatch(pattern,"appendonly",0)) {
10091 addReplyBulkCString(c,"appendonly");
10092 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10093 matches++;
10094 }
38db9171 10095 if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) {
10096 addReplyBulkCString(c,"no-appendfsync-on-rewrite");
10097 addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no");
10098 matches++;
10099 }
1b677732 10100 if (stringmatch(pattern,"appendfsync",0)) {
10101 char *policy;
10102
10103 switch(server.appendfsync) {
10104 case APPENDFSYNC_NO: policy = "no"; break;
10105 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10106 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10107 default: policy = "unknown"; break; /* too harmless to panic */
10108 }
10109 addReplyBulkCString(c,"appendfsync");
10110 addReplyBulkCString(c,policy);
10111 matches++;
10112 }
a34e0a25 10113 if (stringmatch(pattern,"save",0)) {
10114 sds buf = sdsempty();
10115 int j;
10116
10117 for (j = 0; j < server.saveparamslen; j++) {
10118 buf = sdscatprintf(buf,"%ld %d",
10119 server.saveparams[j].seconds,
10120 server.saveparams[j].changes);
10121 if (j != server.saveparamslen-1)
10122 buf = sdscatlen(buf," ",1);
10123 }
10124 addReplyBulkCString(c,"save");
10125 addReplyBulkCString(c,buf);
10126 sdsfree(buf);
10127 matches++;
10128 }
500ece7c 10129 decrRefCount(o);
10130 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10131}
10132
10133static void configCommand(redisClient *c) {
10134 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10135 if (c->argc != 4) goto badarity;
10136 configSetCommand(c);
10137 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10138 if (c->argc != 3) goto badarity;
10139 configGetCommand(c);
10140 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10141 if (c->argc != 2) goto badarity;
10142 server.stat_numcommands = 0;
10143 server.stat_numconnections = 0;
10144 server.stat_expiredkeys = 0;
10145 server.stat_starttime = time(NULL);
10146 addReply(c,shared.ok);
10147 } else {
10148 addReplySds(c,sdscatprintf(sdsempty(),
10149 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10150 }
10151 return;
10152
10153badarity:
10154 addReplySds(c,sdscatprintf(sdsempty(),
10155 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10156 (char*) c->argv[1]->ptr));
10157}
10158
befec3cd 10159/* =========================== Pubsub implementation ======================== */
10160
ffc6b7f8 10161static void freePubsubPattern(void *p) {
10162 pubsubPattern *pat = p;
10163
10164 decrRefCount(pat->pattern);
10165 zfree(pat);
10166}
10167
10168static int listMatchPubsubPattern(void *a, void *b) {
10169 pubsubPattern *pa = a, *pb = b;
10170
10171 return (pa->client == pb->client) &&
bf028098 10172 (equalStringObjects(pa->pattern,pb->pattern));
ffc6b7f8 10173}
10174
10175/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10176 * 0 if the client was already subscribed to that channel. */
10177static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
befec3cd 10178 struct dictEntry *de;
10179 list *clients = NULL;
10180 int retval = 0;
10181
ffc6b7f8 10182 /* Add the channel to the client -> channels hash table */
10183 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
befec3cd 10184 retval = 1;
ffc6b7f8 10185 incrRefCount(channel);
10186 /* Add the client to the channel -> list of clients hash table */
10187 de = dictFind(server.pubsub_channels,channel);
befec3cd 10188 if (de == NULL) {
10189 clients = listCreate();
ffc6b7f8 10190 dictAdd(server.pubsub_channels,channel,clients);
10191 incrRefCount(channel);
befec3cd 10192 } else {
10193 clients = dictGetEntryVal(de);
10194 }
10195 listAddNodeTail(clients,c);
10196 }
10197 /* Notify the client */
10198 addReply(c,shared.mbulk3);
10199 addReply(c,shared.subscribebulk);
ffc6b7f8 10200 addReplyBulk(c,channel);
482b672d 10201 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
befec3cd 10202 return retval;
10203}
10204
ffc6b7f8 10205/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10206 * 0 if the client was not subscribed to the specified channel. */
10207static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
befec3cd 10208 struct dictEntry *de;
10209 list *clients;
10210 listNode *ln;
10211 int retval = 0;
10212
ffc6b7f8 10213 /* Remove the channel from the client -> channels hash table */
10214 incrRefCount(channel); /* channel may be just a pointer to the same object
201037f5 10215 we have in the hash tables. Protect it... */
ffc6b7f8 10216 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
befec3cd 10217 retval = 1;
ffc6b7f8 10218 /* Remove the client from the channel -> clients list hash table */
10219 de = dictFind(server.pubsub_channels,channel);
befec3cd 10220 assert(de != NULL);
10221 clients = dictGetEntryVal(de);
10222 ln = listSearchKey(clients,c);
10223 assert(ln != NULL);
10224 listDelNode(clients,ln);
ff767a75 10225 if (listLength(clients) == 0) {
10226 /* Free the list and associated hash entry at all if this was
10227 * the latest client, so that it will be possible to abuse
ffc6b7f8 10228 * Redis PUBSUB creating millions of channels. */
10229 dictDelete(server.pubsub_channels,channel);
ff767a75 10230 }
befec3cd 10231 }
10232 /* Notify the client */
10233 if (notify) {
10234 addReply(c,shared.mbulk3);
10235 addReply(c,shared.unsubscribebulk);
ffc6b7f8 10236 addReplyBulk(c,channel);
482b672d 10237 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10238 listLength(c->pubsub_patterns));
10239
10240 }
10241 decrRefCount(channel); /* it is finally safe to release it */
10242 return retval;
10243}
10244
10245/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10246static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10247 int retval = 0;
10248
10249 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10250 retval = 1;
10251 pubsubPattern *pat;
10252 listAddNodeTail(c->pubsub_patterns,pattern);
10253 incrRefCount(pattern);
10254 pat = zmalloc(sizeof(*pat));
10255 pat->pattern = getDecodedObject(pattern);
10256 pat->client = c;
10257 listAddNodeTail(server.pubsub_patterns,pat);
10258 }
10259 /* Notify the client */
10260 addReply(c,shared.mbulk3);
10261 addReply(c,shared.psubscribebulk);
10262 addReplyBulk(c,pattern);
482b672d 10263 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
ffc6b7f8 10264 return retval;
10265}
10266
10267/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10268 * 0 if the client was not subscribed to the specified channel. */
10269static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10270 listNode *ln;
10271 pubsubPattern pat;
10272 int retval = 0;
10273
10274 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10275 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10276 retval = 1;
10277 listDelNode(c->pubsub_patterns,ln);
10278 pat.client = c;
10279 pat.pattern = pattern;
10280 ln = listSearchKey(server.pubsub_patterns,&pat);
10281 listDelNode(server.pubsub_patterns,ln);
10282 }
10283 /* Notify the client */
10284 if (notify) {
10285 addReply(c,shared.mbulk3);
10286 addReply(c,shared.punsubscribebulk);
10287 addReplyBulk(c,pattern);
482b672d 10288 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10289 listLength(c->pubsub_patterns));
befec3cd 10290 }
ffc6b7f8 10291 decrRefCount(pattern);
befec3cd 10292 return retval;
10293}
10294
ffc6b7f8 10295/* Unsubscribe from all the channels. Return the number of channels the
10296 * client was subscribed from. */
10297static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10298 dictIterator *di = dictGetIterator(c->pubsub_channels);
befec3cd 10299 dictEntry *de;
10300 int count = 0;
10301
10302 while((de = dictNext(di)) != NULL) {
ffc6b7f8 10303 robj *channel = dictGetEntryKey(de);
befec3cd 10304
ffc6b7f8 10305 count += pubsubUnsubscribeChannel(c,channel,notify);
befec3cd 10306 }
10307 dictReleaseIterator(di);
10308 return count;
10309}
10310
ffc6b7f8 10311/* Unsubscribe from all the patterns. Return the number of patterns the
10312 * client was subscribed from. */
10313static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10314 listNode *ln;
10315 listIter li;
10316 int count = 0;
10317
10318 listRewind(c->pubsub_patterns,&li);
10319 while ((ln = listNext(&li)) != NULL) {
10320 robj *pattern = ln->value;
10321
10322 count += pubsubUnsubscribePattern(c,pattern,notify);
10323 }
10324 return count;
10325}
10326
befec3cd 10327/* Publish a message */
ffc6b7f8 10328static int pubsubPublishMessage(robj *channel, robj *message) {
befec3cd 10329 int receivers = 0;
10330 struct dictEntry *de;
ffc6b7f8 10331 listNode *ln;
10332 listIter li;
befec3cd 10333
ffc6b7f8 10334 /* Send to clients listening for that channel */
10335 de = dictFind(server.pubsub_channels,channel);
befec3cd 10336 if (de) {
10337 list *list = dictGetEntryVal(de);
10338 listNode *ln;
10339 listIter li;
10340
10341 listRewind(list,&li);
10342 while ((ln = listNext(&li)) != NULL) {
10343 redisClient *c = ln->value;
10344
10345 addReply(c,shared.mbulk3);
10346 addReply(c,shared.messagebulk);
ffc6b7f8 10347 addReplyBulk(c,channel);
befec3cd 10348 addReplyBulk(c,message);
10349 receivers++;
10350 }
10351 }
ffc6b7f8 10352 /* Send to clients listening to matching channels */
10353 if (listLength(server.pubsub_patterns)) {
10354 listRewind(server.pubsub_patterns,&li);
10355 channel = getDecodedObject(channel);
10356 while ((ln = listNext(&li)) != NULL) {
10357 pubsubPattern *pat = ln->value;
10358
10359 if (stringmatchlen((char*)pat->pattern->ptr,
10360 sdslen(pat->pattern->ptr),
10361 (char*)channel->ptr,
10362 sdslen(channel->ptr),0)) {
c8d0ea0e 10363 addReply(pat->client,shared.mbulk4);
10364 addReply(pat->client,shared.pmessagebulk);
10365 addReplyBulk(pat->client,pat->pattern);
ffc6b7f8 10366 addReplyBulk(pat->client,channel);
10367 addReplyBulk(pat->client,message);
10368 receivers++;
10369 }
10370 }
10371 decrRefCount(channel);
10372 }
befec3cd 10373 return receivers;
10374}
10375
10376static void subscribeCommand(redisClient *c) {
10377 int j;
10378
10379 for (j = 1; j < c->argc; j++)
ffc6b7f8 10380 pubsubSubscribeChannel(c,c->argv[j]);
befec3cd 10381}
10382
10383static void unsubscribeCommand(redisClient *c) {
10384 if (c->argc == 1) {
ffc6b7f8 10385 pubsubUnsubscribeAllChannels(c,1);
10386 return;
10387 } else {
10388 int j;
10389
10390 for (j = 1; j < c->argc; j++)
10391 pubsubUnsubscribeChannel(c,c->argv[j],1);
10392 }
10393}
10394
10395static void psubscribeCommand(redisClient *c) {
10396 int j;
10397
10398 for (j = 1; j < c->argc; j++)
10399 pubsubSubscribePattern(c,c->argv[j]);
10400}
10401
10402static void punsubscribeCommand(redisClient *c) {
10403 if (c->argc == 1) {
10404 pubsubUnsubscribeAllPatterns(c,1);
befec3cd 10405 return;
10406 } else {
10407 int j;
10408
10409 for (j = 1; j < c->argc; j++)
ffc6b7f8 10410 pubsubUnsubscribePattern(c,c->argv[j],1);
befec3cd 10411 }
10412}
10413
10414static void publishCommand(redisClient *c) {
10415 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
482b672d 10416 addReplyLongLong(c,receivers);
befec3cd 10417}
10418
37ab76c9 10419/* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10420 *
10421 * The implementation uses a per-DB hash table mapping keys to list of clients
10422 * WATCHing those keys, so that given a key that is going to be modified
10423 * we can mark all the associated clients as dirty.
10424 *
10425 * Also every client contains a list of WATCHed keys so that's possible to
10426 * un-watch such keys when the client is freed or when UNWATCH is called. */
10427
10428/* In the client->watched_keys list we need to use watchedKey structures
10429 * as in order to identify a key in Redis we need both the key name and the
10430 * DB */
10431typedef struct watchedKey {
10432 robj *key;
10433 redisDb *db;
10434} watchedKey;
10435
10436/* Watch for the specified key */
10437static void watchForKey(redisClient *c, robj *key) {
10438 list *clients = NULL;
10439 listIter li;
10440 listNode *ln;
10441 watchedKey *wk;
10442
10443 /* Check if we are already watching for this key */
10444 listRewind(c->watched_keys,&li);
10445 while((ln = listNext(&li))) {
10446 wk = listNodeValue(ln);
10447 if (wk->db == c->db && equalStringObjects(key,wk->key))
10448 return; /* Key already watched */
10449 }
10450 /* This key is not already watched in this DB. Let's add it */
10451 clients = dictFetchValue(c->db->watched_keys,key);
10452 if (!clients) {
10453 clients = listCreate();
10454 dictAdd(c->db->watched_keys,key,clients);
10455 incrRefCount(key);
10456 }
10457 listAddNodeTail(clients,c);
10458 /* Add the new key to the lits of keys watched by this client */
10459 wk = zmalloc(sizeof(*wk));
10460 wk->key = key;
10461 wk->db = c->db;
10462 incrRefCount(key);
10463 listAddNodeTail(c->watched_keys,wk);
10464}
10465
10466/* Unwatch all the keys watched by this client. To clean the EXEC dirty
10467 * flag is up to the caller. */
10468static void unwatchAllKeys(redisClient *c) {
10469 listIter li;
10470 listNode *ln;
10471
10472 if (listLength(c->watched_keys) == 0) return;
10473 listRewind(c->watched_keys,&li);
10474 while((ln = listNext(&li))) {
10475 list *clients;
10476 watchedKey *wk;
10477
10478 /* Lookup the watched key -> clients list and remove the client
10479 * from the list */
10480 wk = listNodeValue(ln);
10481 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10482 assert(clients != NULL);
10483 listDelNode(clients,listSearchKey(clients,c));
10484 /* Kill the entry at all if this was the only client */
10485 if (listLength(clients) == 0)
10486 dictDelete(wk->db->watched_keys, wk->key);
10487 /* Remove this watched key from the client->watched list */
10488 listDelNode(c->watched_keys,ln);
10489 decrRefCount(wk->key);
10490 zfree(wk);
10491 }
10492}
10493
ca3f830b 10494/* "Touch" a key, so that if this key is being WATCHed by some client the
37ab76c9 10495 * next EXEC will fail. */
10496static void touchWatchedKey(redisDb *db, robj *key) {
10497 list *clients;
10498 listIter li;
10499 listNode *ln;
10500
10501 if (dictSize(db->watched_keys) == 0) return;
10502 clients = dictFetchValue(db->watched_keys, key);
10503 if (!clients) return;
10504
10505 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
10506 /* Check if we are already watching for this key */
10507 listRewind(clients,&li);
10508 while((ln = listNext(&li))) {
10509 redisClient *c = listNodeValue(ln);
10510
10511 c->flags |= REDIS_DIRTY_CAS;
10512 }
10513}
10514
9b30e1a2 10515/* On FLUSHDB or FLUSHALL all the watched keys that are present before the
10516 * flush but will be deleted as effect of the flushing operation should
10517 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
10518 * a FLUSHALL operation (all the DBs flushed). */
10519static void touchWatchedKeysOnFlush(int dbid) {
10520 listIter li1, li2;
10521 listNode *ln;
10522
10523 /* For every client, check all the waited keys */
10524 listRewind(server.clients,&li1);
10525 while((ln = listNext(&li1))) {
10526 redisClient *c = listNodeValue(ln);
10527 listRewind(c->watched_keys,&li2);
10528 while((ln = listNext(&li2))) {
10529 watchedKey *wk = listNodeValue(ln);
10530
10531 /* For every watched key matching the specified DB, if the
10532 * key exists, mark the client as dirty, as the key will be
10533 * removed. */
10534 if (dbid == -1 || wk->db->id == dbid) {
10535 if (dictFind(wk->db->dict, wk->key) != NULL)
10536 c->flags |= REDIS_DIRTY_CAS;
10537 }
10538 }
10539 }
10540}
10541
37ab76c9 10542static void watchCommand(redisClient *c) {
10543 int j;
10544
6531c94d 10545 if (c->flags & REDIS_MULTI) {
10546 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
10547 return;
10548 }
37ab76c9 10549 for (j = 1; j < c->argc; j++)
10550 watchForKey(c,c->argv[j]);
10551 addReply(c,shared.ok);
10552}
10553
10554static void unwatchCommand(redisClient *c) {
10555 unwatchAllKeys(c);
10556 c->flags &= (~REDIS_DIRTY_CAS);
10557 addReply(c,shared.ok);
10558}
10559
7f957c92 10560/* ================================= Debugging ============================== */
10561
ba798261 10562/* Compute the sha1 of string at 's' with 'len' bytes long.
10563 * The SHA1 is then xored againt the string pointed by digest.
10564 * Since xor is commutative, this operation is used in order to
10565 * "add" digests relative to unordered elements.
10566 *
10567 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
10568static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
10569 SHA1_CTX ctx;
10570 unsigned char hash[20], *s = ptr;
10571 int j;
10572
10573 SHA1Init(&ctx);
10574 SHA1Update(&ctx,s,len);
10575 SHA1Final(hash,&ctx);
10576
10577 for (j = 0; j < 20; j++)
10578 digest[j] ^= hash[j];
10579}
10580
10581static void xorObjectDigest(unsigned char *digest, robj *o) {
10582 o = getDecodedObject(o);
10583 xorDigest(digest,o->ptr,sdslen(o->ptr));
10584 decrRefCount(o);
10585}
10586
10587/* This function instead of just computing the SHA1 and xoring it
10588 * against diget, also perform the digest of "digest" itself and
10589 * replace the old value with the new one.
10590 *
10591 * So the final digest will be:
10592 *
10593 * digest = SHA1(digest xor SHA1(data))
10594 *
10595 * This function is used every time we want to preserve the order so
10596 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
10597 *
10598 * Also note that mixdigest("foo") followed by mixdigest("bar")
10599 * will lead to a different digest compared to "fo", "obar".
10600 */
10601static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
10602 SHA1_CTX ctx;
10603 char *s = ptr;
10604
10605 xorDigest(digest,s,len);
10606 SHA1Init(&ctx);
10607 SHA1Update(&ctx,digest,20);
10608 SHA1Final(digest,&ctx);
10609}
10610
10611static void mixObjectDigest(unsigned char *digest, robj *o) {
10612 o = getDecodedObject(o);
10613 mixDigest(digest,o->ptr,sdslen(o->ptr));
10614 decrRefCount(o);
10615}
10616
10617/* Compute the dataset digest. Since keys, sets elements, hashes elements
10618 * are not ordered, we use a trick: every aggregate digest is the xor
10619 * of the digests of their elements. This way the order will not change
10620 * the result. For list instead we use a feedback entering the output digest
10621 * as input in order to ensure that a different ordered list will result in
10622 * a different digest. */
10623static void computeDatasetDigest(unsigned char *final) {
10624 unsigned char digest[20];
10625 char buf[128];
10626 dictIterator *di = NULL;
10627 dictEntry *de;
10628 int j;
10629 uint32_t aux;
10630
10631 memset(final,0,20); /* Start with a clean result */
10632
10633 for (j = 0; j < server.dbnum; j++) {
10634 redisDb *db = server.db+j;
10635
10636 if (dictSize(db->dict) == 0) continue;
10637 di = dictGetIterator(db->dict);
10638
10639 /* hash the DB id, so the same dataset moved in a different
10640 * DB will lead to a different digest */
10641 aux = htonl(j);
10642 mixDigest(final,&aux,sizeof(aux));
10643
10644 /* Iterate this DB writing every entry */
10645 while((de = dictNext(di)) != NULL) {
cbae1d34 10646 robj *key, *o, *kcopy;
ba798261 10647 time_t expiretime;
10648
10649 memset(digest,0,20); /* This key-val digest */
10650 key = dictGetEntryKey(de);
cbae1d34 10651
10652 if (!server.vm_enabled) {
10653 mixObjectDigest(digest,key);
ba798261 10654 o = dictGetEntryVal(de);
ba798261 10655 } else {
cbae1d34 10656 /* Don't work with the key directly as when VM is active
10657 * this is unsafe: TODO: fix decrRefCount to check if the
10658 * count really reached 0 to avoid this mess */
10659 kcopy = dupStringObject(key);
10660 mixObjectDigest(digest,kcopy);
10661 o = lookupKeyRead(db,kcopy);
10662 decrRefCount(kcopy);
ba798261 10663 }
10664 aux = htonl(o->type);
10665 mixDigest(digest,&aux,sizeof(aux));
10666 expiretime = getExpire(db,key);
10667
10668 /* Save the key and associated value */
10669 if (o->type == REDIS_STRING) {
10670 mixObjectDigest(digest,o);
10671 } else if (o->type == REDIS_LIST) {
10672 list *list = o->ptr;
10673 listNode *ln;
10674 listIter li;
10675
10676 listRewind(list,&li);
10677 while((ln = listNext(&li))) {
10678 robj *eleobj = listNodeValue(ln);
10679
10680 mixObjectDigest(digest,eleobj);
10681 }
10682 } else if (o->type == REDIS_SET) {
10683 dict *set = o->ptr;
10684 dictIterator *di = dictGetIterator(set);
10685 dictEntry *de;
10686
10687 while((de = dictNext(di)) != NULL) {
10688 robj *eleobj = dictGetEntryKey(de);
10689
10690 xorObjectDigest(digest,eleobj);
10691 }
10692 dictReleaseIterator(di);
10693 } else if (o->type == REDIS_ZSET) {
10694 zset *zs = o->ptr;
10695 dictIterator *di = dictGetIterator(zs->dict);
10696 dictEntry *de;
10697
10698 while((de = dictNext(di)) != NULL) {
10699 robj *eleobj = dictGetEntryKey(de);
10700 double *score = dictGetEntryVal(de);
10701 unsigned char eledigest[20];
10702
10703 snprintf(buf,sizeof(buf),"%.17g",*score);
10704 memset(eledigest,0,20);
10705 mixObjectDigest(eledigest,eleobj);
10706 mixDigest(eledigest,buf,strlen(buf));
10707 xorDigest(digest,eledigest,20);
10708 }
10709 dictReleaseIterator(di);
10710 } else if (o->type == REDIS_HASH) {
10711 hashIterator *hi;
10712 robj *obj;
10713
10714 hi = hashInitIterator(o);
10715 while (hashNext(hi) != REDIS_ERR) {
10716 unsigned char eledigest[20];
10717
10718 memset(eledigest,0,20);
10719 obj = hashCurrent(hi,REDIS_HASH_KEY);
10720 mixObjectDigest(eledigest,obj);
10721 decrRefCount(obj);
10722 obj = hashCurrent(hi,REDIS_HASH_VALUE);
10723 mixObjectDigest(eledigest,obj);
10724 decrRefCount(obj);
10725 xorDigest(digest,eledigest,20);
10726 }
10727 hashReleaseIterator(hi);
10728 } else {
10729 redisPanic("Unknown object type");
10730 }
ba798261 10731 /* If the key has an expire, add it to the mix */
10732 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
10733 /* We can finally xor the key-val digest to the final digest */
10734 xorDigest(final,digest,20);
10735 }
10736 dictReleaseIterator(di);
10737 }
10738}
10739
7f957c92 10740static void debugCommand(redisClient *c) {
10741 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10742 *((char*)-1) = 'x';
210e29f7 10743 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10744 if (rdbSave(server.dbfilename) != REDIS_OK) {
10745 addReply(c,shared.err);
10746 return;
10747 }
10748 emptyDb();
10749 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10750 addReply(c,shared.err);
10751 return;
10752 }
10753 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10754 addReply(c,shared.ok);
71c2b467 10755 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10756 emptyDb();
10757 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10758 addReply(c,shared.err);
10759 return;
10760 }
10761 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10762 addReply(c,shared.ok);
333298da 10763 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10764 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10765 robj *key, *val;
10766
10767 if (!de) {
10768 addReply(c,shared.nokeyerr);
10769 return;
10770 }
10771 key = dictGetEntryKey(de);
10772 val = dictGetEntryVal(de);
59146ef3 10773 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
10774 key->storage == REDIS_VM_SWAPPING)) {
07efaf74 10775 char *strenc;
10776 char buf[128];
10777
10778 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10779 strenc = strencoding[val->encoding];
10780 } else {
10781 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10782 strenc = buf;
10783 }
ace06542 10784 addReplySds(c,sdscatprintf(sdsempty(),
10785 "+Key at:%p refcount:%d, value at:%p refcount:%d "
07efaf74 10786 "encoding:%s serializedlength:%lld\r\n",
682ac724 10787 (void*)key, key->refcount, (void*)val, val->refcount,
07efaf74 10788 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 10789 } else {
10790 addReplySds(c,sdscatprintf(sdsempty(),
10791 "+Key at:%p refcount:%d, value swapped at: page %llu "
10792 "using %llu pages\r\n",
10793 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10794 (unsigned long long) key->vm.usedpages));
10795 }
78ebe4c8 10796 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10797 lookupKeyRead(c->db,c->argv[2]);
10798 addReply(c,shared.ok);
7d30035d 10799 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10800 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10801 robj *key, *val;
10802
10803 if (!server.vm_enabled) {
10804 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10805 return;
10806 }
10807 if (!de) {
10808 addReply(c,shared.nokeyerr);
10809 return;
10810 }
10811 key = dictGetEntryKey(de);
10812 val = dictGetEntryVal(de);
4ef8de8a 10813 /* If the key is shared we want to create a copy */
10814 if (key->refcount > 1) {
10815 robj *newkey = dupStringObject(key);
10816 decrRefCount(key);
10817 key = dictGetEntryKey(de) = newkey;
10818 }
10819 /* Swap it */
7d30035d 10820 if (key->storage != REDIS_VM_MEMORY) {
10821 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 10822 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 10823 dictGetEntryVal(de) = NULL;
10824 addReply(c,shared.ok);
10825 } else {
10826 addReply(c,shared.err);
10827 }
59305dc7 10828 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
10829 long keys, j;
10830 robj *key, *val;
10831 char buf[128];
10832
10833 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
10834 return;
10835 for (j = 0; j < keys; j++) {
10836 snprintf(buf,sizeof(buf),"key:%lu",j);
10837 key = createStringObject(buf,strlen(buf));
10838 if (lookupKeyRead(c->db,key) != NULL) {
10839 decrRefCount(key);
10840 continue;
10841 }
10842 snprintf(buf,sizeof(buf),"value:%lu",j);
10843 val = createStringObject(buf,strlen(buf));
10844 dictAdd(c->db->dict,key,val);
10845 }
10846 addReply(c,shared.ok);
ba798261 10847 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
10848 unsigned char digest[20];
10849 sds d = sdsnew("+");
10850 int j;
10851
10852 computeDatasetDigest(digest);
10853 for (j = 0; j < 20; j++)
10854 d = sdscatprintf(d, "%02x",digest[j]);
10855
10856 d = sdscatlen(d,"\r\n",2);
10857 addReplySds(c,d);
7f957c92 10858 } else {
333298da 10859 addReplySds(c,sdsnew(
bdcb92f2 10860 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 10861 }
10862}
56906eef 10863
6c96ba7d 10864static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 10865 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
fdfb02e7 10866 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
dfc5e96c 10867#ifdef HAVE_BACKTRACE
10868 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10869 *((char*)-1) = 'x';
10870#endif
10871}
10872
c651fd9e 10873static void _redisPanic(char *msg, char *file, int line) {
10874 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
17772754 10875 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
c651fd9e 10876#ifdef HAVE_BACKTRACE
10877 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10878 *((char*)-1) = 'x';
10879#endif
10880}
10881
bcfc686d 10882/* =================================== Main! ================================ */
56906eef 10883
bcfc686d 10884#ifdef __linux__
10885int linuxOvercommitMemoryValue(void) {
10886 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10887 char buf[64];
56906eef 10888
bcfc686d 10889 if (!fp) return -1;
10890 if (fgets(buf,64,fp) == NULL) {
10891 fclose(fp);
10892 return -1;
10893 }
10894 fclose(fp);
56906eef 10895
bcfc686d 10896 return atoi(buf);
10897}
10898
10899void linuxOvercommitMemoryWarning(void) {
10900 if (linuxOvercommitMemoryValue() == 0) {
7ccd2d0a 10901 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
bcfc686d 10902 }
10903}
10904#endif /* __linux__ */
10905
10906static void daemonize(void) {
10907 int fd;
10908 FILE *fp;
10909
10910 if (fork() != 0) exit(0); /* parent exits */
10911 setsid(); /* create a new session */
10912
10913 /* Every output goes to /dev/null. If Redis is daemonized but
10914 * the 'logfile' is set to 'stdout' in the configuration file
10915 * it will not log at all. */
10916 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10917 dup2(fd, STDIN_FILENO);
10918 dup2(fd, STDOUT_FILENO);
10919 dup2(fd, STDERR_FILENO);
10920 if (fd > STDERR_FILENO) close(fd);
10921 }
10922 /* Try to write the pid file */
10923 fp = fopen(server.pidfile,"w");
10924 if (fp) {
10925 fprintf(fp,"%d\n",getpid());
10926 fclose(fp);
56906eef 10927 }
56906eef 10928}
10929
42ab0172 10930static void version() {
8a3b0d2d 10931 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
10932 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
42ab0172
AO
10933 exit(0);
10934}
10935
723fb69b
AO
10936static void usage() {
10937 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 10938 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
10939 exit(1);
10940}
10941
bcfc686d 10942int main(int argc, char **argv) {
9651a787 10943 time_t start;
10944
bcfc686d 10945 initServerConfig();
10946 if (argc == 2) {
44efe66e 10947 if (strcmp(argv[1], "-v") == 0 ||
10948 strcmp(argv[1], "--version") == 0) version();
10949 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 10950 resetServerSaveParams();
10951 loadServerConfig(argv[1]);
723fb69b
AO
10952 } else if ((argc > 2)) {
10953 usage();
bcfc686d 10954 } else {
10955 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10956 }
bcfc686d 10957 if (server.daemonize) daemonize();
71c54b21 10958 initServer();
bcfc686d 10959 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10960#ifdef __linux__
10961 linuxOvercommitMemoryWarning();
10962#endif
9651a787 10963 start = time(NULL);
bcfc686d 10964 if (server.appendonly) {
10965 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 10966 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 10967 } else {
10968 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 10969 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 10970 }
bcfc686d 10971 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 10972 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 10973 aeMain(server.el);
10974 aeDeleteEventLoop(server.el);
10975 return 0;
10976}
10977
10978/* ============================= Backtrace support ========================= */
10979
10980#ifdef HAVE_BACKTRACE
10981static char *findFuncName(void *pointer, unsigned long *offset);
10982
56906eef 10983static void *getMcontextEip(ucontext_t *uc) {
10984#if defined(__FreeBSD__)
10985 return (void*) uc->uc_mcontext.mc_eip;
10986#elif defined(__dietlibc__)
10987 return (void*) uc->uc_mcontext.eip;
06db1f50 10988#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 10989 #if __x86_64__
10990 return (void*) uc->uc_mcontext->__ss.__rip;
10991 #else
56906eef 10992 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 10993 #endif
06db1f50 10994#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 10995 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 10996 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 10997 #else
10998 return (void*) uc->uc_mcontext->__ss.__eip;
e0a62c7f 10999 #endif
54bac49d 11000#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 11001 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 11002#elif defined(__ia64__) /* Linux IA64 */
11003 return (void*) uc->uc_mcontext.sc_ip;
11004#else
11005 return NULL;
56906eef 11006#endif
11007}
11008
11009static void segvHandler(int sig, siginfo_t *info, void *secret) {
11010 void *trace[100];
11011 char **messages = NULL;
11012 int i, trace_size = 0;
11013 unsigned long offset=0;
56906eef 11014 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 11015 sds infostring;
56906eef 11016 REDIS_NOTUSED(info);
11017
11018 redisLog(REDIS_WARNING,
11019 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 11020 infostring = genRedisInfoString();
11021 redisLog(REDIS_WARNING, "%s",infostring);
11022 /* It's not safe to sdsfree() the returned string under memory
11023 * corruption conditions. Let it leak as we are going to abort */
e0a62c7f 11024
56906eef 11025 trace_size = backtrace(trace, 100);
de96dbfe 11026 /* overwrite sigaction with caller's address */
b91cf5ef 11027 if (getMcontextEip(uc) != NULL) {
11028 trace[1] = getMcontextEip(uc);
11029 }
56906eef 11030 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 11031
d76412d1 11032 for (i=1; i<trace_size; ++i) {
56906eef 11033 char *fn = findFuncName(trace[i], &offset), *p;
11034
11035 p = strchr(messages[i],'+');
11036 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11037 redisLog(REDIS_WARNING,"%s", messages[i]);
11038 } else {
11039 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11040 }
11041 }
b177fd30 11042 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 11043 _exit(0);
fe3bbfbe 11044}
56906eef 11045
fab43727 11046static void sigtermHandler(int sig) {
11047 REDIS_NOTUSED(sig);
b58ba105 11048
fab43727 11049 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11050 server.shutdown_asap = 1;
b58ba105
AM
11051}
11052
56906eef 11053static void setupSigSegvAction(void) {
11054 struct sigaction act;
11055
11056 sigemptyset (&act.sa_mask);
11057 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11058 * is used. Otherwise, sa_handler is used */
11059 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11060 act.sa_sigaction = segvHandler;
11061 sigaction (SIGSEGV, &act, NULL);
11062 sigaction (SIGBUS, &act, NULL);
12fea928 11063 sigaction (SIGFPE, &act, NULL);
11064 sigaction (SIGILL, &act, NULL);
11065 sigaction (SIGBUS, &act, NULL);
b58ba105
AM
11066
11067 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
fab43727 11068 act.sa_handler = sigtermHandler;
b58ba105 11069 sigaction (SIGTERM, &act, NULL);
e65fdc78 11070 return;
56906eef 11071}
e65fdc78 11072
bcfc686d 11073#include "staticsymbols.h"
11074/* This function try to convert a pointer into a function name. It's used in
11075 * oreder to provide a backtrace under segmentation fault that's able to
11076 * display functions declared as static (otherwise the backtrace is useless). */
11077static char *findFuncName(void *pointer, unsigned long *offset){
11078 int i, ret = -1;
11079 unsigned long off, minoff = 0;
ed9b544e 11080
bcfc686d 11081 /* Try to match against the Symbol with the smallest offset */
11082 for (i=0; symsTable[i].pointer; i++) {
11083 unsigned long lp = (unsigned long) pointer;
0bc03378 11084
bcfc686d 11085 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11086 off=lp-symsTable[i].pointer;
11087 if (ret < 0 || off < minoff) {
11088 minoff=off;
11089 ret=i;
11090 }
11091 }
0bc03378 11092 }
bcfc686d 11093 if (ret == -1) return NULL;
11094 *offset = minoff;
11095 return symsTable[ret].name;
0bc03378 11096}
bcfc686d 11097#else /* HAVE_BACKTRACE */
11098static void setupSigSegvAction(void) {
0bc03378 11099}
bcfc686d 11100#endif /* HAVE_BACKTRACE */
0bc03378 11101
ed9b544e 11102
ed9b544e 11103
bcfc686d 11104/* The End */
11105
11106
ed9b544e 11107