]> git.saurik.com Git - redis.git/blame - redis.c
Merge branch 'master' into nested-multi
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
9005896c 30#define REDIS_VERSION "2.1.1"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
40#include <signal.h>
fbf9bcdb 41
42#ifdef HAVE_BACKTRACE
c9468bcf 43#include <execinfo.h>
44#include <ucontext.h>
fbf9bcdb 45#endif /* HAVE_BACKTRACE */
46
ed9b544e 47#include <sys/wait.h>
48#include <errno.h>
49#include <assert.h>
50#include <ctype.h>
51#include <stdarg.h>
52#include <inttypes.h>
53#include <arpa/inet.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#include <sys/time.h>
57#include <sys/resource.h>
2895e862 58#include <sys/uio.h>
f78fd11b 59#include <limits.h>
fb82e75c 60#include <float.h>
a7866db6 61#include <math.h>
92f8e882 62#include <pthread.h>
0bc1b2f6 63
64#if defined(__sun)
5043dff3 65#include "solarisfixes.h"
66#endif
ed9b544e 67
c9468bcf 68#include "redis.h"
ed9b544e 69#include "ae.h" /* Event driven programming library */
70#include "sds.h" /* Dynamic safe strings */
71#include "anet.h" /* Networking the easy way */
72#include "dict.h" /* Hash tables */
73#include "adlist.h" /* Linked lists */
74#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 75#include "lzf.h" /* LZF compression library */
76#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
ba798261 77#include "zipmap.h" /* Compact dictionary-alike data structure */
78#include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
5436146c 79#include "release.h" /* Release and/or git repository information */
ed9b544e 80
81/* Error codes */
82#define REDIS_OK 0
83#define REDIS_ERR -1
84
85/* Static server configuration */
86#define REDIS_SERVERPORT 6379 /* TCP port */
87#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 88#define REDIS_IOBUF_LEN 1024
ed9b544e 89#define REDIS_LOADBUF_LEN 1024
248ea310 90#define REDIS_STATIC_ARGS 8
ed9b544e 91#define REDIS_DEFAULT_DBNUM 16
92#define REDIS_CONFIGLINE_MAX 1024
93#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
94#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
8ca3e9d1 95#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
6f376729 96#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 97#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98
99/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
100#define REDIS_WRITEV_THRESHOLD 3
101/* Max number of iovecs used for each writev call */
102#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 103
104/* Hash table parameters */
105#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 106
107/* Command flags */
3fd78bcd 108#define REDIS_CMD_BULK 1 /* Bulk write command */
109#define REDIS_CMD_INLINE 2 /* Inline command */
110/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
111 this flags will return an error when the 'maxmemory' option is set in the
112 config file and the server is using more than maxmemory bytes of memory.
113 In short this commands are denied on low memory conditions. */
114#define REDIS_CMD_DENYOOM 4
4005fef1 115#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
ed9b544e 116
117/* Object types */
118#define REDIS_STRING 0
119#define REDIS_LIST 1
120#define REDIS_SET 2
1812e024 121#define REDIS_ZSET 3
122#define REDIS_HASH 4
f78fd11b 123
5234952b 124/* Objects encoding. Some kind of objects like Strings and Hashes can be
125 * internally represented in multiple ways. The 'encoding' field of the object
126 * is set to one of this fields for this object. */
942a3961 127#define REDIS_ENCODING_RAW 0 /* Raw representation */
128#define REDIS_ENCODING_INT 1 /* Encoded as integer */
5234952b 129#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
130#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
942a3961 131
07efaf74 132static char* strencoding[] = {
133 "raw", "int", "zipmap", "hashtable"
134};
135
f78fd11b 136/* Object types only used for dumping to disk */
bb32ede5 137#define REDIS_EXPIRETIME 253
ed9b544e 138#define REDIS_SELECTDB 254
139#define REDIS_EOF 255
140
f78fd11b 141/* Defines related to the dump file format. To store 32 bits lengths for short
142 * keys requires a lot of space, so we check the most significant 2 bits of
143 * the first byte to interpreter the length:
144 *
145 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
146 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
147 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 148 * 11|000000 this means: specially encoded object will follow. The six bits
149 * number specify the kind of object that follows.
150 * See the REDIS_RDB_ENC_* defines.
f78fd11b 151 *
10c43610 152 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
153 * values, will fit inside. */
f78fd11b 154#define REDIS_RDB_6BITLEN 0
155#define REDIS_RDB_14BITLEN 1
156#define REDIS_RDB_32BITLEN 2
17be1a4a 157#define REDIS_RDB_ENCVAL 3
f78fd11b 158#define REDIS_RDB_LENERR UINT_MAX
159
a4d1ba9a 160/* When a length of a string object stored on disk has the first two bits
161 * set, the remaining two bits specify a special encoding for the object
162 * accordingly to the following defines: */
163#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
164#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
165#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 166#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 167
75680a3c 168/* Virtual memory object->where field. */
169#define REDIS_VM_MEMORY 0 /* The object is on memory */
170#define REDIS_VM_SWAPPED 1 /* The object is on disk */
171#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
172#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
173
06224fec 174/* Virtual memory static configuration stuff.
175 * Check vmFindContiguousPages() to know more about this magic numbers. */
176#define REDIS_VM_MAX_NEAR_PAGES 65536
177#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 178#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 179#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 180/* The following is the *percentage* of completed I/O jobs to process when the
181 * handelr is called. While Virtual Memory I/O operations are performed by
182 * threads, this operations must be processed by the main thread when completed
183 * in order to take effect. */
c953f24b 184#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 185
ed9b544e 186/* Client flags */
d5d55fc3 187#define REDIS_SLAVE 1 /* This client is a slave server */
188#define REDIS_MASTER 2 /* This client is a master server */
189#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
190#define REDIS_MULTI 8 /* This client is in a MULTI context */
191#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
192#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
37ab76c9 193#define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
ed9b544e 194
40d224a9 195/* Slave replication state - slave side */
ed9b544e 196#define REDIS_REPL_NONE 0 /* No active replication */
197#define REDIS_REPL_CONNECT 1 /* Must connect to master */
198#define REDIS_REPL_CONNECTED 2 /* Connected to master */
199
40d224a9 200/* Slave replication state - from the point of view of master
201 * Note that in SEND_BULK and ONLINE state the slave receives new updates
202 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
203 * to start the next background saving in order to send updates to it. */
204#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
205#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
206#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
207#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
208
ed9b544e 209/* List related stuff */
210#define REDIS_HEAD 0
211#define REDIS_TAIL 1
212
213/* Sort operations */
214#define REDIS_SORT_GET 0
443c6409 215#define REDIS_SORT_ASC 1
216#define REDIS_SORT_DESC 2
ed9b544e 217#define REDIS_SORTKEY_MAX 1024
218
219/* Log levels */
220#define REDIS_DEBUG 0
f870935d 221#define REDIS_VERBOSE 1
222#define REDIS_NOTICE 2
223#define REDIS_WARNING 3
ed9b544e 224
225/* Anti-warning macro... */
226#define REDIS_NOTUSED(V) ((void) V)
227
6b47e12e 228#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
229#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 230
48f0308a 231/* Append only defines */
232#define APPENDFSYNC_NO 0
233#define APPENDFSYNC_ALWAYS 1
234#define APPENDFSYNC_EVERYSEC 2
235
cbba7dd7 236/* Hashes related defaults */
237#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
238#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
239
dfc5e96c 240/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 241#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
c651fd9e 242#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
6c96ba7d 243static void _redisAssert(char *estr, char *file, int line);
c651fd9e 244static void _redisPanic(char *msg, char *file, int line);
dfc5e96c 245
ed9b544e 246/*================================= Data types ============================== */
247
248/* A redis object, that is a type able to hold a string / list / set */
75680a3c 249
250/* The VM object structure */
251struct redisObjectVM {
3a66edc7 252 off_t page; /* the page at witch the object is stored on disk */
253 off_t usedpages; /* number of pages used on disk */
254 time_t atime; /* Last access time */
75680a3c 255} vm;
256
257/* The actual Redis Object */
ed9b544e 258typedef struct redisObject {
ed9b544e 259 void *ptr;
942a3961 260 unsigned char type;
261 unsigned char encoding;
d894161b 262 unsigned char storage; /* If this object is a key, where is the value?
263 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
264 unsigned char vtype; /* If this object is a key, and value is swapped out,
265 * this is the type of the swapped out object. */
ed9b544e 266 int refcount;
75680a3c 267 /* VM fields, this are only allocated if VM is active, otherwise the
268 * object allocation function will just allocate
269 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
270 * Redis without VM active will not have any overhead. */
271 struct redisObjectVM vm;
ed9b544e 272} robj;
273
dfc5e96c 274/* Macro used to initalize a Redis object allocated on the stack.
275 * Note that this macro is taken near the structure definition to make sure
276 * we'll update it when the structure is changed, to avoid bugs like
277 * bug #85 introduced exactly in this way. */
278#define initStaticStringObject(_var,_ptr) do { \
279 _var.refcount = 1; \
280 _var.type = REDIS_STRING; \
281 _var.encoding = REDIS_ENCODING_RAW; \
282 _var.ptr = _ptr; \
3a66edc7 283 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 284} while(0);
285
3305306f 286typedef struct redisDb {
4409877e 287 dict *dict; /* The keyspace for this DB */
288 dict *expires; /* Timeout of keys with a timeout set */
37ab76c9 289 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 290 dict *io_keys; /* Keys with clients waiting for VM I/O */
37ab76c9 291 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
3305306f 292 int id;
293} redisDb;
294
6e469882 295/* Client MULTI/EXEC state */
296typedef struct multiCmd {
297 robj **argv;
298 int argc;
299 struct redisCommand *cmd;
300} multiCmd;
301
302typedef struct multiState {
303 multiCmd *commands; /* Array of MULTI commands */
304 int count; /* Total number of MULTI commands */
305} multiState;
306
ed9b544e 307/* With multiplexing we need to take per-clinet state.
308 * Clients are taken in a liked list. */
309typedef struct redisClient {
310 int fd;
3305306f 311 redisDb *db;
ed9b544e 312 int dictid;
313 sds querybuf;
e8a74421 314 robj **argv, **mbargv;
315 int argc, mbargc;
40d224a9 316 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 317 int multibulk; /* multi bulk command format active */
ed9b544e 318 list *reply;
319 int sentlen;
320 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 321 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 322 int slaveseldb; /* slave selected db, if this client is a slave */
323 int authenticated; /* when requirepass is non-NULL */
324 int replstate; /* replication state if this is a slave */
325 int repldbfd; /* replication DB file descriptor */
6e469882 326 long repldboff; /* replication DB file offset */
40d224a9 327 off_t repldbsize; /* replication DB file size */
6e469882 328 multiState mstate; /* MULTI/EXEC state */
37ab76c9 329 robj **blocking_keys; /* The key we are waiting to terminate a blocking
4409877e 330 * operation such as BLPOP. Otherwise NULL. */
37ab76c9 331 int blocking_keys_num; /* Number of blocking keys */
4409877e 332 time_t blockingto; /* Blocking operation timeout. If UNIX current time
333 * is >= blockingto then the operation timed out. */
92f8e882 334 list *io_keys; /* Keys this client is waiting to be loaded from the
335 * swap file in order to continue. */
37ab76c9 336 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
ffc6b7f8 337 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
338 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
ed9b544e 339} redisClient;
340
341struct saveparam {
342 time_t seconds;
343 int changes;
344};
345
346/* Global server state structure */
347struct redisServer {
348 int port;
349 int fd;
3305306f 350 redisDb *db;
ed9b544e 351 long long dirty; /* changes to DB from the last save */
352 list *clients;
87eca727 353 list *slaves, *monitors;
ed9b544e 354 char neterr[ANET_ERR_LEN];
355 aeEventLoop *el;
356 int cronloops; /* number of times the cron function run */
357 list *objfreelist; /* A list of freed objects to avoid malloc() */
358 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 359 /* Fields used only for stats */
360 time_t stat_starttime; /* server start time */
361 long long stat_numcommands; /* number of processed commands */
362 long long stat_numconnections; /* number of connections received */
2a6a2ed1 363 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 364 /* Configuration */
365 int verbosity;
366 int glueoutputbuf;
367 int maxidletime;
368 int dbnum;
369 int daemonize;
44b38ef4 370 int appendonly;
48f0308a 371 int appendfsync;
fab43727 372 int shutdown_asap;
48f0308a 373 time_t lastfsync;
44b38ef4 374 int appendfd;
375 int appendseldb;
ed329fcf 376 char *pidfile;
9f3c422c 377 pid_t bgsavechildpid;
9d65a1bb 378 pid_t bgrewritechildpid;
379 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
28ed1f33 380 sds aofbuf; /* AOF buffer, written before entering the event loop */
ed9b544e 381 struct saveparam *saveparams;
382 int saveparamslen;
383 char *logfile;
384 char *bindaddr;
385 char *dbfilename;
44b38ef4 386 char *appendfilename;
abcb223e 387 char *requirepass;
121f70cf 388 int rdbcompression;
8ca3e9d1 389 int activerehashing;
ed9b544e 390 /* Replication related */
391 int isslave;
d0ccebcf 392 char *masterauth;
ed9b544e 393 char *masterhost;
394 int masterport;
40d224a9 395 redisClient *master; /* client that is master for this slave */
ed9b544e 396 int replstate;
285add55 397 unsigned int maxclients;
4ef8de8a 398 unsigned long long maxmemory;
d5d55fc3 399 unsigned int blpop_blocked_clients;
400 unsigned int vm_blocked_clients;
ed9b544e 401 /* Sort parameters - qsort_r() is only available under BSD so we
402 * have to take this state global, in order to pass it to sortCompare() */
403 int sort_desc;
404 int sort_alpha;
405 int sort_bypattern;
75680a3c 406 /* Virtual memory configuration */
407 int vm_enabled;
054e426d 408 char *vm_swap_file;
75680a3c 409 off_t vm_page_size;
410 off_t vm_pages;
4ef8de8a 411 unsigned long long vm_max_memory;
cbba7dd7 412 /* Hashes config */
413 size_t hash_max_zipmap_entries;
414 size_t hash_max_zipmap_value;
75680a3c 415 /* Virtual memory state */
416 FILE *vm_fp;
417 int vm_fd;
418 off_t vm_next_page; /* Next probably empty page */
419 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 420 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 421 time_t unixtime; /* Unix time sampled every second. */
92f8e882 422 /* Virtual memory I/O threads stuff */
92f8e882 423 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 424 * put the result of the operation in the io_done list. While the
425 * job is being processed, it's put on io_processing queue. */
426 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
427 list *io_processing; /* List of VM I/O jobs being processed */
428 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 429 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 430 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 431 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
432 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 433 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 434 int io_active_threads; /* Number of running I/O threads */
435 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 436 /* Our main thread is blocked on the event loop, locking for sockets ready
437 * to be read or written, so when a threaded I/O operation is ready to be
438 * processed by the main thread, the I/O thread will use a unix pipe to
439 * awake the main thread. The followings are the two pipe FDs. */
440 int io_ready_pipe_read;
441 int io_ready_pipe_write;
7d98e08c 442 /* Virtual memory stats */
443 unsigned long long vm_stats_used_pages;
444 unsigned long long vm_stats_swapped_objects;
445 unsigned long long vm_stats_swapouts;
446 unsigned long long vm_stats_swapins;
befec3cd 447 /* Pubsub */
ffc6b7f8 448 dict *pubsub_channels; /* Map channels to list of subscribed clients */
449 list *pubsub_patterns; /* A list of pubsub_patterns */
befec3cd 450 /* Misc */
b9bc0eef 451 FILE *devnull;
ed9b544e 452};
453
ffc6b7f8 454typedef struct pubsubPattern {
455 redisClient *client;
456 robj *pattern;
457} pubsubPattern;
458
ed9b544e 459typedef void redisCommandProc(redisClient *c);
ca1788b5 460typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
ed9b544e 461struct redisCommand {
462 char *name;
463 redisCommandProc *proc;
464 int arity;
465 int flags;
76583ea4
PN
466 /* Use a function to determine which keys need to be loaded
467 * in the background prior to executing this command. Takes precedence
468 * over vm_firstkey and others, ignored when NULL */
ca1788b5 469 redisVmPreloadProc *vm_preload_proc;
7c775e09 470 /* What keys should be loaded in background when calling this command? */
471 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
472 int vm_lastkey; /* THe last argument that's a key */
473 int vm_keystep; /* The step between first and last key */
ed9b544e 474};
475
de96dbfe 476struct redisFunctionSym {
477 char *name;
56906eef 478 unsigned long pointer;
de96dbfe 479};
480
ed9b544e 481typedef struct _redisSortObject {
482 robj *obj;
483 union {
484 double score;
485 robj *cmpobj;
486 } u;
487} redisSortObject;
488
489typedef struct _redisSortOperation {
490 int type;
491 robj *pattern;
492} redisSortOperation;
493
6b47e12e 494/* ZSETs use a specialized version of Skiplists */
495
496typedef struct zskiplistNode {
497 struct zskiplistNode **forward;
e3870fab 498 struct zskiplistNode *backward;
912b9165 499 unsigned int *span;
6b47e12e 500 double score;
501 robj *obj;
502} zskiplistNode;
503
504typedef struct zskiplist {
e3870fab 505 struct zskiplistNode *header, *tail;
d13f767c 506 unsigned long length;
6b47e12e 507 int level;
508} zskiplist;
509
1812e024 510typedef struct zset {
511 dict *dict;
6b47e12e 512 zskiplist *zsl;
1812e024 513} zset;
514
6b47e12e 515/* Our shared "common" objects */
516
05df7621 517#define REDIS_SHARED_INTEGERS 10000
ed9b544e 518struct sharedObjectsStruct {
c937aa89 519 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 520 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 521 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
522 *outofrangeerr, *plus,
ed9b544e 523 *select0, *select1, *select2, *select3, *select4,
befec3cd 524 *select5, *select6, *select7, *select8, *select9,
c8d0ea0e 525 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
526 *mbulk4, *psubscribebulk, *punsubscribebulk,
527 *integers[REDIS_SHARED_INTEGERS];
ed9b544e 528} shared;
529
a7866db6 530/* Global vars that are actally used as constants. The following double
531 * values are used for double on-disk serialization, and are initialized
532 * at runtime to avoid strange compiler optimizations. */
533
534static double R_Zero, R_PosInf, R_NegInf, R_Nan;
535
92f8e882 536/* VM threaded I/O request message */
b9bc0eef 537#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
538#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
539#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 540typedef struct iojob {
996cb5f7 541 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 542 redisDb *db;/* Redis database */
92f8e882 543 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 544 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 545 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
546 off_t page; /* Swap page where to read/write the object */
248ea310 547 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 548 int canceled; /* True if this command was canceled by blocking side of VM */
549 pthread_t thread; /* ID of the thread processing this entry */
550} iojob;
92f8e882 551
ed9b544e 552/*================================ Prototypes =============================== */
553
554static void freeStringObject(robj *o);
555static void freeListObject(robj *o);
556static void freeSetObject(robj *o);
557static void decrRefCount(void *o);
558static robj *createObject(int type, void *ptr);
559static void freeClient(redisClient *c);
f78fd11b 560static int rdbLoad(char *filename);
ed9b544e 561static void addReply(redisClient *c, robj *obj);
562static void addReplySds(redisClient *c, sds s);
563static void incrRefCount(robj *o);
f78fd11b 564static int rdbSaveBackground(char *filename);
ed9b544e 565static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 566static robj *dupStringObject(robj *o);
248ea310 567static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
dd142b9c 568static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
28ed1f33 569static void flushAppendOnlyFile(void);
44b38ef4 570static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 571static int syncWithMaster(void);
05df7621 572static robj *tryObjectEncoding(robj *o);
9d65a1bb 573static robj *getDecodedObject(robj *o);
3305306f 574static int removeExpire(redisDb *db, robj *key);
575static int expireIfNeeded(redisDb *db, robj *key);
576static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 577static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 578static int deleteKey(redisDb *db, robj *key);
bb32ede5 579static time_t getExpire(redisDb *db, robj *key);
580static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 581static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 582static void freeMemoryIfNeeded(void);
de96dbfe 583static int processCommand(redisClient *c);
56906eef 584static void setupSigSegvAction(void);
a3b21203 585static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 586static void aofRemoveTempFile(pid_t childpid);
0ea663ea 587static size_t stringObjectLen(robj *o);
638e42ac 588static void processInputBuffer(redisClient *c);
6b47e12e 589static zskiplist *zslCreate(void);
fd8ccf44 590static void zslFree(zskiplist *zsl);
2b59cfdf 591static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 592static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 593static void initClientMultiState(redisClient *c);
594static void freeClientMultiState(redisClient *c);
595static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 596static void unblockClientWaitingData(redisClient *c);
4409877e 597static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 598static void vmInit(void);
a35ddf12 599static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 600static robj *vmLoadObject(robj *key);
7e69548d 601static robj *vmPreviewObject(robj *key);
a69a0c9c 602static int vmSwapOneObjectBlocking(void);
603static int vmSwapOneObjectThreaded(void);
7e69548d 604static int vmCanSwapOut(void);
a5819310 605static int tryFreeOneObjectFromFreelist(void);
996cb5f7 606static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
607static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
608static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 609static void lockThreadedIO(void);
610static void unlockThreadedIO(void);
611static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
612static void freeIOJob(iojob *j);
613static void queueIOJob(iojob *j);
a5819310 614static int vmWriteObjectOnSwap(robj *o, off_t page);
615static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 616static void waitEmptyIOJobsQueue(void);
617static void vmReopenSwapFile(void);
970e10bb 618static int vmFreePage(off_t page);
ca1788b5 619static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
3805e04f 620static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
0a6f3f0f 621static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
d5d55fc3 622static int dontWaitForSwappedKey(redisClient *c, robj *key);
623static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
624static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
625static struct redisCommand *lookupCommand(char *name);
626static void call(redisClient *c, struct redisCommand *cmd);
627static void resetClient(redisClient *c);
ada386b2 628static void convertToRealHash(robj *o);
ffc6b7f8 629static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
630static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
631static void freePubsubPattern(void *p);
632static int listMatchPubsubPattern(void *a, void *b);
633static int compareStringObjects(robj *a, robj *b);
bf028098 634static int equalStringObjects(robj *a, robj *b);
befec3cd 635static void usage();
8f63ddca 636static int rewriteAppendOnlyFileBackground(void);
242a64f3 637static int vmSwapObjectBlocking(robj *key, robj *val);
fab43727 638static int prepareForShutdown();
37ab76c9 639static void touchWatchedKey(redisDb *db, robj *key);
9b30e1a2 640static void touchWatchedKeysOnFlush(int dbid);
37ab76c9 641static void unwatchAllKeys(redisClient *c);
ed9b544e 642
abcb223e 643static void authCommand(redisClient *c);
ed9b544e 644static void pingCommand(redisClient *c);
645static void echoCommand(redisClient *c);
646static void setCommand(redisClient *c);
647static void setnxCommand(redisClient *c);
526d00a5 648static void setexCommand(redisClient *c);
ed9b544e 649static void getCommand(redisClient *c);
650static void delCommand(redisClient *c);
651static void existsCommand(redisClient *c);
652static void incrCommand(redisClient *c);
653static void decrCommand(redisClient *c);
654static void incrbyCommand(redisClient *c);
655static void decrbyCommand(redisClient *c);
656static void selectCommand(redisClient *c);
657static void randomkeyCommand(redisClient *c);
658static void keysCommand(redisClient *c);
659static void dbsizeCommand(redisClient *c);
660static void lastsaveCommand(redisClient *c);
661static void saveCommand(redisClient *c);
662static void bgsaveCommand(redisClient *c);
9d65a1bb 663static void bgrewriteaofCommand(redisClient *c);
ed9b544e 664static void shutdownCommand(redisClient *c);
665static void moveCommand(redisClient *c);
666static void renameCommand(redisClient *c);
667static void renamenxCommand(redisClient *c);
668static void lpushCommand(redisClient *c);
669static void rpushCommand(redisClient *c);
670static void lpopCommand(redisClient *c);
671static void rpopCommand(redisClient *c);
672static void llenCommand(redisClient *c);
673static void lindexCommand(redisClient *c);
674static void lrangeCommand(redisClient *c);
675static void ltrimCommand(redisClient *c);
676static void typeCommand(redisClient *c);
677static void lsetCommand(redisClient *c);
678static void saddCommand(redisClient *c);
679static void sremCommand(redisClient *c);
a4460ef4 680static void smoveCommand(redisClient *c);
ed9b544e 681static void sismemberCommand(redisClient *c);
682static void scardCommand(redisClient *c);
12fea928 683static void spopCommand(redisClient *c);
2abb95a9 684static void srandmemberCommand(redisClient *c);
ed9b544e 685static void sinterCommand(redisClient *c);
686static void sinterstoreCommand(redisClient *c);
40d224a9 687static void sunionCommand(redisClient *c);
688static void sunionstoreCommand(redisClient *c);
f4f56e1d 689static void sdiffCommand(redisClient *c);
690static void sdiffstoreCommand(redisClient *c);
ed9b544e 691static void syncCommand(redisClient *c);
692static void flushdbCommand(redisClient *c);
693static void flushallCommand(redisClient *c);
694static void sortCommand(redisClient *c);
695static void lremCommand(redisClient *c);
0f5f7e9a 696static void rpoplpushcommand(redisClient *c);
ed9b544e 697static void infoCommand(redisClient *c);
70003d28 698static void mgetCommand(redisClient *c);
87eca727 699static void monitorCommand(redisClient *c);
3305306f 700static void expireCommand(redisClient *c);
802e8373 701static void expireatCommand(redisClient *c);
f6b141c5 702static void getsetCommand(redisClient *c);
fd88489a 703static void ttlCommand(redisClient *c);
321b0e13 704static void slaveofCommand(redisClient *c);
7f957c92 705static void debugCommand(redisClient *c);
f6b141c5 706static void msetCommand(redisClient *c);
707static void msetnxCommand(redisClient *c);
fd8ccf44 708static void zaddCommand(redisClient *c);
7db723ad 709static void zincrbyCommand(redisClient *c);
cc812361 710static void zrangeCommand(redisClient *c);
50c55df5 711static void zrangebyscoreCommand(redisClient *c);
f44dd428 712static void zcountCommand(redisClient *c);
e3870fab 713static void zrevrangeCommand(redisClient *c);
3c41331e 714static void zcardCommand(redisClient *c);
1b7106e7 715static void zremCommand(redisClient *c);
6e333bbe 716static void zscoreCommand(redisClient *c);
1807985b 717static void zremrangebyscoreCommand(redisClient *c);
6e469882 718static void multiCommand(redisClient *c);
719static void execCommand(redisClient *c);
18b6cb76 720static void discardCommand(redisClient *c);
4409877e 721static void blpopCommand(redisClient *c);
722static void brpopCommand(redisClient *c);
4b00bebd 723static void appendCommand(redisClient *c);
39191553 724static void substrCommand(redisClient *c);
69d95c3e 725static void zrankCommand(redisClient *c);
798d9e55 726static void zrevrankCommand(redisClient *c);
978c2c94 727static void hsetCommand(redisClient *c);
1f1c7695 728static void hsetnxCommand(redisClient *c);
978c2c94 729static void hgetCommand(redisClient *c);
09aeb579
PN
730static void hmsetCommand(redisClient *c);
731static void hmgetCommand(redisClient *c);
07efaf74 732static void hdelCommand(redisClient *c);
92b27fe9 733static void hlenCommand(redisClient *c);
9212eafd 734static void zremrangebyrankCommand(redisClient *c);
5d373da9 735static void zunionstoreCommand(redisClient *c);
736static void zinterstoreCommand(redisClient *c);
78409a0f 737static void hkeysCommand(redisClient *c);
738static void hvalsCommand(redisClient *c);
739static void hgetallCommand(redisClient *c);
a86f14b1 740static void hexistsCommand(redisClient *c);
500ece7c 741static void configCommand(redisClient *c);
01426b05 742static void hincrbyCommand(redisClient *c);
befec3cd 743static void subscribeCommand(redisClient *c);
744static void unsubscribeCommand(redisClient *c);
ffc6b7f8 745static void psubscribeCommand(redisClient *c);
746static void punsubscribeCommand(redisClient *c);
befec3cd 747static void publishCommand(redisClient *c);
37ab76c9 748static void watchCommand(redisClient *c);
749static void unwatchCommand(redisClient *c);
f6b141c5 750
ed9b544e 751/*================================= Globals ================================= */
752
753/* Global vars */
754static struct redisServer server; /* server global state */
755static struct redisCommand cmdTable[] = {
76583ea4
PN
756 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
757 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
758 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
526d00a5 759 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
760 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
761 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
762 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
763 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
764 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
765 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
766 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
767 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
768 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
769 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
770 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
771 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
776 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
779 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
780 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
781 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
782 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
783 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
784 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
785 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
788 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
789 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
790 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
791 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
792 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
793 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
794 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
795 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
797 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
798 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
5d373da9 799 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
800 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
801 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
802 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
807 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
808 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
809 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
1f1c7695 810 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 811 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
d33278d1 812 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 813 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
01426b05 814 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
815 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
816 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
817 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
819 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 820 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
821 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
822 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
823 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
824 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
825 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
826 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
829 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
830 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
831 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
832 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
838 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
840 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
842 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
843 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
844 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
3805e04f 845 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
846 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
847 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
848 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
849 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
850 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
851 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
852 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
853 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
854 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
855 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 856 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
befec3cd 857 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
858 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
ffc6b7f8 859 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
860 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
4005fef1 861 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
37ab76c9 862 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
863 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
76583ea4 864 {NULL,NULL,0,0,NULL,0,0,0}
ed9b544e 865};
bcfc686d 866
ed9b544e 867/*============================ Utility functions ============================ */
868
869/* Glob-style pattern matching. */
500ece7c 870static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 871 const char *string, int stringLen, int nocase)
872{
873 while(patternLen) {
874 switch(pattern[0]) {
875 case '*':
876 while (pattern[1] == '*') {
877 pattern++;
878 patternLen--;
879 }
880 if (patternLen == 1)
881 return 1; /* match */
882 while(stringLen) {
883 if (stringmatchlen(pattern+1, patternLen-1,
884 string, stringLen, nocase))
885 return 1; /* match */
886 string++;
887 stringLen--;
888 }
889 return 0; /* no match */
890 break;
891 case '?':
892 if (stringLen == 0)
893 return 0; /* no match */
894 string++;
895 stringLen--;
896 break;
897 case '[':
898 {
899 int not, match;
900
901 pattern++;
902 patternLen--;
903 not = pattern[0] == '^';
904 if (not) {
905 pattern++;
906 patternLen--;
907 }
908 match = 0;
909 while(1) {
910 if (pattern[0] == '\\') {
911 pattern++;
912 patternLen--;
913 if (pattern[0] == string[0])
914 match = 1;
915 } else if (pattern[0] == ']') {
916 break;
917 } else if (patternLen == 0) {
918 pattern--;
919 patternLen++;
920 break;
921 } else if (pattern[1] == '-' && patternLen >= 3) {
922 int start = pattern[0];
923 int end = pattern[2];
924 int c = string[0];
925 if (start > end) {
926 int t = start;
927 start = end;
928 end = t;
929 }
930 if (nocase) {
931 start = tolower(start);
932 end = tolower(end);
933 c = tolower(c);
934 }
935 pattern += 2;
936 patternLen -= 2;
937 if (c >= start && c <= end)
938 match = 1;
939 } else {
940 if (!nocase) {
941 if (pattern[0] == string[0])
942 match = 1;
943 } else {
944 if (tolower((int)pattern[0]) == tolower((int)string[0]))
945 match = 1;
946 }
947 }
948 pattern++;
949 patternLen--;
950 }
951 if (not)
952 match = !match;
953 if (!match)
954 return 0; /* no match */
955 string++;
956 stringLen--;
957 break;
958 }
959 case '\\':
960 if (patternLen >= 2) {
961 pattern++;
962 patternLen--;
963 }
964 /* fall through */
965 default:
966 if (!nocase) {
967 if (pattern[0] != string[0])
968 return 0; /* no match */
969 } else {
970 if (tolower((int)pattern[0]) != tolower((int)string[0]))
971 return 0; /* no match */
972 }
973 string++;
974 stringLen--;
975 break;
976 }
977 pattern++;
978 patternLen--;
979 if (stringLen == 0) {
980 while(*pattern == '*') {
981 pattern++;
982 patternLen--;
983 }
984 break;
985 }
986 }
987 if (patternLen == 0 && stringLen == 0)
988 return 1;
989 return 0;
990}
991
500ece7c 992static int stringmatch(const char *pattern, const char *string, int nocase) {
993 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
994}
995
2b619329 996/* Convert a string representing an amount of memory into the number of
997 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
998 * (1024*1024*1024).
999 *
1000 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1001 * set to 0 */
1002static long long memtoll(const char *p, int *err) {
1003 const char *u;
1004 char buf[128];
1005 long mul; /* unit multiplier */
1006 long long val;
1007 unsigned int digits;
1008
1009 if (err) *err = 0;
1010 /* Search the first non digit character. */
1011 u = p;
1012 if (*u == '-') u++;
1013 while(*u && isdigit(*u)) u++;
1014 if (*u == '\0' || !strcasecmp(u,"b")) {
1015 mul = 1;
72324005 1016 } else if (!strcasecmp(u,"k")) {
2b619329 1017 mul = 1000;
72324005 1018 } else if (!strcasecmp(u,"kb")) {
2b619329 1019 mul = 1024;
72324005 1020 } else if (!strcasecmp(u,"m")) {
2b619329 1021 mul = 1000*1000;
72324005 1022 } else if (!strcasecmp(u,"mb")) {
2b619329 1023 mul = 1024*1024;
72324005 1024 } else if (!strcasecmp(u,"g")) {
2b619329 1025 mul = 1000L*1000*1000;
72324005 1026 } else if (!strcasecmp(u,"gb")) {
2b619329 1027 mul = 1024L*1024*1024;
1028 } else {
1029 if (err) *err = 1;
1030 mul = 1;
1031 }
1032 digits = u-p;
1033 if (digits >= sizeof(buf)) {
1034 if (err) *err = 1;
1035 return LLONG_MAX;
1036 }
1037 memcpy(buf,p,digits);
1038 buf[digits] = '\0';
1039 val = strtoll(buf,NULL,10);
1040 return val*mul;
1041}
1042
ee14da56 1043/* Convert a long long into a string. Returns the number of
1044 * characters needed to represent the number, that can be shorter if passed
1045 * buffer length is not enough to store the whole number. */
1046static int ll2string(char *s, size_t len, long long value) {
1047 char buf[32], *p;
1048 unsigned long long v;
1049 size_t l;
1050
1051 if (len == 0) return 0;
1052 v = (value < 0) ? -value : value;
1053 p = buf+31; /* point to the last character */
1054 do {
1055 *p-- = '0'+(v%10);
1056 v /= 10;
1057 } while(v);
1058 if (value < 0) *p-- = '-';
1059 p++;
1060 l = 32-(p-buf);
1061 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1062 memcpy(s,p,l);
1063 s[l] = '\0';
1064 return l;
1065}
1066
56906eef 1067static void redisLog(int level, const char *fmt, ...) {
ed9b544e 1068 va_list ap;
1069 FILE *fp;
1070
1071 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1072 if (!fp) return;
1073
1074 va_start(ap, fmt);
1075 if (level >= server.verbosity) {
6766f45e 1076 char *c = ".-*#";
1904ecc1 1077 char buf[64];
1078 time_t now;
1079
1080 now = time(NULL);
6c9385e0 1081 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 1082 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 1083 vfprintf(fp, fmt, ap);
1084 fprintf(fp,"\n");
1085 fflush(fp);
1086 }
1087 va_end(ap);
1088
1089 if (server.logfile) fclose(fp);
1090}
1091
1092/*====================== Hash table type implementation ==================== */
1093
1094/* This is an hash table type that uses the SDS dynamic strings libary as
1095 * keys and radis objects as values (objects can hold SDS strings,
1096 * lists, sets). */
1097
1812e024 1098static void dictVanillaFree(void *privdata, void *val)
1099{
1100 DICT_NOTUSED(privdata);
1101 zfree(val);
1102}
1103
4409877e 1104static void dictListDestructor(void *privdata, void *val)
1105{
1106 DICT_NOTUSED(privdata);
1107 listRelease((list*)val);
1108}
1109
ed9b544e 1110static int sdsDictKeyCompare(void *privdata, const void *key1,
1111 const void *key2)
1112{
1113 int l1,l2;
1114 DICT_NOTUSED(privdata);
1115
1116 l1 = sdslen((sds)key1);
1117 l2 = sdslen((sds)key2);
1118 if (l1 != l2) return 0;
1119 return memcmp(key1, key2, l1) == 0;
1120}
1121
1122static void dictRedisObjectDestructor(void *privdata, void *val)
1123{
1124 DICT_NOTUSED(privdata);
1125
a35ddf12 1126 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 1127 decrRefCount(val);
1128}
1129
942a3961 1130static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1131 const void *key2)
1132{
1133 const robj *o1 = key1, *o2 = key2;
1134 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1135}
1136
942a3961 1137static unsigned int dictObjHash(const void *key) {
ed9b544e 1138 const robj *o = key;
1139 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1140}
1141
942a3961 1142static int dictEncObjKeyCompare(void *privdata, const void *key1,
1143 const void *key2)
1144{
9d65a1bb 1145 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1146 int cmp;
942a3961 1147
2a1198b4 1148 if (o1->encoding == REDIS_ENCODING_INT &&
dc05abde 1149 o2->encoding == REDIS_ENCODING_INT)
1150 return o1->ptr == o2->ptr;
2a1198b4 1151
9d65a1bb 1152 o1 = getDecodedObject(o1);
1153 o2 = getDecodedObject(o2);
1154 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1155 decrRefCount(o1);
1156 decrRefCount(o2);
1157 return cmp;
942a3961 1158}
1159
1160static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1161 robj *o = (robj*) key;
942a3961 1162
ed9e4966 1163 if (o->encoding == REDIS_ENCODING_RAW) {
1164 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1165 } else {
1166 if (o->encoding == REDIS_ENCODING_INT) {
1167 char buf[32];
1168 int len;
1169
ee14da56 1170 len = ll2string(buf,32,(long)o->ptr);
ed9e4966 1171 return dictGenHashFunction((unsigned char*)buf, len);
1172 } else {
1173 unsigned int hash;
1174
1175 o = getDecodedObject(o);
1176 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1177 decrRefCount(o);
1178 return hash;
1179 }
1180 }
942a3961 1181}
1182
f2d9f50f 1183/* Sets type and expires */
ed9b544e 1184static dictType setDictType = {
942a3961 1185 dictEncObjHash, /* hash function */
ed9b544e 1186 NULL, /* key dup */
1187 NULL, /* val dup */
942a3961 1188 dictEncObjKeyCompare, /* key compare */
ed9b544e 1189 dictRedisObjectDestructor, /* key destructor */
1190 NULL /* val destructor */
1191};
1192
f2d9f50f 1193/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1194static dictType zsetDictType = {
1195 dictEncObjHash, /* hash function */
1196 NULL, /* key dup */
1197 NULL, /* val dup */
1198 dictEncObjKeyCompare, /* key compare */
1199 dictRedisObjectDestructor, /* key destructor */
da0a1620 1200 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1201};
1202
f2d9f50f 1203/* Db->dict */
5234952b 1204static dictType dbDictType = {
942a3961 1205 dictObjHash, /* hash function */
ed9b544e 1206 NULL, /* key dup */
1207 NULL, /* val dup */
942a3961 1208 dictObjKeyCompare, /* key compare */
ed9b544e 1209 dictRedisObjectDestructor, /* key destructor */
1210 dictRedisObjectDestructor /* val destructor */
1211};
1212
f2d9f50f 1213/* Db->expires */
1214static dictType keyptrDictType = {
1215 dictObjHash, /* hash function */
1216 NULL, /* key dup */
1217 NULL, /* val dup */
1218 dictObjKeyCompare, /* key compare */
1219 dictRedisObjectDestructor, /* key destructor */
1220 NULL /* val destructor */
1221};
1222
5234952b 1223/* Hash type hash table (note that small hashes are represented with zimpaps) */
1224static dictType hashDictType = {
1225 dictEncObjHash, /* hash function */
1226 NULL, /* key dup */
1227 NULL, /* val dup */
1228 dictEncObjKeyCompare, /* key compare */
1229 dictRedisObjectDestructor, /* key destructor */
1230 dictRedisObjectDestructor /* val destructor */
1231};
1232
4409877e 1233/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1234 * lists as values. It's used for blocking operations (BLPOP) and to
1235 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1236static dictType keylistDictType = {
1237 dictObjHash, /* hash function */
1238 NULL, /* key dup */
1239 NULL, /* val dup */
1240 dictObjKeyCompare, /* key compare */
1241 dictRedisObjectDestructor, /* key destructor */
1242 dictListDestructor /* val destructor */
1243};
1244
42ab0172
AO
1245static void version();
1246
ed9b544e 1247/* ========================= Random utility functions ======================= */
1248
1249/* Redis generally does not try to recover from out of memory conditions
1250 * when allocating objects or strings, it is not clear if it will be possible
1251 * to report this condition to the client since the networking layer itself
1252 * is based on heap allocation for send buffers, so we simply abort.
1253 * At least the code will be simpler to read... */
1254static void oom(const char *msg) {
71c54b21 1255 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1256 sleep(1);
1257 abort();
1258}
1259
1260/* ====================== Redis server networking stuff ===================== */
56906eef 1261static void closeTimedoutClients(void) {
ed9b544e 1262 redisClient *c;
ed9b544e 1263 listNode *ln;
1264 time_t now = time(NULL);
c7df85a4 1265 listIter li;
ed9b544e 1266
c7df85a4 1267 listRewind(server.clients,&li);
1268 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1269 c = listNodeValue(ln);
f86a74e9 1270 if (server.maxidletime &&
1271 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1272 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
ffc6b7f8 1273 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1274 listLength(c->pubsub_patterns) == 0 &&
d6cc8867 1275 (now - c->lastinteraction > server.maxidletime))
f86a74e9 1276 {
f870935d 1277 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1278 freeClient(c);
f86a74e9 1279 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1280 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1281 addReply(c,shared.nullmultibulk);
b0d8747d 1282 unblockClientWaitingData(c);
f86a74e9 1283 }
ed9b544e 1284 }
1285 }
ed9b544e 1286}
1287
12fea928 1288static int htNeedsResize(dict *dict) {
1289 long long size, used;
1290
1291 size = dictSlots(dict);
1292 used = dictSize(dict);
1293 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1294 (used*100/size < REDIS_HT_MINFILL));
1295}
1296
0bc03378 1297/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1298 * we resize the hash table to save memory */
56906eef 1299static void tryResizeHashTables(void) {
0bc03378 1300 int j;
1301
1302 for (j = 0; j < server.dbnum; j++) {
5413c40d 1303 if (htNeedsResize(server.db[j].dict))
0bc03378 1304 dictResize(server.db[j].dict);
12fea928 1305 if (htNeedsResize(server.db[j].expires))
1306 dictResize(server.db[j].expires);
0bc03378 1307 }
1308}
1309
8ca3e9d1 1310/* Our hash table implementation performs rehashing incrementally while
1311 * we write/read from the hash table. Still if the server is idle, the hash
1312 * table will use two tables for a long time. So we try to use 1 millisecond
1313 * of CPU time at every serverCron() loop in order to rehash some key. */
1314static void incrementallyRehash(void) {
1315 int j;
1316
1317 for (j = 0; j < server.dbnum; j++) {
1318 if (dictIsRehashing(server.db[j].dict)) {
1319 dictRehashMilliseconds(server.db[j].dict,1);
1320 break; /* already used our millisecond for this loop... */
1321 }
1322 }
1323}
1324
9d65a1bb 1325/* A background saving child (BGSAVE) terminated its work. Handle this. */
1326void backgroundSaveDoneHandler(int statloc) {
1327 int exitcode = WEXITSTATUS(statloc);
1328 int bysignal = WIFSIGNALED(statloc);
1329
1330 if (!bysignal && exitcode == 0) {
1331 redisLog(REDIS_NOTICE,
1332 "Background saving terminated with success");
1333 server.dirty = 0;
1334 server.lastsave = time(NULL);
1335 } else if (!bysignal && exitcode != 0) {
1336 redisLog(REDIS_WARNING, "Background saving error");
1337 } else {
1338 redisLog(REDIS_WARNING,
454eea7c 1339 "Background saving terminated by signal %d", WTERMSIG(statloc));
9d65a1bb 1340 rdbRemoveTempFile(server.bgsavechildpid);
1341 }
1342 server.bgsavechildpid = -1;
1343 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1344 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1345 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1346}
1347
1348/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1349 * Handle this. */
1350void backgroundRewriteDoneHandler(int statloc) {
1351 int exitcode = WEXITSTATUS(statloc);
1352 int bysignal = WIFSIGNALED(statloc);
1353
1354 if (!bysignal && exitcode == 0) {
1355 int fd;
1356 char tmpfile[256];
1357
1358 redisLog(REDIS_NOTICE,
1359 "Background append only file rewriting terminated with success");
1360 /* Now it's time to flush the differences accumulated by the parent */
1361 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1362 fd = open(tmpfile,O_WRONLY|O_APPEND);
1363 if (fd == -1) {
1364 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1365 goto cleanup;
1366 }
1367 /* Flush our data... */
1368 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1369 (signed) sdslen(server.bgrewritebuf)) {
1370 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1371 close(fd);
1372 goto cleanup;
1373 }
b32627cd 1374 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1375 /* Now our work is to rename the temp file into the stable file. And
1376 * switch the file descriptor used by the server for append only. */
1377 if (rename(tmpfile,server.appendfilename) == -1) {
1378 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1379 close(fd);
1380 goto cleanup;
1381 }
1382 /* Mission completed... almost */
1383 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1384 if (server.appendfd != -1) {
1385 /* If append only is actually enabled... */
1386 close(server.appendfd);
1387 server.appendfd = fd;
1388 fsync(fd);
85a83172 1389 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1390 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1391 } else {
1392 /* If append only is disabled we just generate a dump in this
1393 * format. Why not? */
1394 close(fd);
1395 }
1396 } else if (!bysignal && exitcode != 0) {
1397 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1398 } else {
1399 redisLog(REDIS_WARNING,
454eea7c 1400 "Background append only file rewriting terminated by signal %d",
1401 WTERMSIG(statloc));
9d65a1bb 1402 }
1403cleanup:
1404 sdsfree(server.bgrewritebuf);
1405 server.bgrewritebuf = sdsempty();
1406 aofRemoveTempFile(server.bgrewritechildpid);
1407 server.bgrewritechildpid = -1;
1408}
1409
884d4b39 1410/* This function is called once a background process of some kind terminates,
1411 * as we want to avoid resizing the hash tables when there is a child in order
1412 * to play well with copy-on-write (otherwise when a resize happens lots of
1413 * memory pages are copied). The goal of this function is to update the ability
1414 * for dict.c to resize the hash tables accordingly to the fact we have o not
1415 * running childs. */
1416static void updateDictResizePolicy(void) {
1417 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1418 dictEnableResize();
1419 else
1420 dictDisableResize();
1421}
1422
56906eef 1423static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1424 int j, loops = server.cronloops++;
ed9b544e 1425 REDIS_NOTUSED(eventLoop);
1426 REDIS_NOTUSED(id);
1427 REDIS_NOTUSED(clientData);
1428
3a66edc7 1429 /* We take a cached value of the unix time in the global state because
1430 * with virtual memory and aging there is to store the current time
1431 * in objects at every object access, and accuracy is not needed.
1432 * To access a global var is faster than calling time(NULL) */
1433 server.unixtime = time(NULL);
1434
fab43727 1435 /* We received a SIGTERM, shutting down here in a safe way, as it is
1436 * not ok doing so inside the signal handler. */
1437 if (server.shutdown_asap) {
1438 if (prepareForShutdown() == REDIS_OK) exit(0);
1439 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1440 }
1441
0bc03378 1442 /* Show some info about non-empty databases */
ed9b544e 1443 for (j = 0; j < server.dbnum; j++) {
dec423d9 1444 long long size, used, vkeys;
94754ccc 1445
3305306f 1446 size = dictSlots(server.db[j].dict);
1447 used = dictSize(server.db[j].dict);
94754ccc 1448 vkeys = dictSize(server.db[j].expires);
1763929f 1449 if (!(loops % 50) && (used || vkeys)) {
f870935d 1450 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1451 /* dictPrintStats(server.dict); */
ed9b544e 1452 }
ed9b544e 1453 }
1454
0bc03378 1455 /* We don't want to resize the hash tables while a bacground saving
1456 * is in progress: the saving child is created using fork() that is
1457 * implemented with a copy-on-write semantic in most modern systems, so
1458 * if we resize the HT while there is the saving child at work actually
1459 * a lot of memory movements in the parent will cause a lot of pages
1460 * copied. */
8ca3e9d1 1461 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1462 if (!(loops % 10)) tryResizeHashTables();
1463 if (server.activerehashing) incrementallyRehash();
884d4b39 1464 }
0bc03378 1465
ed9b544e 1466 /* Show information about connected clients */
1763929f 1467 if (!(loops % 50)) {
bdcb92f2 1468 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
ed9b544e 1469 listLength(server.clients)-listLength(server.slaves),
1470 listLength(server.slaves),
bdcb92f2 1471 zmalloc_used_memory());
ed9b544e 1472 }
1473
1474 /* Close connections of timedout clients */
1763929f 1475 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1476 closeTimedoutClients();
1477
9d65a1bb 1478 /* Check if a background saving or AOF rewrite in progress terminated */
1479 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1480 int statloc;
9d65a1bb 1481 pid_t pid;
1482
1483 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1484 if (pid == server.bgsavechildpid) {
1485 backgroundSaveDoneHandler(statloc);
ed9b544e 1486 } else {
9d65a1bb 1487 backgroundRewriteDoneHandler(statloc);
ed9b544e 1488 }
884d4b39 1489 updateDictResizePolicy();
ed9b544e 1490 }
1491 } else {
1492 /* If there is not a background saving in progress check if
1493 * we have to save now */
1494 time_t now = time(NULL);
1495 for (j = 0; j < server.saveparamslen; j++) {
1496 struct saveparam *sp = server.saveparams+j;
1497
1498 if (server.dirty >= sp->changes &&
1499 now-server.lastsave > sp->seconds) {
1500 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1501 sp->changes, sp->seconds);
f78fd11b 1502 rdbSaveBackground(server.dbfilename);
ed9b544e 1503 break;
1504 }
1505 }
1506 }
94754ccc 1507
f2324293 1508 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1509 * will use few CPU cycles if there are few expiring keys, otherwise
1510 * it will get more aggressive to avoid that too much memory is used by
1511 * keys that can be removed from the keyspace. */
94754ccc 1512 for (j = 0; j < server.dbnum; j++) {
f2324293 1513 int expired;
94754ccc 1514 redisDb *db = server.db+j;
94754ccc 1515
f2324293 1516 /* Continue to expire if at the end of the cycle more than 25%
1517 * of the keys were expired. */
1518 do {
4ef8de8a 1519 long num = dictSize(db->expires);
94754ccc 1520 time_t now = time(NULL);
1521
f2324293 1522 expired = 0;
94754ccc 1523 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1524 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1525 while (num--) {
1526 dictEntry *de;
1527 time_t t;
1528
1529 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1530 t = (time_t) dictGetEntryVal(de);
1531 if (now > t) {
1532 deleteKey(db,dictGetEntryKey(de));
f2324293 1533 expired++;
2a6a2ed1 1534 server.stat_expiredkeys++;
94754ccc 1535 }
1536 }
f2324293 1537 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1538 }
1539
4ef8de8a 1540 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1541 * is enbled. Try to free objects from the free list first. */
7e69548d 1542 if (vmCanSwapOut()) {
1543 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1544 server.vm_max_memory)
1545 {
72e9fd40 1546 int retval;
1547
a5819310 1548 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1549 retval = (server.vm_max_threads == 0) ?
1550 vmSwapOneObjectBlocking() :
1551 vmSwapOneObjectThreaded();
1763929f 1552 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1553 zmalloc_used_memory() >
1554 (server.vm_max_memory+server.vm_max_memory/10))
1555 {
1556 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1557 }
72e9fd40 1558 /* Note that when using threade I/O we free just one object,
1559 * because anyway when the I/O thread in charge to swap this
1560 * object out will finish, the handler of completed jobs
1561 * will try to swap more objects if we are still out of memory. */
1562 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1563 }
1564 }
1565
ed9b544e 1566 /* Check if we should connect to a MASTER */
1763929f 1567 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1568 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1569 if (syncWithMaster() == REDIS_OK) {
1570 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
8f63ddca 1571 if (server.appendonly) rewriteAppendOnlyFileBackground();
ed9b544e 1572 }
1573 }
1763929f 1574 return 100;
ed9b544e 1575}
1576
d5d55fc3 1577/* This function gets called every time Redis is entering the
1578 * main loop of the event driven library, that is, before to sleep
1579 * for ready file descriptors. */
1580static void beforeSleep(struct aeEventLoop *eventLoop) {
1581 REDIS_NOTUSED(eventLoop);
1582
28ed1f33 1583 /* Awake clients that got all the swapped keys they requested */
d5d55fc3 1584 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1585 listIter li;
1586 listNode *ln;
1587
1588 listRewind(server.io_ready_clients,&li);
1589 while((ln = listNext(&li))) {
1590 redisClient *c = ln->value;
1591 struct redisCommand *cmd;
1592
1593 /* Resume the client. */
1594 listDelNode(server.io_ready_clients,ln);
1595 c->flags &= (~REDIS_IO_WAIT);
1596 server.vm_blocked_clients--;
1597 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1598 readQueryFromClient, c);
1599 cmd = lookupCommand(c->argv[0]->ptr);
1600 assert(cmd != NULL);
1601 call(c,cmd);
1602 resetClient(c);
1603 /* There may be more data to process in the input buffer. */
1604 if (c->querybuf && sdslen(c->querybuf) > 0)
1605 processInputBuffer(c);
1606 }
1607 }
28ed1f33 1608 /* Write the AOF buffer on disk */
1609 flushAppendOnlyFile();
d5d55fc3 1610}
1611
ed9b544e 1612static void createSharedObjects(void) {
05df7621 1613 int j;
1614
ed9b544e 1615 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1616 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1617 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1618 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1619 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1620 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1621 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1622 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1623 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1624 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1625 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1626 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1627 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1628 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1629 "-ERR no such key\r\n"));
ed9b544e 1630 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1631 "-ERR syntax error\r\n"));
c937aa89 1632 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1633 "-ERR source and destination objects are the same\r\n"));
1634 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1635 "-ERR index out of range\r\n"));
ed9b544e 1636 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1637 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1638 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1639 shared.select0 = createStringObject("select 0\r\n",10);
1640 shared.select1 = createStringObject("select 1\r\n",10);
1641 shared.select2 = createStringObject("select 2\r\n",10);
1642 shared.select3 = createStringObject("select 3\r\n",10);
1643 shared.select4 = createStringObject("select 4\r\n",10);
1644 shared.select5 = createStringObject("select 5\r\n",10);
1645 shared.select6 = createStringObject("select 6\r\n",10);
1646 shared.select7 = createStringObject("select 7\r\n",10);
1647 shared.select8 = createStringObject("select 8\r\n",10);
1648 shared.select9 = createStringObject("select 9\r\n",10);
befec3cd 1649 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
c8d0ea0e 1650 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
befec3cd 1651 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
fc46bb71 1652 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
ffc6b7f8 1653 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1654 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
befec3cd 1655 shared.mbulk3 = createStringObject("*3\r\n",4);
c8d0ea0e 1656 shared.mbulk4 = createStringObject("*4\r\n",4);
05df7621 1657 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1658 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1659 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1660 }
ed9b544e 1661}
1662
1663static void appendServerSaveParams(time_t seconds, int changes) {
1664 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1665 server.saveparams[server.saveparamslen].seconds = seconds;
1666 server.saveparams[server.saveparamslen].changes = changes;
1667 server.saveparamslen++;
1668}
1669
bcfc686d 1670static void resetServerSaveParams() {
ed9b544e 1671 zfree(server.saveparams);
1672 server.saveparams = NULL;
1673 server.saveparamslen = 0;
1674}
1675
1676static void initServerConfig() {
1677 server.dbnum = REDIS_DEFAULT_DBNUM;
1678 server.port = REDIS_SERVERPORT;
f870935d 1679 server.verbosity = REDIS_VERBOSE;
ed9b544e 1680 server.maxidletime = REDIS_MAXIDLETIME;
1681 server.saveparams = NULL;
1682 server.logfile = NULL; /* NULL = log on standard output */
1683 server.bindaddr = NULL;
1684 server.glueoutputbuf = 1;
1685 server.daemonize = 0;
44b38ef4 1686 server.appendonly = 0;
1b677732 1687 server.appendfsync = APPENDFSYNC_EVERYSEC;
48f0308a 1688 server.lastfsync = time(NULL);
44b38ef4 1689 server.appendfd = -1;
1690 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1691 server.pidfile = zstrdup("/var/run/redis.pid");
1692 server.dbfilename = zstrdup("dump.rdb");
1693 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1694 server.requirepass = NULL;
b0553789 1695 server.rdbcompression = 1;
8ca3e9d1 1696 server.activerehashing = 1;
285add55 1697 server.maxclients = 0;
d5d55fc3 1698 server.blpop_blocked_clients = 0;
3fd78bcd 1699 server.maxmemory = 0;
75680a3c 1700 server.vm_enabled = 0;
054e426d 1701 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1702 server.vm_page_size = 256; /* 256 bytes per page */
1703 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1704 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1705 server.vm_max_threads = 4;
d5d55fc3 1706 server.vm_blocked_clients = 0;
cbba7dd7 1707 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1708 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
fab43727 1709 server.shutdown_asap = 0;
75680a3c 1710
bcfc686d 1711 resetServerSaveParams();
ed9b544e 1712
1713 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1714 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1715 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1716 /* Replication related */
1717 server.isslave = 0;
d0ccebcf 1718 server.masterauth = NULL;
ed9b544e 1719 server.masterhost = NULL;
1720 server.masterport = 6379;
1721 server.master = NULL;
1722 server.replstate = REDIS_REPL_NONE;
a7866db6 1723
1724 /* Double constants initialization */
1725 R_Zero = 0.0;
1726 R_PosInf = 1.0/R_Zero;
1727 R_NegInf = -1.0/R_Zero;
1728 R_Nan = R_Zero/R_Zero;
ed9b544e 1729}
1730
1731static void initServer() {
1732 int j;
1733
1734 signal(SIGHUP, SIG_IGN);
1735 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1736 setupSigSegvAction();
ed9b544e 1737
b9bc0eef 1738 server.devnull = fopen("/dev/null","w");
1739 if (server.devnull == NULL) {
1740 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1741 exit(1);
1742 }
ed9b544e 1743 server.clients = listCreate();
1744 server.slaves = listCreate();
87eca727 1745 server.monitors = listCreate();
ed9b544e 1746 server.objfreelist = listCreate();
1747 createSharedObjects();
1748 server.el = aeCreateEventLoop();
3305306f 1749 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
ed9b544e 1750 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1751 if (server.fd == -1) {
1752 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1753 exit(1);
1754 }
3305306f 1755 for (j = 0; j < server.dbnum; j++) {
5234952b 1756 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1757 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
37ab76c9 1758 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1759 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1760 if (server.vm_enabled)
1761 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1762 server.db[j].id = j;
1763 }
ffc6b7f8 1764 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1765 server.pubsub_patterns = listCreate();
1766 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1767 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
ed9b544e 1768 server.cronloops = 0;
9f3c422c 1769 server.bgsavechildpid = -1;
9d65a1bb 1770 server.bgrewritechildpid = -1;
1771 server.bgrewritebuf = sdsempty();
28ed1f33 1772 server.aofbuf = sdsempty();
ed9b544e 1773 server.lastsave = time(NULL);
1774 server.dirty = 0;
ed9b544e 1775 server.stat_numcommands = 0;
1776 server.stat_numconnections = 0;
2a6a2ed1 1777 server.stat_expiredkeys = 0;
ed9b544e 1778 server.stat_starttime = time(NULL);
3a66edc7 1779 server.unixtime = time(NULL);
d8f8b666 1780 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1781 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1782 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1783
1784 if (server.appendonly) {
3bb225d6 1785 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1786 if (server.appendfd == -1) {
1787 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1788 strerror(errno));
1789 exit(1);
1790 }
1791 }
75680a3c 1792
1793 if (server.vm_enabled) vmInit();
ed9b544e 1794}
1795
1796/* Empty the whole database */
ca37e9cd 1797static long long emptyDb() {
ed9b544e 1798 int j;
ca37e9cd 1799 long long removed = 0;
ed9b544e 1800
3305306f 1801 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1802 removed += dictSize(server.db[j].dict);
3305306f 1803 dictEmpty(server.db[j].dict);
1804 dictEmpty(server.db[j].expires);
1805 }
ca37e9cd 1806 return removed;
ed9b544e 1807}
1808
85dd2f3a 1809static int yesnotoi(char *s) {
1810 if (!strcasecmp(s,"yes")) return 1;
1811 else if (!strcasecmp(s,"no")) return 0;
1812 else return -1;
1813}
1814
ed9b544e 1815/* I agree, this is a very rudimental way to load a configuration...
1816 will improve later if the config gets more complex */
1817static void loadServerConfig(char *filename) {
c9a111ac 1818 FILE *fp;
ed9b544e 1819 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1820 int linenum = 0;
1821 sds line = NULL;
c9a111ac 1822
1823 if (filename[0] == '-' && filename[1] == '\0')
1824 fp = stdin;
1825 else {
1826 if ((fp = fopen(filename,"r")) == NULL) {
9a22de82 1827 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
c9a111ac 1828 exit(1);
1829 }
ed9b544e 1830 }
c9a111ac 1831
ed9b544e 1832 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1833 sds *argv;
1834 int argc, j;
1835
1836 linenum++;
1837 line = sdsnew(buf);
1838 line = sdstrim(line," \t\r\n");
1839
1840 /* Skip comments and blank lines*/
1841 if (line[0] == '#' || line[0] == '\0') {
1842 sdsfree(line);
1843 continue;
1844 }
1845
1846 /* Split into arguments */
1847 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1848 sdstolower(argv[0]);
1849
1850 /* Execute config directives */
bb0b03a3 1851 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1852 server.maxidletime = atoi(argv[1]);
0150db36 1853 if (server.maxidletime < 0) {
ed9b544e 1854 err = "Invalid timeout value"; goto loaderr;
1855 }
bb0b03a3 1856 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1857 server.port = atoi(argv[1]);
1858 if (server.port < 1 || server.port > 65535) {
1859 err = "Invalid port"; goto loaderr;
1860 }
bb0b03a3 1861 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1862 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1863 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1864 int seconds = atoi(argv[1]);
1865 int changes = atoi(argv[2]);
1866 if (seconds < 1 || changes < 0) {
1867 err = "Invalid save parameters"; goto loaderr;
1868 }
1869 appendServerSaveParams(seconds,changes);
bb0b03a3 1870 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1871 if (chdir(argv[1]) == -1) {
1872 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1873 argv[1], strerror(errno));
1874 exit(1);
1875 }
bb0b03a3 1876 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1877 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1878 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1879 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1880 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1881 else {
1882 err = "Invalid log level. Must be one of debug, notice, warning";
1883 goto loaderr;
1884 }
bb0b03a3 1885 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1886 FILE *logfp;
ed9b544e 1887
1888 server.logfile = zstrdup(argv[1]);
bb0b03a3 1889 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1890 zfree(server.logfile);
1891 server.logfile = NULL;
1892 }
1893 if (server.logfile) {
1894 /* Test if we are able to open the file. The server will not
1895 * be able to abort just for this problem later... */
c9a111ac 1896 logfp = fopen(server.logfile,"a");
1897 if (logfp == NULL) {
ed9b544e 1898 err = sdscatprintf(sdsempty(),
1899 "Can't open the log file: %s", strerror(errno));
1900 goto loaderr;
1901 }
c9a111ac 1902 fclose(logfp);
ed9b544e 1903 }
bb0b03a3 1904 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1905 server.dbnum = atoi(argv[1]);
1906 if (server.dbnum < 1) {
1907 err = "Invalid number of databases"; goto loaderr;
1908 }
b3f83f12
JZ
1909 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1910 loadServerConfig(argv[1]);
285add55 1911 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1912 server.maxclients = atoi(argv[1]);
3fd78bcd 1913 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
2b619329 1914 server.maxmemory = memtoll(argv[1],NULL);
bb0b03a3 1915 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1916 server.masterhost = sdsnew(argv[1]);
1917 server.masterport = atoi(argv[2]);
1918 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1919 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1920 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1921 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1922 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1923 err = "argument must be 'yes' or 'no'"; goto loaderr;
1924 }
121f70cf 1925 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1926 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
8ca3e9d1 1927 err = "argument must be 'yes' or 'no'"; goto loaderr;
1928 }
1929 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1930 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
121f70cf 1931 err = "argument must be 'yes' or 'no'"; goto loaderr;
1932 }
bb0b03a3 1933 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1934 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1935 err = "argument must be 'yes' or 'no'"; goto loaderr;
1936 }
44b38ef4 1937 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1938 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1939 err = "argument must be 'yes' or 'no'"; goto loaderr;
1940 }
f3b52411
PN
1941 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1942 zfree(server.appendfilename);
1943 server.appendfilename = zstrdup(argv[1]);
48f0308a 1944 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1945 if (!strcasecmp(argv[1],"no")) {
48f0308a 1946 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1947 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1948 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1949 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1950 server.appendfsync = APPENDFSYNC_EVERYSEC;
1951 } else {
1952 err = "argument must be 'no', 'always' or 'everysec'";
1953 goto loaderr;
1954 }
bb0b03a3 1955 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 1956 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1957 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 1958 zfree(server.pidfile);
054e426d 1959 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1960 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 1961 zfree(server.dbfilename);
054e426d 1962 server.dbfilename = zstrdup(argv[1]);
75680a3c 1963 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1964 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1965 err = "argument must be 'yes' or 'no'"; goto loaderr;
1966 }
054e426d 1967 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 1968 zfree(server.vm_swap_file);
054e426d 1969 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 1970 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2b619329 1971 server.vm_max_memory = memtoll(argv[1],NULL);
4ef8de8a 1972 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2b619329 1973 server.vm_page_size = memtoll(argv[1], NULL);
4ef8de8a 1974 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2b619329 1975 server.vm_pages = memtoll(argv[1], NULL);
92f8e882 1976 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1977 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 1978 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2b619329 1979 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
cbba7dd7 1980 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2b619329 1981 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
ed9b544e 1982 } else {
1983 err = "Bad directive or wrong number of arguments"; goto loaderr;
1984 }
1985 for (j = 0; j < argc; j++)
1986 sdsfree(argv[j]);
1987 zfree(argv);
1988 sdsfree(line);
1989 }
c9a111ac 1990 if (fp != stdin) fclose(fp);
ed9b544e 1991 return;
1992
1993loaderr:
1994 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1995 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1996 fprintf(stderr, ">>> '%s'\n", line);
1997 fprintf(stderr, "%s\n", err);
1998 exit(1);
1999}
2000
2001static void freeClientArgv(redisClient *c) {
2002 int j;
2003
2004 for (j = 0; j < c->argc; j++)
2005 decrRefCount(c->argv[j]);
e8a74421 2006 for (j = 0; j < c->mbargc; j++)
2007 decrRefCount(c->mbargv[j]);
ed9b544e 2008 c->argc = 0;
e8a74421 2009 c->mbargc = 0;
ed9b544e 2010}
2011
2012static void freeClient(redisClient *c) {
2013 listNode *ln;
2014
4409877e 2015 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 2016 * call, we have to set querybuf to NULL *before* to call
2017 * unblockClientWaitingData() to avoid processInputBuffer() will get
2018 * called. Also it is important to remove the file events after
2019 * this, because this call adds the READABLE event. */
4409877e 2020 sdsfree(c->querybuf);
2021 c->querybuf = NULL;
2022 if (c->flags & REDIS_BLOCKED)
b0d8747d 2023 unblockClientWaitingData(c);
4409877e 2024
37ab76c9 2025 /* UNWATCH all the keys */
2026 unwatchAllKeys(c);
2027 listRelease(c->watched_keys);
ffc6b7f8 2028 /* Unsubscribe from all the pubsub channels */
2029 pubsubUnsubscribeAllChannels(c,0);
2030 pubsubUnsubscribeAllPatterns(c,0);
2031 dictRelease(c->pubsub_channels);
2032 listRelease(c->pubsub_patterns);
befec3cd 2033 /* Obvious cleanup */
ed9b544e 2034 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2035 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 2036 listRelease(c->reply);
2037 freeClientArgv(c);
2038 close(c->fd);
92f8e882 2039 /* Remove from the list of clients */
ed9b544e 2040 ln = listSearchKey(server.clients,c);
dfc5e96c 2041 redisAssert(ln != NULL);
ed9b544e 2042 listDelNode(server.clients,ln);
37ab76c9 2043 /* Remove from the list of clients that are now ready to be restarted
2044 * after waiting for swapped keys */
d5d55fc3 2045 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2046 ln = listSearchKey(server.io_ready_clients,c);
2047 if (ln) {
2048 listDelNode(server.io_ready_clients,ln);
2049 server.vm_blocked_clients--;
2050 }
2051 }
37ab76c9 2052 /* Remove from the list of clients waiting for swapped keys */
d5d55fc3 2053 while (server.vm_enabled && listLength(c->io_keys)) {
2054 ln = listFirst(c->io_keys);
2055 dontWaitForSwappedKey(c,ln->value);
92f8e882 2056 }
b3e3d0d7 2057 listRelease(c->io_keys);
befec3cd 2058 /* Master/slave cleanup */
ed9b544e 2059 if (c->flags & REDIS_SLAVE) {
6208b3a7 2060 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2061 close(c->repldbfd);
87eca727 2062 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2063 ln = listSearchKey(l,c);
dfc5e96c 2064 redisAssert(ln != NULL);
87eca727 2065 listDelNode(l,ln);
ed9b544e 2066 }
2067 if (c->flags & REDIS_MASTER) {
2068 server.master = NULL;
2069 server.replstate = REDIS_REPL_CONNECT;
2070 }
befec3cd 2071 /* Release memory */
93ea3759 2072 zfree(c->argv);
e8a74421 2073 zfree(c->mbargv);
6e469882 2074 freeClientMultiState(c);
ed9b544e 2075 zfree(c);
2076}
2077
cc30e368 2078#define GLUEREPLY_UP_TO (1024)
ed9b544e 2079static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 2080 int copylen = 0;
2081 char buf[GLUEREPLY_UP_TO];
6208b3a7 2082 listNode *ln;
c7df85a4 2083 listIter li;
ed9b544e 2084 robj *o;
2085
c7df85a4 2086 listRewind(c->reply,&li);
2087 while((ln = listNext(&li))) {
c28b42ac 2088 int objlen;
2089
ed9b544e 2090 o = ln->value;
c28b42ac 2091 objlen = sdslen(o->ptr);
2092 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2093 memcpy(buf+copylen,o->ptr,objlen);
2094 copylen += objlen;
ed9b544e 2095 listDelNode(c->reply,ln);
c28b42ac 2096 } else {
2097 if (copylen == 0) return;
2098 break;
ed9b544e 2099 }
ed9b544e 2100 }
c28b42ac 2101 /* Now the output buffer is empty, add the new single element */
2102 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2103 listAddNodeHead(c->reply,o);
ed9b544e 2104}
2105
2106static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2107 redisClient *c = privdata;
2108 int nwritten = 0, totwritten = 0, objlen;
2109 robj *o;
2110 REDIS_NOTUSED(el);
2111 REDIS_NOTUSED(mask);
2112
2895e862 2113 /* Use writev() if we have enough buffers to send */
7ea870c0 2114 if (!server.glueoutputbuf &&
e0a62c7f 2115 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
7ea870c0 2116 !(c->flags & REDIS_MASTER))
2895e862 2117 {
2118 sendReplyToClientWritev(el, fd, privdata, mask);
2119 return;
2120 }
2895e862 2121
ed9b544e 2122 while(listLength(c->reply)) {
c28b42ac 2123 if (server.glueoutputbuf && listLength(c->reply) > 1)
2124 glueReplyBuffersIfNeeded(c);
2125
ed9b544e 2126 o = listNodeValue(listFirst(c->reply));
2127 objlen = sdslen(o->ptr);
2128
2129 if (objlen == 0) {
2130 listDelNode(c->reply,listFirst(c->reply));
2131 continue;
2132 }
2133
2134 if (c->flags & REDIS_MASTER) {
6f376729 2135 /* Don't reply to a master */
ed9b544e 2136 nwritten = objlen - c->sentlen;
2137 } else {
a4d1ba9a 2138 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 2139 if (nwritten <= 0) break;
2140 }
2141 c->sentlen += nwritten;
2142 totwritten += nwritten;
2143 /* If we fully sent the object on head go to the next one */
2144 if (c->sentlen == objlen) {
2145 listDelNode(c->reply,listFirst(c->reply));
2146 c->sentlen = 0;
2147 }
6f376729 2148 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 2149 * bytes, in a single threaded server it's a good idea to serve
6f376729 2150 * other clients as well, even if a very large request comes from
2151 * super fast link that is always able to accept data (in real world
12f9d551 2152 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 2153 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 2154 }
2155 if (nwritten == -1) {
2156 if (errno == EAGAIN) {
2157 nwritten = 0;
2158 } else {
f870935d 2159 redisLog(REDIS_VERBOSE,
ed9b544e 2160 "Error writing to client: %s", strerror(errno));
2161 freeClient(c);
2162 return;
2163 }
2164 }
2165 if (totwritten > 0) c->lastinteraction = time(NULL);
2166 if (listLength(c->reply) == 0) {
2167 c->sentlen = 0;
2168 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2169 }
2170}
2171
2895e862 2172static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2173{
2174 redisClient *c = privdata;
2175 int nwritten = 0, totwritten = 0, objlen, willwrite;
2176 robj *o;
2177 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2178 int offset, ion = 0;
2179 REDIS_NOTUSED(el);
2180 REDIS_NOTUSED(mask);
2181
2182 listNode *node;
2183 while (listLength(c->reply)) {
2184 offset = c->sentlen;
2185 ion = 0;
2186 willwrite = 0;
2187
2188 /* fill-in the iov[] array */
2189 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2190 o = listNodeValue(node);
2191 objlen = sdslen(o->ptr);
2192
e0a62c7f 2193 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2895e862 2194 break;
2195
2196 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2197 break; /* no more iovecs */
2198
2199 iov[ion].iov_base = ((char*)o->ptr) + offset;
2200 iov[ion].iov_len = objlen - offset;
2201 willwrite += objlen - offset;
2202 offset = 0; /* just for the first item */
2203 ion++;
2204 }
2205
2206 if(willwrite == 0)
2207 break;
2208
2209 /* write all collected blocks at once */
2210 if((nwritten = writev(fd, iov, ion)) < 0) {
2211 if (errno != EAGAIN) {
f870935d 2212 redisLog(REDIS_VERBOSE,
2895e862 2213 "Error writing to client: %s", strerror(errno));
2214 freeClient(c);
2215 return;
2216 }
2217 break;
2218 }
2219
2220 totwritten += nwritten;
2221 offset = c->sentlen;
2222
2223 /* remove written robjs from c->reply */
2224 while (nwritten && listLength(c->reply)) {
2225 o = listNodeValue(listFirst(c->reply));
2226 objlen = sdslen(o->ptr);
2227
2228 if(nwritten >= objlen - offset) {
2229 listDelNode(c->reply, listFirst(c->reply));
2230 nwritten -= objlen - offset;
2231 c->sentlen = 0;
2232 } else {
2233 /* partial write */
2234 c->sentlen += nwritten;
2235 break;
2236 }
2237 offset = 0;
2238 }
2239 }
2240
e0a62c7f 2241 if (totwritten > 0)
2895e862 2242 c->lastinteraction = time(NULL);
2243
2244 if (listLength(c->reply) == 0) {
2245 c->sentlen = 0;
2246 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2247 }
2248}
2249
ed9b544e 2250static struct redisCommand *lookupCommand(char *name) {
2251 int j = 0;
2252 while(cmdTable[j].name != NULL) {
bb0b03a3 2253 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
ed9b544e 2254 j++;
2255 }
2256 return NULL;
2257}
2258
2259/* resetClient prepare the client to process the next command */
2260static void resetClient(redisClient *c) {
2261 freeClientArgv(c);
2262 c->bulklen = -1;
e8a74421 2263 c->multibulk = 0;
ed9b544e 2264}
2265
6e469882 2266/* Call() is the core of Redis execution of a command */
2267static void call(redisClient *c, struct redisCommand *cmd) {
2268 long long dirty;
2269
2270 dirty = server.dirty;
2271 cmd->proc(c);
4005fef1 2272 dirty = server.dirty-dirty;
2273
2274 if (server.appendonly && dirty)
6e469882 2275 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
4005fef1 2276 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2277 listLength(server.slaves))
248ea310 2278 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2279 if (listLength(server.monitors))
dd142b9c 2280 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2281 server.stat_numcommands++;
2282}
2283
ed9b544e 2284/* If this function gets called we already read a whole
2285 * command, argments are in the client argv/argc fields.
2286 * processCommand() execute the command or prepare the
2287 * server for a bulk read from the client.
2288 *
2289 * If 1 is returned the client is still alive and valid and
2290 * and other operations can be performed by the caller. Otherwise
2291 * if 0 is returned the client was destroied (i.e. after QUIT). */
2292static int processCommand(redisClient *c) {
2293 struct redisCommand *cmd;
ed9b544e 2294
3fd78bcd 2295 /* Free some memory if needed (maxmemory setting) */
2296 if (server.maxmemory) freeMemoryIfNeeded();
2297
e8a74421 2298 /* Handle the multi bulk command type. This is an alternative protocol
2299 * supported by Redis in order to receive commands that are composed of
2300 * multiple binary-safe "bulk" arguments. The latency of processing is
2301 * a bit higher but this allows things like multi-sets, so if this
2302 * protocol is used only for MSET and similar commands this is a big win. */
2303 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2304 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2305 if (c->multibulk <= 0) {
2306 resetClient(c);
2307 return 1;
2308 } else {
2309 decrRefCount(c->argv[c->argc-1]);
2310 c->argc--;
2311 return 1;
2312 }
2313 } else if (c->multibulk) {
2314 if (c->bulklen == -1) {
2315 if (((char*)c->argv[0]->ptr)[0] != '$') {
2316 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2317 resetClient(c);
2318 return 1;
2319 } else {
2320 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2321 decrRefCount(c->argv[0]);
2322 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2323 c->argc--;
2324 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2325 resetClient(c);
2326 return 1;
2327 }
2328 c->argc--;
2329 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2330 return 1;
2331 }
2332 } else {
2333 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2334 c->mbargv[c->mbargc] = c->argv[0];
2335 c->mbargc++;
2336 c->argc--;
2337 c->multibulk--;
2338 if (c->multibulk == 0) {
2339 robj **auxargv;
2340 int auxargc;
2341
2342 /* Here we need to swap the multi-bulk argc/argv with the
2343 * normal argc/argv of the client structure. */
2344 auxargv = c->argv;
2345 c->argv = c->mbargv;
2346 c->mbargv = auxargv;
2347
2348 auxargc = c->argc;
2349 c->argc = c->mbargc;
2350 c->mbargc = auxargc;
2351
2352 /* We need to set bulklen to something different than -1
2353 * in order for the code below to process the command without
2354 * to try to read the last argument of a bulk command as
2355 * a special argument. */
2356 c->bulklen = 0;
2357 /* continue below and process the command */
2358 } else {
2359 c->bulklen = -1;
2360 return 1;
2361 }
2362 }
2363 }
2364 /* -- end of multi bulk commands processing -- */
2365
ed9b544e 2366 /* The QUIT command is handled as a special case. Normal command
2367 * procs are unable to close the client connection safely */
bb0b03a3 2368 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2369 freeClient(c);
2370 return 0;
2371 }
d5d55fc3 2372
2373 /* Now lookup the command and check ASAP about trivial error conditions
2374 * such wrong arity, bad command name and so forth. */
ed9b544e 2375 cmd = lookupCommand(c->argv[0]->ptr);
2376 if (!cmd) {
2c14807b 2377 addReplySds(c,
2378 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2379 (char*)c->argv[0]->ptr));
ed9b544e 2380 resetClient(c);
2381 return 1;
2382 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2383 (c->argc < -cmd->arity)) {
454d4e43 2384 addReplySds(c,
2385 sdscatprintf(sdsempty(),
2386 "-ERR wrong number of arguments for '%s' command\r\n",
2387 cmd->name));
ed9b544e 2388 resetClient(c);
2389 return 1;
2390 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2391 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2392 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2393
2394 decrRefCount(c->argv[c->argc-1]);
2395 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2396 c->argc--;
2397 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2398 resetClient(c);
2399 return 1;
2400 }
2401 c->argc--;
2402 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2403 /* It is possible that the bulk read is already in the
8d0490e7 2404 * buffer. Check this condition and handle it accordingly.
2405 * This is just a fast path, alternative to call processInputBuffer().
2406 * It's a good idea since the code is small and this condition
2407 * happens most of the times. */
ed9b544e 2408 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2409 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2410 c->argc++;
2411 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2412 } else {
d5d55fc3 2413 /* Otherwise return... there is to read the last argument
2414 * from the socket. */
ed9b544e 2415 return 1;
2416 }
2417 }
942a3961 2418 /* Let's try to encode the bulk object to save space. */
2419 if (cmd->flags & REDIS_CMD_BULK)
05df7621 2420 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
942a3961 2421
e63943a4 2422 /* Check if the user is authenticated */
2423 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2424 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2425 resetClient(c);
2426 return 1;
2427 }
2428
b61a28fe 2429 /* Handle the maxmemory directive */
2430 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2431 zmalloc_used_memory() > server.maxmemory)
2432 {
2433 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2434 resetClient(c);
2435 return 1;
2436 }
2437
d6cc8867 2438 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
e6cca5db 2439 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2440 &&
ffc6b7f8 2441 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2442 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2443 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
d6cc8867 2444 resetClient(c);
2445 return 1;
2446 }
2447
ed9b544e 2448 /* Exec the command */
6531c94d 2449 if (c->flags & REDIS_MULTI &&
2450 cmd->proc != execCommand && cmd->proc != discardCommand &&
2451 cmd->proc != multiCommand && cmd->proc != watchCommand)
2452 {
6e469882 2453 queueMultiCommand(c,cmd);
2454 addReply(c,shared.queued);
2455 } else {
d5d55fc3 2456 if (server.vm_enabled && server.vm_max_threads > 0 &&
0a6f3f0f 2457 blockClientOnSwappedKeys(c,cmd)) return 1;
6e469882 2458 call(c,cmd);
2459 }
ed9b544e 2460
2461 /* Prepare the client for the next command */
ed9b544e 2462 resetClient(c);
2463 return 1;
2464}
2465
248ea310 2466static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2467 listNode *ln;
c7df85a4 2468 listIter li;
ed9b544e 2469 int outc = 0, j;
93ea3759 2470 robj **outv;
248ea310 2471 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2472 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2473 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2474 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2475 robj *lenobj;
93ea3759 2476
2477 if (argc <= REDIS_STATIC_ARGS) {
2478 outv = static_outv;
2479 } else {
248ea310 2480 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2481 }
248ea310 2482
2483 lenobj = createObject(REDIS_STRING,
2484 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2485 lenobj->refcount = 0;
2486 outv[outc++] = lenobj;
ed9b544e 2487 for (j = 0; j < argc; j++) {
248ea310 2488 lenobj = createObject(REDIS_STRING,
2489 sdscatprintf(sdsempty(),"$%lu\r\n",
2490 (unsigned long) stringObjectLen(argv[j])));
2491 lenobj->refcount = 0;
2492 outv[outc++] = lenobj;
ed9b544e 2493 outv[outc++] = argv[j];
248ea310 2494 outv[outc++] = shared.crlf;
ed9b544e 2495 }
ed9b544e 2496
40d224a9 2497 /* Increment all the refcounts at start and decrement at end in order to
2498 * be sure to free objects if there is no slave in a replication state
2499 * able to be feed with commands */
2500 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2501 listRewind(slaves,&li);
2502 while((ln = listNext(&li))) {
ed9b544e 2503 redisClient *slave = ln->value;
40d224a9 2504
2505 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2506 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2507
2508 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2509 if (slave->slaveseldb != dictid) {
2510 robj *selectcmd;
2511
2512 switch(dictid) {
2513 case 0: selectcmd = shared.select0; break;
2514 case 1: selectcmd = shared.select1; break;
2515 case 2: selectcmd = shared.select2; break;
2516 case 3: selectcmd = shared.select3; break;
2517 case 4: selectcmd = shared.select4; break;
2518 case 5: selectcmd = shared.select5; break;
2519 case 6: selectcmd = shared.select6; break;
2520 case 7: selectcmd = shared.select7; break;
2521 case 8: selectcmd = shared.select8; break;
2522 case 9: selectcmd = shared.select9; break;
2523 default:
2524 selectcmd = createObject(REDIS_STRING,
2525 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2526 selectcmd->refcount = 0;
2527 break;
2528 }
2529 addReply(slave,selectcmd);
2530 slave->slaveseldb = dictid;
2531 }
2532 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2533 }
40d224a9 2534 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2535 if (outv != static_outv) zfree(outv);
ed9b544e 2536}
2537
dd142b9c 2538static sds sdscatrepr(sds s, char *p, size_t len) {
2539 s = sdscatlen(s,"\"",1);
2540 while(len--) {
2541 switch(*p) {
2542 case '\\':
2543 case '"':
2544 s = sdscatprintf(s,"\\%c",*p);
2545 break;
2546 case '\n': s = sdscatlen(s,"\\n",1); break;
2547 case '\r': s = sdscatlen(s,"\\r",1); break;
2548 case '\t': s = sdscatlen(s,"\\t",1); break;
2549 case '\a': s = sdscatlen(s,"\\a",1); break;
2550 case '\b': s = sdscatlen(s,"\\b",1); break;
2551 default:
2552 if (isprint(*p))
2553 s = sdscatprintf(s,"%c",*p);
2554 else
2555 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2556 break;
2557 }
2558 p++;
2559 }
2560 return sdscatlen(s,"\"",1);
2561}
2562
2563static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2564 listNode *ln;
2565 listIter li;
2566 int j;
2567 sds cmdrepr = sdsnew("+");
2568 robj *cmdobj;
2569 struct timeval tv;
2570
2571 gettimeofday(&tv,NULL);
2572 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2573 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2574
2575 for (j = 0; j < argc; j++) {
2576 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2577 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2578 } else {
2579 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2580 sdslen(argv[j]->ptr));
2581 }
2582 if (j != argc-1)
2583 cmdrepr = sdscatlen(cmdrepr," ",1);
2584 }
2585 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2586 cmdobj = createObject(REDIS_STRING,cmdrepr);
2587
2588 listRewind(monitors,&li);
2589 while((ln = listNext(&li))) {
2590 redisClient *monitor = ln->value;
2591 addReply(monitor,cmdobj);
2592 }
2593 decrRefCount(cmdobj);
2594}
2595
638e42ac 2596static void processInputBuffer(redisClient *c) {
ed9b544e 2597again:
4409877e 2598 /* Before to process the input buffer, make sure the client is not
2599 * waitig for a blocking operation such as BLPOP. Note that the first
2600 * iteration the client is never blocked, otherwise the processInputBuffer
2601 * would not be called at all, but after the execution of the first commands
2602 * in the input buffer the client may be blocked, and the "goto again"
2603 * will try to reiterate. The following line will make it return asap. */
92f8e882 2604 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2605 if (c->bulklen == -1) {
2606 /* Read the first line of the query */
2607 char *p = strchr(c->querybuf,'\n');
2608 size_t querylen;
644fafa3 2609
ed9b544e 2610 if (p) {
2611 sds query, *argv;
2612 int argc, j;
e0a62c7f 2613
ed9b544e 2614 query = c->querybuf;
2615 c->querybuf = sdsempty();
2616 querylen = 1+(p-(query));
2617 if (sdslen(query) > querylen) {
2618 /* leave data after the first line of the query in the buffer */
2619 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2620 }
2621 *p = '\0'; /* remove "\n" */
2622 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2623 sdsupdatelen(query);
2624
2625 /* Now we can split the query in arguments */
ed9b544e 2626 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2627 sdsfree(query);
2628
2629 if (c->argv) zfree(c->argv);
2630 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2631
2632 for (j = 0; j < argc; j++) {
ed9b544e 2633 if (sdslen(argv[j])) {
2634 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2635 c->argc++;
2636 } else {
2637 sdsfree(argv[j]);
2638 }
2639 }
2640 zfree(argv);
7c49733c 2641 if (c->argc) {
2642 /* Execute the command. If the client is still valid
2643 * after processCommand() return and there is something
2644 * on the query buffer try to process the next command. */
2645 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2646 } else {
2647 /* Nothing to process, argc == 0. Just process the query
2648 * buffer if it's not empty or return to the caller */
2649 if (sdslen(c->querybuf)) goto again;
2650 }
ed9b544e 2651 return;
644fafa3 2652 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2653 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2654 freeClient(c);
2655 return;
2656 }
2657 } else {
2658 /* Bulk read handling. Note that if we are at this point
2659 the client already sent a command terminated with a newline,
2660 we are reading the bulk data that is actually the last
2661 argument of the command. */
2662 int qbl = sdslen(c->querybuf);
2663
2664 if (c->bulklen <= qbl) {
2665 /* Copy everything but the final CRLF as final argument */
2666 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2667 c->argc++;
2668 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2669 /* Process the command. If the client is still valid after
2670 * the processing and there is more data in the buffer
2671 * try to parse it. */
2672 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2673 return;
2674 }
2675 }
2676}
2677
638e42ac 2678static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2679 redisClient *c = (redisClient*) privdata;
2680 char buf[REDIS_IOBUF_LEN];
2681 int nread;
2682 REDIS_NOTUSED(el);
2683 REDIS_NOTUSED(mask);
2684
2685 nread = read(fd, buf, REDIS_IOBUF_LEN);
2686 if (nread == -1) {
2687 if (errno == EAGAIN) {
2688 nread = 0;
2689 } else {
f870935d 2690 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2691 freeClient(c);
2692 return;
2693 }
2694 } else if (nread == 0) {
f870935d 2695 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2696 freeClient(c);
2697 return;
2698 }
2699 if (nread) {
2700 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2701 c->lastinteraction = time(NULL);
2702 } else {
2703 return;
2704 }
168ac5c6 2705 processInputBuffer(c);
638e42ac 2706}
2707
ed9b544e 2708static int selectDb(redisClient *c, int id) {
2709 if (id < 0 || id >= server.dbnum)
2710 return REDIS_ERR;
3305306f 2711 c->db = &server.db[id];
ed9b544e 2712 return REDIS_OK;
2713}
2714
40d224a9 2715static void *dupClientReplyValue(void *o) {
2716 incrRefCount((robj*)o);
12d090d2 2717 return o;
40d224a9 2718}
2719
ffc6b7f8 2720static int listMatchObjects(void *a, void *b) {
bf028098 2721 return equalStringObjects(a,b);
ffc6b7f8 2722}
2723
ed9b544e 2724static redisClient *createClient(int fd) {
2725 redisClient *c = zmalloc(sizeof(*c));
2726
2727 anetNonBlock(NULL,fd);
2728 anetTcpNoDelay(NULL,fd);
2729 if (!c) return NULL;
2730 selectDb(c,0);
2731 c->fd = fd;
2732 c->querybuf = sdsempty();
2733 c->argc = 0;
93ea3759 2734 c->argv = NULL;
ed9b544e 2735 c->bulklen = -1;
e8a74421 2736 c->multibulk = 0;
2737 c->mbargc = 0;
2738 c->mbargv = NULL;
ed9b544e 2739 c->sentlen = 0;
2740 c->flags = 0;
2741 c->lastinteraction = time(NULL);
abcb223e 2742 c->authenticated = 0;
40d224a9 2743 c->replstate = REDIS_REPL_NONE;
6b47e12e 2744 c->reply = listCreate();
ed9b544e 2745 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2746 listSetDupMethod(c->reply,dupClientReplyValue);
37ab76c9 2747 c->blocking_keys = NULL;
2748 c->blocking_keys_num = 0;
92f8e882 2749 c->io_keys = listCreate();
87c68815 2750 c->watched_keys = listCreate();
92f8e882 2751 listSetFreeMethod(c->io_keys,decrRefCount);
ffc6b7f8 2752 c->pubsub_channels = dictCreate(&setDictType,NULL);
2753 c->pubsub_patterns = listCreate();
2754 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2755 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
ed9b544e 2756 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2757 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2758 freeClient(c);
2759 return NULL;
2760 }
6b47e12e 2761 listAddNodeTail(server.clients,c);
6e469882 2762 initClientMultiState(c);
ed9b544e 2763 return c;
2764}
2765
2766static void addReply(redisClient *c, robj *obj) {
2767 if (listLength(c->reply) == 0 &&
6208b3a7 2768 (c->replstate == REDIS_REPL_NONE ||
2769 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2770 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2771 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2772
2773 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2774 obj = dupStringObject(obj);
2775 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2776 }
9d65a1bb 2777 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2778}
2779
2780static void addReplySds(redisClient *c, sds s) {
2781 robj *o = createObject(REDIS_STRING,s);
2782 addReply(c,o);
2783 decrRefCount(o);
2784}
2785
e2665397 2786static void addReplyDouble(redisClient *c, double d) {
2787 char buf[128];
2788
2789 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2790 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2791 (unsigned long) strlen(buf),buf));
e2665397 2792}
2793
aa7c2934
PN
2794static void addReplyLongLong(redisClient *c, long long ll) {
2795 char buf[128];
2796 size_t len;
2797
2798 if (ll == 0) {
2799 addReply(c,shared.czero);
2800 return;
2801 } else if (ll == 1) {
2802 addReply(c,shared.cone);
2803 return;
2804 }
482b672d 2805 buf[0] = ':';
2806 len = ll2string(buf+1,sizeof(buf)-1,ll);
2807 buf[len+1] = '\r';
2808 buf[len+2] = '\n';
2809 addReplySds(c,sdsnewlen(buf,len+3));
aa7c2934
PN
2810}
2811
92b27fe9 2812static void addReplyUlong(redisClient *c, unsigned long ul) {
2813 char buf[128];
2814 size_t len;
2815
dd88747b 2816 if (ul == 0) {
2817 addReply(c,shared.czero);
2818 return;
2819 } else if (ul == 1) {
2820 addReply(c,shared.cone);
2821 return;
2822 }
92b27fe9 2823 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2824 addReplySds(c,sdsnewlen(buf,len));
2825}
2826
942a3961 2827static void addReplyBulkLen(redisClient *c, robj *obj) {
482b672d 2828 size_t len, intlen;
2829 char buf[128];
942a3961 2830
2831 if (obj->encoding == REDIS_ENCODING_RAW) {
2832 len = sdslen(obj->ptr);
2833 } else {
2834 long n = (long)obj->ptr;
2835
e054afda 2836 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2837 len = 1;
2838 if (n < 0) {
2839 len++;
2840 n = -n;
2841 }
2842 while((n = n/10) != 0) {
2843 len++;
2844 }
2845 }
482b672d 2846 buf[0] = '$';
2847 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2848 buf[intlen+1] = '\r';
2849 buf[intlen+2] = '\n';
2850 addReplySds(c,sdsnewlen(buf,intlen+3));
942a3961 2851}
2852
dd88747b 2853static void addReplyBulk(redisClient *c, robj *obj) {
2854 addReplyBulkLen(c,obj);
2855 addReply(c,obj);
2856 addReply(c,shared.crlf);
2857}
2858
500ece7c 2859/* In the CONFIG command we need to add vanilla C string as bulk replies */
2860static void addReplyBulkCString(redisClient *c, char *s) {
2861 if (s == NULL) {
2862 addReply(c,shared.nullbulk);
2863 } else {
2864 robj *o = createStringObject(s,strlen(s));
2865 addReplyBulk(c,o);
2866 decrRefCount(o);
2867 }
2868}
2869
ed9b544e 2870static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2871 int cport, cfd;
2872 char cip[128];
285add55 2873 redisClient *c;
ed9b544e 2874 REDIS_NOTUSED(el);
2875 REDIS_NOTUSED(mask);
2876 REDIS_NOTUSED(privdata);
2877
2878 cfd = anetAccept(server.neterr, fd, cip, &cport);
2879 if (cfd == AE_ERR) {
f870935d 2880 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2881 return;
2882 }
f870935d 2883 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2884 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2885 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2886 close(cfd); /* May be already closed, just ingore errors */
2887 return;
2888 }
285add55 2889 /* If maxclient directive is set and this is one client more... close the
2890 * connection. Note that we create the client instead to check before
2891 * for this condition, since now the socket is already set in nonblocking
2892 * mode and we can send an error for free using the Kernel I/O */
2893 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2894 char *err = "-ERR max number of clients reached\r\n";
2895
2896 /* That's a best effort error message, don't check write errors */
fee803ba 2897 if (write(c->fd,err,strlen(err)) == -1) {
2898 /* Nothing to do, Just to avoid the warning... */
2899 }
285add55 2900 freeClient(c);
2901 return;
2902 }
ed9b544e 2903 server.stat_numconnections++;
2904}
2905
2906/* ======================= Redis objects implementation ===================== */
2907
2908static robj *createObject(int type, void *ptr) {
2909 robj *o;
2910
a5819310 2911 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2912 if (listLength(server.objfreelist)) {
2913 listNode *head = listFirst(server.objfreelist);
2914 o = listNodeValue(head);
2915 listDelNode(server.objfreelist,head);
a5819310 2916 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2917 } else {
75680a3c 2918 if (server.vm_enabled) {
a5819310 2919 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2920 o = zmalloc(sizeof(*o));
2921 } else {
2922 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2923 }
ed9b544e 2924 }
ed9b544e 2925 o->type = type;
942a3961 2926 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2927 o->ptr = ptr;
2928 o->refcount = 1;
3a66edc7 2929 if (server.vm_enabled) {
1064ef87 2930 /* Note that this code may run in the context of an I/O thread
2931 * and accessing to server.unixtime in theory is an error
2932 * (no locks). But in practice this is safe, and even if we read
2933 * garbage Redis will not fail, as it's just a statistical info */
3a66edc7 2934 o->vm.atime = server.unixtime;
2935 o->storage = REDIS_VM_MEMORY;
2936 }
ed9b544e 2937 return o;
2938}
2939
2940static robj *createStringObject(char *ptr, size_t len) {
2941 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2942}
2943
3f973463
PN
2944static robj *createStringObjectFromLongLong(long long value) {
2945 robj *o;
2946 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2947 incrRefCount(shared.integers[value]);
2948 o = shared.integers[value];
2949 } else {
3f973463 2950 if (value >= LONG_MIN && value <= LONG_MAX) {
10dea8dc 2951 o = createObject(REDIS_STRING, NULL);
3f973463
PN
2952 o->encoding = REDIS_ENCODING_INT;
2953 o->ptr = (void*)((long)value);
2954 } else {
ee14da56 2955 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3f973463
PN
2956 }
2957 }
2958 return o;
2959}
2960
4ef8de8a 2961static robj *dupStringObject(robj *o) {
b9bc0eef 2962 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2963 return createStringObject(o->ptr,sdslen(o->ptr));
2964}
2965
ed9b544e 2966static robj *createListObject(void) {
2967 list *l = listCreate();
2968
ed9b544e 2969 listSetFreeMethod(l,decrRefCount);
2970 return createObject(REDIS_LIST,l);
2971}
2972
2973static robj *createSetObject(void) {
2974 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2975 return createObject(REDIS_SET,d);
2976}
2977
5234952b 2978static robj *createHashObject(void) {
2979 /* All the Hashes start as zipmaps. Will be automatically converted
2980 * into hash tables if there are enough elements or big elements
2981 * inside. */
2982 unsigned char *zm = zipmapNew();
2983 robj *o = createObject(REDIS_HASH,zm);
2984 o->encoding = REDIS_ENCODING_ZIPMAP;
2985 return o;
2986}
2987
1812e024 2988static robj *createZsetObject(void) {
6b47e12e 2989 zset *zs = zmalloc(sizeof(*zs));
2990
2991 zs->dict = dictCreate(&zsetDictType,NULL);
2992 zs->zsl = zslCreate();
2993 return createObject(REDIS_ZSET,zs);
1812e024 2994}
2995
ed9b544e 2996static void freeStringObject(robj *o) {
942a3961 2997 if (o->encoding == REDIS_ENCODING_RAW) {
2998 sdsfree(o->ptr);
2999 }
ed9b544e 3000}
3001
3002static void freeListObject(robj *o) {
3003 listRelease((list*) o->ptr);
3004}
3005
3006static void freeSetObject(robj *o) {
3007 dictRelease((dict*) o->ptr);
3008}
3009
fd8ccf44 3010static void freeZsetObject(robj *o) {
3011 zset *zs = o->ptr;
3012
3013 dictRelease(zs->dict);
3014 zslFree(zs->zsl);
3015 zfree(zs);
3016}
3017
ed9b544e 3018static void freeHashObject(robj *o) {
cbba7dd7 3019 switch (o->encoding) {
3020 case REDIS_ENCODING_HT:
3021 dictRelease((dict*) o->ptr);
3022 break;
3023 case REDIS_ENCODING_ZIPMAP:
3024 zfree(o->ptr);
3025 break;
3026 default:
f83c6cb5 3027 redisPanic("Unknown hash encoding type");
cbba7dd7 3028 break;
3029 }
ed9b544e 3030}
3031
3032static void incrRefCount(robj *o) {
3033 o->refcount++;
3034}
3035
3036static void decrRefCount(void *obj) {
3037 robj *o = obj;
94754ccc 3038
c651fd9e 3039 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
970e10bb 3040 /* Object is a key of a swapped out value, or in the process of being
3041 * loaded. */
996cb5f7 3042 if (server.vm_enabled &&
3043 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3044 {
996cb5f7 3045 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 3046 redisAssert(o->type == REDIS_STRING);
a35ddf12 3047 freeStringObject(o);
3048 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 3049 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 3050 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3051 !listAddNodeHead(server.objfreelist,o))
3052 zfree(o);
a5819310 3053 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 3054 server.vm_stats_swapped_objects--;
a35ddf12 3055 return;
3056 }
996cb5f7 3057 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 3058 if (--(o->refcount) == 0) {
996cb5f7 3059 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3060 vmCancelThreadedIOJob(obj);
ed9b544e 3061 switch(o->type) {
3062 case REDIS_STRING: freeStringObject(o); break;
3063 case REDIS_LIST: freeListObject(o); break;
3064 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 3065 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 3066 case REDIS_HASH: freeHashObject(o); break;
f83c6cb5 3067 default: redisPanic("Unknown object type"); break;
ed9b544e 3068 }
a5819310 3069 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 3070 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3071 !listAddNodeHead(server.objfreelist,o))
3072 zfree(o);
a5819310 3073 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 3074 }
3075}
3076
942a3961 3077static robj *lookupKey(redisDb *db, robj *key) {
3078 dictEntry *de = dictFind(db->dict,key);
3a66edc7 3079 if (de) {
55cf8433 3080 robj *key = dictGetEntryKey(de);
3081 robj *val = dictGetEntryVal(de);
3a66edc7 3082
55cf8433 3083 if (server.vm_enabled) {
996cb5f7 3084 if (key->storage == REDIS_VM_MEMORY ||
3085 key->storage == REDIS_VM_SWAPPING)
3086 {
3087 /* If we were swapping the object out, stop it, this key
3088 * was requested. */
3089 if (key->storage == REDIS_VM_SWAPPING)
3090 vmCancelThreadedIOJob(key);
55cf8433 3091 /* Update the access time of the key for the aging algorithm. */
3092 key->vm.atime = server.unixtime;
3093 } else {
d5d55fc3 3094 int notify = (key->storage == REDIS_VM_LOADING);
3095
55cf8433 3096 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 3097 redisAssert(val == NULL);
55cf8433 3098 val = vmLoadObject(key);
3099 dictGetEntryVal(de) = val;
d5d55fc3 3100
3101 /* Clients blocked by the VM subsystem may be waiting for
3102 * this key... */
3103 if (notify) handleClientsBlockedOnSwappedKey(db,key);
55cf8433 3104 }
3105 }
3106 return val;
3a66edc7 3107 } else {
3108 return NULL;
3109 }
942a3961 3110}
3111
3112static robj *lookupKeyRead(redisDb *db, robj *key) {
3113 expireIfNeeded(db,key);
3114 return lookupKey(db,key);
3115}
3116
3117static robj *lookupKeyWrite(redisDb *db, robj *key) {
3118 deleteIfVolatile(db,key);
37ab76c9 3119 touchWatchedKey(db,key);
942a3961 3120 return lookupKey(db,key);
3121}
3122
92b27fe9 3123static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3124 robj *o = lookupKeyRead(c->db, key);
3125 if (!o) addReply(c,reply);
3126 return o;
3127}
3128
3129static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3130 robj *o = lookupKeyWrite(c->db, key);
3131 if (!o) addReply(c,reply);
3132 return o;
3133}
3134
3135static int checkType(redisClient *c, robj *o, int type) {
3136 if (o->type != type) {
3137 addReply(c,shared.wrongtypeerr);
3138 return 1;
3139 }
3140 return 0;
3141}
3142
942a3961 3143static int deleteKey(redisDb *db, robj *key) {
3144 int retval;
3145
3146 /* We need to protect key from destruction: after the first dictDelete()
3147 * it may happen that 'key' is no longer valid if we don't increment
3148 * it's count. This may happen when we get the object reference directly
3149 * from the hash table with dictRandomKey() or dict iterators */
3150 incrRefCount(key);
3151 if (dictSize(db->expires)) dictDelete(db->expires,key);
3152 retval = dictDelete(db->dict,key);
3153 decrRefCount(key);
3154
3155 return retval == DICT_OK;
3156}
3157
724a51b1 3158/* Check if the nul-terminated string 's' can be represented by a long
3159 * (that is, is a number that fits into long without any other space or
3160 * character before or after the digits).
3161 *
3162 * If so, the function returns REDIS_OK and *longval is set to the value
3163 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 3164static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 3165 char buf[32], *endptr;
3166 long value;
3167 int slen;
e0a62c7f 3168
724a51b1 3169 value = strtol(s, &endptr, 10);
3170 if (endptr[0] != '\0') return REDIS_ERR;
ee14da56 3171 slen = ll2string(buf,32,value);
724a51b1 3172
3173 /* If the number converted back into a string is not identical
3174 * then it's not possible to encode the string as integer */
f69f2cba 3175 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 3176 if (longval) *longval = value;
3177 return REDIS_OK;
3178}
3179
942a3961 3180/* Try to encode a string object in order to save space */
05df7621 3181static robj *tryObjectEncoding(robj *o) {
942a3961 3182 long value;
942a3961 3183 sds s = o->ptr;
3305306f 3184
942a3961 3185 if (o->encoding != REDIS_ENCODING_RAW)
05df7621 3186 return o; /* Already encoded */
3305306f 3187
05df7621 3188 /* It's not safe to encode shared objects: shared objects can be shared
942a3961 3189 * everywhere in the "object space" of Redis. Encoded objects can only
3190 * appear as "values" (and not, for instance, as keys) */
05df7621 3191 if (o->refcount > 1) return o;
3305306f 3192
942a3961 3193 /* Currently we try to encode only strings */
dfc5e96c 3194 redisAssert(o->type == REDIS_STRING);
94754ccc 3195
724a51b1 3196 /* Check if we can represent this string as a long integer */
05df7621 3197 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
942a3961 3198
3199 /* Ok, this object can be encoded */
05df7621 3200 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3201 decrRefCount(o);
3202 incrRefCount(shared.integers[value]);
3203 return shared.integers[value];
3204 } else {
3205 o->encoding = REDIS_ENCODING_INT;
3206 sdsfree(o->ptr);
3207 o->ptr = (void*) value;
3208 return o;
3209 }
942a3961 3210}
3211
9d65a1bb 3212/* Get a decoded version of an encoded object (returned as a new object).
3213 * If the object is already raw-encoded just increment the ref count. */
3214static robj *getDecodedObject(robj *o) {
942a3961 3215 robj *dec;
e0a62c7f 3216
9d65a1bb 3217 if (o->encoding == REDIS_ENCODING_RAW) {
3218 incrRefCount(o);
3219 return o;
3220 }
942a3961 3221 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3222 char buf[32];
3223
ee14da56 3224 ll2string(buf,32,(long)o->ptr);
942a3961 3225 dec = createStringObject(buf,strlen(buf));
3226 return dec;
3227 } else {
08ee9b57 3228 redisPanic("Unknown encoding type");
942a3961 3229 }
3305306f 3230}
3231
d7f43c08 3232/* Compare two string objects via strcmp() or alike.
3233 * Note that the objects may be integer-encoded. In such a case we
ee14da56 3234 * use ll2string() to get a string representation of the numbers on the stack
1fd9bc8a 3235 * and compare the strings, it's much faster than calling getDecodedObject().
3236 *
3237 * Important note: if objects are not integer encoded, but binary-safe strings,
3238 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3239 * binary safe. */
724a51b1 3240static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 3241 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 3242 char bufa[128], bufb[128], *astr, *bstr;
3243 int bothsds = 1;
724a51b1 3244
e197b441 3245 if (a == b) return 0;
d7f43c08 3246 if (a->encoding != REDIS_ENCODING_RAW) {
ee14da56 3247 ll2string(bufa,sizeof(bufa),(long) a->ptr);
d7f43c08 3248 astr = bufa;
3249 bothsds = 0;
724a51b1 3250 } else {
d7f43c08 3251 astr = a->ptr;
724a51b1 3252 }
d7f43c08 3253 if (b->encoding != REDIS_ENCODING_RAW) {
ee14da56 3254 ll2string(bufb,sizeof(bufb),(long) b->ptr);
d7f43c08 3255 bstr = bufb;
3256 bothsds = 0;
3257 } else {
3258 bstr = b->ptr;
3259 }
3260 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3261}
3262
bf028098 3263/* Equal string objects return 1 if the two objects are the same from the
3264 * point of view of a string comparison, otherwise 0 is returned. Note that
3265 * this function is faster then checking for (compareStringObject(a,b) == 0)
3266 * because it can perform some more optimization. */
3267static int equalStringObjects(robj *a, robj *b) {
3268 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3269 return a->ptr == b->ptr;
3270 } else {
3271 return compareStringObjects(a,b) == 0;
3272 }
3273}
3274
0ea663ea 3275static size_t stringObjectLen(robj *o) {
dfc5e96c 3276 redisAssert(o->type == REDIS_STRING);
0ea663ea 3277 if (o->encoding == REDIS_ENCODING_RAW) {
3278 return sdslen(o->ptr);
3279 } else {
3280 char buf[32];
3281
ee14da56 3282 return ll2string(buf,32,(long)o->ptr);
0ea663ea 3283 }
3284}
3285
bd79a6bd
PN
3286static int getDoubleFromObject(robj *o, double *target) {
3287 double value;
682c73e8 3288 char *eptr;
bbe025e0 3289
bd79a6bd
PN
3290 if (o == NULL) {
3291 value = 0;
3292 } else {
3293 redisAssert(o->type == REDIS_STRING);
3294 if (o->encoding == REDIS_ENCODING_RAW) {
3295 value = strtod(o->ptr, &eptr);
682c73e8 3296 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3297 } else if (o->encoding == REDIS_ENCODING_INT) {
3298 value = (long)o->ptr;
3299 } else {
946342c1 3300 redisPanic("Unknown string encoding");
bd79a6bd
PN
3301 }
3302 }
3303
bd79a6bd
PN
3304 *target = value;
3305 return REDIS_OK;
3306}
bbe025e0 3307
bd79a6bd
PN
3308static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3309 double value;
3310 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3311 if (msg != NULL) {
3312 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3313 } else {
3314 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3315 }
bbe025e0
AM
3316 return REDIS_ERR;
3317 }
3318
bd79a6bd 3319 *target = value;
bbe025e0
AM
3320 return REDIS_OK;
3321}
3322
bd79a6bd
PN
3323static int getLongLongFromObject(robj *o, long long *target) {
3324 long long value;
682c73e8 3325 char *eptr;
bbe025e0 3326
bd79a6bd
PN
3327 if (o == NULL) {
3328 value = 0;
3329 } else {
3330 redisAssert(o->type == REDIS_STRING);
3331 if (o->encoding == REDIS_ENCODING_RAW) {
3332 value = strtoll(o->ptr, &eptr, 10);
682c73e8 3333 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3334 } else if (o->encoding == REDIS_ENCODING_INT) {
3335 value = (long)o->ptr;
3336 } else {
946342c1 3337 redisPanic("Unknown string encoding");
bd79a6bd
PN
3338 }
3339 }
3340
bd79a6bd
PN
3341 *target = value;
3342 return REDIS_OK;
3343}
bbe025e0 3344
bd79a6bd
PN
3345static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3346 long long value;
3347 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3348 if (msg != NULL) {
3349 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3350 } else {
3351 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3352 }
bbe025e0
AM
3353 return REDIS_ERR;
3354 }
3355
bd79a6bd 3356 *target = value;
bbe025e0
AM
3357 return REDIS_OK;
3358}
3359
bd79a6bd
PN
3360static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3361 long long value;
bbe025e0 3362
bd79a6bd
PN
3363 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3364 if (value < LONG_MIN || value > LONG_MAX) {
3365 if (msg != NULL) {
3366 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3367 } else {
3368 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3369 }
bbe025e0
AM
3370 return REDIS_ERR;
3371 }
3372
bd79a6bd 3373 *target = value;
bbe025e0
AM
3374 return REDIS_OK;
3375}
3376
06233c45 3377/*============================ RDB saving/loading =========================== */
ed9b544e 3378
f78fd11b 3379static int rdbSaveType(FILE *fp, unsigned char type) {
3380 if (fwrite(&type,1,1,fp) == 0) return -1;
3381 return 0;
3382}
3383
bb32ede5 3384static int rdbSaveTime(FILE *fp, time_t t) {
3385 int32_t t32 = (int32_t) t;
3386 if (fwrite(&t32,4,1,fp) == 0) return -1;
3387 return 0;
3388}
3389
e3566d4b 3390/* check rdbLoadLen() comments for more info */
f78fd11b 3391static int rdbSaveLen(FILE *fp, uint32_t len) {
3392 unsigned char buf[2];
3393
3394 if (len < (1<<6)) {
3395 /* Save a 6 bit len */
10c43610 3396 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3397 if (fwrite(buf,1,1,fp) == 0) return -1;
3398 } else if (len < (1<<14)) {
3399 /* Save a 14 bit len */
10c43610 3400 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3401 buf[1] = len&0xFF;
17be1a4a 3402 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3403 } else {
3404 /* Save a 32 bit len */
10c43610 3405 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3406 if (fwrite(buf,1,1,fp) == 0) return -1;
3407 len = htonl(len);
3408 if (fwrite(&len,4,1,fp) == 0) return -1;
3409 }
3410 return 0;
3411}
3412
32a66513 3413/* Encode 'value' as an integer if possible (if integer will fit the
3414 * supported range). If the function sucessful encoded the integer
3415 * then the (up to 5 bytes) encoded representation is written in the
3416 * string pointed by 'enc' and the length is returned. Otherwise
3417 * 0 is returned. */
3418static int rdbEncodeInteger(long long value, unsigned char *enc) {
e3566d4b 3419 /* Finally check if it fits in our ranges */
3420 if (value >= -(1<<7) && value <= (1<<7)-1) {
3421 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3422 enc[1] = value&0xFF;
3423 return 2;
3424 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3425 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3426 enc[1] = value&0xFF;
3427 enc[2] = (value>>8)&0xFF;
3428 return 3;
3429 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3430 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3431 enc[1] = value&0xFF;
3432 enc[2] = (value>>8)&0xFF;
3433 enc[3] = (value>>16)&0xFF;
3434 enc[4] = (value>>24)&0xFF;
3435 return 5;
3436 } else {
3437 return 0;
3438 }
3439}
3440
32a66513 3441/* String objects in the form "2391" "-100" without any space and with a
3442 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3443 * encoded as integers to save space */
3444static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3445 long long value;
3446 char *endptr, buf[32];
3447
3448 /* Check if it's possible to encode this value as a number */
3449 value = strtoll(s, &endptr, 10);
3450 if (endptr[0] != '\0') return 0;
3451 ll2string(buf,32,value);
3452
3453 /* If the number converted back into a string is not identical
3454 * then it's not possible to encode the string as integer */
3455 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3456
3457 return rdbEncodeInteger(value,enc);
3458}
3459
b1befe6a 3460static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3461 size_t comprlen, outlen;
774e3047 3462 unsigned char byte;
3463 void *out;
3464
3465 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3466 if (len <= 4) return 0;
3467 outlen = len-4;
3a2694c4 3468 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3469 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3470 if (comprlen == 0) {
88e85998 3471 zfree(out);
774e3047 3472 return 0;
3473 }
3474 /* Data compressed! Let's save it on disk */
3475 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3476 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3477 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3478 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3479 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3480 zfree(out);
774e3047 3481 return comprlen;
3482
3483writeerr:
88e85998 3484 zfree(out);
774e3047 3485 return -1;
3486}
3487
e3566d4b 3488/* Save a string objet as [len][data] on disk. If the object is a string
3489 * representation of an integer value we try to safe it in a special form */
b1befe6a 3490static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3491 int enclen;
10c43610 3492
774e3047 3493 /* Try integer encoding */
e3566d4b 3494 if (len <= 11) {
3495 unsigned char buf[5];
b1befe6a 3496 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3497 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3498 return 0;
3499 }
3500 }
774e3047 3501
3502 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3503 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3504 if (server.rdbcompression && len > 20) {
774e3047 3505 int retval;
3506
b1befe6a 3507 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3508 if (retval == -1) return -1;
3509 if (retval > 0) return 0;
3510 /* retval == 0 means data can't be compressed, save the old way */
3511 }
3512
3513 /* Store verbatim */
10c43610 3514 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3515 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3516 return 0;
3517}
3518
942a3961 3519/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3520static int rdbSaveStringObject(FILE *fp, robj *obj) {
3521 int retval;
942a3961 3522
32a66513 3523 /* Avoid to decode the object, then encode it again, if the
3524 * object is alrady integer encoded. */
3525 if (obj->encoding == REDIS_ENCODING_INT) {
3526 long val = (long) obj->ptr;
3527 unsigned char buf[5];
3528 int enclen;
3529
3530 if ((enclen = rdbEncodeInteger(val,buf)) > 0) {
3531 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3532 return 0;
3533 }
3534 /* otherwise... fall throught and continue with the usual
3535 * code path. */
3536 }
3537
f2d9f50f 3538 /* Avoid incr/decr ref count business when possible.
3539 * This plays well with copy-on-write given that we are probably
3540 * in a child process (BGSAVE). Also this makes sure key objects
3541 * of swapped objects are not incRefCount-ed (an assert does not allow
3542 * this in order to avoid bugs) */
3543 if (obj->encoding != REDIS_ENCODING_RAW) {
996cb5f7 3544 obj = getDecodedObject(obj);
b1befe6a 3545 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3546 decrRefCount(obj);
3547 } else {
b1befe6a 3548 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3549 }
9d65a1bb 3550 return retval;
942a3961 3551}
3552
a7866db6 3553/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3554 * 8 bit integer specifing the length of the representation.
3555 * This 8 bit integer has special values in order to specify the following
3556 * conditions:
3557 * 253: not a number
3558 * 254: + inf
3559 * 255: - inf
3560 */
3561static int rdbSaveDoubleValue(FILE *fp, double val) {
3562 unsigned char buf[128];
3563 int len;
3564
3565 if (isnan(val)) {
3566 buf[0] = 253;
3567 len = 1;
3568 } else if (!isfinite(val)) {
3569 len = 1;
3570 buf[0] = (val < 0) ? 255 : 254;
3571 } else {
88e8d89f 3572#if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
fe244589 3573 /* Check if the float is in a safe range to be casted into a
3574 * long long. We are assuming that long long is 64 bit here.
3575 * Also we are assuming that there are no implementations around where
3576 * double has precision < 52 bit.
3577 *
3578 * Under this assumptions we test if a double is inside an interval
3579 * where casting to long long is safe. Then using two castings we
3580 * make sure the decimal part is zero. If all this is true we use
3581 * integer printing function that is much faster. */
fb82e75c 3582 double min = -4503599627370495; /* (2^52)-1 */
3583 double max = 4503599627370496; /* -(2^52) */
fe244589 3584 if (val > min && val < max && val == ((double)((long long)val)))
8c096b16 3585 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3586 else
88e8d89f 3587#endif
8c096b16 3588 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3589 buf[0] = strlen((char*)buf+1);
a7866db6 3590 len = buf[0]+1;
3591 }
3592 if (fwrite(buf,len,1,fp) == 0) return -1;
3593 return 0;
3594}
3595
06233c45 3596/* Save a Redis object. */
3597static int rdbSaveObject(FILE *fp, robj *o) {
3598 if (o->type == REDIS_STRING) {
3599 /* Save a string value */
3600 if (rdbSaveStringObject(fp,o) == -1) return -1;
3601 } else if (o->type == REDIS_LIST) {
3602 /* Save a list value */
3603 list *list = o->ptr;
c7df85a4 3604 listIter li;
06233c45 3605 listNode *ln;
3606
06233c45 3607 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
c7df85a4 3608 listRewind(list,&li);
3609 while((ln = listNext(&li))) {
06233c45 3610 robj *eleobj = listNodeValue(ln);
3611
3612 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3613 }
3614 } else if (o->type == REDIS_SET) {
3615 /* Save a set value */
3616 dict *set = o->ptr;
3617 dictIterator *di = dictGetIterator(set);
3618 dictEntry *de;
3619
3620 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3621 while((de = dictNext(di)) != NULL) {
3622 robj *eleobj = dictGetEntryKey(de);
3623
3624 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3625 }
3626 dictReleaseIterator(di);
3627 } else if (o->type == REDIS_ZSET) {
3628 /* Save a set value */
3629 zset *zs = o->ptr;
3630 dictIterator *di = dictGetIterator(zs->dict);
3631 dictEntry *de;
3632
3633 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3634 while((de = dictNext(di)) != NULL) {
3635 robj *eleobj = dictGetEntryKey(de);
3636 double *score = dictGetEntryVal(de);
3637
3638 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3639 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3640 }
3641 dictReleaseIterator(di);
b1befe6a 3642 } else if (o->type == REDIS_HASH) {
3643 /* Save a hash value */
3644 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3645 unsigned char *p = zipmapRewind(o->ptr);
3646 unsigned int count = zipmapLen(o->ptr);
3647 unsigned char *key, *val;
3648 unsigned int klen, vlen;
3649
3650 if (rdbSaveLen(fp,count) == -1) return -1;
3651 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3652 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3653 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3654 }
3655 } else {
3656 dictIterator *di = dictGetIterator(o->ptr);
3657 dictEntry *de;
3658
3659 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3660 while((de = dictNext(di)) != NULL) {
3661 robj *key = dictGetEntryKey(de);
3662 robj *val = dictGetEntryVal(de);
3663
3664 if (rdbSaveStringObject(fp,key) == -1) return -1;
3665 if (rdbSaveStringObject(fp,val) == -1) return -1;
3666 }
3667 dictReleaseIterator(di);
3668 }
06233c45 3669 } else {
f83c6cb5 3670 redisPanic("Unknown object type");
06233c45 3671 }
3672 return 0;
3673}
3674
3675/* Return the length the object will have on disk if saved with
3676 * the rdbSaveObject() function. Currently we use a trick to get
3677 * this length with very little changes to the code. In the future
3678 * we could switch to a faster solution. */
b9bc0eef 3679static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3680 if (fp == NULL) fp = server.devnull;
06233c45 3681 rewind(fp);
3682 assert(rdbSaveObject(fp,o) != 1);
3683 return ftello(fp);
3684}
3685
06224fec 3686/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3687static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3688 off_t bytes = rdbSavedObjectLen(o,fp);
e0a62c7f 3689
06224fec 3690 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3691}
3692
ed9b544e 3693/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3694static int rdbSave(char *filename) {
ed9b544e 3695 dictIterator *di = NULL;
3696 dictEntry *de;
ed9b544e 3697 FILE *fp;
3698 char tmpfile[256];
3699 int j;
bb32ede5 3700 time_t now = time(NULL);
ed9b544e 3701
2316bb3b 3702 /* Wait for I/O therads to terminate, just in case this is a
3703 * foreground-saving, to avoid seeking the swap file descriptor at the
3704 * same time. */
3705 if (server.vm_enabled)
3706 waitEmptyIOJobsQueue();
3707
a3b21203 3708 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3709 fp = fopen(tmpfile,"w");
3710 if (!fp) {
3711 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3712 return REDIS_ERR;
3713 }
f78fd11b 3714 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3715 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3716 redisDb *db = server.db+j;
3717 dict *d = db->dict;
3305306f 3718 if (dictSize(d) == 0) continue;
ed9b544e 3719 di = dictGetIterator(d);
3720 if (!di) {
3721 fclose(fp);
3722 return REDIS_ERR;
3723 }
3724
3725 /* Write the SELECT DB opcode */
f78fd11b 3726 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3727 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3728
3729 /* Iterate this DB writing every entry */
3730 while((de = dictNext(di)) != NULL) {
3731 robj *key = dictGetEntryKey(de);
3732 robj *o = dictGetEntryVal(de);
bb32ede5 3733 time_t expiretime = getExpire(db,key);
3734
3735 /* Save the expire time */
3736 if (expiretime != -1) {
3737 /* If this key is already expired skip it */
3738 if (expiretime < now) continue;
3739 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3740 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3741 }
7e69548d 3742 /* Save the key and associated value. This requires special
3743 * handling if the value is swapped out. */
996cb5f7 3744 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3745 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3746 /* Save type, key, value */
3747 if (rdbSaveType(fp,o->type) == -1) goto werr;
3748 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3749 if (rdbSaveObject(fp,o) == -1) goto werr;
3750 } else {
996cb5f7 3751 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3752 robj *po;
7e69548d 3753 /* Get a preview of the object in memory */
3754 po = vmPreviewObject(key);
7e69548d 3755 /* Save type, key, value */
3756 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3757 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3758 if (rdbSaveObject(fp,po) == -1) goto werr;
3759 /* Remove the loaded object from memory */
3760 decrRefCount(po);
7e69548d 3761 }
ed9b544e 3762 }
3763 dictReleaseIterator(di);
3764 }
3765 /* EOF opcode */
f78fd11b 3766 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3767
3768 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3769 fflush(fp);
3770 fsync(fileno(fp));
3771 fclose(fp);
e0a62c7f 3772
ed9b544e 3773 /* Use RENAME to make sure the DB file is changed atomically only
3774 * if the generate DB file is ok. */
3775 if (rename(tmpfile,filename) == -1) {
325d1eb4 3776 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3777 unlink(tmpfile);
3778 return REDIS_ERR;
3779 }
3780 redisLog(REDIS_NOTICE,"DB saved on disk");
3781 server.dirty = 0;
3782 server.lastsave = time(NULL);
3783 return REDIS_OK;
3784
3785werr:
3786 fclose(fp);
3787 unlink(tmpfile);
3788 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3789 if (di) dictReleaseIterator(di);
3790 return REDIS_ERR;
3791}
3792
f78fd11b 3793static int rdbSaveBackground(char *filename) {
ed9b544e 3794 pid_t childpid;
3795
9d65a1bb 3796 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3797 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3798 if ((childpid = fork()) == 0) {
3799 /* Child */
054e426d 3800 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3801 close(server.fd);
f78fd11b 3802 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3803 _exit(0);
ed9b544e 3804 } else {
478c2c6f 3805 _exit(1);
ed9b544e 3806 }
3807 } else {
3808 /* Parent */
5a7c647e 3809 if (childpid == -1) {
3810 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3811 strerror(errno));
3812 return REDIS_ERR;
3813 }
ed9b544e 3814 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3815 server.bgsavechildpid = childpid;
884d4b39 3816 updateDictResizePolicy();
ed9b544e 3817 return REDIS_OK;
3818 }
3819 return REDIS_OK; /* unreached */
3820}
3821
a3b21203 3822static void rdbRemoveTempFile(pid_t childpid) {
3823 char tmpfile[256];
3824
3825 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3826 unlink(tmpfile);
3827}
3828
f78fd11b 3829static int rdbLoadType(FILE *fp) {
3830 unsigned char type;
7b45bfb2 3831 if (fread(&type,1,1,fp) == 0) return -1;
3832 return type;
3833}
3834
bb32ede5 3835static time_t rdbLoadTime(FILE *fp) {
3836 int32_t t32;
3837 if (fread(&t32,4,1,fp) == 0) return -1;
3838 return (time_t) t32;
3839}
3840
e3566d4b 3841/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3842 * of this file for a description of how this are stored on disk.
3843 *
3844 * isencoded is set to 1 if the readed length is not actually a length but
3845 * an "encoding type", check the above comments for more info */
c78a8ccc 3846static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3847 unsigned char buf[2];
3848 uint32_t len;
c78a8ccc 3849 int type;
f78fd11b 3850
e3566d4b 3851 if (isencoded) *isencoded = 0;
c78a8ccc 3852 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3853 type = (buf[0]&0xC0)>>6;
3854 if (type == REDIS_RDB_6BITLEN) {
3855 /* Read a 6 bit len */
3856 return buf[0]&0x3F;
3857 } else if (type == REDIS_RDB_ENCVAL) {
3858 /* Read a 6 bit len encoding type */
3859 if (isencoded) *isencoded = 1;
3860 return buf[0]&0x3F;
3861 } else if (type == REDIS_RDB_14BITLEN) {
3862 /* Read a 14 bit len */
3863 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3864 return ((buf[0]&0x3F)<<8)|buf[1];
3865 } else {
3866 /* Read a 32 bit len */
f78fd11b 3867 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3868 return ntohl(len);
f78fd11b 3869 }
f78fd11b 3870}
3871
ad30aa60 3872/* Load an integer-encoded object from file 'fp', with the specified
3873 * encoding type 'enctype'. If encode is true the function may return
3874 * an integer-encoded object as reply, otherwise the returned object
3875 * will always be encoded as a raw string. */
3876static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
e3566d4b 3877 unsigned char enc[4];
3878 long long val;
3879
3880 if (enctype == REDIS_RDB_ENC_INT8) {
3881 if (fread(enc,1,1,fp) == 0) return NULL;
3882 val = (signed char)enc[0];
3883 } else if (enctype == REDIS_RDB_ENC_INT16) {
3884 uint16_t v;
3885 if (fread(enc,2,1,fp) == 0) return NULL;
3886 v = enc[0]|(enc[1]<<8);
3887 val = (int16_t)v;
3888 } else if (enctype == REDIS_RDB_ENC_INT32) {
3889 uint32_t v;
3890 if (fread(enc,4,1,fp) == 0) return NULL;
3891 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3892 val = (int32_t)v;
3893 } else {
3894 val = 0; /* anti-warning */
f83c6cb5 3895 redisPanic("Unknown RDB integer encoding type");
e3566d4b 3896 }
ad30aa60 3897 if (encode)
3898 return createStringObjectFromLongLong(val);
3899 else
3900 return createObject(REDIS_STRING,sdsfromlonglong(val));
e3566d4b 3901}
3902
c78a8ccc 3903static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3904 unsigned int len, clen;
3905 unsigned char *c = NULL;
3906 sds val = NULL;
3907
c78a8ccc 3908 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3909 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3910 if ((c = zmalloc(clen)) == NULL) goto err;
3911 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3912 if (fread(c,clen,1,fp) == 0) goto err;
3913 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3914 zfree(c);
88e85998 3915 return createObject(REDIS_STRING,val);
3916err:
3917 zfree(c);
3918 sdsfree(val);
3919 return NULL;
3920}
3921
ad30aa60 3922static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
e3566d4b 3923 int isencoded;
3924 uint32_t len;
f78fd11b 3925 sds val;
3926
c78a8ccc 3927 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3928 if (isencoded) {
3929 switch(len) {
3930 case REDIS_RDB_ENC_INT8:
3931 case REDIS_RDB_ENC_INT16:
3932 case REDIS_RDB_ENC_INT32:
ad30aa60 3933 return rdbLoadIntegerObject(fp,len,encode);
88e85998 3934 case REDIS_RDB_ENC_LZF:
bdcb92f2 3935 return rdbLoadLzfStringObject(fp);
e3566d4b 3936 default:
f83c6cb5 3937 redisPanic("Unknown RDB encoding type");
e3566d4b 3938 }
3939 }
3940
f78fd11b 3941 if (len == REDIS_RDB_LENERR) return NULL;
3942 val = sdsnewlen(NULL,len);
3943 if (len && fread(val,len,1,fp) == 0) {
3944 sdsfree(val);
3945 return NULL;
3946 }
bdcb92f2 3947 return createObject(REDIS_STRING,val);
f78fd11b 3948}
3949
ad30aa60 3950static robj *rdbLoadStringObject(FILE *fp) {
3951 return rdbGenericLoadStringObject(fp,0);
3952}
3953
3954static robj *rdbLoadEncodedStringObject(FILE *fp) {
3955 return rdbGenericLoadStringObject(fp,1);
3956}
3957
a7866db6 3958/* For information about double serialization check rdbSaveDoubleValue() */
3959static int rdbLoadDoubleValue(FILE *fp, double *val) {
3960 char buf[128];
3961 unsigned char len;
3962
3963 if (fread(&len,1,1,fp) == 0) return -1;
3964 switch(len) {
3965 case 255: *val = R_NegInf; return 0;
3966 case 254: *val = R_PosInf; return 0;
3967 case 253: *val = R_Nan; return 0;
3968 default:
3969 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3970 buf[len] = '\0';
a7866db6 3971 sscanf(buf, "%lg", val);
3972 return 0;
3973 }
3974}
3975
c78a8ccc 3976/* Load a Redis object of the specified type from the specified file.
3977 * On success a newly allocated object is returned, otherwise NULL. */
3978static robj *rdbLoadObject(int type, FILE *fp) {
3979 robj *o;
3980
bcd11906 3981 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 3982 if (type == REDIS_STRING) {
3983 /* Read string value */
ad30aa60 3984 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 3985 o = tryObjectEncoding(o);
c78a8ccc 3986 } else if (type == REDIS_LIST || type == REDIS_SET) {
3987 /* Read list/set value */
3988 uint32_t listlen;
3989
3990 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3991 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3c68de9b 3992 /* It's faster to expand the dict to the right size asap in order
3993 * to avoid rehashing */
3994 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3995 dictExpand(o->ptr,listlen);
c78a8ccc 3996 /* Load every single element of the list/set */
3997 while(listlen--) {
3998 robj *ele;
3999
ad30aa60 4000 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4001 ele = tryObjectEncoding(ele);
c78a8ccc 4002 if (type == REDIS_LIST) {
4003 listAddNodeTail((list*)o->ptr,ele);
4004 } else {
4005 dictAdd((dict*)o->ptr,ele,NULL);
4006 }
4007 }
4008 } else if (type == REDIS_ZSET) {
4009 /* Read list/set value */
ada386b2 4010 size_t zsetlen;
c78a8ccc 4011 zset *zs;
4012
4013 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4014 o = createZsetObject();
4015 zs = o->ptr;
4016 /* Load every single element of the list/set */
4017 while(zsetlen--) {
4018 robj *ele;
4019 double *score = zmalloc(sizeof(double));
4020
ad30aa60 4021 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4022 ele = tryObjectEncoding(ele);
c78a8ccc 4023 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4024 dictAdd(zs->dict,ele,score);
4025 zslInsert(zs->zsl,*score,ele);
4026 incrRefCount(ele); /* added to skiplist */
4027 }
ada386b2 4028 } else if (type == REDIS_HASH) {
4029 size_t hashlen;
4030
4031 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4032 o = createHashObject();
4033 /* Too many entries? Use an hash table. */
4034 if (hashlen > server.hash_max_zipmap_entries)
4035 convertToRealHash(o);
4036 /* Load every key/value, then set it into the zipmap or hash
4037 * table, as needed. */
4038 while(hashlen--) {
4039 robj *key, *val;
4040
4041 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4042 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4043 /* If we are using a zipmap and there are too big values
4044 * the object is converted to real hash table encoding. */
4045 if (o->encoding != REDIS_ENCODING_HT &&
4046 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4047 sdslen(val->ptr) > server.hash_max_zipmap_value))
4048 {
4049 convertToRealHash(o);
4050 }
4051
4052 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4053 unsigned char *zm = o->ptr;
4054
4055 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4056 val->ptr,sdslen(val->ptr),NULL);
4057 o->ptr = zm;
4058 decrRefCount(key);
4059 decrRefCount(val);
4060 } else {
05df7621 4061 key = tryObjectEncoding(key);
4062 val = tryObjectEncoding(val);
ada386b2 4063 dictAdd((dict*)o->ptr,key,val);
ada386b2 4064 }
4065 }
c78a8ccc 4066 } else {
f83c6cb5 4067 redisPanic("Unknown object type");
c78a8ccc 4068 }
4069 return o;
4070}
4071
f78fd11b 4072static int rdbLoad(char *filename) {
ed9b544e 4073 FILE *fp;
f78fd11b 4074 uint32_t dbid;
bb32ede5 4075 int type, retval, rdbver;
585af7e2 4076 int swap_all_values = 0;
3305306f 4077 dict *d = server.db[0].dict;
bb32ede5 4078 redisDb *db = server.db+0;
f78fd11b 4079 char buf[1024];
242a64f3 4080 time_t expiretime, now = time(NULL);
b492cf00 4081 long long loadedkeys = 0;
bb32ede5 4082
ed9b544e 4083 fp = fopen(filename,"r");
4084 if (!fp) return REDIS_ERR;
4085 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 4086 buf[9] = '\0';
4087 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 4088 fclose(fp);
4089 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4090 return REDIS_ERR;
4091 }
f78fd11b 4092 rdbver = atoi(buf+5);
c78a8ccc 4093 if (rdbver != 1) {
f78fd11b 4094 fclose(fp);
4095 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4096 return REDIS_ERR;
4097 }
ed9b544e 4098 while(1) {
585af7e2 4099 robj *key, *val;
ed9b544e 4100
585af7e2 4101 expiretime = -1;
ed9b544e 4102 /* Read type. */
f78fd11b 4103 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 4104 if (type == REDIS_EXPIRETIME) {
4105 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4106 /* We read the time so we need to read the object type again */
4107 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4108 }
ed9b544e 4109 if (type == REDIS_EOF) break;
4110 /* Handle SELECT DB opcode as a special case */
4111 if (type == REDIS_SELECTDB) {
c78a8ccc 4112 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 4113 goto eoferr;
ed9b544e 4114 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 4115 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 4116 exit(1);
4117 }
bb32ede5 4118 db = server.db+dbid;
4119 d = db->dict;
ed9b544e 4120 continue;
4121 }
4122 /* Read key */
585af7e2 4123 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
c78a8ccc 4124 /* Read value */
585af7e2 4125 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
89e689c5 4126 /* Check if the key already expired */
4127 if (expiretime != -1 && expiretime < now) {
4128 decrRefCount(key);
4129 decrRefCount(val);
4130 continue;
4131 }
ed9b544e 4132 /* Add the new object in the hash table */
585af7e2 4133 retval = dictAdd(d,key,val);
ed9b544e 4134 if (retval == DICT_ERR) {
585af7e2 4135 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
ed9b544e 4136 exit(1);
4137 }
242a64f3 4138 loadedkeys++;
bb32ede5 4139 /* Set the expire time if needed */
89e689c5 4140 if (expiretime != -1) setExpire(db,key,expiretime);
242a64f3 4141
b492cf00 4142 /* Handle swapping while loading big datasets when VM is on */
242a64f3 4143
4144 /* If we detecter we are hopeless about fitting something in memory
4145 * we just swap every new key on disk. Directly...
4146 * Note that's important to check for this condition before resorting
4147 * to random sampling, otherwise we may try to swap already
4148 * swapped keys. */
585af7e2 4149 if (swap_all_values) {
4150 dictEntry *de = dictFind(d,key);
242a64f3 4151
4152 /* de may be NULL since the key already expired */
4153 if (de) {
585af7e2 4154 key = dictGetEntryKey(de);
4155 val = dictGetEntryVal(de);
242a64f3 4156
585af7e2 4157 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
242a64f3 4158 dictGetEntryVal(de) = NULL;
4159 }
4160 }
4161 continue;
4162 }
4163
4164 /* If we have still some hope of having some value fitting memory
4165 * then we try random sampling. */
585af7e2 4166 if (!swap_all_values && server.vm_enabled && (loadedkeys % 5000) == 0) {
b492cf00 4167 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 4168 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 4169 }
242a64f3 4170 if (zmalloc_used_memory() > server.vm_max_memory)
585af7e2 4171 swap_all_values = 1; /* We are already using too much mem */
b492cf00 4172 }
ed9b544e 4173 }
4174 fclose(fp);
4175 return REDIS_OK;
4176
4177eoferr: /* unexpected end of file is handled here with a fatal exit */
f80dff62 4178 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 4179 exit(1);
4180 return REDIS_ERR; /* Just to avoid warning */
4181}
4182
b58ba105 4183/*================================== Shutdown =============================== */
fab43727 4184static int prepareForShutdown() {
b58ba105
AM
4185 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4186 /* Kill the saving child if there is a background saving in progress.
4187 We want to avoid race conditions, for instance our saving child may
4188 overwrite the synchronous saving did by SHUTDOWN. */
4189 if (server.bgsavechildpid != -1) {
4190 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4191 kill(server.bgsavechildpid,SIGKILL);
4192 rdbRemoveTempFile(server.bgsavechildpid);
4193 }
4194 if (server.appendonly) {
4195 /* Append only file: fsync() the AOF and exit */
4196 fsync(server.appendfd);
4197 if (server.vm_enabled) unlink(server.vm_swap_file);
b58ba105
AM
4198 } else {
4199 /* Snapshotting. Perform a SYNC SAVE and exit */
4200 if (rdbSave(server.dbfilename) == REDIS_OK) {
4201 if (server.daemonize)
4202 unlink(server.pidfile);
4203 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
b58ba105
AM
4204 } else {
4205 /* Ooops.. error saving! The best we can do is to continue
4206 * operating. Note that if there was a background saving process,
4207 * in the next cron() Redis will be notified that the background
4208 * saving aborted, handling special stuff like slaves pending for
4209 * synchronization... */
4210 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
fab43727 4211 return REDIS_ERR;
b58ba105
AM
4212 }
4213 }
8513a757 4214 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
fab43727 4215 return REDIS_OK;
b58ba105
AM
4216}
4217
ed9b544e 4218/*================================== Commands =============================== */
4219
abcb223e 4220static void authCommand(redisClient *c) {
2e77c2ee 4221 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
4222 c->authenticated = 1;
4223 addReply(c,shared.ok);
4224 } else {
4225 c->authenticated = 0;
fa4c0aba 4226 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
4227 }
4228}
4229
ed9b544e 4230static void pingCommand(redisClient *c) {
4231 addReply(c,shared.pong);
4232}
4233
4234static void echoCommand(redisClient *c) {
dd88747b 4235 addReplyBulk(c,c->argv[1]);
ed9b544e 4236}
4237
4238/*=================================== Strings =============================== */
4239
526d00a5 4240static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
ed9b544e 4241 int retval;
10ce1276 4242 long seconds = 0; /* initialized to avoid an harmness warning */
ed9b544e 4243
526d00a5 4244 if (expire) {
4245 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4246 return;
4247 if (seconds <= 0) {
4248 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4249 return;
4250 }
4251 }
4252
37ab76c9 4253 touchWatchedKey(c->db,key);
526d00a5 4254 if (nx) deleteIfVolatile(c->db,key);
4255 retval = dictAdd(c->db->dict,key,val);
ed9b544e 4256 if (retval == DICT_ERR) {
4257 if (!nx) {
1b03836c 4258 /* If the key is about a swapped value, we want a new key object
4259 * to overwrite the old. So we delete the old key in the database.
4260 * This will also make sure that swap pages about the old object
4261 * will be marked as free. */
526d00a5 4262 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4263 incrRefCount(key);
4264 dictReplace(c->db->dict,key,val);
4265 incrRefCount(val);
ed9b544e 4266 } else {
c937aa89 4267 addReply(c,shared.czero);
ed9b544e 4268 return;
4269 }
4270 } else {
526d00a5 4271 incrRefCount(key);
4272 incrRefCount(val);
ed9b544e 4273 }
4274 server.dirty++;
526d00a5 4275 removeExpire(c->db,key);
4276 if (expire) setExpire(c->db,key,time(NULL)+seconds);
c937aa89 4277 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 4278}
4279
4280static void setCommand(redisClient *c) {
526d00a5 4281 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
ed9b544e 4282}
4283
4284static void setnxCommand(redisClient *c) {
526d00a5 4285 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4286}
4287
4288static void setexCommand(redisClient *c) {
4289 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
ed9b544e 4290}
4291
322fc7d8 4292static int getGenericCommand(redisClient *c) {
dd88747b 4293 robj *o;
e0a62c7f 4294
dd88747b 4295 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 4296 return REDIS_OK;
dd88747b 4297
4298 if (o->type != REDIS_STRING) {
4299 addReply(c,shared.wrongtypeerr);
4300 return REDIS_ERR;
ed9b544e 4301 } else {
dd88747b 4302 addReplyBulk(c,o);
4303 return REDIS_OK;
ed9b544e 4304 }
4305}
4306
322fc7d8 4307static void getCommand(redisClient *c) {
4308 getGenericCommand(c);
4309}
4310
f6b141c5 4311static void getsetCommand(redisClient *c) {
322fc7d8 4312 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 4313 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4314 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4315 } else {
4316 incrRefCount(c->argv[1]);
4317 }
4318 incrRefCount(c->argv[2]);
4319 server.dirty++;
4320 removeExpire(c->db,c->argv[1]);
4321}
4322
70003d28 4323static void mgetCommand(redisClient *c) {
70003d28 4324 int j;
e0a62c7f 4325
c937aa89 4326 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 4327 for (j = 1; j < c->argc; j++) {
3305306f 4328 robj *o = lookupKeyRead(c->db,c->argv[j]);
4329 if (o == NULL) {
c937aa89 4330 addReply(c,shared.nullbulk);
70003d28 4331 } else {
70003d28 4332 if (o->type != REDIS_STRING) {
c937aa89 4333 addReply(c,shared.nullbulk);
70003d28 4334 } else {
dd88747b 4335 addReplyBulk(c,o);
70003d28 4336 }
4337 }
4338 }
4339}
4340
6c446631 4341static void msetGenericCommand(redisClient *c, int nx) {
906573e7 4342 int j, busykeys = 0;
6c446631 4343
4344 if ((c->argc % 2) == 0) {
454d4e43 4345 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 4346 return;
4347 }
4348 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4349 * set nothing at all if at least one already key exists. */
4350 if (nx) {
4351 for (j = 1; j < c->argc; j += 2) {
906573e7 4352 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4353 busykeys++;
6c446631 4354 }
4355 }
4356 }
906573e7 4357 if (busykeys) {
4358 addReply(c, shared.czero);
4359 return;
4360 }
6c446631 4361
4362 for (j = 1; j < c->argc; j += 2) {
4363 int retval;
4364
05df7621 4365 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
6c446631 4366 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4367 if (retval == DICT_ERR) {
4368 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4369 incrRefCount(c->argv[j+1]);
4370 } else {
4371 incrRefCount(c->argv[j]);
4372 incrRefCount(c->argv[j+1]);
4373 }
4374 removeExpire(c->db,c->argv[j]);
4375 }
4376 server.dirty += (c->argc-1)/2;
4377 addReply(c, nx ? shared.cone : shared.ok);
4378}
4379
4380static void msetCommand(redisClient *c) {
4381 msetGenericCommand(c,0);
4382}
4383
4384static void msetnxCommand(redisClient *c) {
4385 msetGenericCommand(c,1);
4386}
4387
d68ed120 4388static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 4389 long long value;
4390 int retval;
4391 robj *o;
e0a62c7f 4392
3305306f 4393 o = lookupKeyWrite(c->db,c->argv[1]);
6485f293
PN
4394 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4395 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
ed9b544e 4396
4397 value += incr;
d6f4c262 4398 o = createStringObjectFromLongLong(value);
3305306f 4399 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 4400 if (retval == DICT_ERR) {
3305306f 4401 dictReplace(c->db->dict,c->argv[1],o);
4402 removeExpire(c->db,c->argv[1]);
ed9b544e 4403 } else {
4404 incrRefCount(c->argv[1]);
4405 }
4406 server.dirty++;
c937aa89 4407 addReply(c,shared.colon);
ed9b544e 4408 addReply(c,o);
4409 addReply(c,shared.crlf);
4410}
4411
4412static void incrCommand(redisClient *c) {
a4d1ba9a 4413 incrDecrCommand(c,1);
ed9b544e 4414}
4415
4416static void decrCommand(redisClient *c) {
a4d1ba9a 4417 incrDecrCommand(c,-1);
ed9b544e 4418}
4419
4420static void incrbyCommand(redisClient *c) {
bbe025e0
AM
4421 long long incr;
4422
bd79a6bd 4423 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4424 incrDecrCommand(c,incr);
ed9b544e 4425}
4426
4427static void decrbyCommand(redisClient *c) {
bbe025e0
AM
4428 long long incr;
4429
bd79a6bd 4430 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4431 incrDecrCommand(c,-incr);
ed9b544e 4432}
4433
4b00bebd 4434static void appendCommand(redisClient *c) {
4435 int retval;
4436 size_t totlen;
4437 robj *o;
4438
4439 o = lookupKeyWrite(c->db,c->argv[1]);
4440 if (o == NULL) {
4441 /* Create the key */
4442 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4443 incrRefCount(c->argv[1]);
4444 incrRefCount(c->argv[2]);
4445 totlen = stringObjectLen(c->argv[2]);
4446 } else {
4447 dictEntry *de;
e0a62c7f 4448
4b00bebd 4449 de = dictFind(c->db->dict,c->argv[1]);
4450 assert(de != NULL);
4451
4452 o = dictGetEntryVal(de);
4453 if (o->type != REDIS_STRING) {
4454 addReply(c,shared.wrongtypeerr);
4455 return;
4456 }
4457 /* If the object is specially encoded or shared we have to make
4458 * a copy */
4459 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4460 robj *decoded = getDecodedObject(o);
4461
4462 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4463 decrRefCount(decoded);
4464 dictReplace(c->db->dict,c->argv[1],o);
4465 }
4466 /* APPEND! */
4467 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4468 o->ptr = sdscatlen(o->ptr,
4469 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4470 } else {
4471 o->ptr = sdscatprintf(o->ptr, "%ld",
4472 (unsigned long) c->argv[2]->ptr);
4473 }
4474 totlen = sdslen(o->ptr);
4475 }
4476 server.dirty++;
4477 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4478}
4479
39191553 4480static void substrCommand(redisClient *c) {
4481 robj *o;
4482 long start = atoi(c->argv[2]->ptr);
4483 long end = atoi(c->argv[3]->ptr);
dd88747b 4484 size_t rangelen, strlen;
4485 sds range;
39191553 4486
dd88747b 4487 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4488 checkType(c,o,REDIS_STRING)) return;
39191553 4489
dd88747b 4490 o = getDecodedObject(o);
4491 strlen = sdslen(o->ptr);
8fe7fad7 4492
dd88747b 4493 /* convert negative indexes */
4494 if (start < 0) start = strlen+start;
4495 if (end < 0) end = strlen+end;
4496 if (start < 0) start = 0;
4497 if (end < 0) end = 0;
39191553 4498
dd88747b 4499 /* indexes sanity checks */
4500 if (start > end || (size_t)start >= strlen) {
4501 /* Out of range start or start > end result in null reply */
4502 addReply(c,shared.nullbulk);
4503 decrRefCount(o);
4504 return;
39191553 4505 }
dd88747b 4506 if ((size_t)end >= strlen) end = strlen-1;
4507 rangelen = (end-start)+1;
4508
4509 /* Return the result */
4510 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4511 range = sdsnewlen((char*)o->ptr+start,rangelen);
4512 addReplySds(c,range);
4513 addReply(c,shared.crlf);
4514 decrRefCount(o);
39191553 4515}
4516
ed9b544e 4517/* ========================= Type agnostic commands ========================= */
4518
4519static void delCommand(redisClient *c) {
5109cdff 4520 int deleted = 0, j;
4521
4522 for (j = 1; j < c->argc; j++) {
4523 if (deleteKey(c->db,c->argv[j])) {
37ab76c9 4524 touchWatchedKey(c->db,c->argv[j]);
5109cdff 4525 server.dirty++;
4526 deleted++;
4527 }
4528 }
482b672d 4529 addReplyLongLong(c,deleted);
ed9b544e 4530}
4531
4532static void existsCommand(redisClient *c) {
f4f06efc
PN
4533 expireIfNeeded(c->db,c->argv[1]);
4534 if (dictFind(c->db->dict,c->argv[1])) {
4535 addReply(c, shared.cone);
4536 } else {
4537 addReply(c, shared.czero);
4538 }
ed9b544e 4539}
4540
4541static void selectCommand(redisClient *c) {
4542 int id = atoi(c->argv[1]->ptr);
e0a62c7f 4543
ed9b544e 4544 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4545 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4546 } else {
4547 addReply(c,shared.ok);
4548 }
4549}
4550
4551static void randomkeyCommand(redisClient *c) {
4552 dictEntry *de;
dc4be23e 4553 robj *key;
e0a62c7f 4554
3305306f 4555 while(1) {
4556 de = dictGetRandomKey(c->db->dict);
ce7bef07 4557 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 4558 }
2b619329 4559
ed9b544e 4560 if (de == NULL) {
dc4be23e 4561 addReply(c,shared.nullbulk);
4562 return;
4563 }
4564
4565 key = dictGetEntryKey(de);
4566 if (server.vm_enabled) {
4567 key = dupStringObject(key);
4568 addReplyBulk(c,key);
4569 decrRefCount(key);
ed9b544e 4570 } else {
dc4be23e 4571 addReplyBulk(c,key);
ed9b544e 4572 }
4573}
4574
4575static void keysCommand(redisClient *c) {
4576 dictIterator *di;
4577 dictEntry *de;
4578 sds pattern = c->argv[1]->ptr;
4579 int plen = sdslen(pattern);
a3f9eec2 4580 unsigned long numkeys = 0;
ed9b544e 4581 robj *lenobj = createObject(REDIS_STRING,NULL);
4582
3305306f 4583 di = dictGetIterator(c->db->dict);
ed9b544e 4584 addReply(c,lenobj);
4585 decrRefCount(lenobj);
4586 while((de = dictNext(di)) != NULL) {
4587 robj *keyobj = dictGetEntryKey(de);
3305306f 4588
ed9b544e 4589 sds key = keyobj->ptr;
4590 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4591 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 4592 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4593 addReplyBulk(c,keyobj);
3305306f 4594 numkeys++;
3305306f 4595 }
ed9b544e 4596 }
4597 }
4598 dictReleaseIterator(di);
a3f9eec2 4599 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4600}
4601
4602static void dbsizeCommand(redisClient *c) {
4603 addReplySds(c,
3305306f 4604 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4605}
4606
4607static void lastsaveCommand(redisClient *c) {
4608 addReplySds(c,
c937aa89 4609 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4610}
4611
4612static void typeCommand(redisClient *c) {
3305306f 4613 robj *o;
ed9b544e 4614 char *type;
3305306f 4615
4616 o = lookupKeyRead(c->db,c->argv[1]);
4617 if (o == NULL) {
c937aa89 4618 type = "+none";
ed9b544e 4619 } else {
ed9b544e 4620 switch(o->type) {
c937aa89 4621 case REDIS_STRING: type = "+string"; break;
4622 case REDIS_LIST: type = "+list"; break;
4623 case REDIS_SET: type = "+set"; break;
412a8bce 4624 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4625 case REDIS_HASH: type = "+hash"; break;
4626 default: type = "+unknown"; break;
ed9b544e 4627 }
4628 }
4629 addReplySds(c,sdsnew(type));
4630 addReply(c,shared.crlf);
4631}
4632
4633static void saveCommand(redisClient *c) {
9d65a1bb 4634 if (server.bgsavechildpid != -1) {
05557f6d 4635 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4636 return;
4637 }
f78fd11b 4638 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4639 addReply(c,shared.ok);
4640 } else {
4641 addReply(c,shared.err);
4642 }
4643}
4644
4645static void bgsaveCommand(redisClient *c) {
9d65a1bb 4646 if (server.bgsavechildpid != -1) {
ed9b544e 4647 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4648 return;
4649 }
f78fd11b 4650 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4651 char *status = "+Background saving started\r\n";
4652 addReplySds(c,sdsnew(status));
ed9b544e 4653 } else {
4654 addReply(c,shared.err);
4655 }
4656}
4657
4658static void shutdownCommand(redisClient *c) {
fab43727 4659 if (prepareForShutdown() == REDIS_OK)
4660 exit(0);
4661 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
ed9b544e 4662}
4663
4664static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4665 robj *o;
4666
4667 /* To use the same key as src and dst is probably an error */
4668 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4669 addReply(c,shared.sameobjecterr);
ed9b544e 4670 return;
4671 }
4672
dd88747b 4673 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4674 return;
dd88747b 4675
ed9b544e 4676 incrRefCount(o);
3305306f 4677 deleteIfVolatile(c->db,c->argv[2]);
4678 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 4679 if (nx) {
4680 decrRefCount(o);
c937aa89 4681 addReply(c,shared.czero);
ed9b544e 4682 return;
4683 }
3305306f 4684 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 4685 } else {
4686 incrRefCount(c->argv[2]);
4687 }
3305306f 4688 deleteKey(c->db,c->argv[1]);
b167f877 4689 touchWatchedKey(c->db,c->argv[2]);
ed9b544e 4690 server.dirty++;
c937aa89 4691 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4692}
4693
4694static void renameCommand(redisClient *c) {
4695 renameGenericCommand(c,0);
4696}
4697
4698static void renamenxCommand(redisClient *c) {
4699 renameGenericCommand(c,1);
4700}
4701
4702static void moveCommand(redisClient *c) {
3305306f 4703 robj *o;
4704 redisDb *src, *dst;
ed9b544e 4705 int srcid;
4706
4707 /* Obtain source and target DB pointers */
3305306f 4708 src = c->db;
4709 srcid = c->db->id;
ed9b544e 4710 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4711 addReply(c,shared.outofrangeerr);
ed9b544e 4712 return;
4713 }
3305306f 4714 dst = c->db;
4715 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4716
4717 /* If the user is moving using as target the same
4718 * DB as the source DB it is probably an error. */
4719 if (src == dst) {
c937aa89 4720 addReply(c,shared.sameobjecterr);
ed9b544e 4721 return;
4722 }
4723
4724 /* Check if the element exists and get a reference */
3305306f 4725 o = lookupKeyWrite(c->db,c->argv[1]);
4726 if (!o) {
c937aa89 4727 addReply(c,shared.czero);
ed9b544e 4728 return;
4729 }
4730
4731 /* Try to add the element to the target DB */
3305306f 4732 deleteIfVolatile(dst,c->argv[1]);
4733 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 4734 addReply(c,shared.czero);
ed9b544e 4735 return;
4736 }
3305306f 4737 incrRefCount(c->argv[1]);
ed9b544e 4738 incrRefCount(o);
4739
4740 /* OK! key moved, free the entry in the source DB */
3305306f 4741 deleteKey(src,c->argv[1]);
ed9b544e 4742 server.dirty++;
c937aa89 4743 addReply(c,shared.cone);
ed9b544e 4744}
4745
4746/* =================================== Lists ================================ */
4747static void pushGenericCommand(redisClient *c, int where) {
4748 robj *lobj;
ed9b544e 4749 list *list;
3305306f 4750
4751 lobj = lookupKeyWrite(c->db,c->argv[1]);
4752 if (lobj == NULL) {
95242ab5 4753 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4754 addReply(c,shared.cone);
95242ab5 4755 return;
4756 }
ed9b544e 4757 lobj = createListObject();
4758 list = lobj->ptr;
4759 if (where == REDIS_HEAD) {
6b47e12e 4760 listAddNodeHead(list,c->argv[2]);
ed9b544e 4761 } else {
6b47e12e 4762 listAddNodeTail(list,c->argv[2]);
ed9b544e 4763 }
3305306f 4764 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 4765 incrRefCount(c->argv[1]);
4766 incrRefCount(c->argv[2]);
4767 } else {
ed9b544e 4768 if (lobj->type != REDIS_LIST) {
4769 addReply(c,shared.wrongtypeerr);
4770 return;
4771 }
95242ab5 4772 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4773 addReply(c,shared.cone);
95242ab5 4774 return;
4775 }
ed9b544e 4776 list = lobj->ptr;
4777 if (where == REDIS_HEAD) {
6b47e12e 4778 listAddNodeHead(list,c->argv[2]);
ed9b544e 4779 } else {
6b47e12e 4780 listAddNodeTail(list,c->argv[2]);
ed9b544e 4781 }
4782 incrRefCount(c->argv[2]);
4783 }
4784 server.dirty++;
482b672d 4785 addReplyLongLong(c,listLength(list));
ed9b544e 4786}
4787
4788static void lpushCommand(redisClient *c) {
4789 pushGenericCommand(c,REDIS_HEAD);
4790}
4791
4792static void rpushCommand(redisClient *c) {
4793 pushGenericCommand(c,REDIS_TAIL);
4794}
4795
4796static void llenCommand(redisClient *c) {
3305306f 4797 robj *o;
ed9b544e 4798 list *l;
dd88747b 4799
4800 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4801 checkType(c,o,REDIS_LIST)) return;
e0a62c7f 4802
dd88747b 4803 l = o->ptr;
4804 addReplyUlong(c,listLength(l));
ed9b544e 4805}
4806
4807static void lindexCommand(redisClient *c) {
3305306f 4808 robj *o;
ed9b544e 4809 int index = atoi(c->argv[2]->ptr);
dd88747b 4810 list *list;
4811 listNode *ln;
4812
4813 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4814 checkType(c,o,REDIS_LIST)) return;
4815 list = o->ptr;
4816
4817 ln = listIndex(list, index);
4818 if (ln == NULL) {
c937aa89 4819 addReply(c,shared.nullbulk);
ed9b544e 4820 } else {
dd88747b 4821 robj *ele = listNodeValue(ln);
4822 addReplyBulk(c,ele);
ed9b544e 4823 }
4824}
4825
4826static void lsetCommand(redisClient *c) {
3305306f 4827 robj *o;
ed9b544e 4828 int index = atoi(c->argv[2]->ptr);
dd88747b 4829 list *list;
4830 listNode *ln;
4831
4832 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4833 checkType(c,o,REDIS_LIST)) return;
4834 list = o->ptr;
4835
4836 ln = listIndex(list, index);
4837 if (ln == NULL) {
4838 addReply(c,shared.outofrangeerr);
ed9b544e 4839 } else {
dd88747b 4840 robj *ele = listNodeValue(ln);
ed9b544e 4841
dd88747b 4842 decrRefCount(ele);
4843 listNodeValue(ln) = c->argv[3];
4844 incrRefCount(c->argv[3]);
4845 addReply(c,shared.ok);
4846 server.dirty++;
ed9b544e 4847 }
4848}
4849
4850static void popGenericCommand(redisClient *c, int where) {
3305306f 4851 robj *o;
dd88747b 4852 list *list;
4853 listNode *ln;
3305306f 4854
dd88747b 4855 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4856 checkType(c,o,REDIS_LIST)) return;
4857 list = o->ptr;
ed9b544e 4858
dd88747b 4859 if (where == REDIS_HEAD)
4860 ln = listFirst(list);
4861 else
4862 ln = listLast(list);
ed9b544e 4863
dd88747b 4864 if (ln == NULL) {
4865 addReply(c,shared.nullbulk);
4866 } else {
4867 robj *ele = listNodeValue(ln);
4868 addReplyBulk(c,ele);
4869 listDelNode(list,ln);
3ea27d37 4870 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4871 server.dirty++;
ed9b544e 4872 }
4873}
4874
4875static void lpopCommand(redisClient *c) {
4876 popGenericCommand(c,REDIS_HEAD);
4877}
4878
4879static void rpopCommand(redisClient *c) {
4880 popGenericCommand(c,REDIS_TAIL);
4881}
4882
4883static void lrangeCommand(redisClient *c) {
3305306f 4884 robj *o;
ed9b544e 4885 int start = atoi(c->argv[2]->ptr);
4886 int end = atoi(c->argv[3]->ptr);
dd88747b 4887 int llen;
4888 int rangelen, j;
4889 list *list;
4890 listNode *ln;
4891 robj *ele;
4892
4e27f268 4893 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4894 || checkType(c,o,REDIS_LIST)) return;
dd88747b 4895 list = o->ptr;
4896 llen = listLength(list);
4897
4898 /* convert negative indexes */
4899 if (start < 0) start = llen+start;
4900 if (end < 0) end = llen+end;
4901 if (start < 0) start = 0;
4902 if (end < 0) end = 0;
4903
4904 /* indexes sanity checks */
4905 if (start > end || start >= llen) {
4906 /* Out of range start or start > end result in empty list */
4907 addReply(c,shared.emptymultibulk);
4908 return;
4909 }
4910 if (end >= llen) end = llen-1;
4911 rangelen = (end-start)+1;
3305306f 4912
dd88747b 4913 /* Return the result in form of a multi-bulk reply */
4914 ln = listIndex(list, start);
4915 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4916 for (j = 0; j < rangelen; j++) {
4917 ele = listNodeValue(ln);
4918 addReplyBulk(c,ele);
4919 ln = ln->next;
ed9b544e 4920 }
4921}
4922
4923static void ltrimCommand(redisClient *c) {
3305306f 4924 robj *o;
ed9b544e 4925 int start = atoi(c->argv[2]->ptr);
4926 int end = atoi(c->argv[3]->ptr);
dd88747b 4927 int llen;
4928 int j, ltrim, rtrim;
4929 list *list;
4930 listNode *ln;
4931
4932 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4933 checkType(c,o,REDIS_LIST)) return;
4934 list = o->ptr;
4935 llen = listLength(list);
4936
4937 /* convert negative indexes */
4938 if (start < 0) start = llen+start;
4939 if (end < 0) end = llen+end;
4940 if (start < 0) start = 0;
4941 if (end < 0) end = 0;
4942
4943 /* indexes sanity checks */
4944 if (start > end || start >= llen) {
4945 /* Out of range start or start > end result in empty list */
4946 ltrim = llen;
4947 rtrim = 0;
ed9b544e 4948 } else {
dd88747b 4949 if (end >= llen) end = llen-1;
4950 ltrim = start;
4951 rtrim = llen-end-1;
4952 }
ed9b544e 4953
dd88747b 4954 /* Remove list elements to perform the trim */
4955 for (j = 0; j < ltrim; j++) {
4956 ln = listFirst(list);
4957 listDelNode(list,ln);
4958 }
4959 for (j = 0; j < rtrim; j++) {
4960 ln = listLast(list);
4961 listDelNode(list,ln);
ed9b544e 4962 }
3ea27d37 4963 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4964 server.dirty++;
4965 addReply(c,shared.ok);
ed9b544e 4966}
4967
4968static void lremCommand(redisClient *c) {
3305306f 4969 robj *o;
dd88747b 4970 list *list;
4971 listNode *ln, *next;
4972 int toremove = atoi(c->argv[2]->ptr);
4973 int removed = 0;
4974 int fromtail = 0;
a4d1ba9a 4975
dd88747b 4976 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4977 checkType(c,o,REDIS_LIST)) return;
4978 list = o->ptr;
4979
4980 if (toremove < 0) {
4981 toremove = -toremove;
4982 fromtail = 1;
4983 }
4984 ln = fromtail ? list->tail : list->head;
4985 while (ln) {
4986 robj *ele = listNodeValue(ln);
4987
4988 next = fromtail ? ln->prev : ln->next;
bf028098 4989 if (equalStringObjects(ele,c->argv[3])) {
dd88747b 4990 listDelNode(list,ln);
4991 server.dirty++;
4992 removed++;
4993 if (toremove && removed == toremove) break;
ed9b544e 4994 }
dd88747b 4995 ln = next;
ed9b544e 4996 }
3ea27d37 4997 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4998 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 4999}
5000
12f9d551 5001/* This is the semantic of this command:
0f5f7e9a 5002 * RPOPLPUSH srclist dstlist:
12f9d551 5003 * IF LLEN(srclist) > 0
5004 * element = RPOP srclist
5005 * LPUSH dstlist element
5006 * RETURN element
5007 * ELSE
5008 * RETURN nil
5009 * END
5010 * END
5011 *
5012 * The idea is to be able to get an element from a list in a reliable way
5013 * since the element is not just returned but pushed against another list
5014 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5015 */
0f5f7e9a 5016static void rpoplpushcommand(redisClient *c) {
12f9d551 5017 robj *sobj;
dd88747b 5018 list *srclist;
5019 listNode *ln;
5020
5021 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5022 checkType(c,sobj,REDIS_LIST)) return;
5023 srclist = sobj->ptr;
5024 ln = listLast(srclist);
12f9d551 5025
dd88747b 5026 if (ln == NULL) {
12f9d551 5027 addReply(c,shared.nullbulk);
5028 } else {
dd88747b 5029 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5030 robj *ele = listNodeValue(ln);
5031 list *dstlist;
e20fb74f 5032
dd88747b 5033 if (dobj && dobj->type != REDIS_LIST) {
5034 addReply(c,shared.wrongtypeerr);
5035 return;
5036 }
12f9d551 5037
dd88747b 5038 /* Add the element to the target list (unless it's directly
5039 * passed to some BLPOP-ing client */
5040 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
5041 if (dobj == NULL) {
5042 /* Create the list if the key does not exist */
5043 dobj = createListObject();
5044 dictAdd(c->db->dict,c->argv[2],dobj);
5045 incrRefCount(c->argv[2]);
12f9d551 5046 }
dd88747b 5047 dstlist = dobj->ptr;
5048 listAddNodeHead(dstlist,ele);
5049 incrRefCount(ele);
12f9d551 5050 }
dd88747b 5051
5052 /* Send the element to the client as reply as well */
5053 addReplyBulk(c,ele);
5054
5055 /* Finally remove the element from the source list */
5056 listDelNode(srclist,ln);
3ea27d37 5057 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5058 server.dirty++;
12f9d551 5059 }
5060}
5061
ed9b544e 5062/* ==================================== Sets ================================ */
5063
5064static void saddCommand(redisClient *c) {
ed9b544e 5065 robj *set;
5066
3305306f 5067 set = lookupKeyWrite(c->db,c->argv[1]);
5068 if (set == NULL) {
ed9b544e 5069 set = createSetObject();
3305306f 5070 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 5071 incrRefCount(c->argv[1]);
5072 } else {
ed9b544e 5073 if (set->type != REDIS_SET) {
c937aa89 5074 addReply(c,shared.wrongtypeerr);
ed9b544e 5075 return;
5076 }
5077 }
5078 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5079 incrRefCount(c->argv[2]);
5080 server.dirty++;
c937aa89 5081 addReply(c,shared.cone);
ed9b544e 5082 } else {
c937aa89 5083 addReply(c,shared.czero);
ed9b544e 5084 }
5085}
5086
5087static void sremCommand(redisClient *c) {
3305306f 5088 robj *set;
ed9b544e 5089
dd88747b 5090 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5091 checkType(c,set,REDIS_SET)) return;
5092
5093 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5094 server.dirty++;
5095 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 5096 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5097 addReply(c,shared.cone);
ed9b544e 5098 } else {
dd88747b 5099 addReply(c,shared.czero);
ed9b544e 5100 }
5101}
5102
a4460ef4 5103static void smoveCommand(redisClient *c) {
5104 robj *srcset, *dstset;
5105
5106 srcset = lookupKeyWrite(c->db,c->argv[1]);
5107 dstset = lookupKeyWrite(c->db,c->argv[2]);
5108
5109 /* If the source key does not exist return 0, if it's of the wrong type
5110 * raise an error */
5111 if (srcset == NULL || srcset->type != REDIS_SET) {
5112 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5113 return;
5114 }
5115 /* Error if the destination key is not a set as well */
5116 if (dstset && dstset->type != REDIS_SET) {
5117 addReply(c,shared.wrongtypeerr);
5118 return;
5119 }
5120 /* Remove the element from the source set */
5121 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5122 /* Key not found in the src set! return zero */
5123 addReply(c,shared.czero);
5124 return;
5125 }
3ea27d37 5126 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5127 deleteKey(c->db,c->argv[1]);
a4460ef4 5128 server.dirty++;
5129 /* Add the element to the destination set */
5130 if (!dstset) {
5131 dstset = createSetObject();
5132 dictAdd(c->db->dict,c->argv[2],dstset);
5133 incrRefCount(c->argv[2]);
5134 }
5135 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5136 incrRefCount(c->argv[3]);
5137 addReply(c,shared.cone);
5138}
5139
ed9b544e 5140static void sismemberCommand(redisClient *c) {
3305306f 5141 robj *set;
ed9b544e 5142
dd88747b 5143 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5144 checkType(c,set,REDIS_SET)) return;
5145
5146 if (dictFind(set->ptr,c->argv[2]))
5147 addReply(c,shared.cone);
5148 else
c937aa89 5149 addReply(c,shared.czero);
ed9b544e 5150}
5151
5152static void scardCommand(redisClient *c) {
3305306f 5153 robj *o;
ed9b544e 5154 dict *s;
dd88747b 5155
5156 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5157 checkType(c,o,REDIS_SET)) return;
e0a62c7f 5158
dd88747b 5159 s = o->ptr;
5160 addReplyUlong(c,dictSize(s));
ed9b544e 5161}
5162
12fea928 5163static void spopCommand(redisClient *c) {
5164 robj *set;
5165 dictEntry *de;
5166
dd88747b 5167 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5168 checkType(c,set,REDIS_SET)) return;
5169
5170 de = dictGetRandomKey(set->ptr);
5171 if (de == NULL) {
12fea928 5172 addReply(c,shared.nullbulk);
5173 } else {
dd88747b 5174 robj *ele = dictGetEntryKey(de);
12fea928 5175
dd88747b 5176 addReplyBulk(c,ele);
5177 dictDelete(set->ptr,ele);
5178 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 5179 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5180 server.dirty++;
12fea928 5181 }
5182}
5183
2abb95a9 5184static void srandmemberCommand(redisClient *c) {
5185 robj *set;
5186 dictEntry *de;
5187
dd88747b 5188 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5189 checkType(c,set,REDIS_SET)) return;
5190
5191 de = dictGetRandomKey(set->ptr);
5192 if (de == NULL) {
2abb95a9 5193 addReply(c,shared.nullbulk);
5194 } else {
dd88747b 5195 robj *ele = dictGetEntryKey(de);
2abb95a9 5196
dd88747b 5197 addReplyBulk(c,ele);
2abb95a9 5198 }
5199}
5200
ed9b544e 5201static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5202 dict **d1 = (void*) s1, **d2 = (void*) s2;
5203
3305306f 5204 return dictSize(*d1)-dictSize(*d2);
ed9b544e 5205}
5206
682ac724 5207static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 5208 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5209 dictIterator *di;
5210 dictEntry *de;
5211 robj *lenobj = NULL, *dstset = NULL;
682ac724 5212 unsigned long j, cardinality = 0;
ed9b544e 5213
ed9b544e 5214 for (j = 0; j < setsnum; j++) {
5215 robj *setobj;
3305306f 5216
5217 setobj = dstkey ?
5218 lookupKeyWrite(c->db,setskeys[j]) :
5219 lookupKeyRead(c->db,setskeys[j]);
5220 if (!setobj) {
ed9b544e 5221 zfree(dv);
5faa6025 5222 if (dstkey) {
fdcaae84 5223 if (deleteKey(c->db,dstkey))
5224 server.dirty++;
0d36ded0 5225 addReply(c,shared.czero);
5faa6025 5226 } else {
4e27f268 5227 addReply(c,shared.emptymultibulk);
5faa6025 5228 }
ed9b544e 5229 return;
5230 }
ed9b544e 5231 if (setobj->type != REDIS_SET) {
5232 zfree(dv);
c937aa89 5233 addReply(c,shared.wrongtypeerr);
ed9b544e 5234 return;
5235 }
5236 dv[j] = setobj->ptr;
5237 }
5238 /* Sort sets from the smallest to largest, this will improve our
5239 * algorithm's performace */
5240 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5241
5242 /* The first thing we should output is the total number of elements...
5243 * since this is a multi-bulk write, but at this stage we don't know
5244 * the intersection set size, so we use a trick, append an empty object
5245 * to the output list and save the pointer to later modify it with the
5246 * right length */
5247 if (!dstkey) {
5248 lenobj = createObject(REDIS_STRING,NULL);
5249 addReply(c,lenobj);
5250 decrRefCount(lenobj);
5251 } else {
5252 /* If we have a target key where to store the resulting set
5253 * create this key with an empty set inside */
5254 dstset = createSetObject();
ed9b544e 5255 }
5256
5257 /* Iterate all the elements of the first (smallest) set, and test
5258 * the element against all the other sets, if at least one set does
5259 * not include the element it is discarded */
5260 di = dictGetIterator(dv[0]);
ed9b544e 5261
5262 while((de = dictNext(di)) != NULL) {
5263 robj *ele;
5264
5265 for (j = 1; j < setsnum; j++)
5266 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5267 if (j != setsnum)
5268 continue; /* at least one set does not contain the member */
5269 ele = dictGetEntryKey(de);
5270 if (!dstkey) {
dd88747b 5271 addReplyBulk(c,ele);
ed9b544e 5272 cardinality++;
5273 } else {
5274 dictAdd(dstset->ptr,ele,NULL);
5275 incrRefCount(ele);
5276 }
5277 }
5278 dictReleaseIterator(di);
5279
83cdfe18 5280 if (dstkey) {
3ea27d37 5281 /* Store the resulting set into the target, if the intersection
5282 * is not an empty set. */
83cdfe18 5283 deleteKey(c->db,dstkey);
3ea27d37 5284 if (dictSize((dict*)dstset->ptr) > 0) {
5285 dictAdd(c->db->dict,dstkey,dstset);
5286 incrRefCount(dstkey);
482b672d 5287 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5288 } else {
5289 decrRefCount(dstset);
d36c4e97 5290 addReply(c,shared.czero);
3ea27d37 5291 }
40d224a9 5292 server.dirty++;
d36c4e97 5293 } else {
5294 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 5295 }
ed9b544e 5296 zfree(dv);
5297}
5298
5299static void sinterCommand(redisClient *c) {
5300 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5301}
5302
5303static void sinterstoreCommand(redisClient *c) {
5304 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5305}
5306
f4f56e1d 5307#define REDIS_OP_UNION 0
5308#define REDIS_OP_DIFF 1
2830ca53 5309#define REDIS_OP_INTER 2
f4f56e1d 5310
5311static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 5312 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5313 dictIterator *di;
5314 dictEntry *de;
f4f56e1d 5315 robj *dstset = NULL;
40d224a9 5316 int j, cardinality = 0;
5317
40d224a9 5318 for (j = 0; j < setsnum; j++) {
5319 robj *setobj;
5320
5321 setobj = dstkey ?
5322 lookupKeyWrite(c->db,setskeys[j]) :
5323 lookupKeyRead(c->db,setskeys[j]);
5324 if (!setobj) {
5325 dv[j] = NULL;
5326 continue;
5327 }
5328 if (setobj->type != REDIS_SET) {
5329 zfree(dv);
5330 addReply(c,shared.wrongtypeerr);
5331 return;
5332 }
5333 dv[j] = setobj->ptr;
5334 }
5335
5336 /* We need a temp set object to store our union. If the dstkey
5337 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5338 * this set object will be the resulting object to set into the target key*/
5339 dstset = createSetObject();
5340
40d224a9 5341 /* Iterate all the elements of all the sets, add every element a single
5342 * time to the result set */
5343 for (j = 0; j < setsnum; j++) {
51829ed3 5344 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 5345 if (!dv[j]) continue; /* non existing keys are like empty sets */
5346
5347 di = dictGetIterator(dv[j]);
40d224a9 5348
5349 while((de = dictNext(di)) != NULL) {
5350 robj *ele;
5351
5352 /* dictAdd will not add the same element multiple times */
5353 ele = dictGetEntryKey(de);
f4f56e1d 5354 if (op == REDIS_OP_UNION || j == 0) {
5355 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5356 incrRefCount(ele);
40d224a9 5357 cardinality++;
5358 }
f4f56e1d 5359 } else if (op == REDIS_OP_DIFF) {
5360 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5361 cardinality--;
5362 }
40d224a9 5363 }
5364 }
5365 dictReleaseIterator(di);
51829ed3 5366
d36c4e97 5367 /* result set is empty? Exit asap. */
5368 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 5369 }
5370
f4f56e1d 5371 /* Output the content of the resulting set, if not in STORE mode */
5372 if (!dstkey) {
5373 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5374 di = dictGetIterator(dstset->ptr);
f4f56e1d 5375 while((de = dictNext(di)) != NULL) {
5376 robj *ele;
5377
5378 ele = dictGetEntryKey(de);
dd88747b 5379 addReplyBulk(c,ele);
f4f56e1d 5380 }
5381 dictReleaseIterator(di);
d36c4e97 5382 decrRefCount(dstset);
83cdfe18
AG
5383 } else {
5384 /* If we have a target key where to store the resulting set
5385 * create this key with the result set inside */
5386 deleteKey(c->db,dstkey);
3ea27d37 5387 if (dictSize((dict*)dstset->ptr) > 0) {
5388 dictAdd(c->db->dict,dstkey,dstset);
5389 incrRefCount(dstkey);
482b672d 5390 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5391 } else {
5392 decrRefCount(dstset);
d36c4e97 5393 addReply(c,shared.czero);
3ea27d37 5394 }
40d224a9 5395 server.dirty++;
5396 }
5397 zfree(dv);
5398}
5399
5400static void sunionCommand(redisClient *c) {
f4f56e1d 5401 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 5402}
5403
5404static void sunionstoreCommand(redisClient *c) {
f4f56e1d 5405 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5406}
5407
5408static void sdiffCommand(redisClient *c) {
5409 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5410}
5411
5412static void sdiffstoreCommand(redisClient *c) {
5413 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 5414}
5415
6b47e12e 5416/* ==================================== ZSets =============================== */
5417
5418/* ZSETs are ordered sets using two data structures to hold the same elements
5419 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5420 * data structure.
5421 *
5422 * The elements are added to an hash table mapping Redis objects to scores.
5423 * At the same time the elements are added to a skip list mapping scores
5424 * to Redis objects (so objects are sorted by scores in this "view"). */
5425
5426/* This skiplist implementation is almost a C translation of the original
5427 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5428 * Alternative to Balanced Trees", modified in three ways:
5429 * a) this implementation allows for repeated values.
5430 * b) the comparison is not just by key (our 'score') but by satellite data.
5431 * c) there is a back pointer, so it's a doubly linked list with the back
5432 * pointers being only at "level 1". This allows to traverse the list
5433 * from tail to head, useful for ZREVRANGE. */
5434
5435static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5436 zskiplistNode *zn = zmalloc(sizeof(*zn));
5437
5438 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2f4dd7e0 5439 if (level > 1)
2b37892e 5440 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
2f4dd7e0 5441 else
5442 zn->span = NULL;
6b47e12e 5443 zn->score = score;
5444 zn->obj = obj;
5445 return zn;
5446}
5447
5448static zskiplist *zslCreate(void) {
5449 int j;
5450 zskiplist *zsl;
e0a62c7f 5451
6b47e12e 5452 zsl = zmalloc(sizeof(*zsl));
5453 zsl->level = 1;
cc812361 5454 zsl->length = 0;
6b47e12e 5455 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 5456 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 5457 zsl->header->forward[j] = NULL;
94e543b5 5458
5459 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5460 if (j < ZSKIPLIST_MAXLEVEL-1)
5461 zsl->header->span[j] = 0;
69d95c3e 5462 }
e3870fab 5463 zsl->header->backward = NULL;
5464 zsl->tail = NULL;
6b47e12e 5465 return zsl;
5466}
5467
fd8ccf44 5468static void zslFreeNode(zskiplistNode *node) {
5469 decrRefCount(node->obj);
ad807e6f 5470 zfree(node->forward);
69d95c3e 5471 zfree(node->span);
fd8ccf44 5472 zfree(node);
5473}
5474
5475static void zslFree(zskiplist *zsl) {
ad807e6f 5476 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 5477
ad807e6f 5478 zfree(zsl->header->forward);
69d95c3e 5479 zfree(zsl->header->span);
ad807e6f 5480 zfree(zsl->header);
fd8ccf44 5481 while(node) {
599379dd 5482 next = node->forward[0];
fd8ccf44 5483 zslFreeNode(node);
5484 node = next;
5485 }
ad807e6f 5486 zfree(zsl);
fd8ccf44 5487}
5488
6b47e12e 5489static int zslRandomLevel(void) {
5490 int level = 1;
5491 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5492 level += 1;
10c2baa5 5493 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6b47e12e 5494}
5495
5496static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5497 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 5498 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 5499 int i, level;
5500
5501 x = zsl->header;
5502 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
5503 /* store rank that is crossed to reach the insert position */
5504 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 5505
9d60e6e4 5506 while (x->forward[i] &&
5507 (x->forward[i]->score < score ||
5508 (x->forward[i]->score == score &&
69d95c3e 5509 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 5510 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 5511 x = x->forward[i];
69d95c3e 5512 }
6b47e12e 5513 update[i] = x;
5514 }
6b47e12e 5515 /* we assume the key is not already inside, since we allow duplicated
5516 * scores, and the re-insertion of score and redis object should never
5517 * happpen since the caller of zslInsert() should test in the hash table
5518 * if the element is already inside or not. */
5519 level = zslRandomLevel();
5520 if (level > zsl->level) {
69d95c3e 5521 for (i = zsl->level; i < level; i++) {
2b37892e 5522 rank[i] = 0;
6b47e12e 5523 update[i] = zsl->header;
2b37892e 5524 update[i]->span[i-1] = zsl->length;
69d95c3e 5525 }
6b47e12e 5526 zsl->level = level;
5527 }
5528 x = zslCreateNode(level,score,obj);
5529 for (i = 0; i < level; i++) {
5530 x->forward[i] = update[i]->forward[i];
5531 update[i]->forward[i] = x;
69d95c3e
PN
5532
5533 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
5534 if (i > 0) {
5535 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5536 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5537 }
6b47e12e 5538 }
69d95c3e
PN
5539
5540 /* increment span for untouched levels */
5541 for (i = level; i < zsl->level; i++) {
2b37892e 5542 update[i]->span[i-1]++;
69d95c3e
PN
5543 }
5544
bb975144 5545 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 5546 if (x->forward[0])
5547 x->forward[0]->backward = x;
5548 else
5549 zsl->tail = x;
cc812361 5550 zsl->length++;
6b47e12e 5551}
5552
84105336
PN
5553/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5554void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5555 int i;
5556 for (i = 0; i < zsl->level; i++) {
5557 if (update[i]->forward[i] == x) {
5558 if (i > 0) {
5559 update[i]->span[i-1] += x->span[i-1] - 1;
5560 }
5561 update[i]->forward[i] = x->forward[i];
5562 } else {
5563 /* invariant: i > 0, because update[0]->forward[0]
5564 * is always equal to x */
5565 update[i]->span[i-1] -= 1;
5566 }
5567 }
5568 if (x->forward[0]) {
5569 x->forward[0]->backward = x->backward;
5570 } else {
5571 zsl->tail = x->backward;
5572 }
5573 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5574 zsl->level--;
5575 zsl->length--;
5576}
5577
50c55df5 5578/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 5579static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 5580 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5581 int i;
5582
5583 x = zsl->header;
5584 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 5585 while (x->forward[i] &&
5586 (x->forward[i]->score < score ||
5587 (x->forward[i]->score == score &&
5588 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 5589 x = x->forward[i];
5590 update[i] = x;
5591 }
5592 /* We may have multiple elements with the same score, what we need
5593 * is to find the element with both the right score and object. */
5594 x = x->forward[0];
bf028098 5595 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
84105336 5596 zslDeleteNode(zsl, x, update);
9d60e6e4 5597 zslFreeNode(x);
9d60e6e4 5598 return 1;
5599 } else {
5600 return 0; /* not found */
e197b441 5601 }
5602 return 0; /* not found */
fd8ccf44 5603}
5604
1807985b 5605/* Delete all the elements with score between min and max from the skiplist.
5606 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5607 * Note that this function takes the reference to the hash table view of the
5608 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 5609static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 5610 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5611 unsigned long removed = 0;
5612 int i;
5613
5614 x = zsl->header;
5615 for (i = zsl->level-1; i >= 0; i--) {
5616 while (x->forward[i] && x->forward[i]->score < min)
5617 x = x->forward[i];
5618 update[i] = x;
5619 }
5620 /* We may have multiple elements with the same score, what we need
5621 * is to find the element with both the right score and object. */
5622 x = x->forward[0];
5623 while (x && x->score <= max) {
84105336
PN
5624 zskiplistNode *next = x->forward[0];
5625 zslDeleteNode(zsl, x, update);
1807985b 5626 dictDelete(dict,x->obj);
5627 zslFreeNode(x);
1807985b 5628 removed++;
5629 x = next;
5630 }
5631 return removed; /* not found */
5632}
1807985b 5633
9212eafd 5634/* Delete all the elements with rank between start and end from the skiplist.
2424490f 5635 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
5636static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5637 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5638 unsigned long traversed = 0, removed = 0;
5639 int i;
5640
9212eafd
PN
5641 x = zsl->header;
5642 for (i = zsl->level-1; i >= 0; i--) {
5643 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5644 traversed += i > 0 ? x->span[i-1] : 1;
5645 x = x->forward[i];
1807985b 5646 }
9212eafd
PN
5647 update[i] = x;
5648 }
5649
5650 traversed++;
5651 x = x->forward[0];
5652 while (x && traversed <= end) {
84105336
PN
5653 zskiplistNode *next = x->forward[0];
5654 zslDeleteNode(zsl, x, update);
1807985b 5655 dictDelete(dict,x->obj);
5656 zslFreeNode(x);
1807985b 5657 removed++;
9212eafd 5658 traversed++;
1807985b 5659 x = next;
5660 }
9212eafd 5661 return removed;
1807985b 5662}
5663
50c55df5 5664/* Find the first node having a score equal or greater than the specified one.
5665 * Returns NULL if there is no match. */
5666static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5667 zskiplistNode *x;
5668 int i;
5669
5670 x = zsl->header;
5671 for (i = zsl->level-1; i >= 0; i--) {
5672 while (x->forward[i] && x->forward[i]->score < score)
5673 x = x->forward[i];
5674 }
5675 /* We may have multiple elements with the same score, what we need
5676 * is to find the element with both the right score and object. */
5677 return x->forward[0];
5678}
5679
27b0ccca
PN
5680/* Find the rank for an element by both score and key.
5681 * Returns 0 when the element cannot be found, rank otherwise.
5682 * Note that the rank is 1-based due to the span of zsl->header to the
5683 * first element. */
5684static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5685 zskiplistNode *x;
5686 unsigned long rank = 0;
5687 int i;
5688
5689 x = zsl->header;
5690 for (i = zsl->level-1; i >= 0; i--) {
5691 while (x->forward[i] &&
5692 (x->forward[i]->score < score ||
5693 (x->forward[i]->score == score &&
5694 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 5695 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
5696 x = x->forward[i];
5697 }
5698
5699 /* x might be equal to zsl->header, so test if obj is non-NULL */
bf028098 5700 if (x->obj && equalStringObjects(x->obj,o)) {
27b0ccca
PN
5701 return rank;
5702 }
5703 }
5704 return 0;
5705}
5706
e74825c2
PN
5707/* Finds an element by its rank. The rank argument needs to be 1-based. */
5708zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5709 zskiplistNode *x;
5710 unsigned long traversed = 0;
5711 int i;
5712
5713 x = zsl->header;
5714 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 5715 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5716 {
a50ea45c 5717 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
5718 x = x->forward[i];
5719 }
e74825c2
PN
5720 if (traversed == rank) {
5721 return x;
5722 }
5723 }
5724 return NULL;
5725}
5726
fd8ccf44 5727/* The actual Z-commands implementations */
5728
7db723ad 5729/* This generic command implements both ZADD and ZINCRBY.
e2665397 5730 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 5731 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 5732static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 5733 robj *zsetobj;
5734 zset *zs;
5735 double *score;
5736
e2665397 5737 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 5738 if (zsetobj == NULL) {
5739 zsetobj = createZsetObject();
e2665397 5740 dictAdd(c->db->dict,key,zsetobj);
5741 incrRefCount(key);
fd8ccf44 5742 } else {
5743 if (zsetobj->type != REDIS_ZSET) {
5744 addReply(c,shared.wrongtypeerr);
5745 return;
5746 }
5747 }
fd8ccf44 5748 zs = zsetobj->ptr;
e2665397 5749
7db723ad 5750 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 5751 * needs to handle the two different conditions. It's all about setting
5752 * '*score', that is, the new score to set, to the right value. */
5753 score = zmalloc(sizeof(double));
5754 if (doincrement) {
5755 dictEntry *de;
5756
5757 /* Read the old score. If the element was not present starts from 0 */
5758 de = dictFind(zs->dict,ele);
5759 if (de) {
5760 double *oldscore = dictGetEntryVal(de);
5761 *score = *oldscore + scoreval;
5762 } else {
5763 *score = scoreval;
5764 }
5765 } else {
5766 *score = scoreval;
5767 }
5768
5769 /* What follows is a simple remove and re-insert operation that is common
7db723ad 5770 * to both ZADD and ZINCRBY... */
e2665397 5771 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 5772 /* case 1: New element */
e2665397 5773 incrRefCount(ele); /* added to hash */
5774 zslInsert(zs->zsl,*score,ele);
5775 incrRefCount(ele); /* added to skiplist */
fd8ccf44 5776 server.dirty++;
e2665397 5777 if (doincrement)
e2665397 5778 addReplyDouble(c,*score);
91d71bfc 5779 else
5780 addReply(c,shared.cone);
fd8ccf44 5781 } else {
5782 dictEntry *de;
5783 double *oldscore;
e0a62c7f 5784
fd8ccf44 5785 /* case 2: Score update operation */
e2665397 5786 de = dictFind(zs->dict,ele);
dfc5e96c 5787 redisAssert(de != NULL);
fd8ccf44 5788 oldscore = dictGetEntryVal(de);
5789 if (*score != *oldscore) {
5790 int deleted;
5791
e2665397 5792 /* Remove and insert the element in the skip list with new score */
5793 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 5794 redisAssert(deleted != 0);
e2665397 5795 zslInsert(zs->zsl,*score,ele);
5796 incrRefCount(ele);
5797 /* Update the score in the hash table */
5798 dictReplace(zs->dict,ele,score);
fd8ccf44 5799 server.dirty++;
2161a965 5800 } else {
5801 zfree(score);
fd8ccf44 5802 }
e2665397 5803 if (doincrement)
5804 addReplyDouble(c,*score);
5805 else
5806 addReply(c,shared.czero);
fd8ccf44 5807 }
5808}
5809
e2665397 5810static void zaddCommand(redisClient *c) {
5811 double scoreval;
5812
bd79a6bd 5813 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5814 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5815}
5816
7db723ad 5817static void zincrbyCommand(redisClient *c) {
e2665397 5818 double scoreval;
5819
bd79a6bd 5820 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5821 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5822}
5823
1b7106e7 5824static void zremCommand(redisClient *c) {
5825 robj *zsetobj;
5826 zset *zs;
dd88747b 5827 dictEntry *de;
5828 double *oldscore;
5829 int deleted;
1b7106e7 5830
dd88747b 5831 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5832 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 5833
dd88747b 5834 zs = zsetobj->ptr;
5835 de = dictFind(zs->dict,c->argv[2]);
5836 if (de == NULL) {
5837 addReply(c,shared.czero);
5838 return;
1b7106e7 5839 }
dd88747b 5840 /* Delete from the skiplist */
5841 oldscore = dictGetEntryVal(de);
5842 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5843 redisAssert(deleted != 0);
5844
5845 /* Delete from the hash table */
5846 dictDelete(zs->dict,c->argv[2]);
5847 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5848 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5849 server.dirty++;
5850 addReply(c,shared.cone);
1b7106e7 5851}
5852
1807985b 5853static void zremrangebyscoreCommand(redisClient *c) {
bbe025e0
AM
5854 double min;
5855 double max;
dd88747b 5856 long deleted;
1807985b 5857 robj *zsetobj;
5858 zset *zs;
5859
bd79a6bd
PN
5860 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5861 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
bbe025e0 5862
dd88747b 5863 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5864 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 5865
dd88747b 5866 zs = zsetobj->ptr;
5867 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5868 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5869 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5870 server.dirty += deleted;
482b672d 5871 addReplyLongLong(c,deleted);
1807985b 5872}
5873
9212eafd 5874static void zremrangebyrankCommand(redisClient *c) {
bbe025e0
AM
5875 long start;
5876 long end;
dd88747b 5877 int llen;
5878 long deleted;
9212eafd
PN
5879 robj *zsetobj;
5880 zset *zs;
5881
bd79a6bd
PN
5882 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5883 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 5884
dd88747b 5885 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5886 checkType(c,zsetobj,REDIS_ZSET)) return;
5887 zs = zsetobj->ptr;
5888 llen = zs->zsl->length;
9212eafd 5889
dd88747b 5890 /* convert negative indexes */
5891 if (start < 0) start = llen+start;
5892 if (end < 0) end = llen+end;
5893 if (start < 0) start = 0;
5894 if (end < 0) end = 0;
9212eafd 5895
dd88747b 5896 /* indexes sanity checks */
5897 if (start > end || start >= llen) {
5898 addReply(c,shared.czero);
5899 return;
9212eafd 5900 }
dd88747b 5901 if (end >= llen) end = llen-1;
5902
5903 /* increment start and end because zsl*Rank functions
5904 * use 1-based rank */
5905 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5906 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5907 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5908 server.dirty += deleted;
482b672d 5909 addReplyLongLong(c, deleted);
9212eafd
PN
5910}
5911
8f92e768
PN
5912typedef struct {
5913 dict *dict;
5914 double weight;
5915} zsetopsrc;
5916
5917static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5918 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5919 unsigned long size1, size2;
5920 size1 = d1->dict ? dictSize(d1->dict) : 0;
5921 size2 = d2->dict ? dictSize(d2->dict) : 0;
5922 return size1 - size2;
5923}
5924
d2764cd6
PN
5925#define REDIS_AGGR_SUM 1
5926#define REDIS_AGGR_MIN 2
5927#define REDIS_AGGR_MAX 3
bc000c1d 5928#define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
d2764cd6
PN
5929
5930inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5931 if (aggregate == REDIS_AGGR_SUM) {
5932 *target = *target + val;
5933 } else if (aggregate == REDIS_AGGR_MIN) {
5934 *target = val < *target ? val : *target;
5935 } else if (aggregate == REDIS_AGGR_MAX) {
5936 *target = val > *target ? val : *target;
5937 } else {
5938 /* safety net */
f83c6cb5 5939 redisPanic("Unknown ZUNION/INTER aggregate type");
d2764cd6
PN
5940 }
5941}
5942
2830ca53 5943static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
bc000c1d 5944 int i, j, setnum;
d2764cd6 5945 int aggregate = REDIS_AGGR_SUM;
8f92e768 5946 zsetopsrc *src;
2830ca53
PN
5947 robj *dstobj;
5948 zset *dstzset;
b287c9bb
PN
5949 dictIterator *di;
5950 dictEntry *de;
5951
bc000c1d
JC
5952 /* expect setnum input keys to be given */
5953 setnum = atoi(c->argv[2]->ptr);
5954 if (setnum < 1) {
5d373da9 5955 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
2830ca53 5956 return;
b287c9bb 5957 }
2830ca53
PN
5958
5959 /* test if the expected number of keys would overflow */
bc000c1d 5960 if (3+setnum > c->argc) {
b287c9bb
PN
5961 addReply(c,shared.syntaxerr);
5962 return;
5963 }
5964
2830ca53 5965 /* read keys to be used for input */
bc000c1d
JC
5966 src = zmalloc(sizeof(zsetopsrc) * setnum);
5967 for (i = 0, j = 3; i < setnum; i++, j++) {
5968 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
5969 if (!obj) {
8f92e768 5970 src[i].dict = NULL;
b287c9bb 5971 } else {
bc000c1d
JC
5972 if (obj->type == REDIS_ZSET) {
5973 src[i].dict = ((zset*)obj->ptr)->dict;
5974 } else if (obj->type == REDIS_SET) {
5975 src[i].dict = (obj->ptr);
5976 } else {
8f92e768 5977 zfree(src);
b287c9bb
PN
5978 addReply(c,shared.wrongtypeerr);
5979 return;
5980 }
b287c9bb 5981 }
2830ca53
PN
5982
5983 /* default all weights to 1 */
8f92e768 5984 src[i].weight = 1.0;
b287c9bb
PN
5985 }
5986
2830ca53
PN
5987 /* parse optional extra arguments */
5988 if (j < c->argc) {
d2764cd6 5989 int remaining = c->argc - j;
b287c9bb 5990
2830ca53 5991 while (remaining) {
bc000c1d 5992 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 5993 j++; remaining--;
bc000c1d 5994 for (i = 0; i < setnum; i++, j++, remaining--) {
bd79a6bd 5995 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
bbe025e0 5996 return;
2830ca53 5997 }
d2764cd6
PN
5998 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5999 j++; remaining--;
6000 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6001 aggregate = REDIS_AGGR_SUM;
6002 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6003 aggregate = REDIS_AGGR_MIN;
6004 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6005 aggregate = REDIS_AGGR_MAX;
6006 } else {
6007 zfree(src);
6008 addReply(c,shared.syntaxerr);
6009 return;
6010 }
6011 j++; remaining--;
2830ca53 6012 } else {
8f92e768 6013 zfree(src);
2830ca53
PN
6014 addReply(c,shared.syntaxerr);
6015 return;
6016 }
6017 }
6018 }
b287c9bb 6019
d2764cd6
PN
6020 /* sort sets from the smallest to largest, this will improve our
6021 * algorithm's performance */
bc000c1d 6022 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
d2764cd6 6023
2830ca53
PN
6024 dstobj = createZsetObject();
6025 dstzset = dstobj->ptr;
6026
6027 if (op == REDIS_OP_INTER) {
8f92e768
PN
6028 /* skip going over all entries if the smallest zset is NULL or empty */
6029 if (src[0].dict && dictSize(src[0].dict) > 0) {
6030 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6031 * from small to large, all src[i > 0].dict are non-empty too */
6032 di = dictGetIterator(src[0].dict);
2830ca53 6033 while((de = dictNext(di)) != NULL) {
d2764cd6 6034 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6035 *score = src[0].weight * zunionInterDictValue(de);
2830ca53 6036
bc000c1d 6037 for (j = 1; j < setnum; j++) {
d2764cd6 6038 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6039 if (other) {
bc000c1d 6040 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6041 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6042 } else {
6043 break;
6044 }
6045 }
b287c9bb 6046
2830ca53 6047 /* skip entry when not present in every source dict */
bc000c1d 6048 if (j != setnum) {
2830ca53
PN
6049 zfree(score);
6050 } else {
6051 robj *o = dictGetEntryKey(de);
6052 dictAdd(dstzset->dict,o,score);
6053 incrRefCount(o); /* added to dictionary */
6054 zslInsert(dstzset->zsl,*score,o);
6055 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
6056 }
6057 }
2830ca53
PN
6058 dictReleaseIterator(di);
6059 }
6060 } else if (op == REDIS_OP_UNION) {
bc000c1d 6061 for (i = 0; i < setnum; i++) {
8f92e768 6062 if (!src[i].dict) continue;
2830ca53 6063
8f92e768 6064 di = dictGetIterator(src[i].dict);
2830ca53
PN
6065 while((de = dictNext(di)) != NULL) {
6066 /* skip key when already processed */
6067 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6068
d2764cd6 6069 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6070 *score = src[i].weight * zunionInterDictValue(de);
2830ca53 6071
d2764cd6
PN
6072 /* because the zsets are sorted by size, its only possible
6073 * for sets at larger indices to hold this entry */
bc000c1d 6074 for (j = (i+1); j < setnum; j++) {
d2764cd6 6075 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6076 if (other) {
bc000c1d 6077 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6078 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6079 }
6080 }
b287c9bb 6081
2830ca53
PN
6082 robj *o = dictGetEntryKey(de);
6083 dictAdd(dstzset->dict,o,score);
6084 incrRefCount(o); /* added to dictionary */
6085 zslInsert(dstzset->zsl,*score,o);
6086 incrRefCount(o); /* added to skiplist */
6087 }
6088 dictReleaseIterator(di);
b287c9bb 6089 }
2830ca53
PN
6090 } else {
6091 /* unknown operator */
6092 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
6093 }
6094
6095 deleteKey(c->db,dstkey);
3ea27d37 6096 if (dstzset->zsl->length) {
6097 dictAdd(c->db->dict,dstkey,dstobj);
6098 incrRefCount(dstkey);
482b672d 6099 addReplyLongLong(c, dstzset->zsl->length);
3ea27d37 6100 server.dirty++;
6101 } else {
8bca8773 6102 decrRefCount(dstobj);
3ea27d37 6103 addReply(c, shared.czero);
6104 }
8f92e768 6105 zfree(src);
b287c9bb
PN
6106}
6107
5d373da9 6108static void zunionstoreCommand(redisClient *c) {
2830ca53 6109 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
6110}
6111
5d373da9 6112static void zinterstoreCommand(redisClient *c) {
2830ca53 6113 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
6114}
6115
e3870fab 6116static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 6117 robj *o;
bbe025e0
AM
6118 long start;
6119 long end;
752da584 6120 int withscores = 0;
dd88747b 6121 int llen;
6122 int rangelen, j;
6123 zset *zsetobj;
6124 zskiplist *zsl;
6125 zskiplistNode *ln;
6126 robj *ele;
752da584 6127
bd79a6bd
PN
6128 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6129 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 6130
752da584 6131 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6132 withscores = 1;
6133 } else if (c->argc >= 5) {
6134 addReply(c,shared.syntaxerr);
6135 return;
6136 }
cc812361 6137
4e27f268 6138 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6139 || checkType(c,o,REDIS_ZSET)) return;
dd88747b 6140 zsetobj = o->ptr;
6141 zsl = zsetobj->zsl;
6142 llen = zsl->length;
cc812361 6143
dd88747b 6144 /* convert negative indexes */
6145 if (start < 0) start = llen+start;
6146 if (end < 0) end = llen+end;
6147 if (start < 0) start = 0;
6148 if (end < 0) end = 0;
cc812361 6149
dd88747b 6150 /* indexes sanity checks */
6151 if (start > end || start >= llen) {
6152 /* Out of range start or start > end result in empty list */
6153 addReply(c,shared.emptymultibulk);
6154 return;
6155 }
6156 if (end >= llen) end = llen-1;
6157 rangelen = (end-start)+1;
cc812361 6158
dd88747b 6159 /* check if starting point is trivial, before searching
6160 * the element in log(N) time */
6161 if (reverse) {
6162 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6163 } else {
6164 ln = start == 0 ?
6165 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6166 }
cc812361 6167
dd88747b 6168 /* Return the result in form of a multi-bulk reply */
6169 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6170 withscores ? (rangelen*2) : rangelen));
6171 for (j = 0; j < rangelen; j++) {
6172 ele = ln->obj;
6173 addReplyBulk(c,ele);
6174 if (withscores)
6175 addReplyDouble(c,ln->score);
6176 ln = reverse ? ln->backward : ln->forward[0];
cc812361 6177 }
6178}
6179
e3870fab 6180static void zrangeCommand(redisClient *c) {
6181 zrangeGenericCommand(c,0);
6182}
6183
6184static void zrevrangeCommand(redisClient *c) {
6185 zrangeGenericCommand(c,1);
6186}
6187
f44dd428 6188/* This command implements both ZRANGEBYSCORE and ZCOUNT.
6189 * If justcount is non-zero, just the count is returned. */
6190static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 6191 robj *o;
f44dd428 6192 double min, max;
6193 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 6194 int offset = 0, limit = -1;
0500ef27
SH
6195 int withscores = 0;
6196 int badsyntax = 0;
6197
f44dd428 6198 /* Parse the min-max interval. If one of the values is prefixed
6199 * by the "(" character, it's considered "open". For instance
6200 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6201 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6202 if (((char*)c->argv[2]->ptr)[0] == '(') {
6203 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6204 minex = 1;
6205 } else {
6206 min = strtod(c->argv[2]->ptr,NULL);
6207 }
6208 if (((char*)c->argv[3]->ptr)[0] == '(') {
6209 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6210 maxex = 1;
6211 } else {
6212 max = strtod(c->argv[3]->ptr,NULL);
6213 }
6214
6215 /* Parse "WITHSCORES": note that if the command was called with
6216 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6217 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 6218 if (c->argc == 5 || c->argc == 8) {
3a3978b1 6219 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6220 withscores = 1;
6221 else
6222 badsyntax = 1;
0500ef27 6223 }
3a3978b1 6224 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 6225 badsyntax = 1;
0500ef27 6226 if (badsyntax) {
454d4e43 6227 addReplySds(c,
6228 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 6229 return;
0500ef27
SH
6230 }
6231
f44dd428 6232 /* Parse "LIMIT" */
0500ef27 6233 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 6234 addReply(c,shared.syntaxerr);
6235 return;
0500ef27 6236 } else if (c->argc == (7 + withscores)) {
80181f78 6237 offset = atoi(c->argv[5]->ptr);
6238 limit = atoi(c->argv[6]->ptr);
0b13687c 6239 if (offset < 0) offset = 0;
80181f78 6240 }
50c55df5 6241
f44dd428 6242 /* Ok, lookup the key and get the range */
50c55df5 6243 o = lookupKeyRead(c->db,c->argv[1]);
6244 if (o == NULL) {
4e27f268 6245 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6246 } else {
6247 if (o->type != REDIS_ZSET) {
6248 addReply(c,shared.wrongtypeerr);
6249 } else {
6250 zset *zsetobj = o->ptr;
6251 zskiplist *zsl = zsetobj->zsl;
6252 zskiplistNode *ln;
f44dd428 6253 robj *ele, *lenobj = NULL;
6254 unsigned long rangelen = 0;
50c55df5 6255
f44dd428 6256 /* Get the first node with the score >= min, or with
6257 * score > min if 'minex' is true. */
50c55df5 6258 ln = zslFirstWithScore(zsl,min);
f44dd428 6259 while (minex && ln && ln->score == min) ln = ln->forward[0];
6260
50c55df5 6261 if (ln == NULL) {
6262 /* No element matching the speciifed interval */
f44dd428 6263 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6264 return;
6265 }
6266
6267 /* We don't know in advance how many matching elements there
6268 * are in the list, so we push this object that will represent
6269 * the multi-bulk length in the output buffer, and will "fix"
6270 * it later */
f44dd428 6271 if (!justcount) {
6272 lenobj = createObject(REDIS_STRING,NULL);
6273 addReply(c,lenobj);
6274 decrRefCount(lenobj);
6275 }
50c55df5 6276
f44dd428 6277 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 6278 if (offset) {
6279 offset--;
6280 ln = ln->forward[0];
6281 continue;
6282 }
6283 if (limit == 0) break;
f44dd428 6284 if (!justcount) {
6285 ele = ln->obj;
dd88747b 6286 addReplyBulk(c,ele);
f44dd428 6287 if (withscores)
6288 addReplyDouble(c,ln->score);
6289 }
50c55df5 6290 ln = ln->forward[0];
6291 rangelen++;
80181f78 6292 if (limit > 0) limit--;
50c55df5 6293 }
f44dd428 6294 if (justcount) {
482b672d 6295 addReplyLongLong(c,(long)rangelen);
f44dd428 6296 } else {
6297 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6298 withscores ? (rangelen*2) : rangelen);
6299 }
50c55df5 6300 }
6301 }
6302}
6303
f44dd428 6304static void zrangebyscoreCommand(redisClient *c) {
6305 genericZrangebyscoreCommand(c,0);
6306}
6307
6308static void zcountCommand(redisClient *c) {
6309 genericZrangebyscoreCommand(c,1);
6310}
6311
3c41331e 6312static void zcardCommand(redisClient *c) {
e197b441 6313 robj *o;
6314 zset *zs;
dd88747b 6315
6316 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6317 checkType(c,o,REDIS_ZSET)) return;
6318
6319 zs = o->ptr;
6320 addReplyUlong(c,zs->zsl->length);
e197b441 6321}
6322
6e333bbe 6323static void zscoreCommand(redisClient *c) {
6324 robj *o;
6325 zset *zs;
dd88747b 6326 dictEntry *de;
6327
6328 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6329 checkType(c,o,REDIS_ZSET)) return;
6330
6331 zs = o->ptr;
6332 de = dictFind(zs->dict,c->argv[2]);
6333 if (!de) {
96d8b4ee 6334 addReply(c,shared.nullbulk);
6e333bbe 6335 } else {
dd88747b 6336 double *score = dictGetEntryVal(de);
6e333bbe 6337
dd88747b 6338 addReplyDouble(c,*score);
6e333bbe 6339 }
6340}
6341
798d9e55 6342static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 6343 robj *o;
dd88747b 6344 zset *zs;
6345 zskiplist *zsl;
6346 dictEntry *de;
6347 unsigned long rank;
6348 double *score;
6349
6350 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6351 checkType(c,o,REDIS_ZSET)) return;
6352
6353 zs = o->ptr;
6354 zsl = zs->zsl;
6355 de = dictFind(zs->dict,c->argv[2]);
6356 if (!de) {
69d95c3e
PN
6357 addReply(c,shared.nullbulk);
6358 return;
6359 }
69d95c3e 6360
dd88747b 6361 score = dictGetEntryVal(de);
6362 rank = zslGetRank(zsl, *score, c->argv[2]);
6363 if (rank) {
6364 if (reverse) {
482b672d 6365 addReplyLongLong(c, zsl->length - rank);
27b0ccca 6366 } else {
482b672d 6367 addReplyLongLong(c, rank-1);
69d95c3e 6368 }
dd88747b 6369 } else {
6370 addReply(c,shared.nullbulk);
978c2c94 6371 }
6372}
6373
798d9e55
PN
6374static void zrankCommand(redisClient *c) {
6375 zrankGenericCommand(c, 0);
6376}
6377
6378static void zrevrankCommand(redisClient *c) {
6379 zrankGenericCommand(c, 1);
6380}
6381
7fb16bac
PN
6382/* ========================= Hashes utility functions ======================= */
6383#define REDIS_HASH_KEY 1
6384#define REDIS_HASH_VALUE 2
978c2c94 6385
7fb16bac
PN
6386/* Check the length of a number of objects to see if we need to convert a
6387 * zipmap to a real hash. Note that we only check string encoded objects
6388 * as their string length can be queried in constant time. */
6389static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6390 int i;
6391 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
978c2c94 6392
7fb16bac
PN
6393 for (i = start; i <= end; i++) {
6394 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6395 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6396 {
6397 convertToRealHash(subject);
978c2c94 6398 return;
6399 }
6400 }
7fb16bac 6401}
bae2c7ec 6402
97224de7
PN
6403/* Encode given objects in-place when the hash uses a dict. */
6404static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6405 if (subject->encoding == REDIS_ENCODING_HT) {
3f973463
PN
6406 if (o1) *o1 = tryObjectEncoding(*o1);
6407 if (o2) *o2 = tryObjectEncoding(*o2);
97224de7
PN
6408 }
6409}
6410
7fb16bac 6411/* Get the value from a hash identified by key. Returns either a string
a3f3af86
PN
6412 * object or NULL if the value cannot be found. The refcount of the object
6413 * is always increased by 1 when the value was found. */
7fb16bac
PN
6414static robj *hashGet(robj *o, robj *key) {
6415 robj *value = NULL;
978c2c94 6416 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7fb16bac
PN
6417 unsigned char *v;
6418 unsigned int vlen;
6419 key = getDecodedObject(key);
6420 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6421 value = createStringObject((char*)v,vlen);
6422 }
6423 decrRefCount(key);
6424 } else {
6425 dictEntry *de = dictFind(o->ptr,key);
6426 if (de != NULL) {
6427 value = dictGetEntryVal(de);
a3f3af86 6428 incrRefCount(value);
7fb16bac
PN
6429 }
6430 }
6431 return value;
6432}
978c2c94 6433
7fb16bac
PN
6434/* Test if the key exists in the given hash. Returns 1 if the key
6435 * exists and 0 when it doesn't. */
6436static int hashExists(robj *o, robj *key) {
6437 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6438 key = getDecodedObject(key);
6439 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6440 decrRefCount(key);
6441 return 1;
6442 }
6443 decrRefCount(key);
6444 } else {
6445 if (dictFind(o->ptr,key) != NULL) {
6446 return 1;
6447 }
6448 }
6449 return 0;
6450}
bae2c7ec 6451
7fb16bac
PN
6452/* Add an element, discard the old if the key already exists.
6453 * Return 0 on insert and 1 on update. */
feb8d7e6 6454static int hashSet(robj *o, robj *key, robj *value) {
7fb16bac
PN
6455 int update = 0;
6456 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6457 key = getDecodedObject(key);
6458 value = getDecodedObject(value);
6459 o->ptr = zipmapSet(o->ptr,
6460 key->ptr,sdslen(key->ptr),
6461 value->ptr,sdslen(value->ptr), &update);
6462 decrRefCount(key);
6463 decrRefCount(value);
6464
6465 /* Check if the zipmap needs to be upgraded to a real hash table */
6466 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
bae2c7ec 6467 convertToRealHash(o);
978c2c94 6468 } else {
7fb16bac
PN
6469 if (dictReplace(o->ptr,key,value)) {
6470 /* Insert */
6471 incrRefCount(key);
978c2c94 6472 } else {
7fb16bac 6473 /* Update */
978c2c94 6474 update = 1;
6475 }
7fb16bac 6476 incrRefCount(value);
978c2c94 6477 }
7fb16bac 6478 return update;
978c2c94 6479}
6480
7fb16bac
PN
6481/* Delete an element from a hash.
6482 * Return 1 on deleted and 0 on not found. */
6483static int hashDelete(robj *o, robj *key) {
6484 int deleted = 0;
6485 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6486 key = getDecodedObject(key);
6487 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6488 decrRefCount(key);
6489 } else {
6490 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6491 /* Always check if the dictionary needs a resize after a delete. */
6492 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
d33278d1 6493 }
7fb16bac
PN
6494 return deleted;
6495}
d33278d1 6496
7fb16bac 6497/* Return the number of elements in a hash. */
c811bb38 6498static unsigned long hashLength(robj *o) {
7fb16bac
PN
6499 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6500 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6501}
6502
6503/* Structure to hold hash iteration abstration. Note that iteration over
6504 * hashes involves both fields and values. Because it is possible that
6505 * not both are required, store pointers in the iterator to avoid
6506 * unnecessary memory allocation for fields/values. */
6507typedef struct {
6508 int encoding;
6509 unsigned char *zi;
6510 unsigned char *zk, *zv;
6511 unsigned int zklen, zvlen;
6512
6513 dictIterator *di;
6514 dictEntry *de;
6515} hashIterator;
6516
c44d3b56
PN
6517static hashIterator *hashInitIterator(robj *subject) {
6518 hashIterator *hi = zmalloc(sizeof(hashIterator));
7fb16bac
PN
6519 hi->encoding = subject->encoding;
6520 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6521 hi->zi = zipmapRewind(subject->ptr);
6522 } else if (hi->encoding == REDIS_ENCODING_HT) {
6523 hi->di = dictGetIterator(subject->ptr);
d33278d1 6524 } else {
7fb16bac 6525 redisAssert(NULL);
d33278d1 6526 }
c44d3b56 6527 return hi;
7fb16bac 6528}
d33278d1 6529
7fb16bac
PN
6530static void hashReleaseIterator(hashIterator *hi) {
6531 if (hi->encoding == REDIS_ENCODING_HT) {
6532 dictReleaseIterator(hi->di);
d33278d1 6533 }
c44d3b56 6534 zfree(hi);
7fb16bac 6535}
d33278d1 6536
7fb16bac
PN
6537/* Move to the next entry in the hash. Return REDIS_OK when the next entry
6538 * could be found and REDIS_ERR when the iterator reaches the end. */
c811bb38 6539static int hashNext(hashIterator *hi) {
7fb16bac
PN
6540 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6541 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6542 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6543 } else {
6544 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6545 }
6546 return REDIS_OK;
6547}
d33278d1 6548
0c390abc 6549/* Get key or value object at current iteration position.
a3f3af86 6550 * This increases the refcount of the field object by 1. */
c811bb38 6551static robj *hashCurrent(hashIterator *hi, int what) {
7fb16bac
PN
6552 robj *o;
6553 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6554 if (what & REDIS_HASH_KEY) {
6555 o = createStringObject((char*)hi->zk,hi->zklen);
6556 } else {
6557 o = createStringObject((char*)hi->zv,hi->zvlen);
d33278d1 6558 }
d33278d1 6559 } else {
7fb16bac
PN
6560 if (what & REDIS_HASH_KEY) {
6561 o = dictGetEntryKey(hi->de);
6562 } else {
6563 o = dictGetEntryVal(hi->de);
d33278d1 6564 }
a3f3af86 6565 incrRefCount(o);
d33278d1 6566 }
7fb16bac 6567 return o;
d33278d1
PN
6568}
6569
7fb16bac
PN
6570static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6571 robj *o = lookupKeyWrite(c->db,key);
01426b05
PN
6572 if (o == NULL) {
6573 o = createHashObject();
7fb16bac
PN
6574 dictAdd(c->db->dict,key,o);
6575 incrRefCount(key);
01426b05
PN
6576 } else {
6577 if (o->type != REDIS_HASH) {
6578 addReply(c,shared.wrongtypeerr);
7fb16bac 6579 return NULL;
01426b05
PN
6580 }
6581 }
7fb16bac
PN
6582 return o;
6583}
01426b05 6584
7fb16bac
PN
6585/* ============================= Hash commands ============================== */
6586static void hsetCommand(redisClient *c) {
6e9e463f 6587 int update;
7fb16bac 6588 robj *o;
bbe025e0 6589
7fb16bac
PN
6590 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6591 hashTryConversion(o,c->argv,2,3);
97224de7 6592 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6593 update = hashSet(o,c->argv[2],c->argv[3]);
6e9e463f 6594 addReply(c, update ? shared.czero : shared.cone);
7fb16bac
PN
6595 server.dirty++;
6596}
01426b05 6597
1f1c7695
PN
6598static void hsetnxCommand(redisClient *c) {
6599 robj *o;
6600 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6601 hashTryConversion(o,c->argv,2,3);
6602
6603 if (hashExists(o, c->argv[2])) {
6604 addReply(c, shared.czero);
01426b05 6605 } else {
97224de7 6606 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6607 hashSet(o,c->argv[2],c->argv[3]);
1f1c7695
PN
6608 addReply(c, shared.cone);
6609 server.dirty++;
6610 }
6611}
01426b05 6612
7fb16bac
PN
6613static void hmsetCommand(redisClient *c) {
6614 int i;
6615 robj *o;
01426b05 6616
7fb16bac
PN
6617 if ((c->argc % 2) == 1) {
6618 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6619 return;
6620 }
01426b05 6621
7fb16bac
PN
6622 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6623 hashTryConversion(o,c->argv,2,c->argc-1);
6624 for (i = 2; i < c->argc; i += 2) {
97224de7 6625 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
feb8d7e6 6626 hashSet(o,c->argv[i],c->argv[i+1]);
7fb16bac
PN
6627 }
6628 addReply(c, shared.ok);
edc2f63a 6629 server.dirty++;
7fb16bac
PN
6630}
6631
6632static void hincrbyCommand(redisClient *c) {
6633 long long value, incr;
6634 robj *o, *current, *new;
6635
bd79a6bd 6636 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
7fb16bac
PN
6637 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6638 if ((current = hashGet(o,c->argv[2])) != NULL) {
946342c1
PN
6639 if (getLongLongFromObjectOrReply(c,current,&value,
6640 "hash value is not an integer") != REDIS_OK) {
6641 decrRefCount(current);
6642 return;
6643 }
a3f3af86 6644 decrRefCount(current);
7fb16bac
PN
6645 } else {
6646 value = 0;
01426b05
PN
6647 }
6648
7fb16bac 6649 value += incr;
3f973463
PN
6650 new = createStringObjectFromLongLong(value);
6651 hashTryObjectEncoding(o,&c->argv[2],NULL);
feb8d7e6 6652 hashSet(o,c->argv[2],new);
7fb16bac
PN
6653 decrRefCount(new);
6654 addReplyLongLong(c,value);
01426b05 6655 server.dirty++;
01426b05
PN
6656}
6657
978c2c94 6658static void hgetCommand(redisClient *c) {
7fb16bac 6659 robj *o, *value;
dd88747b 6660 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6661 checkType(c,o,REDIS_HASH)) return;
6662
7fb16bac
PN
6663 if ((value = hashGet(o,c->argv[2])) != NULL) {
6664 addReplyBulk(c,value);
a3f3af86 6665 decrRefCount(value);
dd88747b 6666 } else {
7fb16bac 6667 addReply(c,shared.nullbulk);
69d95c3e 6668 }
69d95c3e
PN
6669}
6670
09aeb579
PN
6671static void hmgetCommand(redisClient *c) {
6672 int i;
7fb16bac
PN
6673 robj *o, *value;
6674 o = lookupKeyRead(c->db,c->argv[1]);
6675 if (o != NULL && o->type != REDIS_HASH) {
6676 addReply(c,shared.wrongtypeerr);
09aeb579
PN
6677 }
6678
7fb16bac
PN
6679 /* Note the check for o != NULL happens inside the loop. This is
6680 * done because objects that cannot be found are considered to be
6681 * an empty hash. The reply should then be a series of NULLs. */
09aeb579 6682 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7fb16bac
PN
6683 for (i = 2; i < c->argc; i++) {
6684 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6685 addReplyBulk(c,value);
a3f3af86 6686 decrRefCount(value);
7fb16bac
PN
6687 } else {
6688 addReply(c,shared.nullbulk);
09aeb579
PN
6689 }
6690 }
6691}
6692
07efaf74 6693static void hdelCommand(redisClient *c) {
dd88747b 6694 robj *o;
dd88747b 6695 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6696 checkType(c,o,REDIS_HASH)) return;
07efaf74 6697
7fb16bac
PN
6698 if (hashDelete(o,c->argv[2])) {
6699 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6700 addReply(c,shared.cone);
6701 server.dirty++;
dd88747b 6702 } else {
7fb16bac 6703 addReply(c,shared.czero);
07efaf74 6704 }
6705}
6706
92b27fe9 6707static void hlenCommand(redisClient *c) {
6708 robj *o;
dd88747b 6709 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 6710 checkType(c,o,REDIS_HASH)) return;
6711
7fb16bac 6712 addReplyUlong(c,hashLength(o));
92b27fe9 6713}
6714
78409a0f 6715static void genericHgetallCommand(redisClient *c, int flags) {
7fb16bac 6716 robj *o, *lenobj, *obj;
78409a0f 6717 unsigned long count = 0;
c44d3b56 6718 hashIterator *hi;
78409a0f 6719
4e27f268 6720 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
78409a0f 6721 || checkType(c,o,REDIS_HASH)) return;
6722
6723 lenobj = createObject(REDIS_STRING,NULL);
6724 addReply(c,lenobj);
6725 decrRefCount(lenobj);
6726
c44d3b56
PN
6727 hi = hashInitIterator(o);
6728 while (hashNext(hi) != REDIS_ERR) {
7fb16bac 6729 if (flags & REDIS_HASH_KEY) {
c44d3b56 6730 obj = hashCurrent(hi,REDIS_HASH_KEY);
7fb16bac 6731 addReplyBulk(c,obj);
a3f3af86 6732 decrRefCount(obj);
7fb16bac 6733 count++;
78409a0f 6734 }
7fb16bac 6735 if (flags & REDIS_HASH_VALUE) {
c44d3b56 6736 obj = hashCurrent(hi,REDIS_HASH_VALUE);
7fb16bac 6737 addReplyBulk(c,obj);
a3f3af86 6738 decrRefCount(obj);
7fb16bac 6739 count++;
78409a0f 6740 }
78409a0f 6741 }
c44d3b56 6742 hashReleaseIterator(hi);
7fb16bac 6743
78409a0f 6744 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6745}
6746
6747static void hkeysCommand(redisClient *c) {
7fb16bac 6748 genericHgetallCommand(c,REDIS_HASH_KEY);
78409a0f 6749}
6750
6751static void hvalsCommand(redisClient *c) {
7fb16bac 6752 genericHgetallCommand(c,REDIS_HASH_VALUE);
78409a0f 6753}
6754
6755static void hgetallCommand(redisClient *c) {
7fb16bac 6756 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
78409a0f 6757}
6758
a86f14b1 6759static void hexistsCommand(redisClient *c) {
6760 robj *o;
a86f14b1 6761 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6762 checkType(c,o,REDIS_HASH)) return;
6763
7fb16bac 6764 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
a86f14b1 6765}
6766
ada386b2 6767static void convertToRealHash(robj *o) {
6768 unsigned char *key, *val, *p, *zm = o->ptr;
6769 unsigned int klen, vlen;
6770 dict *dict = dictCreate(&hashDictType,NULL);
6771
6772 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6773 p = zipmapRewind(zm);
6774 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6775 robj *keyobj, *valobj;
6776
6777 keyobj = createStringObject((char*)key,klen);
6778 valobj = createStringObject((char*)val,vlen);
05df7621 6779 keyobj = tryObjectEncoding(keyobj);
6780 valobj = tryObjectEncoding(valobj);
ada386b2 6781 dictAdd(dict,keyobj,valobj);
6782 }
6783 o->encoding = REDIS_ENCODING_HT;
6784 o->ptr = dict;
6785 zfree(zm);
6786}
6787
6b47e12e 6788/* ========================= Non type-specific commands ==================== */
6789
ed9b544e 6790static void flushdbCommand(redisClient *c) {
ca37e9cd 6791 server.dirty += dictSize(c->db->dict);
9b30e1a2 6792 touchWatchedKeysOnFlush(c->db->id);
3305306f 6793 dictEmpty(c->db->dict);
6794 dictEmpty(c->db->expires);
ed9b544e 6795 addReply(c,shared.ok);
ed9b544e 6796}
6797
6798static void flushallCommand(redisClient *c) {
9b30e1a2 6799 touchWatchedKeysOnFlush(-1);
ca37e9cd 6800 server.dirty += emptyDb();
ed9b544e 6801 addReply(c,shared.ok);
500ece7c 6802 if (server.bgsavechildpid != -1) {
6803 kill(server.bgsavechildpid,SIGKILL);
6804 rdbRemoveTempFile(server.bgsavechildpid);
6805 }
f78fd11b 6806 rdbSave(server.dbfilename);
ca37e9cd 6807 server.dirty++;
ed9b544e 6808}
6809
56906eef 6810static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 6811 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 6812 so->type = type;
6813 so->pattern = pattern;
6814 return so;
6815}
6816
6817/* Return the value associated to the key with a name obtained
55017f9d
PN
6818 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6819 * The returned object will always have its refcount increased by 1
6820 * when it is non-NULL. */
56906eef 6821static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6d7d1370 6822 char *p, *f;
ed9b544e 6823 sds spat, ssub;
6d7d1370
PN
6824 robj keyobj, fieldobj, *o;
6825 int prefixlen, sublen, postfixlen, fieldlen;
ed9b544e 6826 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6827 struct {
f1017b3f 6828 long len;
6829 long free;
ed9b544e 6830 char buf[REDIS_SORTKEY_MAX+1];
6d7d1370 6831 } keyname, fieldname;
ed9b544e 6832
28173a49 6833 /* If the pattern is "#" return the substitution object itself in order
6834 * to implement the "SORT ... GET #" feature. */
6835 spat = pattern->ptr;
6836 if (spat[0] == '#' && spat[1] == '\0') {
55017f9d 6837 incrRefCount(subst);
28173a49 6838 return subst;
6839 }
6840
6841 /* The substitution object may be specially encoded. If so we create
9d65a1bb 6842 * a decoded object on the fly. Otherwise getDecodedObject will just
6843 * increment the ref count, that we'll decrement later. */
6844 subst = getDecodedObject(subst);
942a3961 6845
ed9b544e 6846 ssub = subst->ptr;
6847 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6848 p = strchr(spat,'*');
ed5a857a 6849 if (!p) {
6850 decrRefCount(subst);
6851 return NULL;
6852 }
ed9b544e 6853
6d7d1370
PN
6854 /* Find out if we're dealing with a hash dereference. */
6855 if ((f = strstr(p+1, "->")) != NULL) {
6856 fieldlen = sdslen(spat)-(f-spat);
6857 /* this also copies \0 character */
6858 memcpy(fieldname.buf,f+2,fieldlen-1);
6859 fieldname.len = fieldlen-2;
6860 } else {
6861 fieldlen = 0;
6862 }
6863
ed9b544e 6864 prefixlen = p-spat;
6865 sublen = sdslen(ssub);
6d7d1370 6866 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
ed9b544e 6867 memcpy(keyname.buf,spat,prefixlen);
6868 memcpy(keyname.buf+prefixlen,ssub,sublen);
6869 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6870 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6871 keyname.len = prefixlen+sublen+postfixlen;
942a3961 6872 decrRefCount(subst);
6873
6d7d1370
PN
6874 /* Lookup substituted key */
6875 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6876 o = lookupKeyRead(db,&keyobj);
55017f9d
PN
6877 if (o == NULL) return NULL;
6878
6879 if (fieldlen > 0) {
6880 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6d7d1370 6881
705dad38
PN
6882 /* Retrieve value from hash by the field name. This operation
6883 * already increases the refcount of the returned object. */
6d7d1370
PN
6884 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6885 o = hashGet(o, &fieldobj);
705dad38 6886 } else {
55017f9d 6887 if (o->type != REDIS_STRING) return NULL;
b6f07345 6888
705dad38
PN
6889 /* Every object that this function returns needs to have its refcount
6890 * increased. sortCommand decreases it again. */
6891 incrRefCount(o);
6d7d1370
PN
6892 }
6893
6894 return o;
ed9b544e 6895}
6896
6897/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6898 * the additional parameter is not standard but a BSD-specific we have to
6899 * pass sorting parameters via the global 'server' structure */
6900static int sortCompare(const void *s1, const void *s2) {
6901 const redisSortObject *so1 = s1, *so2 = s2;
6902 int cmp;
6903
6904 if (!server.sort_alpha) {
6905 /* Numeric sorting. Here it's trivial as we precomputed scores */
6906 if (so1->u.score > so2->u.score) {
6907 cmp = 1;
6908 } else if (so1->u.score < so2->u.score) {
6909 cmp = -1;
6910 } else {
6911 cmp = 0;
6912 }
6913 } else {
6914 /* Alphanumeric sorting */
6915 if (server.sort_bypattern) {
6916 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6917 /* At least one compare object is NULL */
6918 if (so1->u.cmpobj == so2->u.cmpobj)
6919 cmp = 0;
6920 else if (so1->u.cmpobj == NULL)
6921 cmp = -1;
6922 else
6923 cmp = 1;
6924 } else {
6925 /* We have both the objects, use strcoll */
6926 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6927 }
6928 } else {
08ee9b57 6929 /* Compare elements directly. */
6930 cmp = compareStringObjects(so1->obj,so2->obj);
ed9b544e 6931 }
6932 }
6933 return server.sort_desc ? -cmp : cmp;
6934}
6935
6936/* The SORT command is the most complex command in Redis. Warning: this code
6937 * is optimized for speed and a bit less for readability */
6938static void sortCommand(redisClient *c) {
ed9b544e 6939 list *operations;
6940 int outputlen = 0;
6941 int desc = 0, alpha = 0;
6942 int limit_start = 0, limit_count = -1, start, end;
6943 int j, dontsort = 0, vectorlen;
6944 int getop = 0; /* GET operation counter */
443c6409 6945 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 6946 redisSortObject *vector; /* Resulting vector to sort */
6947
6948 /* Lookup the key to sort. It must be of the right types */
3305306f 6949 sortval = lookupKeyRead(c->db,c->argv[1]);
6950 if (sortval == NULL) {
4e27f268 6951 addReply(c,shared.emptymultibulk);
ed9b544e 6952 return;
6953 }
a5eb649b 6954 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6955 sortval->type != REDIS_ZSET)
6956 {
c937aa89 6957 addReply(c,shared.wrongtypeerr);
ed9b544e 6958 return;
6959 }
6960
6961 /* Create a list of operations to perform for every sorted element.
6962 * Operations can be GET/DEL/INCR/DECR */
6963 operations = listCreate();
092dac2a 6964 listSetFreeMethod(operations,zfree);
ed9b544e 6965 j = 2;
6966
6967 /* Now we need to protect sortval incrementing its count, in the future
6968 * SORT may have options able to overwrite/delete keys during the sorting
6969 * and the sorted key itself may get destroied */
6970 incrRefCount(sortval);
6971
6972 /* The SORT command has an SQL-alike syntax, parse it */
6973 while(j < c->argc) {
6974 int leftargs = c->argc-j-1;
6975 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6976 desc = 0;
6977 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6978 desc = 1;
6979 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6980 alpha = 1;
6981 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6982 limit_start = atoi(c->argv[j+1]->ptr);
6983 limit_count = atoi(c->argv[j+2]->ptr);
6984 j+=2;
443c6409 6985 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6986 storekey = c->argv[j+1];
6987 j++;
ed9b544e 6988 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6989 sortby = c->argv[j+1];
6990 /* If the BY pattern does not contain '*', i.e. it is constant,
6991 * we don't need to sort nor to lookup the weight keys. */
6992 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6993 j++;
6994 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6995 listAddNodeTail(operations,createSortOperation(
6996 REDIS_SORT_GET,c->argv[j+1]));
6997 getop++;
6998 j++;
ed9b544e 6999 } else {
7000 decrRefCount(sortval);
7001 listRelease(operations);
c937aa89 7002 addReply(c,shared.syntaxerr);
ed9b544e 7003 return;
7004 }
7005 j++;
7006 }
7007
7008 /* Load the sorting vector with all the objects to sort */
a5eb649b 7009 switch(sortval->type) {
7010 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
7011 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7012 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
f83c6cb5 7013 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
a5eb649b 7014 }
ed9b544e 7015 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 7016 j = 0;
a5eb649b 7017
ed9b544e 7018 if (sortval->type == REDIS_LIST) {
7019 list *list = sortval->ptr;
6208b3a7 7020 listNode *ln;
c7df85a4 7021 listIter li;
6208b3a7 7022
c7df85a4 7023 listRewind(list,&li);
7024 while((ln = listNext(&li))) {
ed9b544e 7025 robj *ele = ln->value;
7026 vector[j].obj = ele;
7027 vector[j].u.score = 0;
7028 vector[j].u.cmpobj = NULL;
ed9b544e 7029 j++;
7030 }
7031 } else {
a5eb649b 7032 dict *set;
ed9b544e 7033 dictIterator *di;
7034 dictEntry *setele;
7035
a5eb649b 7036 if (sortval->type == REDIS_SET) {
7037 set = sortval->ptr;
7038 } else {
7039 zset *zs = sortval->ptr;
7040 set = zs->dict;
7041 }
7042
ed9b544e 7043 di = dictGetIterator(set);
ed9b544e 7044 while((setele = dictNext(di)) != NULL) {
7045 vector[j].obj = dictGetEntryKey(setele);
7046 vector[j].u.score = 0;
7047 vector[j].u.cmpobj = NULL;
7048 j++;
7049 }
7050 dictReleaseIterator(di);
7051 }
dfc5e96c 7052 redisAssert(j == vectorlen);
ed9b544e 7053
7054 /* Now it's time to load the right scores in the sorting vector */
7055 if (dontsort == 0) {
7056 for (j = 0; j < vectorlen; j++) {
6d7d1370 7057 robj *byval;
ed9b544e 7058 if (sortby) {
6d7d1370 7059 /* lookup value to sort by */
3305306f 7060 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
705dad38 7061 if (!byval) continue;
ed9b544e 7062 } else {
6d7d1370
PN
7063 /* use object itself to sort by */
7064 byval = vector[j].obj;
7065 }
7066
7067 if (alpha) {
08ee9b57 7068 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6d7d1370
PN
7069 } else {
7070 if (byval->encoding == REDIS_ENCODING_RAW) {
7071 vector[j].u.score = strtod(byval->ptr,NULL);
16fa22f1 7072 } else if (byval->encoding == REDIS_ENCODING_INT) {
6d7d1370
PN
7073 /* Don't need to decode the object if it's
7074 * integer-encoded (the only encoding supported) so
7075 * far. We can just cast it */
16fa22f1
PN
7076 vector[j].u.score = (long)byval->ptr;
7077 } else {
7078 redisAssert(1 != 1);
942a3961 7079 }
ed9b544e 7080 }
6d7d1370 7081
705dad38
PN
7082 /* when the object was retrieved using lookupKeyByPattern,
7083 * its refcount needs to be decreased. */
7084 if (sortby) {
7085 decrRefCount(byval);
ed9b544e 7086 }
7087 }
7088 }
7089
7090 /* We are ready to sort the vector... perform a bit of sanity check
7091 * on the LIMIT option too. We'll use a partial version of quicksort. */
7092 start = (limit_start < 0) ? 0 : limit_start;
7093 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7094 if (start >= vectorlen) {
7095 start = vectorlen-1;
7096 end = vectorlen-2;
7097 }
7098 if (end >= vectorlen) end = vectorlen-1;
7099
7100 if (dontsort == 0) {
7101 server.sort_desc = desc;
7102 server.sort_alpha = alpha;
7103 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 7104 if (sortby && (start != 0 || end != vectorlen-1))
7105 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7106 else
7107 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 7108 }
7109
7110 /* Send command output to the output buffer, performing the specified
7111 * GET/DEL/INCR/DECR operations if any. */
7112 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 7113 if (storekey == NULL) {
7114 /* STORE option not specified, sent the sorting result to client */
7115 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7116 for (j = start; j <= end; j++) {
7117 listNode *ln;
c7df85a4 7118 listIter li;
7119
dd88747b 7120 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 7121 listRewind(operations,&li);
7122 while((ln = listNext(&li))) {
443c6409 7123 redisSortOperation *sop = ln->value;
7124 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7125 vector[j].obj);
7126
7127 if (sop->type == REDIS_SORT_GET) {
55017f9d 7128 if (!val) {
443c6409 7129 addReply(c,shared.nullbulk);
7130 } else {
dd88747b 7131 addReplyBulk(c,val);
55017f9d 7132 decrRefCount(val);
443c6409 7133 }
7134 } else {
dfc5e96c 7135 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 7136 }
7137 }
ed9b544e 7138 }
443c6409 7139 } else {
7140 robj *listObject = createListObject();
7141 list *listPtr = (list*) listObject->ptr;
7142
7143 /* STORE option specified, set the sorting result as a List object */
7144 for (j = start; j <= end; j++) {
7145 listNode *ln;
c7df85a4 7146 listIter li;
7147
443c6409 7148 if (!getop) {
7149 listAddNodeTail(listPtr,vector[j].obj);
7150 incrRefCount(vector[j].obj);
7151 }
c7df85a4 7152 listRewind(operations,&li);
7153 while((ln = listNext(&li))) {
443c6409 7154 redisSortOperation *sop = ln->value;
7155 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7156 vector[j].obj);
7157
7158 if (sop->type == REDIS_SORT_GET) {
55017f9d 7159 if (!val) {
443c6409 7160 listAddNodeTail(listPtr,createStringObject("",0));
7161 } else {
55017f9d
PN
7162 /* We should do a incrRefCount on val because it is
7163 * added to the list, but also a decrRefCount because
7164 * it is returned by lookupKeyByPattern. This results
7165 * in doing nothing at all. */
443c6409 7166 listAddNodeTail(listPtr,val);
443c6409 7167 }
ed9b544e 7168 } else {
dfc5e96c 7169 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 7170 }
ed9b544e 7171 }
ed9b544e 7172 }
121796f7 7173 if (dictReplace(c->db->dict,storekey,listObject)) {
7174 incrRefCount(storekey);
7175 }
443c6409 7176 /* Note: we add 1 because the DB is dirty anyway since even if the
7177 * SORT result is empty a new key is set and maybe the old content
7178 * replaced. */
7179 server.dirty += 1+outputlen;
7180 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 7181 }
7182
7183 /* Cleanup */
7184 decrRefCount(sortval);
7185 listRelease(operations);
7186 for (j = 0; j < vectorlen; j++) {
16fa22f1 7187 if (alpha && vector[j].u.cmpobj)
ed9b544e 7188 decrRefCount(vector[j].u.cmpobj);
7189 }
7190 zfree(vector);
7191}
7192
ec6c7a1d 7193/* Convert an amount of bytes into a human readable string in the form
7194 * of 100B, 2G, 100M, 4K, and so forth. */
7195static void bytesToHuman(char *s, unsigned long long n) {
7196 double d;
7197
7198 if (n < 1024) {
7199 /* Bytes */
7200 sprintf(s,"%lluB",n);
7201 return;
7202 } else if (n < (1024*1024)) {
7203 d = (double)n/(1024);
7204 sprintf(s,"%.2fK",d);
7205 } else if (n < (1024LL*1024*1024)) {
7206 d = (double)n/(1024*1024);
7207 sprintf(s,"%.2fM",d);
7208 } else if (n < (1024LL*1024*1024*1024)) {
7209 d = (double)n/(1024LL*1024*1024);
b72f6a4b 7210 sprintf(s,"%.2fG",d);
ec6c7a1d 7211 }
7212}
7213
1c85b79f 7214/* Create the string returned by the INFO command. This is decoupled
7215 * by the INFO command itself as we need to report the same information
7216 * on memory corruption problems. */
7217static sds genRedisInfoString(void) {
ed9b544e 7218 sds info;
7219 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 7220 int j;
ec6c7a1d 7221 char hmem[64];
55a8298f 7222
b72f6a4b 7223 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 7224 info = sdscatprintf(sdsempty(),
7225 "redis_version:%s\r\n"
5436146c
PN
7226 "redis_git_sha1:%s\r\n"
7227 "redis_git_dirty:%d\r\n"
f1017b3f 7228 "arch_bits:%s\r\n"
7a932b74 7229 "multiplexing_api:%s\r\n"
0d7170a4 7230 "process_id:%ld\r\n"
682ac724 7231 "uptime_in_seconds:%ld\r\n"
7232 "uptime_in_days:%ld\r\n"
ed9b544e 7233 "connected_clients:%d\r\n"
7234 "connected_slaves:%d\r\n"
f86a74e9 7235 "blocked_clients:%d\r\n"
5fba9f71 7236 "used_memory:%zu\r\n"
ec6c7a1d 7237 "used_memory_human:%s\r\n"
ed9b544e 7238 "changes_since_last_save:%lld\r\n"
be2bb6b0 7239 "bgsave_in_progress:%d\r\n"
682ac724 7240 "last_save_time:%ld\r\n"
b3fad521 7241 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 7242 "total_connections_received:%lld\r\n"
7243 "total_commands_processed:%lld\r\n"
2a6a2ed1 7244 "expired_keys:%lld\r\n"
3be2c9d7 7245 "hash_max_zipmap_entries:%zu\r\n"
7246 "hash_max_zipmap_value:%zu\r\n"
ffc6b7f8 7247 "pubsub_channels:%ld\r\n"
7248 "pubsub_patterns:%u\r\n"
7d98e08c 7249 "vm_enabled:%d\r\n"
a0f643ea 7250 "role:%s\r\n"
ed9b544e 7251 ,REDIS_VERSION,
5436146c 7252 REDIS_GIT_SHA1,
274e45e3 7253 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
f1017b3f 7254 (sizeof(long) == 8) ? "64" : "32",
7a932b74 7255 aeGetApiName(),
0d7170a4 7256 (long) getpid(),
a0f643ea 7257 uptime,
7258 uptime/(3600*24),
ed9b544e 7259 listLength(server.clients)-listLength(server.slaves),
7260 listLength(server.slaves),
d5d55fc3 7261 server.blpop_blocked_clients,
b72f6a4b 7262 zmalloc_used_memory(),
ec6c7a1d 7263 hmem,
ed9b544e 7264 server.dirty,
9d65a1bb 7265 server.bgsavechildpid != -1,
ed9b544e 7266 server.lastsave,
b3fad521 7267 server.bgrewritechildpid != -1,
ed9b544e 7268 server.stat_numconnections,
7269 server.stat_numcommands,
2a6a2ed1 7270 server.stat_expiredkeys,
55a8298f 7271 server.hash_max_zipmap_entries,
7272 server.hash_max_zipmap_value,
ffc6b7f8 7273 dictSize(server.pubsub_channels),
7274 listLength(server.pubsub_patterns),
7d98e08c 7275 server.vm_enabled != 0,
a0f643ea 7276 server.masterhost == NULL ? "master" : "slave"
ed9b544e 7277 );
a0f643ea 7278 if (server.masterhost) {
7279 info = sdscatprintf(info,
7280 "master_host:%s\r\n"
7281 "master_port:%d\r\n"
7282 "master_link_status:%s\r\n"
7283 "master_last_io_seconds_ago:%d\r\n"
7284 ,server.masterhost,
7285 server.masterport,
7286 (server.replstate == REDIS_REPL_CONNECTED) ?
7287 "up" : "down",
f72b934d 7288 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 7289 );
7290 }
7d98e08c 7291 if (server.vm_enabled) {
1064ef87 7292 lockThreadedIO();
7d98e08c 7293 info = sdscatprintf(info,
7294 "vm_conf_max_memory:%llu\r\n"
7295 "vm_conf_page_size:%llu\r\n"
7296 "vm_conf_pages:%llu\r\n"
7297 "vm_stats_used_pages:%llu\r\n"
7298 "vm_stats_swapped_objects:%llu\r\n"
7299 "vm_stats_swappin_count:%llu\r\n"
7300 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 7301 "vm_stats_io_newjobs_len:%lu\r\n"
7302 "vm_stats_io_processing_len:%lu\r\n"
7303 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 7304 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 7305 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 7306 ,(unsigned long long) server.vm_max_memory,
7307 (unsigned long long) server.vm_page_size,
7308 (unsigned long long) server.vm_pages,
7309 (unsigned long long) server.vm_stats_used_pages,
7310 (unsigned long long) server.vm_stats_swapped_objects,
7311 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 7312 (unsigned long long) server.vm_stats_swapouts,
7313 (unsigned long) listLength(server.io_newjobs),
7314 (unsigned long) listLength(server.io_processing),
7315 (unsigned long) listLength(server.io_processed),
d5d55fc3 7316 (unsigned long) server.io_active_threads,
7317 (unsigned long) server.vm_blocked_clients
7d98e08c 7318 );
1064ef87 7319 unlockThreadedIO();
7d98e08c 7320 }
c3cb078d 7321 for (j = 0; j < server.dbnum; j++) {
7322 long long keys, vkeys;
7323
7324 keys = dictSize(server.db[j].dict);
7325 vkeys = dictSize(server.db[j].expires);
7326 if (keys || vkeys) {
9d65a1bb 7327 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 7328 j, keys, vkeys);
7329 }
7330 }
1c85b79f 7331 return info;
7332}
7333
7334static void infoCommand(redisClient *c) {
7335 sds info = genRedisInfoString();
83c6a618 7336 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7337 (unsigned long)sdslen(info)));
ed9b544e 7338 addReplySds(c,info);
70003d28 7339 addReply(c,shared.crlf);
ed9b544e 7340}
7341
3305306f 7342static void monitorCommand(redisClient *c) {
7343 /* ignore MONITOR if aleady slave or in monitor mode */
7344 if (c->flags & REDIS_SLAVE) return;
7345
7346 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7347 c->slaveseldb = 0;
6b47e12e 7348 listAddNodeTail(server.monitors,c);
3305306f 7349 addReply(c,shared.ok);
7350}
7351
7352/* ================================= Expire ================================= */
7353static int removeExpire(redisDb *db, robj *key) {
7354 if (dictDelete(db->expires,key) == DICT_OK) {
7355 return 1;
7356 } else {
7357 return 0;
7358 }
7359}
7360
7361static int setExpire(redisDb *db, robj *key, time_t when) {
7362 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7363 return 0;
7364 } else {
7365 incrRefCount(key);
7366 return 1;
7367 }
7368}
7369
bb32ede5 7370/* Return the expire time of the specified key, or -1 if no expire
7371 * is associated with this key (i.e. the key is non volatile) */
7372static time_t getExpire(redisDb *db, robj *key) {
7373 dictEntry *de;
7374
7375 /* No expire? return ASAP */
7376 if (dictSize(db->expires) == 0 ||
7377 (de = dictFind(db->expires,key)) == NULL) return -1;
7378
7379 return (time_t) dictGetEntryVal(de);
7380}
7381
3305306f 7382static int expireIfNeeded(redisDb *db, robj *key) {
7383 time_t when;
7384 dictEntry *de;
7385
7386 /* No expire? return ASAP */
7387 if (dictSize(db->expires) == 0 ||
7388 (de = dictFind(db->expires,key)) == NULL) return 0;
7389
7390 /* Lookup the expire */
7391 when = (time_t) dictGetEntryVal(de);
7392 if (time(NULL) <= when) return 0;
7393
7394 /* Delete the key */
7395 dictDelete(db->expires,key);
2a6a2ed1 7396 server.stat_expiredkeys++;
3305306f 7397 return dictDelete(db->dict,key) == DICT_OK;
7398}
7399
7400static int deleteIfVolatile(redisDb *db, robj *key) {
7401 dictEntry *de;
7402
7403 /* No expire? return ASAP */
7404 if (dictSize(db->expires) == 0 ||
7405 (de = dictFind(db->expires,key)) == NULL) return 0;
7406
7407 /* Delete the key */
0c66a471 7408 server.dirty++;
2a6a2ed1 7409 server.stat_expiredkeys++;
3305306f 7410 dictDelete(db->expires,key);
7411 return dictDelete(db->dict,key) == DICT_OK;
7412}
7413
bbe025e0 7414static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
3305306f 7415 dictEntry *de;
bbe025e0
AM
7416 time_t seconds;
7417
bd79a6bd 7418 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
bbe025e0
AM
7419
7420 seconds -= offset;
3305306f 7421
802e8373 7422 de = dictFind(c->db->dict,key);
3305306f 7423 if (de == NULL) {
7424 addReply(c,shared.czero);
7425 return;
7426 }
d4dd6556 7427 if (seconds <= 0) {
43e5ccdf 7428 if (deleteKey(c->db,key)) server.dirty++;
7429 addReply(c, shared.cone);
3305306f 7430 return;
7431 } else {
7432 time_t when = time(NULL)+seconds;
802e8373 7433 if (setExpire(c->db,key,when)) {
3305306f 7434 addReply(c,shared.cone);
77423026 7435 server.dirty++;
7436 } else {
3305306f 7437 addReply(c,shared.czero);
77423026 7438 }
3305306f 7439 return;
7440 }
7441}
7442
802e8373 7443static void expireCommand(redisClient *c) {
bbe025e0 7444 expireGenericCommand(c,c->argv[1],c->argv[2],0);
802e8373 7445}
7446
7447static void expireatCommand(redisClient *c) {
bbe025e0 7448 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
802e8373 7449}
7450
fd88489a 7451static void ttlCommand(redisClient *c) {
7452 time_t expire;
7453 int ttl = -1;
7454
7455 expire = getExpire(c->db,c->argv[1]);
7456 if (expire != -1) {
7457 ttl = (int) (expire-time(NULL));
7458 if (ttl < 0) ttl = -1;
7459 }
7460 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7461}
7462
6e469882 7463/* ================================ MULTI/EXEC ============================== */
7464
7465/* Client state initialization for MULTI/EXEC */
7466static void initClientMultiState(redisClient *c) {
7467 c->mstate.commands = NULL;
7468 c->mstate.count = 0;
7469}
7470
7471/* Release all the resources associated with MULTI/EXEC state */
7472static void freeClientMultiState(redisClient *c) {
7473 int j;
7474
7475 for (j = 0; j < c->mstate.count; j++) {
7476 int i;
7477 multiCmd *mc = c->mstate.commands+j;
7478
7479 for (i = 0; i < mc->argc; i++)
7480 decrRefCount(mc->argv[i]);
7481 zfree(mc->argv);
7482 }
7483 zfree(c->mstate.commands);
7484}
7485
7486/* Add a new command into the MULTI commands queue */
7487static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7488 multiCmd *mc;
7489 int j;
7490
7491 c->mstate.commands = zrealloc(c->mstate.commands,
7492 sizeof(multiCmd)*(c->mstate.count+1));
7493 mc = c->mstate.commands+c->mstate.count;
7494 mc->cmd = cmd;
7495 mc->argc = c->argc;
7496 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7497 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7498 for (j = 0; j < c->argc; j++)
7499 incrRefCount(mc->argv[j]);
7500 c->mstate.count++;
7501}
7502
7503static void multiCommand(redisClient *c) {
6531c94d 7504 if (c->flags & REDIS_MULTI) {
7505 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
7506 return;
7507 }
6e469882 7508 c->flags |= REDIS_MULTI;
36c548f0 7509 addReply(c,shared.ok);
6e469882 7510}
7511
18b6cb76
DJ
7512static void discardCommand(redisClient *c) {
7513 if (!(c->flags & REDIS_MULTI)) {
7514 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7515 return;
7516 }
7517
7518 freeClientMultiState(c);
7519 initClientMultiState(c);
7520 c->flags &= (~REDIS_MULTI);
7521 addReply(c,shared.ok);
7522}
7523
66c8853f 7524/* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7525 * implememntation for more information. */
7526static void execCommandReplicateMulti(redisClient *c) {
7527 struct redisCommand *cmd;
7528 robj *multistring = createStringObject("MULTI",5);
7529
7530 cmd = lookupCommand("multi");
7531 if (server.appendonly)
7532 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7533 if (listLength(server.slaves))
7534 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7535 decrRefCount(multistring);
7536}
7537
6e469882 7538static void execCommand(redisClient *c) {
7539 int j;
7540 robj **orig_argv;
7541 int orig_argc;
7542
7543 if (!(c->flags & REDIS_MULTI)) {
7544 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7545 return;
7546 }
7547
37ab76c9 7548 /* Check if we need to abort the EXEC if some WATCHed key was touched.
7549 * A failed EXEC will return a multi bulk nil object. */
7550 if (c->flags & REDIS_DIRTY_CAS) {
7551 freeClientMultiState(c);
7552 initClientMultiState(c);
7553 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7554 unwatchAllKeys(c);
7555 addReply(c,shared.nullmultibulk);
7556 return;
7557 }
7558
66c8853f 7559 /* Replicate a MULTI request now that we are sure the block is executed.
7560 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7561 * both the AOF and the replication link will have the same consistency
7562 * and atomicity guarantees. */
7563 execCommandReplicateMulti(c);
7564
7565 /* Exec all the queued commands */
1ad4d316 7566 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
6e469882 7567 orig_argv = c->argv;
7568 orig_argc = c->argc;
7569 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7570 for (j = 0; j < c->mstate.count; j++) {
7571 c->argc = c->mstate.commands[j].argc;
7572 c->argv = c->mstate.commands[j].argv;
7573 call(c,c->mstate.commands[j].cmd);
7574 }
7575 c->argv = orig_argv;
7576 c->argc = orig_argc;
7577 freeClientMultiState(c);
7578 initClientMultiState(c);
1ad4d316 7579 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
66c8853f 7580 /* Make sure the EXEC command is always replicated / AOF, since we
7581 * always send the MULTI command (we can't know beforehand if the
7582 * next operations will contain at least a modification to the DB). */
7583 server.dirty++;
6e469882 7584}
7585
4409877e 7586/* =========================== Blocking Operations ========================= */
7587
7588/* Currently Redis blocking operations support is limited to list POP ops,
7589 * so the current implementation is not fully generic, but it is also not
7590 * completely specific so it will not require a rewrite to support new
7591 * kind of blocking operations in the future.
7592 *
7593 * Still it's important to note that list blocking operations can be already
7594 * used as a notification mechanism in order to implement other blocking
7595 * operations at application level, so there must be a very strong evidence
7596 * of usefulness and generality before new blocking operations are implemented.
7597 *
7598 * This is how the current blocking POP works, we use BLPOP as example:
7599 * - If the user calls BLPOP and the key exists and contains a non empty list
7600 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7601 * if there is not to block.
7602 * - If instead BLPOP is called and the key does not exists or the list is
7603 * empty we need to block. In order to do so we remove the notification for
7604 * new data to read in the client socket (so that we'll not serve new
7605 * requests if the blocking request is not served). Also we put the client
37ab76c9 7606 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
4409877e 7607 * blocking for this keys.
7608 * - If a PUSH operation against a key with blocked clients waiting is
7609 * performed, we serve the first in the list: basically instead to push
7610 * the new element inside the list we return it to the (first / oldest)
7611 * blocking client, unblock the client, and remove it form the list.
7612 *
7613 * The above comment and the source code should be enough in order to understand
7614 * the implementation and modify / fix it later.
7615 */
7616
7617/* Set a client in blocking mode for the specified key, with the specified
7618 * timeout */
b177fd30 7619static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 7620 dictEntry *de;
7621 list *l;
b177fd30 7622 int j;
4409877e 7623
37ab76c9 7624 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
7625 c->blocking_keys_num = numkeys;
4409877e 7626 c->blockingto = timeout;
b177fd30 7627 for (j = 0; j < numkeys; j++) {
7628 /* Add the key in the client structure, to map clients -> keys */
37ab76c9 7629 c->blocking_keys[j] = keys[j];
b177fd30 7630 incrRefCount(keys[j]);
4409877e 7631
b177fd30 7632 /* And in the other "side", to map keys -> clients */
37ab76c9 7633 de = dictFind(c->db->blocking_keys,keys[j]);
b177fd30 7634 if (de == NULL) {
7635 int retval;
7636
7637 /* For every key we take a list of clients blocked for it */
7638 l = listCreate();
37ab76c9 7639 retval = dictAdd(c->db->blocking_keys,keys[j],l);
b177fd30 7640 incrRefCount(keys[j]);
7641 assert(retval == DICT_OK);
7642 } else {
7643 l = dictGetEntryVal(de);
7644 }
7645 listAddNodeTail(l,c);
4409877e 7646 }
b177fd30 7647 /* Mark the client as a blocked client */
4409877e 7648 c->flags |= REDIS_BLOCKED;
d5d55fc3 7649 server.blpop_blocked_clients++;
4409877e 7650}
7651
7652/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 7653static void unblockClientWaitingData(redisClient *c) {
4409877e 7654 dictEntry *de;
7655 list *l;
b177fd30 7656 int j;
4409877e 7657
37ab76c9 7658 assert(c->blocking_keys != NULL);
b177fd30 7659 /* The client may wait for multiple keys, so unblock it for every key. */
37ab76c9 7660 for (j = 0; j < c->blocking_keys_num; j++) {
b177fd30 7661 /* Remove this client from the list of clients waiting for this key. */
37ab76c9 7662 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
b177fd30 7663 assert(de != NULL);
7664 l = dictGetEntryVal(de);
7665 listDelNode(l,listSearchKey(l,c));
7666 /* If the list is empty we need to remove it to avoid wasting memory */
7667 if (listLength(l) == 0)
37ab76c9 7668 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
7669 decrRefCount(c->blocking_keys[j]);
b177fd30 7670 }
7671 /* Cleanup the client structure */
37ab76c9 7672 zfree(c->blocking_keys);
7673 c->blocking_keys = NULL;
4409877e 7674 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 7675 server.blpop_blocked_clients--;
5921aa36 7676 /* We want to process data if there is some command waiting
b0d8747d 7677 * in the input buffer. Note that this is safe even if
7678 * unblockClientWaitingData() gets called from freeClient() because
7679 * freeClient() will be smart enough to call this function
7680 * *after* c->querybuf was set to NULL. */
4409877e 7681 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7682}
7683
7684/* This should be called from any function PUSHing into lists.
7685 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7686 * 'ele' is the element pushed.
7687 *
7688 * If the function returns 0 there was no client waiting for a list push
7689 * against this key.
7690 *
7691 * If the function returns 1 there was a client waiting for a list push
7692 * against this key, the element was passed to this client thus it's not
7693 * needed to actually add it to the list and the caller should return asap. */
7694static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7695 struct dictEntry *de;
7696 redisClient *receiver;
7697 list *l;
7698 listNode *ln;
7699
37ab76c9 7700 de = dictFind(c->db->blocking_keys,key);
4409877e 7701 if (de == NULL) return 0;
7702 l = dictGetEntryVal(de);
7703 ln = listFirst(l);
7704 assert(ln != NULL);
7705 receiver = ln->value;
4409877e 7706
b177fd30 7707 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 7708 addReplyBulk(receiver,key);
7709 addReplyBulk(receiver,ele);
b0d8747d 7710 unblockClientWaitingData(receiver);
4409877e 7711 return 1;
7712}
7713
7714/* Blocking RPOP/LPOP */
7715static void blockingPopGenericCommand(redisClient *c, int where) {
7716 robj *o;
7717 time_t timeout;
b177fd30 7718 int j;
4409877e 7719
b177fd30 7720 for (j = 1; j < c->argc-1; j++) {
7721 o = lookupKeyWrite(c->db,c->argv[j]);
7722 if (o != NULL) {
7723 if (o->type != REDIS_LIST) {
7724 addReply(c,shared.wrongtypeerr);
4409877e 7725 return;
b177fd30 7726 } else {
7727 list *list = o->ptr;
7728 if (listLength(list) != 0) {
7729 /* If the list contains elements fall back to the usual
7730 * non-blocking POP operation */
7731 robj *argv[2], **orig_argv;
7732 int orig_argc;
e0a62c7f 7733
b177fd30 7734 /* We need to alter the command arguments before to call
7735 * popGenericCommand() as the command takes a single key. */
7736 orig_argv = c->argv;
7737 orig_argc = c->argc;
7738 argv[1] = c->argv[j];
7739 c->argv = argv;
7740 c->argc = 2;
7741
7742 /* Also the return value is different, we need to output
7743 * the multi bulk reply header and the key name. The
7744 * "real" command will add the last element (the value)
7745 * for us. If this souds like an hack to you it's just
7746 * because it is... */
7747 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 7748 addReplyBulk(c,argv[1]);
b177fd30 7749 popGenericCommand(c,where);
7750
7751 /* Fix the client structure with the original stuff */
7752 c->argv = orig_argv;
7753 c->argc = orig_argc;
7754 return;
7755 }
4409877e 7756 }
7757 }
7758 }
7759 /* If the list is empty or the key does not exists we must block */
b177fd30 7760 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 7761 if (timeout > 0) timeout += time(NULL);
b177fd30 7762 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 7763}
7764
7765static void blpopCommand(redisClient *c) {
7766 blockingPopGenericCommand(c,REDIS_HEAD);
7767}
7768
7769static void brpopCommand(redisClient *c) {
7770 blockingPopGenericCommand(c,REDIS_TAIL);
7771}
7772
ed9b544e 7773/* =============================== Replication ============================= */
7774
a4d1ba9a 7775static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7776 ssize_t nwritten, ret = size;
7777 time_t start = time(NULL);
7778
7779 timeout++;
7780 while(size) {
7781 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7782 nwritten = write(fd,ptr,size);
7783 if (nwritten == -1) return -1;
7784 ptr += nwritten;
7785 size -= nwritten;
7786 }
7787 if ((time(NULL)-start) > timeout) {
7788 errno = ETIMEDOUT;
7789 return -1;
7790 }
7791 }
7792 return ret;
7793}
7794
a4d1ba9a 7795static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7796 ssize_t nread, totread = 0;
7797 time_t start = time(NULL);
7798
7799 timeout++;
7800 while(size) {
7801 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7802 nread = read(fd,ptr,size);
7803 if (nread == -1) return -1;
7804 ptr += nread;
7805 size -= nread;
7806 totread += nread;
7807 }
7808 if ((time(NULL)-start) > timeout) {
7809 errno = ETIMEDOUT;
7810 return -1;
7811 }
7812 }
7813 return totread;
7814}
7815
7816static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7817 ssize_t nread = 0;
7818
7819 size--;
7820 while(size) {
7821 char c;
7822
7823 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7824 if (c == '\n') {
7825 *ptr = '\0';
7826 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7827 return nread;
7828 } else {
7829 *ptr++ = c;
7830 *ptr = '\0';
7831 nread++;
7832 }
7833 }
7834 return nread;
7835}
7836
7837static void syncCommand(redisClient *c) {
40d224a9 7838 /* ignore SYNC if aleady slave or in monitor mode */
7839 if (c->flags & REDIS_SLAVE) return;
7840
7841 /* SYNC can't be issued when the server has pending data to send to
7842 * the client about already issued commands. We need a fresh reply
7843 * buffer registering the differences between the BGSAVE and the current
7844 * dataset, so that we can copy to other slaves if needed. */
7845 if (listLength(c->reply) != 0) {
7846 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7847 return;
7848 }
7849
7850 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7851 /* Here we need to check if there is a background saving operation
7852 * in progress, or if it is required to start one */
9d65a1bb 7853 if (server.bgsavechildpid != -1) {
40d224a9 7854 /* Ok a background save is in progress. Let's check if it is a good
7855 * one for replication, i.e. if there is another slave that is
7856 * registering differences since the server forked to save */
7857 redisClient *slave;
7858 listNode *ln;
c7df85a4 7859 listIter li;
40d224a9 7860
c7df85a4 7861 listRewind(server.slaves,&li);
7862 while((ln = listNext(&li))) {
40d224a9 7863 slave = ln->value;
7864 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 7865 }
7866 if (ln) {
7867 /* Perfect, the server is already registering differences for
7868 * another slave. Set the right state, and copy the buffer. */
7869 listRelease(c->reply);
7870 c->reply = listDup(slave->reply);
40d224a9 7871 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7872 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7873 } else {
7874 /* No way, we need to wait for the next BGSAVE in order to
7875 * register differences */
7876 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7877 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7878 }
7879 } else {
7880 /* Ok we don't have a BGSAVE in progress, let's start one */
7881 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7882 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7883 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7884 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7885 return;
7886 }
7887 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7888 }
6208b3a7 7889 c->repldbfd = -1;
40d224a9 7890 c->flags |= REDIS_SLAVE;
7891 c->slaveseldb = 0;
6b47e12e 7892 listAddNodeTail(server.slaves,c);
40d224a9 7893 return;
7894}
7895
6208b3a7 7896static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7897 redisClient *slave = privdata;
7898 REDIS_NOTUSED(el);
7899 REDIS_NOTUSED(mask);
7900 char buf[REDIS_IOBUF_LEN];
7901 ssize_t nwritten, buflen;
7902
7903 if (slave->repldboff == 0) {
7904 /* Write the bulk write count before to transfer the DB. In theory here
7905 * we don't know how much room there is in the output buffer of the
7906 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7907 * operations) will never be smaller than the few bytes we need. */
7908 sds bulkcount;
7909
7910 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7911 slave->repldbsize);
7912 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7913 {
7914 sdsfree(bulkcount);
7915 freeClient(slave);
7916 return;
7917 }
7918 sdsfree(bulkcount);
7919 }
7920 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7921 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7922 if (buflen <= 0) {
7923 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7924 (buflen == 0) ? "premature EOF" : strerror(errno));
7925 freeClient(slave);
7926 return;
7927 }
7928 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 7929 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 7930 strerror(errno));
7931 freeClient(slave);
7932 return;
7933 }
7934 slave->repldboff += nwritten;
7935 if (slave->repldboff == slave->repldbsize) {
7936 close(slave->repldbfd);
7937 slave->repldbfd = -1;
7938 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7939 slave->replstate = REDIS_REPL_ONLINE;
7940 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 7941 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 7942 freeClient(slave);
7943 return;
7944 }
7945 addReplySds(slave,sdsempty());
7946 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7947 }
7948}
ed9b544e 7949
a3b21203 7950/* This function is called at the end of every backgrond saving.
7951 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7952 * otherwise REDIS_ERR is passed to the function.
7953 *
7954 * The goal of this function is to handle slaves waiting for a successful
7955 * background saving in order to perform non-blocking synchronization. */
7956static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 7957 listNode *ln;
7958 int startbgsave = 0;
c7df85a4 7959 listIter li;
ed9b544e 7960
c7df85a4 7961 listRewind(server.slaves,&li);
7962 while((ln = listNext(&li))) {
6208b3a7 7963 redisClient *slave = ln->value;
ed9b544e 7964
6208b3a7 7965 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7966 startbgsave = 1;
7967 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7968 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 7969 struct redis_stat buf;
e0a62c7f 7970
6208b3a7 7971 if (bgsaveerr != REDIS_OK) {
7972 freeClient(slave);
7973 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7974 continue;
7975 }
7976 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 7977 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 7978 freeClient(slave);
7979 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7980 continue;
7981 }
7982 slave->repldboff = 0;
7983 slave->repldbsize = buf.st_size;
7984 slave->replstate = REDIS_REPL_SEND_BULK;
7985 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 7986 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 7987 freeClient(slave);
7988 continue;
7989 }
7990 }
ed9b544e 7991 }
6208b3a7 7992 if (startbgsave) {
7993 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 7994 listIter li;
7995
7996 listRewind(server.slaves,&li);
6208b3a7 7997 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 7998 while((ln = listNext(&li))) {
6208b3a7 7999 redisClient *slave = ln->value;
ed9b544e 8000
6208b3a7 8001 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8002 freeClient(slave);
8003 }
8004 }
8005 }
ed9b544e 8006}
8007
8008static int syncWithMaster(void) {
d0ccebcf 8009 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 8010 long dumpsize;
ed9b544e 8011 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 8012 int dfd, maxtries = 5;
ed9b544e 8013
8014 if (fd == -1) {
8015 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8016 strerror(errno));
8017 return REDIS_ERR;
8018 }
d0ccebcf 8019
8020 /* AUTH with the master if required. */
8021 if(server.masterauth) {
8022 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8023 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8024 close(fd);
8025 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8026 strerror(errno));
8027 return REDIS_ERR;
8028 }
8029 /* Read the AUTH result. */
8030 if (syncReadLine(fd,buf,1024,3600) == -1) {
8031 close(fd);
8032 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8033 strerror(errno));
8034 return REDIS_ERR;
8035 }
8036 if (buf[0] != '+') {
8037 close(fd);
8038 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8039 return REDIS_ERR;
8040 }
8041 }
8042
ed9b544e 8043 /* Issue the SYNC command */
8044 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8045 close(fd);
8046 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8047 strerror(errno));
8048 return REDIS_ERR;
8049 }
8050 /* Read the bulk write count */
8c4d91fc 8051 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 8052 close(fd);
8053 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8054 strerror(errno));
8055 return REDIS_ERR;
8056 }
4aa701c1 8057 if (buf[0] != '$') {
8058 close(fd);
8059 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8060 return REDIS_ERR;
8061 }
18e61fa2 8062 dumpsize = strtol(buf+1,NULL,10);
8063 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 8064 /* Read the bulk write data on a temp file */
8c5abee8 8065 while(maxtries--) {
8066 snprintf(tmpfile,256,
8067 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8068 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8069 if (dfd != -1) break;
5de9ad7c 8070 sleep(1);
8c5abee8 8071 }
ed9b544e 8072 if (dfd == -1) {
8073 close(fd);
8074 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8075 return REDIS_ERR;
8076 }
8077 while(dumpsize) {
8078 int nread, nwritten;
8079
8080 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8081 if (nread == -1) {
8082 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8083 strerror(errno));
8084 close(fd);
8085 close(dfd);
8086 return REDIS_ERR;
8087 }
8088 nwritten = write(dfd,buf,nread);
8089 if (nwritten == -1) {
8090 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8091 close(fd);
8092 close(dfd);
8093 return REDIS_ERR;
8094 }
8095 dumpsize -= nread;
8096 }
8097 close(dfd);
8098 if (rename(tmpfile,server.dbfilename) == -1) {
8099 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8100 unlink(tmpfile);
8101 close(fd);
8102 return REDIS_ERR;
8103 }
8104 emptyDb();
f78fd11b 8105 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 8106 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8107 close(fd);
8108 return REDIS_ERR;
8109 }
8110 server.master = createClient(fd);
8111 server.master->flags |= REDIS_MASTER;
179b3952 8112 server.master->authenticated = 1;
ed9b544e 8113 server.replstate = REDIS_REPL_CONNECTED;
8114 return REDIS_OK;
8115}
8116
321b0e13 8117static void slaveofCommand(redisClient *c) {
8118 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8119 !strcasecmp(c->argv[2]->ptr,"one")) {
8120 if (server.masterhost) {
8121 sdsfree(server.masterhost);
8122 server.masterhost = NULL;
8123 if (server.master) freeClient(server.master);
8124 server.replstate = REDIS_REPL_NONE;
8125 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8126 }
8127 } else {
8128 sdsfree(server.masterhost);
8129 server.masterhost = sdsdup(c->argv[1]->ptr);
8130 server.masterport = atoi(c->argv[2]->ptr);
8131 if (server.master) freeClient(server.master);
8132 server.replstate = REDIS_REPL_CONNECT;
8133 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8134 server.masterhost, server.masterport);
8135 }
8136 addReply(c,shared.ok);
8137}
8138
3fd78bcd 8139/* ============================ Maxmemory directive ======================== */
8140
a5819310 8141/* Try to free one object form the pre-allocated objects free list.
8142 * This is useful under low mem conditions as by default we take 1 million
8143 * free objects allocated. On success REDIS_OK is returned, otherwise
8144 * REDIS_ERR. */
8145static int tryFreeOneObjectFromFreelist(void) {
f870935d 8146 robj *o;
8147
a5819310 8148 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8149 if (listLength(server.objfreelist)) {
8150 listNode *head = listFirst(server.objfreelist);
8151 o = listNodeValue(head);
8152 listDelNode(server.objfreelist,head);
8153 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8154 zfree(o);
8155 return REDIS_OK;
8156 } else {
8157 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8158 return REDIS_ERR;
8159 }
f870935d 8160}
8161
3fd78bcd 8162/* This function gets called when 'maxmemory' is set on the config file to limit
8163 * the max memory used by the server, and we are out of memory.
8164 * This function will try to, in order:
8165 *
8166 * - Free objects from the free list
8167 * - Try to remove keys with an EXPIRE set
8168 *
8169 * It is not possible to free enough memory to reach used-memory < maxmemory
8170 * the server will start refusing commands that will enlarge even more the
8171 * memory usage.
8172 */
8173static void freeMemoryIfNeeded(void) {
8174 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 8175 int j, k, freed = 0;
8176
8177 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8178 for (j = 0; j < server.dbnum; j++) {
8179 int minttl = -1;
8180 robj *minkey = NULL;
8181 struct dictEntry *de;
8182
8183 if (dictSize(server.db[j].expires)) {
8184 freed = 1;
8185 /* From a sample of three keys drop the one nearest to
8186 * the natural expire */
8187 for (k = 0; k < 3; k++) {
8188 time_t t;
8189
8190 de = dictGetRandomKey(server.db[j].expires);
8191 t = (time_t) dictGetEntryVal(de);
8192 if (minttl == -1 || t < minttl) {
8193 minkey = dictGetEntryKey(de);
8194 minttl = t;
3fd78bcd 8195 }
3fd78bcd 8196 }
a5819310 8197 deleteKey(server.db+j,minkey);
3fd78bcd 8198 }
3fd78bcd 8199 }
a5819310 8200 if (!freed) return; /* nothing to free... */
3fd78bcd 8201 }
8202}
8203
f80dff62 8204/* ============================== Append Only file ========================== */
8205
28ed1f33 8206/* Write the append only file buffer on disk.
8207 *
8208 * Since we are required to write the AOF before replying to the client,
8209 * and the only way the client socket can get a write is entering when the
8210 * the event loop, we accumulate all the AOF writes in a memory
8211 * buffer and write it on disk using this function just before entering
8212 * the event loop again. */
8213static void flushAppendOnlyFile(void) {
8214 time_t now;
8215 ssize_t nwritten;
8216
8217 if (sdslen(server.aofbuf) == 0) return;
8218
8219 /* We want to perform a single write. This should be guaranteed atomic
8220 * at least if the filesystem we are writing is a real physical one.
8221 * While this will save us against the server being killed I don't think
8222 * there is much to do about the whole server stopping for power problems
8223 * or alike */
8224 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8225 if (nwritten != (signed)sdslen(server.aofbuf)) {
8226 /* Ooops, we are in troubles. The best thing to do for now is
8227 * aborting instead of giving the illusion that everything is
8228 * working as expected. */
8229 if (nwritten == -1) {
8230 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8231 } else {
8232 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8233 }
8234 exit(1);
8235 }
8236 sdsfree(server.aofbuf);
8237 server.aofbuf = sdsempty();
8238
8239 /* Fsync if needed */
8240 now = time(NULL);
8241 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8242 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8243 now-server.lastfsync > 1))
8244 {
8245 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8246 * flushing metadata. */
8247 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8248 server.lastfsync = now;
8249 }
8250}
8251
9376e434
PN
8252static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8253 int j;
8254 buf = sdscatprintf(buf,"*%d\r\n",argc);
8255 for (j = 0; j < argc; j++) {
8256 robj *o = getDecodedObject(argv[j]);
8257 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8258 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8259 buf = sdscatlen(buf,"\r\n",2);
8260 decrRefCount(o);
8261 }
8262 return buf;
8263}
8264
8265static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8266 int argc = 3;
8267 long when;
8268 robj *argv[3];
8269
8270 /* Make sure we can use strtol */
8271 seconds = getDecodedObject(seconds);
8272 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8273 decrRefCount(seconds);
8274
8275 argv[0] = createStringObject("EXPIREAT",8);
8276 argv[1] = key;
8277 argv[2] = createObject(REDIS_STRING,
8278 sdscatprintf(sdsempty(),"%ld",when));
8279 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8280 decrRefCount(argv[0]);
8281 decrRefCount(argv[2]);
8282 return buf;
8283}
8284
f80dff62 8285static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8286 sds buf = sdsempty();
f80dff62 8287 robj *tmpargv[3];
8288
8289 /* The DB this command was targetting is not the same as the last command
8290 * we appendend. To issue a SELECT command is needed. */
8291 if (dictid != server.appendseldb) {
8292 char seldb[64];
8293
8294 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 8295 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 8296 (unsigned long)strlen(seldb),seldb);
f80dff62 8297 server.appendseldb = dictid;
8298 }
8299
f80dff62 8300 if (cmd->proc == expireCommand) {
9376e434
PN
8301 /* Translate EXPIRE into EXPIREAT */
8302 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8303 } else if (cmd->proc == setexCommand) {
8304 /* Translate SETEX to SET and EXPIREAT */
8305 tmpargv[0] = createStringObject("SET",3);
f80dff62 8306 tmpargv[1] = argv[1];
9376e434
PN
8307 tmpargv[2] = argv[3];
8308 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8309 decrRefCount(tmpargv[0]);
8310 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8311 } else {
8312 buf = catAppendOnlyGenericCommand(buf,argc,argv);
f80dff62 8313 }
8314
28ed1f33 8315 /* Append to the AOF buffer. This will be flushed on disk just before
8316 * of re-entering the event loop, so before the client will get a
8317 * positive reply about the operation performed. */
8318 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8319
85a83172 8320 /* If a background append only file rewriting is in progress we want to
8321 * accumulate the differences between the child DB and the current one
8322 * in a buffer, so that when the child process will do its work we
8323 * can append the differences to the new append only file. */
8324 if (server.bgrewritechildpid != -1)
8325 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8326
8327 sdsfree(buf);
f80dff62 8328}
8329
8330/* In Redis commands are always executed in the context of a client, so in
8331 * order to load the append only file we need to create a fake client. */
8332static struct redisClient *createFakeClient(void) {
8333 struct redisClient *c = zmalloc(sizeof(*c));
8334
8335 selectDb(c,0);
8336 c->fd = -1;
8337 c->querybuf = sdsempty();
8338 c->argc = 0;
8339 c->argv = NULL;
8340 c->flags = 0;
9387d17d 8341 /* We set the fake client as a slave waiting for the synchronization
8342 * so that Redis will not try to send replies to this client. */
8343 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 8344 c->reply = listCreate();
8345 listSetFreeMethod(c->reply,decrRefCount);
8346 listSetDupMethod(c->reply,dupClientReplyValue);
4132ad8d 8347 initClientMultiState(c);
f80dff62 8348 return c;
8349}
8350
8351static void freeFakeClient(struct redisClient *c) {
8352 sdsfree(c->querybuf);
8353 listRelease(c->reply);
4132ad8d 8354 freeClientMultiState(c);
f80dff62 8355 zfree(c);
8356}
8357
8358/* Replay the append log file. On error REDIS_OK is returned. On non fatal
8359 * error (the append only file is zero-length) REDIS_ERR is returned. On
8360 * fatal error an error message is logged and the program exists. */
8361int loadAppendOnlyFile(char *filename) {
8362 struct redisClient *fakeClient;
8363 FILE *fp = fopen(filename,"r");
8364 struct redis_stat sb;
b492cf00 8365 unsigned long long loadedkeys = 0;
4132ad8d 8366 int appendonly = server.appendonly;
f80dff62 8367
8368 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8369 return REDIS_ERR;
8370
8371 if (fp == NULL) {
8372 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8373 exit(1);
8374 }
8375
4132ad8d
PN
8376 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8377 * to the same file we're about to read. */
8378 server.appendonly = 0;
8379
f80dff62 8380 fakeClient = createFakeClient();
8381 while(1) {
8382 int argc, j;
8383 unsigned long len;
8384 robj **argv;
8385 char buf[128];
8386 sds argsds;
8387 struct redisCommand *cmd;
8388
8389 if (fgets(buf,sizeof(buf),fp) == NULL) {
8390 if (feof(fp))
8391 break;
8392 else
8393 goto readerr;
8394 }
8395 if (buf[0] != '*') goto fmterr;
8396 argc = atoi(buf+1);
8397 argv = zmalloc(sizeof(robj*)*argc);
8398 for (j = 0; j < argc; j++) {
8399 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8400 if (buf[0] != '$') goto fmterr;
8401 len = strtol(buf+1,NULL,10);
8402 argsds = sdsnewlen(NULL,len);
0f151ef1 8403 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 8404 argv[j] = createObject(REDIS_STRING,argsds);
8405 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8406 }
8407
8408 /* Command lookup */
8409 cmd = lookupCommand(argv[0]->ptr);
8410 if (!cmd) {
8411 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8412 exit(1);
8413 }
bdcb92f2 8414 /* Try object encoding */
f80dff62 8415 if (cmd->flags & REDIS_CMD_BULK)
05df7621 8416 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
f80dff62 8417 /* Run the command in the context of a fake client */
8418 fakeClient->argc = argc;
8419 fakeClient->argv = argv;
8420 cmd->proc(fakeClient);
8421 /* Discard the reply objects list from the fake client */
8422 while(listLength(fakeClient->reply))
8423 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8424 /* Clean up, ready for the next command */
8425 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8426 zfree(argv);
b492cf00 8427 /* Handle swapping while loading big datasets when VM is on */
8428 loadedkeys++;
8429 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8430 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 8431 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 8432 }
8433 }
f80dff62 8434 }
4132ad8d
PN
8435
8436 /* This point can only be reached when EOF is reached without errors.
8437 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8438 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8439
f80dff62 8440 fclose(fp);
8441 freeFakeClient(fakeClient);
4132ad8d 8442 server.appendonly = appendonly;
f80dff62 8443 return REDIS_OK;
8444
8445readerr:
8446 if (feof(fp)) {
8447 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8448 } else {
8449 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8450 }
8451 exit(1);
8452fmterr:
8453 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8454 exit(1);
8455}
8456
9d65a1bb 8457/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
9c8e3cee 8458static int fwriteBulkObject(FILE *fp, robj *obj) {
9d65a1bb 8459 char buf[128];
b9bc0eef 8460 int decrrc = 0;
8461
f2d9f50f 8462 /* Avoid the incr/decr ref count business if possible to help
8463 * copy-on-write (we are often in a child process when this function
8464 * is called).
8465 * Also makes sure that key objects don't get incrRefCount-ed when VM
8466 * is enabled */
8467 if (obj->encoding != REDIS_ENCODING_RAW) {
b9bc0eef 8468 obj = getDecodedObject(obj);
8469 decrrc = 1;
8470 }
9d65a1bb 8471 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8472 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 8473 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8474 goto err;
9d65a1bb 8475 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 8476 if (decrrc) decrRefCount(obj);
9d65a1bb 8477 return 1;
8478err:
b9bc0eef 8479 if (decrrc) decrRefCount(obj);
9d65a1bb 8480 return 0;
8481}
8482
9c8e3cee 8483/* Write binary-safe string into a file in the bulkformat
8484 * $<count>\r\n<payload>\r\n */
8485static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8486 char buf[128];
8487
8488 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8489 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8490 if (len && fwrite(s,len,1,fp) == 0) return 0;
8491 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8492 return 1;
8493}
8494
9d65a1bb 8495/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8496static int fwriteBulkDouble(FILE *fp, double d) {
8497 char buf[128], dbuf[128];
8498
8499 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8500 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8501 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8502 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8503 return 1;
8504}
8505
8506/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8507static int fwriteBulkLong(FILE *fp, long l) {
8508 char buf[128], lbuf[128];
8509
8510 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8511 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8512 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8513 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8514 return 1;
8515}
8516
8517/* Write a sequence of commands able to fully rebuild the dataset into
8518 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8519static int rewriteAppendOnlyFile(char *filename) {
8520 dictIterator *di = NULL;
8521 dictEntry *de;
8522 FILE *fp;
8523 char tmpfile[256];
8524 int j;
8525 time_t now = time(NULL);
8526
8527 /* Note that we have to use a different temp name here compared to the
8528 * one used by rewriteAppendOnlyFileBackground() function. */
8529 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8530 fp = fopen(tmpfile,"w");
8531 if (!fp) {
8532 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8533 return REDIS_ERR;
8534 }
8535 for (j = 0; j < server.dbnum; j++) {
8536 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8537 redisDb *db = server.db+j;
8538 dict *d = db->dict;
8539 if (dictSize(d) == 0) continue;
8540 di = dictGetIterator(d);
8541 if (!di) {
8542 fclose(fp);
8543 return REDIS_ERR;
8544 }
8545
8546 /* SELECT the new DB */
8547 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 8548 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 8549
8550 /* Iterate this DB writing every entry */
8551 while((de = dictNext(di)) != NULL) {
e7546c63 8552 robj *key, *o;
8553 time_t expiretime;
8554 int swapped;
8555
8556 key = dictGetEntryKey(de);
b9bc0eef 8557 /* If the value for this key is swapped, load a preview in memory.
8558 * We use a "swapped" flag to remember if we need to free the
8559 * value object instead to just increment the ref count anyway
8560 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 8561 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8562 key->storage == REDIS_VM_SWAPPING) {
e7546c63 8563 o = dictGetEntryVal(de);
8564 swapped = 0;
8565 } else {
8566 o = vmPreviewObject(key);
e7546c63 8567 swapped = 1;
8568 }
8569 expiretime = getExpire(db,key);
9d65a1bb 8570
8571 /* Save the key and associated value */
9d65a1bb 8572 if (o->type == REDIS_STRING) {
8573 /* Emit a SET command */
8574 char cmd[]="*3\r\n$3\r\nSET\r\n";
8575 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8576 /* Key and value */
9c8e3cee 8577 if (fwriteBulkObject(fp,key) == 0) goto werr;
8578 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 8579 } else if (o->type == REDIS_LIST) {
8580 /* Emit the RPUSHes needed to rebuild the list */
8581 list *list = o->ptr;
8582 listNode *ln;
c7df85a4 8583 listIter li;
9d65a1bb 8584
c7df85a4 8585 listRewind(list,&li);
8586 while((ln = listNext(&li))) {
9d65a1bb 8587 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8588 robj *eleobj = listNodeValue(ln);
8589
8590 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8591 if (fwriteBulkObject(fp,key) == 0) goto werr;
8592 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8593 }
8594 } else if (o->type == REDIS_SET) {
8595 /* Emit the SADDs needed to rebuild the set */
8596 dict *set = o->ptr;
8597 dictIterator *di = dictGetIterator(set);
8598 dictEntry *de;
8599
8600 while((de = dictNext(di)) != NULL) {
8601 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8602 robj *eleobj = dictGetEntryKey(de);
8603
8604 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8605 if (fwriteBulkObject(fp,key) == 0) goto werr;
8606 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8607 }
8608 dictReleaseIterator(di);
8609 } else if (o->type == REDIS_ZSET) {
8610 /* Emit the ZADDs needed to rebuild the sorted set */
8611 zset *zs = o->ptr;
8612 dictIterator *di = dictGetIterator(zs->dict);
8613 dictEntry *de;
8614
8615 while((de = dictNext(di)) != NULL) {
8616 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8617 robj *eleobj = dictGetEntryKey(de);
8618 double *score = dictGetEntryVal(de);
8619
8620 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8621 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8622 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 8623 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8624 }
8625 dictReleaseIterator(di);
9c8e3cee 8626 } else if (o->type == REDIS_HASH) {
8627 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8628
8629 /* Emit the HSETs needed to rebuild the hash */
8630 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8631 unsigned char *p = zipmapRewind(o->ptr);
8632 unsigned char *field, *val;
8633 unsigned int flen, vlen;
8634
8635 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8636 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8637 if (fwriteBulkObject(fp,key) == 0) goto werr;
8638 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8639 return -1;
8640 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8641 return -1;
8642 }
8643 } else {
8644 dictIterator *di = dictGetIterator(o->ptr);
8645 dictEntry *de;
8646
8647 while((de = dictNext(di)) != NULL) {
8648 robj *field = dictGetEntryKey(de);
8649 robj *val = dictGetEntryVal(de);
8650
8651 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8652 if (fwriteBulkObject(fp,key) == 0) goto werr;
8653 if (fwriteBulkObject(fp,field) == -1) return -1;
8654 if (fwriteBulkObject(fp,val) == -1) return -1;
8655 }
8656 dictReleaseIterator(di);
8657 }
9d65a1bb 8658 } else {
f83c6cb5 8659 redisPanic("Unknown object type");
9d65a1bb 8660 }
8661 /* Save the expire time */
8662 if (expiretime != -1) {
e96e4fbf 8663 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 8664 /* If this key is already expired skip it */
8665 if (expiretime < now) continue;
8666 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8667 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8668 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8669 }
b9bc0eef 8670 if (swapped) decrRefCount(o);
9d65a1bb 8671 }
8672 dictReleaseIterator(di);
8673 }
8674
8675 /* Make sure data will not remain on the OS's output buffers */
8676 fflush(fp);
8677 fsync(fileno(fp));
8678 fclose(fp);
e0a62c7f 8679
9d65a1bb 8680 /* Use RENAME to make sure the DB file is changed atomically only
8681 * if the generate DB file is ok. */
8682 if (rename(tmpfile,filename) == -1) {
8683 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8684 unlink(tmpfile);
8685 return REDIS_ERR;
8686 }
8687 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8688 return REDIS_OK;
8689
8690werr:
8691 fclose(fp);
8692 unlink(tmpfile);
e96e4fbf 8693 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 8694 if (di) dictReleaseIterator(di);
8695 return REDIS_ERR;
8696}
8697
8698/* This is how rewriting of the append only file in background works:
8699 *
8700 * 1) The user calls BGREWRITEAOF
8701 * 2) Redis calls this function, that forks():
8702 * 2a) the child rewrite the append only file in a temp file.
8703 * 2b) the parent accumulates differences in server.bgrewritebuf.
8704 * 3) When the child finished '2a' exists.
8705 * 4) The parent will trap the exit code, if it's OK, will append the
8706 * data accumulated into server.bgrewritebuf into the temp file, and
8707 * finally will rename(2) the temp file in the actual file name.
8708 * The the new file is reopened as the new append only file. Profit!
8709 */
8710static int rewriteAppendOnlyFileBackground(void) {
8711 pid_t childpid;
8712
8713 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 8714 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 8715 if ((childpid = fork()) == 0) {
8716 /* Child */
8717 char tmpfile[256];
9d65a1bb 8718
054e426d 8719 if (server.vm_enabled) vmReopenSwapFile();
8720 close(server.fd);
9d65a1bb 8721 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8722 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 8723 _exit(0);
9d65a1bb 8724 } else {
478c2c6f 8725 _exit(1);
9d65a1bb 8726 }
8727 } else {
8728 /* Parent */
8729 if (childpid == -1) {
8730 redisLog(REDIS_WARNING,
8731 "Can't rewrite append only file in background: fork: %s",
8732 strerror(errno));
8733 return REDIS_ERR;
8734 }
8735 redisLog(REDIS_NOTICE,
8736 "Background append only file rewriting started by pid %d",childpid);
8737 server.bgrewritechildpid = childpid;
884d4b39 8738 updateDictResizePolicy();
85a83172 8739 /* We set appendseldb to -1 in order to force the next call to the
8740 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8741 * accumulated by the parent into server.bgrewritebuf will start
8742 * with a SELECT statement and it will be safe to merge. */
8743 server.appendseldb = -1;
9d65a1bb 8744 return REDIS_OK;
8745 }
8746 return REDIS_OK; /* unreached */
8747}
8748
8749static void bgrewriteaofCommand(redisClient *c) {
8750 if (server.bgrewritechildpid != -1) {
8751 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8752 return;
8753 }
8754 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 8755 char *status = "+Background append only file rewriting started\r\n";
8756 addReplySds(c,sdsnew(status));
9d65a1bb 8757 } else {
8758 addReply(c,shared.err);
8759 }
8760}
8761
8762static void aofRemoveTempFile(pid_t childpid) {
8763 char tmpfile[256];
8764
8765 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8766 unlink(tmpfile);
8767}
8768
996cb5f7 8769/* Virtual Memory is composed mainly of two subsystems:
8770 * - Blocking Virutal Memory
8771 * - Threaded Virtual Memory I/O
8772 * The two parts are not fully decoupled, but functions are split among two
8773 * different sections of the source code (delimited by comments) in order to
8774 * make more clear what functionality is about the blocking VM and what about
8775 * the threaded (not blocking) VM.
8776 *
8777 * Redis VM design:
8778 *
8779 * Redis VM is a blocking VM (one that blocks reading swapped values from
8780 * disk into memory when a value swapped out is needed in memory) that is made
8781 * unblocking by trying to examine the command argument vector in order to
8782 * load in background values that will likely be needed in order to exec
8783 * the command. The command is executed only once all the relevant keys
8784 * are loaded into memory.
8785 *
8786 * This basically is almost as simple of a blocking VM, but almost as parallel
8787 * as a fully non-blocking VM.
8788 */
8789
2e5eb04e 8790/* Called when the user switches from "appendonly yes" to "appendonly no"
8791 * at runtime using the CONFIG command. */
8792static void stopAppendOnly(void) {
8793 flushAppendOnlyFile();
8794 fsync(server.appendfd);
8795 close(server.appendfd);
8796
8797 server.appendfd = -1;
8798 server.appendseldb = -1;
8799 server.appendonly = 0;
8800 /* rewrite operation in progress? kill it, wait child exit */
8801 if (server.bgsavechildpid != -1) {
8802 int statloc;
8803
30dd89b6 8804 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8805 wait3(&statloc,0,NULL);
2e5eb04e 8806 /* reset the buffer accumulating changes while the child saves */
8807 sdsfree(server.bgrewritebuf);
8808 server.bgrewritebuf = sdsempty();
30dd89b6 8809 server.bgsavechildpid = -1;
2e5eb04e 8810 }
8811}
8812
8813/* Called when the user switches from "appendonly no" to "appendonly yes"
8814 * at runtime using the CONFIG command. */
8815static int startAppendOnly(void) {
8816 server.appendonly = 1;
8817 server.lastfsync = time(NULL);
8818 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8819 if (server.appendfd == -1) {
8820 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8821 return REDIS_ERR;
8822 }
8823 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8824 server.appendonly = 0;
8825 close(server.appendfd);
8826 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8827 return REDIS_ERR;
8828 }
8829 return REDIS_OK;
8830}
8831
996cb5f7 8832/* =================== Virtual Memory - Blocking Side ====================== */
054e426d 8833
75680a3c 8834static void vmInit(void) {
8835 off_t totsize;
996cb5f7 8836 int pipefds[2];
bcaa7a4f 8837 size_t stacksize;
8b5bb414 8838 struct flock fl;
75680a3c 8839
4ad37480 8840 if (server.vm_max_threads != 0)
8841 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8842
054e426d 8843 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8b5bb414 8844 /* Try to open the old swap file, otherwise create it */
6fa987e3 8845 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8846 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8847 }
75680a3c 8848 if (server.vm_fp == NULL) {
6fa987e3 8849 redisLog(REDIS_WARNING,
8b5bb414 8850 "Can't open the swap file: %s. Exiting.",
6fa987e3 8851 strerror(errno));
75680a3c 8852 exit(1);
8853 }
8854 server.vm_fd = fileno(server.vm_fp);
8b5bb414 8855 /* Lock the swap file for writing, this is useful in order to avoid
8856 * another instance to use the same swap file for a config error. */
8857 fl.l_type = F_WRLCK;
8858 fl.l_whence = SEEK_SET;
8859 fl.l_start = fl.l_len = 0;
8860 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8861 redisLog(REDIS_WARNING,
8862 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8863 exit(1);
8864 }
8865 /* Initialize */
75680a3c 8866 server.vm_next_page = 0;
8867 server.vm_near_pages = 0;
7d98e08c 8868 server.vm_stats_used_pages = 0;
8869 server.vm_stats_swapped_objects = 0;
8870 server.vm_stats_swapouts = 0;
8871 server.vm_stats_swapins = 0;
75680a3c 8872 totsize = server.vm_pages*server.vm_page_size;
8873 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8874 if (ftruncate(server.vm_fd,totsize) == -1) {
8875 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8876 strerror(errno));
8877 exit(1);
8878 } else {
8879 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8880 }
7d30035d 8881 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 8882 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 8883 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 8884 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 8885
996cb5f7 8886 /* Initialize threaded I/O (used by Virtual Memory) */
8887 server.io_newjobs = listCreate();
8888 server.io_processing = listCreate();
8889 server.io_processed = listCreate();
d5d55fc3 8890 server.io_ready_clients = listCreate();
92f8e882 8891 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 8892 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8893 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 8894 server.io_active_threads = 0;
996cb5f7 8895 if (pipe(pipefds) == -1) {
8896 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8897 ,strerror(errno));
8898 exit(1);
8899 }
8900 server.io_ready_pipe_read = pipefds[0];
8901 server.io_ready_pipe_write = pipefds[1];
8902 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 8903 /* LZF requires a lot of stack */
8904 pthread_attr_init(&server.io_threads_attr);
8905 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8906 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8907 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 8908 /* Listen for events in the threaded I/O pipe */
8909 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8910 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8911 oom("creating file event");
75680a3c 8912}
8913
06224fec 8914/* Mark the page as used */
8915static void vmMarkPageUsed(off_t page) {
8916 off_t byte = page/8;
8917 int bit = page&7;
970e10bb 8918 redisAssert(vmFreePage(page) == 1);
06224fec 8919 server.vm_bitmap[byte] |= 1<<bit;
8920}
8921
8922/* Mark N contiguous pages as used, with 'page' being the first. */
8923static void vmMarkPagesUsed(off_t page, off_t count) {
8924 off_t j;
8925
8926 for (j = 0; j < count; j++)
7d30035d 8927 vmMarkPageUsed(page+j);
7d98e08c 8928 server.vm_stats_used_pages += count;
7c775e09 8929 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8930 (long long)count, (long long)page);
06224fec 8931}
8932
8933/* Mark the page as free */
8934static void vmMarkPageFree(off_t page) {
8935 off_t byte = page/8;
8936 int bit = page&7;
970e10bb 8937 redisAssert(vmFreePage(page) == 0);
06224fec 8938 server.vm_bitmap[byte] &= ~(1<<bit);
8939}
8940
8941/* Mark N contiguous pages as free, with 'page' being the first. */
8942static void vmMarkPagesFree(off_t page, off_t count) {
8943 off_t j;
8944
8945 for (j = 0; j < count; j++)
7d30035d 8946 vmMarkPageFree(page+j);
7d98e08c 8947 server.vm_stats_used_pages -= count;
7c775e09 8948 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8949 (long long)count, (long long)page);
06224fec 8950}
8951
8952/* Test if the page is free */
8953static int vmFreePage(off_t page) {
8954 off_t byte = page/8;
8955 int bit = page&7;
7d30035d 8956 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 8957}
8958
8959/* Find N contiguous free pages storing the first page of the cluster in *first.
e0a62c7f 8960 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
3a66edc7 8961 * REDIS_ERR is returned.
06224fec 8962 *
8963 * This function uses a simple algorithm: we try to allocate
8964 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8965 * again from the start of the swap file searching for free spaces.
8966 *
8967 * If it looks pretty clear that there are no free pages near our offset
8968 * we try to find less populated places doing a forward jump of
8969 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8970 * without hurry, and then we jump again and so forth...
e0a62c7f 8971 *
06224fec 8972 * This function can be improved using a free list to avoid to guess
8973 * too much, since we could collect data about freed pages.
8974 *
8975 * note: I implemented this function just after watching an episode of
8976 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8977 */
c7df85a4 8978static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 8979 off_t base, offset = 0, since_jump = 0, numfree = 0;
8980
8981 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8982 server.vm_near_pages = 0;
8983 server.vm_next_page = 0;
8984 }
8985 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8986 base = server.vm_next_page;
8987
8988 while(offset < server.vm_pages) {
8989 off_t this = base+offset;
8990
8991 /* If we overflow, restart from page zero */
8992 if (this >= server.vm_pages) {
8993 this -= server.vm_pages;
8994 if (this == 0) {
8995 /* Just overflowed, what we found on tail is no longer
8996 * interesting, as it's no longer contiguous. */
8997 numfree = 0;
8998 }
8999 }
9000 if (vmFreePage(this)) {
9001 /* This is a free page */
9002 numfree++;
9003 /* Already got N free pages? Return to the caller, with success */
9004 if (numfree == n) {
7d30035d 9005 *first = this-(n-1);
9006 server.vm_next_page = this+1;
7c775e09 9007 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 9008 return REDIS_OK;
06224fec 9009 }
9010 } else {
9011 /* The current one is not a free page */
9012 numfree = 0;
9013 }
9014
9015 /* Fast-forward if the current page is not free and we already
9016 * searched enough near this place. */
9017 since_jump++;
9018 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9019 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9020 since_jump = 0;
9021 /* Note that even if we rewind after the jump, we are don't need
9022 * to make sure numfree is set to zero as we only jump *if* it
9023 * is set to zero. */
9024 } else {
9025 /* Otherwise just check the next page */
9026 offset++;
9027 }
9028 }
3a66edc7 9029 return REDIS_ERR;
9030}
9031
a5819310 9032/* Write the specified object at the specified page of the swap file */
9033static int vmWriteObjectOnSwap(robj *o, off_t page) {
9034 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9035 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9036 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9037 redisLog(REDIS_WARNING,
9ebed7cf 9038 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 9039 strerror(errno));
9040 return REDIS_ERR;
9041 }
9042 rdbSaveObject(server.vm_fp,o);
ba76a8f9 9043 fflush(server.vm_fp);
a5819310 9044 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9045 return REDIS_OK;
9046}
9047
3a66edc7 9048/* Swap the 'val' object relative to 'key' into disk. Store all the information
9049 * needed to later retrieve the object into the key object.
9050 * If we can't find enough contiguous empty pages to swap the object on disk
9051 * REDIS_ERR is returned. */
a69a0c9c 9052static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 9053 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 9054 off_t page;
9055
9056 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 9057 assert(key->refcount == 1);
3a66edc7 9058 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 9059 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 9060 key->vm.page = page;
9061 key->vm.usedpages = pages;
9062 key->storage = REDIS_VM_SWAPPED;
d894161b 9063 key->vtype = val->type;
3a66edc7 9064 decrRefCount(val); /* Deallocate the object from memory. */
9065 vmMarkPagesUsed(page,pages);
7d30035d 9066 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
9067 (unsigned char*) key->ptr,
9068 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 9069 server.vm_stats_swapped_objects++;
9070 server.vm_stats_swapouts++;
3a66edc7 9071 return REDIS_OK;
9072}
9073
a5819310 9074static robj *vmReadObjectFromSwap(off_t page, int type) {
9075 robj *o;
3a66edc7 9076
a5819310 9077 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9078 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 9079 redisLog(REDIS_WARNING,
d5d55fc3 9080 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 9081 strerror(errno));
478c2c6f 9082 _exit(1);
3a66edc7 9083 }
a5819310 9084 o = rdbLoadObject(type,server.vm_fp);
9085 if (o == NULL) {
d5d55fc3 9086 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 9087 _exit(1);
3a66edc7 9088 }
a5819310 9089 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9090 return o;
9091}
9092
9093/* Load the value object relative to the 'key' object from swap to memory.
9094 * The newly allocated object is returned.
9095 *
9096 * If preview is true the unserialized object is returned to the caller but
9097 * no changes are made to the key object, nor the pages are marked as freed */
9098static robj *vmGenericLoadObject(robj *key, int preview) {
9099 robj *val;
9100
d5d55fc3 9101 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
a5819310 9102 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 9103 if (!preview) {
9104 key->storage = REDIS_VM_MEMORY;
9105 key->vm.atime = server.unixtime;
9106 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9107 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
9108 (unsigned char*) key->ptr);
7d98e08c 9109 server.vm_stats_swapped_objects--;
38aba9a1 9110 } else {
9111 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
9112 (unsigned char*) key->ptr);
7e69548d 9113 }
7d98e08c 9114 server.vm_stats_swapins++;
3a66edc7 9115 return val;
06224fec 9116}
9117
7e69548d 9118/* Plain object loading, from swap to memory */
9119static robj *vmLoadObject(robj *key) {
996cb5f7 9120 /* If we are loading the object in background, stop it, we
9121 * need to load this object synchronously ASAP. */
9122 if (key->storage == REDIS_VM_LOADING)
9123 vmCancelThreadedIOJob(key);
7e69548d 9124 return vmGenericLoadObject(key,0);
9125}
9126
9127/* Just load the value on disk, without to modify the key.
9128 * This is useful when we want to perform some operation on the value
9129 * without to really bring it from swap to memory, like while saving the
9130 * dataset or rewriting the append only log. */
9131static robj *vmPreviewObject(robj *key) {
9132 return vmGenericLoadObject(key,1);
9133}
9134
4ef8de8a 9135/* How a good candidate is this object for swapping?
9136 * The better candidate it is, the greater the returned value.
9137 *
9138 * Currently we try to perform a fast estimation of the object size in
9139 * memory, and combine it with aging informations.
9140 *
9141 * Basically swappability = idle-time * log(estimated size)
9142 *
9143 * Bigger objects are preferred over smaller objects, but not
9144 * proportionally, this is why we use the logarithm. This algorithm is
9145 * just a first try and will probably be tuned later. */
9146static double computeObjectSwappability(robj *o) {
9147 time_t age = server.unixtime - o->vm.atime;
9148 long asize = 0;
9149 list *l;
9150 dict *d;
9151 struct dictEntry *de;
9152 int z;
9153
9154 if (age <= 0) return 0;
9155 switch(o->type) {
9156 case REDIS_STRING:
9157 if (o->encoding != REDIS_ENCODING_RAW) {
9158 asize = sizeof(*o);
9159 } else {
9160 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9161 }
9162 break;
9163 case REDIS_LIST:
9164 l = o->ptr;
9165 listNode *ln = listFirst(l);
9166
9167 asize = sizeof(list);
9168 if (ln) {
9169 robj *ele = ln->value;
9170 long elesize;
9171
9172 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9173 (sizeof(*o)+sdslen(ele->ptr)) :
9174 sizeof(*o);
9175 asize += (sizeof(listNode)+elesize)*listLength(l);
9176 }
9177 break;
9178 case REDIS_SET:
9179 case REDIS_ZSET:
9180 z = (o->type == REDIS_ZSET);
9181 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9182
9183 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9184 if (z) asize += sizeof(zset)-sizeof(dict);
9185 if (dictSize(d)) {
9186 long elesize;
9187 robj *ele;
9188
9189 de = dictGetRandomKey(d);
9190 ele = dictGetEntryKey(de);
9191 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9192 (sizeof(*o)+sdslen(ele->ptr)) :
9193 sizeof(*o);
9194 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9195 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9196 }
9197 break;
a97b9060 9198 case REDIS_HASH:
9199 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9200 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9201 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9202 unsigned int klen, vlen;
9203 unsigned char *key, *val;
9204
9205 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9206 klen = 0;
9207 vlen = 0;
9208 }
9209 asize = len*(klen+vlen+3);
9210 } else if (o->encoding == REDIS_ENCODING_HT) {
9211 d = o->ptr;
9212 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9213 if (dictSize(d)) {
9214 long elesize;
9215 robj *ele;
9216
9217 de = dictGetRandomKey(d);
9218 ele = dictGetEntryKey(de);
9219 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9220 (sizeof(*o)+sdslen(ele->ptr)) :
9221 sizeof(*o);
9222 ele = dictGetEntryVal(de);
9223 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9224 (sizeof(*o)+sdslen(ele->ptr)) :
9225 sizeof(*o);
9226 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9227 }
9228 }
9229 break;
4ef8de8a 9230 }
c8c72447 9231 return (double)age*log(1+asize);
4ef8de8a 9232}
9233
9234/* Try to swap an object that's a good candidate for swapping.
9235 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 9236 * to swap any object at all.
9237 *
9238 * If 'usethreaded' is true, Redis will try to swap the object in background
9239 * using I/O threads. */
9240static int vmSwapOneObject(int usethreads) {
4ef8de8a 9241 int j, i;
9242 struct dictEntry *best = NULL;
9243 double best_swappability = 0;
b9bc0eef 9244 redisDb *best_db = NULL;
4ef8de8a 9245 robj *key, *val;
9246
9247 for (j = 0; j < server.dbnum; j++) {
9248 redisDb *db = server.db+j;
b72f6a4b 9249 /* Why maxtries is set to 100?
9250 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9251 * are swappable objects */
b0d8747d 9252 int maxtries = 100;
4ef8de8a 9253
9254 if (dictSize(db->dict) == 0) continue;
9255 for (i = 0; i < 5; i++) {
9256 dictEntry *de;
9257 double swappability;
9258
e3cadb8a 9259 if (maxtries) maxtries--;
4ef8de8a 9260 de = dictGetRandomKey(db->dict);
9261 key = dictGetEntryKey(de);
9262 val = dictGetEntryVal(de);
1064ef87 9263 /* Only swap objects that are currently in memory.
9264 *
9265 * Also don't swap shared objects if threaded VM is on, as we
9266 * try to ensure that the main thread does not touch the
9267 * object while the I/O thread is using it, but we can't
9268 * control other keys without adding additional mutex. */
9269 if (key->storage != REDIS_VM_MEMORY ||
9270 (server.vm_max_threads != 0 && val->refcount != 1)) {
e3cadb8a 9271 if (maxtries) i--; /* don't count this try */
9272 continue;
9273 }
4ef8de8a 9274 swappability = computeObjectSwappability(val);
9275 if (!best || swappability > best_swappability) {
9276 best = de;
9277 best_swappability = swappability;
b9bc0eef 9278 best_db = db;
4ef8de8a 9279 }
9280 }
9281 }
7c775e09 9282 if (best == NULL) return REDIS_ERR;
4ef8de8a 9283 key = dictGetEntryKey(best);
9284 val = dictGetEntryVal(best);
9285
e3cadb8a 9286 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 9287 key->ptr, best_swappability);
9288
9289 /* Unshare the key if needed */
9290 if (key->refcount > 1) {
9291 robj *newkey = dupStringObject(key);
9292 decrRefCount(key);
9293 key = dictGetEntryKey(best) = newkey;
9294 }
9295 /* Swap it */
a69a0c9c 9296 if (usethreads) {
b9bc0eef 9297 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 9298 return REDIS_OK;
9299 } else {
a69a0c9c 9300 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9301 dictGetEntryVal(best) = NULL;
9302 return REDIS_OK;
9303 } else {
9304 return REDIS_ERR;
9305 }
4ef8de8a 9306 }
9307}
9308
a69a0c9c 9309static int vmSwapOneObjectBlocking() {
9310 return vmSwapOneObject(0);
9311}
9312
9313static int vmSwapOneObjectThreaded() {
9314 return vmSwapOneObject(1);
9315}
9316
7e69548d 9317/* Return true if it's safe to swap out objects in a given moment.
9318 * Basically we don't want to swap objects out while there is a BGSAVE
9319 * or a BGAEOREWRITE running in backgroud. */
9320static int vmCanSwapOut(void) {
9321 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9322}
9323
1b03836c 9324/* Delete a key if swapped. Returns 1 if the key was found, was swapped
9325 * and was deleted. Otherwise 0 is returned. */
9326static int deleteIfSwapped(redisDb *db, robj *key) {
9327 dictEntry *de;
9328 robj *foundkey;
9329
9330 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9331 foundkey = dictGetEntryKey(de);
9332 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9333 deleteKey(db,key);
9334 return 1;
9335}
9336
996cb5f7 9337/* =================== Virtual Memory - Threaded I/O ======================= */
9338
b9bc0eef 9339static void freeIOJob(iojob *j) {
d5d55fc3 9340 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9341 j->type == REDIS_IOJOB_DO_SWAP ||
9342 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
b9bc0eef 9343 decrRefCount(j->val);
78ebe4c8 9344 /* We don't decrRefCount the j->key field as we did't incremented
9345 * the count creating IO Jobs. This is because the key field here is
9346 * just used as an indentifier and if a key is removed the Job should
9347 * never be touched again. */
b9bc0eef 9348 zfree(j);
9349}
9350
996cb5f7 9351/* Every time a thread finished a Job, it writes a byte into the write side
9352 * of an unix pipe in order to "awake" the main thread, and this function
9353 * is called. */
9354static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9355 int mask)
9356{
9357 char buf[1];
b0d8747d 9358 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 9359 REDIS_NOTUSED(el);
9360 REDIS_NOTUSED(mask);
9361 REDIS_NOTUSED(privdata);
9362
9363 /* For every byte we read in the read side of the pipe, there is one
9364 * I/O job completed to process. */
9365 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 9366 iojob *j;
9367 listNode *ln;
9368 robj *key;
9369 struct dictEntry *de;
9370
996cb5f7 9371 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 9372
9373 /* Get the processed element (the oldest one) */
9374 lockThreadedIO();
1064ef87 9375 assert(listLength(server.io_processed) != 0);
f6c0bba8 9376 if (toprocess == -1) {
9377 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9378 if (toprocess <= 0) toprocess = 1;
9379 }
b9bc0eef 9380 ln = listFirst(server.io_processed);
9381 j = ln->value;
9382 listDelNode(server.io_processed,ln);
9383 unlockThreadedIO();
9384 /* If this job is marked as canceled, just ignore it */
9385 if (j->canceled) {
9386 freeIOJob(j);
9387 continue;
9388 }
9389 /* Post process it in the main thread, as there are things we
9390 * can do just here to avoid race conditions and/or invasive locks */
6c96ba7d 9391 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 9392 de = dictFind(j->db->dict,j->key);
9393 assert(de != NULL);
9394 key = dictGetEntryKey(de);
9395 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9396 redisDb *db;
9397
b9bc0eef 9398 /* Key loaded, bring it at home */
9399 key->storage = REDIS_VM_MEMORY;
9400 key->vm.atime = server.unixtime;
9401 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9402 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9403 (unsigned char*) key->ptr);
9404 server.vm_stats_swapped_objects--;
9405 server.vm_stats_swapins++;
d5d55fc3 9406 dictGetEntryVal(de) = j->val;
9407 incrRefCount(j->val);
9408 db = j->db;
b9bc0eef 9409 freeIOJob(j);
d5d55fc3 9410 /* Handle clients waiting for this key to be loaded. */
9411 handleClientsBlockedOnSwappedKey(db,key);
b9bc0eef 9412 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9413 /* Now we know the amount of pages required to swap this object.
9414 * Let's find some space for it, and queue this task again
9415 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 9416 if (!vmCanSwapOut() ||
9417 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9418 {
9419 /* Ooops... no space or we can't swap as there is
9420 * a fork()ed Redis trying to save stuff on disk. */
b9bc0eef 9421 freeIOJob(j);
054e426d 9422 key->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 9423 } else {
c7df85a4 9424 /* Note that we need to mark this pages as used now,
9425 * if the job will be canceled, we'll mark them as freed
9426 * again. */
9427 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 9428 j->type = REDIS_IOJOB_DO_SWAP;
9429 lockThreadedIO();
9430 queueIOJob(j);
9431 unlockThreadedIO();
9432 }
9433 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9434 robj *val;
9435
9436 /* Key swapped. We can finally free some memory. */
6c96ba7d 9437 if (key->storage != REDIS_VM_SWAPPING) {
9438 printf("key->storage: %d\n",key->storage);
9439 printf("key->name: %s\n",(char*)key->ptr);
9440 printf("key->refcount: %d\n",key->refcount);
9441 printf("val: %p\n",(void*)j->val);
9442 printf("val->type: %d\n",j->val->type);
9443 printf("val->ptr: %s\n",(char*)j->val->ptr);
9444 }
9445 redisAssert(key->storage == REDIS_VM_SWAPPING);
b9bc0eef 9446 val = dictGetEntryVal(de);
9447 key->vm.page = j->page;
9448 key->vm.usedpages = j->pages;
9449 key->storage = REDIS_VM_SWAPPED;
9450 key->vtype = j->val->type;
9451 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 9452 dictGetEntryVal(de) = NULL;
b9bc0eef 9453 redisLog(REDIS_DEBUG,
9454 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9455 (unsigned char*) key->ptr,
9456 (unsigned long long) j->page, (unsigned long long) j->pages);
9457 server.vm_stats_swapped_objects++;
9458 server.vm_stats_swapouts++;
9459 freeIOJob(j);
f11b8647 9460 /* Put a few more swap requests in queue if we are still
9461 * out of memory */
b0d8747d 9462 if (trytoswap && vmCanSwapOut() &&
9463 zmalloc_used_memory() > server.vm_max_memory)
9464 {
f11b8647 9465 int more = 1;
9466 while(more) {
9467 lockThreadedIO();
9468 more = listLength(server.io_newjobs) <
9469 (unsigned) server.vm_max_threads;
9470 unlockThreadedIO();
9471 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 9472 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9473 trytoswap = 0;
9474 break;
9475 }
f11b8647 9476 }
9477 }
b9bc0eef 9478 }
c953f24b 9479 processed++;
f6c0bba8 9480 if (processed == toprocess) return;
996cb5f7 9481 }
9482 if (retval < 0 && errno != EAGAIN) {
9483 redisLog(REDIS_WARNING,
9484 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9485 strerror(errno));
9486 }
9487}
9488
9489static void lockThreadedIO(void) {
9490 pthread_mutex_lock(&server.io_mutex);
9491}
9492
9493static void unlockThreadedIO(void) {
9494 pthread_mutex_unlock(&server.io_mutex);
9495}
9496
9497/* Remove the specified object from the threaded I/O queue if still not
9498 * processed, otherwise make sure to flag it as canceled. */
9499static void vmCancelThreadedIOJob(robj *o) {
9500 list *lists[3] = {
6c96ba7d 9501 server.io_newjobs, /* 0 */
9502 server.io_processing, /* 1 */
9503 server.io_processed /* 2 */
996cb5f7 9504 };
9505 int i;
9506
9507 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 9508again:
996cb5f7 9509 lockThreadedIO();
9510 /* Search for a matching key in one of the queues */
9511 for (i = 0; i < 3; i++) {
9512 listNode *ln;
c7df85a4 9513 listIter li;
996cb5f7 9514
c7df85a4 9515 listRewind(lists[i],&li);
9516 while ((ln = listNext(&li)) != NULL) {
996cb5f7 9517 iojob *job = ln->value;
9518
6c96ba7d 9519 if (job->canceled) continue; /* Skip this, already canceled. */
78ebe4c8 9520 if (job->key == o) {
970e10bb 9521 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9522 (void*)job, (char*)o->ptr, job->type, i);
427a2153 9523 /* Mark the pages as free since the swap didn't happened
9524 * or happened but is now discarded. */
970e10bb 9525 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 9526 vmMarkPagesFree(job->page,job->pages);
9527 /* Cancel the job. It depends on the list the job is
9528 * living in. */
996cb5f7 9529 switch(i) {
9530 case 0: /* io_newjobs */
6c96ba7d 9531 /* If the job was yet not processed the best thing to do
996cb5f7 9532 * is to remove it from the queue at all */
6c96ba7d 9533 freeIOJob(job);
996cb5f7 9534 listDelNode(lists[i],ln);
9535 break;
9536 case 1: /* io_processing */
d5d55fc3 9537 /* Oh Shi- the thread is messing with the Job:
9538 *
9539 * Probably it's accessing the object if this is a
9540 * PREPARE_SWAP or DO_SWAP job.
9541 * If it's a LOAD job it may be reading from disk and
9542 * if we don't wait for the job to terminate before to
9543 * cancel it, maybe in a few microseconds data can be
9544 * corrupted in this pages. So the short story is:
9545 *
9546 * Better to wait for the job to move into the
9547 * next queue (processed)... */
9548
9549 /* We try again and again until the job is completed. */
9550 unlockThreadedIO();
9551 /* But let's wait some time for the I/O thread
9552 * to finish with this job. After all this condition
9553 * should be very rare. */
9554 usleep(1);
9555 goto again;
996cb5f7 9556 case 2: /* io_processed */
2e111efe 9557 /* The job was already processed, that's easy...
9558 * just mark it as canceled so that we'll ignore it
9559 * when processing completed jobs. */
996cb5f7 9560 job->canceled = 1;
9561 break;
9562 }
c7df85a4 9563 /* Finally we have to adjust the storage type of the object
9564 * in order to "UNDO" the operaiton. */
996cb5f7 9565 if (o->storage == REDIS_VM_LOADING)
9566 o->storage = REDIS_VM_SWAPPED;
9567 else if (o->storage == REDIS_VM_SWAPPING)
9568 o->storage = REDIS_VM_MEMORY;
9569 unlockThreadedIO();
9570 return;
9571 }
9572 }
9573 }
9574 unlockThreadedIO();
9575 assert(1 != 1); /* We should never reach this */
9576}
9577
b9bc0eef 9578static void *IOThreadEntryPoint(void *arg) {
9579 iojob *j;
9580 listNode *ln;
9581 REDIS_NOTUSED(arg);
9582
9583 pthread_detach(pthread_self());
9584 while(1) {
9585 /* Get a new job to process */
9586 lockThreadedIO();
9587 if (listLength(server.io_newjobs) == 0) {
9588 /* No new jobs in queue, exit. */
9ebed7cf 9589 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9590 (long) pthread_self());
b9bc0eef 9591 server.io_active_threads--;
9592 unlockThreadedIO();
9593 return NULL;
9594 }
9595 ln = listFirst(server.io_newjobs);
9596 j = ln->value;
9597 listDelNode(server.io_newjobs,ln);
9598 /* Add the job in the processing queue */
9599 j->thread = pthread_self();
9600 listAddNodeTail(server.io_processing,j);
9601 ln = listLast(server.io_processing); /* We use ln later to remove it */
9602 unlockThreadedIO();
9ebed7cf 9603 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9604 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 9605
9606 /* Process the Job */
9607 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9608 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
b9bc0eef 9609 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9610 FILE *fp = fopen("/dev/null","w+");
9611 j->pages = rdbSavedObjectPages(j->val,fp);
9612 fclose(fp);
9613 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 9614 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9615 j->canceled = 1;
b9bc0eef 9616 }
9617
9618 /* Done: insert the job into the processed queue */
9ebed7cf 9619 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9620 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 9621 lockThreadedIO();
9622 listDelNode(server.io_processing,ln);
9623 listAddNodeTail(server.io_processed,j);
9624 unlockThreadedIO();
e0a62c7f 9625
b9bc0eef 9626 /* Signal the main thread there is new stuff to process */
9627 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9628 }
9629 return NULL; /* never reached */
9630}
9631
9632static void spawnIOThread(void) {
9633 pthread_t thread;
478c2c6f 9634 sigset_t mask, omask;
a97b9060 9635 int err;
b9bc0eef 9636
478c2c6f 9637 sigemptyset(&mask);
9638 sigaddset(&mask,SIGCHLD);
9639 sigaddset(&mask,SIGHUP);
9640 sigaddset(&mask,SIGPIPE);
9641 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 9642 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9643 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9644 strerror(err));
9645 usleep(1000000);
9646 }
478c2c6f 9647 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 9648 server.io_active_threads++;
9649}
9650
4ee9488d 9651/* We need to wait for the last thread to exit before we are able to
9652 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 9653static void waitEmptyIOJobsQueue(void) {
4ee9488d 9654 while(1) {
76b7233a 9655 int io_processed_len;
9656
4ee9488d 9657 lockThreadedIO();
054e426d 9658 if (listLength(server.io_newjobs) == 0 &&
9659 listLength(server.io_processing) == 0 &&
9660 server.io_active_threads == 0)
9661 {
4ee9488d 9662 unlockThreadedIO();
9663 return;
9664 }
76b7233a 9665 /* While waiting for empty jobs queue condition we post-process some
9666 * finshed job, as I/O threads may be hanging trying to write against
9667 * the io_ready_pipe_write FD but there are so much pending jobs that
9668 * it's blocking. */
9669 io_processed_len = listLength(server.io_processed);
4ee9488d 9670 unlockThreadedIO();
76b7233a 9671 if (io_processed_len) {
9672 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9673 usleep(1000); /* 1 millisecond */
9674 } else {
9675 usleep(10000); /* 10 milliseconds */
9676 }
4ee9488d 9677 }
9678}
9679
054e426d 9680static void vmReopenSwapFile(void) {
478c2c6f 9681 /* Note: we don't close the old one as we are in the child process
9682 * and don't want to mess at all with the original file object. */
054e426d 9683 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9684 if (server.vm_fp == NULL) {
9685 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9686 server.vm_swap_file);
478c2c6f 9687 _exit(1);
054e426d 9688 }
9689 server.vm_fd = fileno(server.vm_fp);
9690}
9691
b9bc0eef 9692/* This function must be called while with threaded IO locked */
9693static void queueIOJob(iojob *j) {
6c96ba7d 9694 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9695 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 9696 listAddNodeTail(server.io_newjobs,j);
9697 if (server.io_active_threads < server.vm_max_threads)
9698 spawnIOThread();
9699}
9700
9701static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9702 iojob *j;
e0a62c7f 9703
b9bc0eef 9704 assert(key->storage == REDIS_VM_MEMORY);
9705 assert(key->refcount == 1);
9706
9707 j = zmalloc(sizeof(*j));
9708 j->type = REDIS_IOJOB_PREPARE_SWAP;
9709 j->db = db;
78ebe4c8 9710 j->key = key;
b9bc0eef 9711 j->val = val;
9712 incrRefCount(val);
9713 j->canceled = 0;
9714 j->thread = (pthread_t) -1;
f11b8647 9715 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 9716
9717 lockThreadedIO();
9718 queueIOJob(j);
9719 unlockThreadedIO();
9720 return REDIS_OK;
9721}
9722
b0d8747d 9723/* ============ Virtual Memory - Blocking clients on missing keys =========== */
9724
d5d55fc3 9725/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9726 * If there is not already a job loading the key, it is craeted.
9727 * The key is added to the io_keys list in the client structure, and also
9728 * in the hash table mapping swapped keys to waiting clients, that is,
9729 * server.io_waited_keys. */
9730static int waitForSwappedKey(redisClient *c, robj *key) {
9731 struct dictEntry *de;
9732 robj *o;
9733 list *l;
9734
9735 /* If the key does not exist or is already in RAM we don't need to
9736 * block the client at all. */
9737 de = dictFind(c->db->dict,key);
9738 if (de == NULL) return 0;
9739 o = dictGetEntryKey(de);
9740 if (o->storage == REDIS_VM_MEMORY) {
9741 return 0;
9742 } else if (o->storage == REDIS_VM_SWAPPING) {
9743 /* We were swapping the key, undo it! */
9744 vmCancelThreadedIOJob(o);
9745 return 0;
9746 }
e0a62c7f 9747
d5d55fc3 9748 /* OK: the key is either swapped, or being loaded just now. */
9749
9750 /* Add the key to the list of keys this client is waiting for.
9751 * This maps clients to keys they are waiting for. */
9752 listAddNodeTail(c->io_keys,key);
9753 incrRefCount(key);
9754
9755 /* Add the client to the swapped keys => clients waiting map. */
9756 de = dictFind(c->db->io_keys,key);
9757 if (de == NULL) {
9758 int retval;
9759
9760 /* For every key we take a list of clients blocked for it */
9761 l = listCreate();
9762 retval = dictAdd(c->db->io_keys,key,l);
9763 incrRefCount(key);
9764 assert(retval == DICT_OK);
9765 } else {
9766 l = dictGetEntryVal(de);
9767 }
9768 listAddNodeTail(l,c);
9769
9770 /* Are we already loading the key from disk? If not create a job */
9771 if (o->storage == REDIS_VM_SWAPPED) {
9772 iojob *j;
9773
9774 o->storage = REDIS_VM_LOADING;
9775 j = zmalloc(sizeof(*j));
9776 j->type = REDIS_IOJOB_LOAD;
9777 j->db = c->db;
78ebe4c8 9778 j->key = o;
d5d55fc3 9779 j->key->vtype = o->vtype;
9780 j->page = o->vm.page;
9781 j->val = NULL;
9782 j->canceled = 0;
9783 j->thread = (pthread_t) -1;
9784 lockThreadedIO();
9785 queueIOJob(j);
9786 unlockThreadedIO();
9787 }
9788 return 1;
9789}
9790
6f078746
PN
9791/* Preload keys for any command with first, last and step values for
9792 * the command keys prototype, as defined in the command table. */
9793static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9794 int j, last;
9795 if (cmd->vm_firstkey == 0) return;
9796 last = cmd->vm_lastkey;
9797 if (last < 0) last = argc+last;
9798 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
9799 redisAssert(j < argc);
9800 waitForSwappedKey(c,argv[j]);
9801 }
9802}
9803
5d373da9 9804/* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
739ba0d2
PN
9805 * Note that the number of keys to preload is user-defined, so we need to
9806 * apply a sanity check against argc. */
ca1788b5 9807static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
76583ea4 9808 int i, num;
ca1788b5 9809 REDIS_NOTUSED(cmd);
ca1788b5
PN
9810
9811 num = atoi(argv[2]->ptr);
739ba0d2 9812 if (num > (argc-3)) return;
76583ea4 9813 for (i = 0; i < num; i++) {
ca1788b5 9814 waitForSwappedKey(c,argv[3+i]);
76583ea4
PN
9815 }
9816}
9817
3805e04f
PN
9818/* Preload keys needed to execute the entire MULTI/EXEC block.
9819 *
9820 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
9821 * and will block the client when any command requires a swapped out value. */
9822static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9823 int i, margc;
9824 struct redisCommand *mcmd;
9825 robj **margv;
9826 REDIS_NOTUSED(cmd);
9827 REDIS_NOTUSED(argc);
9828 REDIS_NOTUSED(argv);
9829
9830 if (!(c->flags & REDIS_MULTI)) return;
9831 for (i = 0; i < c->mstate.count; i++) {
9832 mcmd = c->mstate.commands[i].cmd;
9833 margc = c->mstate.commands[i].argc;
9834 margv = c->mstate.commands[i].argv;
9835
9836 if (mcmd->vm_preload_proc != NULL) {
9837 mcmd->vm_preload_proc(c,mcmd,margc,margv);
9838 } else {
9839 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
9840 }
76583ea4
PN
9841 }
9842}
9843
b0d8747d 9844/* Is this client attempting to run a command against swapped keys?
d5d55fc3 9845 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 9846 *
d5d55fc3 9847 * The important idea about this function is that it can fail! If keys will
9848 * still be swapped when the client is resumed, this key lookups will
9849 * just block loading keys from disk. In practical terms this should only
9850 * happen with SORT BY command or if there is a bug in this function.
9851 *
9852 * Return 1 if the client is marked as blocked, 0 if the client can
9853 * continue as the keys it is going to access appear to be in memory. */
0a6f3f0f 9854static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
76583ea4 9855 if (cmd->vm_preload_proc != NULL) {
ca1788b5 9856 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
76583ea4 9857 } else {
6f078746 9858 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
76583ea4
PN
9859 }
9860
d5d55fc3 9861 /* If the client was blocked for at least one key, mark it as blocked. */
9862 if (listLength(c->io_keys)) {
9863 c->flags |= REDIS_IO_WAIT;
9864 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9865 server.vm_blocked_clients++;
9866 return 1;
9867 } else {
9868 return 0;
9869 }
9870}
9871
9872/* Remove the 'key' from the list of blocked keys for a given client.
9873 *
9874 * The function returns 1 when there are no longer blocking keys after
9875 * the current one was removed (and the client can be unblocked). */
9876static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9877 list *l;
9878 listNode *ln;
9879 listIter li;
9880 struct dictEntry *de;
9881
9882 /* Remove the key from the list of keys this client is waiting for. */
9883 listRewind(c->io_keys,&li);
9884 while ((ln = listNext(&li)) != NULL) {
bf028098 9885 if (equalStringObjects(ln->value,key)) {
d5d55fc3 9886 listDelNode(c->io_keys,ln);
9887 break;
9888 }
9889 }
9890 assert(ln != NULL);
9891
9892 /* Remove the client form the key => waiting clients map. */
9893 de = dictFind(c->db->io_keys,key);
9894 assert(de != NULL);
9895 l = dictGetEntryVal(de);
9896 ln = listSearchKey(l,c);
9897 assert(ln != NULL);
9898 listDelNode(l,ln);
9899 if (listLength(l) == 0)
9900 dictDelete(c->db->io_keys,key);
9901
9902 return listLength(c->io_keys) == 0;
9903}
9904
9905static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9906 struct dictEntry *de;
9907 list *l;
9908 listNode *ln;
9909 int len;
9910
9911 de = dictFind(db->io_keys,key);
9912 if (!de) return;
9913
9914 l = dictGetEntryVal(de);
9915 len = listLength(l);
9916 /* Note: we can't use something like while(listLength(l)) as the list
9917 * can be freed by the calling function when we remove the last element. */
9918 while (len--) {
9919 ln = listFirst(l);
9920 redisClient *c = ln->value;
9921
9922 if (dontWaitForSwappedKey(c,key)) {
9923 /* Put the client in the list of clients ready to go as we
9924 * loaded all the keys about it. */
9925 listAddNodeTail(server.io_ready_clients,c);
9926 }
9927 }
b0d8747d 9928}
b0d8747d 9929
500ece7c 9930/* =========================== Remote Configuration ========================= */
9931
9932static void configSetCommand(redisClient *c) {
9933 robj *o = getDecodedObject(c->argv[3]);
2e5eb04e 9934 long long ll;
9935
500ece7c 9936 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9937 zfree(server.dbfilename);
9938 server.dbfilename = zstrdup(o->ptr);
9939 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9940 zfree(server.requirepass);
9941 server.requirepass = zstrdup(o->ptr);
9942 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9943 zfree(server.masterauth);
9944 server.masterauth = zstrdup(o->ptr);
9945 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
2e5eb04e 9946 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9947 ll < 0) goto badfmt;
9948 server.maxmemory = ll;
9949 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
9950 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
9951 ll < 0 || ll > LONG_MAX) goto badfmt;
9952 server.maxidletime = ll;
1b677732 9953 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
9954 if (!strcasecmp(o->ptr,"no")) {
9955 server.appendfsync = APPENDFSYNC_NO;
9956 } else if (!strcasecmp(o->ptr,"everysec")) {
9957 server.appendfsync = APPENDFSYNC_EVERYSEC;
9958 } else if (!strcasecmp(o->ptr,"always")) {
9959 server.appendfsync = APPENDFSYNC_ALWAYS;
9960 } else {
9961 goto badfmt;
9962 }
2e5eb04e 9963 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
9964 int old = server.appendonly;
9965 int new = yesnotoi(o->ptr);
9966
9967 if (new == -1) goto badfmt;
9968 if (old != new) {
9969 if (new == 0) {
9970 stopAppendOnly();
9971 } else {
9972 if (startAppendOnly() == REDIS_ERR) {
9973 addReplySds(c,sdscatprintf(sdsempty(),
9974 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
9975 decrRefCount(o);
9976 return;
9977 }
9978 }
9979 }
a34e0a25 9980 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
9981 int vlen, j;
9982 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
9983
9984 /* Perform sanity check before setting the new config:
9985 * - Even number of args
9986 * - Seconds >= 1, changes >= 0 */
9987 if (vlen & 1) {
9988 sdsfreesplitres(v,vlen);
9989 goto badfmt;
9990 }
9991 for (j = 0; j < vlen; j++) {
9992 char *eptr;
9993 long val;
9994
9995 val = strtoll(v[j], &eptr, 10);
9996 if (eptr[0] != '\0' ||
9997 ((j & 1) == 0 && val < 1) ||
9998 ((j & 1) == 1 && val < 0)) {
9999 sdsfreesplitres(v,vlen);
10000 goto badfmt;
10001 }
10002 }
10003 /* Finally set the new config */
10004 resetServerSaveParams();
10005 for (j = 0; j < vlen; j += 2) {
10006 time_t seconds;
10007 int changes;
10008
10009 seconds = strtoll(v[j],NULL,10);
10010 changes = strtoll(v[j+1],NULL,10);
10011 appendServerSaveParams(seconds, changes);
10012 }
10013 sdsfreesplitres(v,vlen);
500ece7c 10014 } else {
10015 addReplySds(c,sdscatprintf(sdsempty(),
10016 "-ERR not supported CONFIG parameter %s\r\n",
10017 (char*)c->argv[2]->ptr));
10018 decrRefCount(o);
10019 return;
10020 }
10021 decrRefCount(o);
10022 addReply(c,shared.ok);
a34e0a25 10023 return;
10024
10025badfmt: /* Bad format errors */
10026 addReplySds(c,sdscatprintf(sdsempty(),
10027 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10028 (char*)o->ptr,
10029 (char*)c->argv[2]->ptr));
10030 decrRefCount(o);
500ece7c 10031}
10032
10033static void configGetCommand(redisClient *c) {
10034 robj *o = getDecodedObject(c->argv[2]);
10035 robj *lenobj = createObject(REDIS_STRING,NULL);
10036 char *pattern = o->ptr;
10037 int matches = 0;
10038
10039 addReply(c,lenobj);
10040 decrRefCount(lenobj);
10041
10042 if (stringmatch(pattern,"dbfilename",0)) {
10043 addReplyBulkCString(c,"dbfilename");
10044 addReplyBulkCString(c,server.dbfilename);
10045 matches++;
10046 }
10047 if (stringmatch(pattern,"requirepass",0)) {
10048 addReplyBulkCString(c,"requirepass");
10049 addReplyBulkCString(c,server.requirepass);
10050 matches++;
10051 }
10052 if (stringmatch(pattern,"masterauth",0)) {
10053 addReplyBulkCString(c,"masterauth");
10054 addReplyBulkCString(c,server.masterauth);
10055 matches++;
10056 }
10057 if (stringmatch(pattern,"maxmemory",0)) {
10058 char buf[128];
10059
2e5eb04e 10060 ll2string(buf,128,server.maxmemory);
500ece7c 10061 addReplyBulkCString(c,"maxmemory");
10062 addReplyBulkCString(c,buf);
10063 matches++;
10064 }
2e5eb04e 10065 if (stringmatch(pattern,"timeout",0)) {
10066 char buf[128];
10067
10068 ll2string(buf,128,server.maxidletime);
10069 addReplyBulkCString(c,"timeout");
10070 addReplyBulkCString(c,buf);
10071 matches++;
10072 }
10073 if (stringmatch(pattern,"appendonly",0)) {
10074 addReplyBulkCString(c,"appendonly");
10075 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10076 matches++;
10077 }
1b677732 10078 if (stringmatch(pattern,"appendfsync",0)) {
10079 char *policy;
10080
10081 switch(server.appendfsync) {
10082 case APPENDFSYNC_NO: policy = "no"; break;
10083 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10084 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10085 default: policy = "unknown"; break; /* too harmless to panic */
10086 }
10087 addReplyBulkCString(c,"appendfsync");
10088 addReplyBulkCString(c,policy);
10089 matches++;
10090 }
a34e0a25 10091 if (stringmatch(pattern,"save",0)) {
10092 sds buf = sdsempty();
10093 int j;
10094
10095 for (j = 0; j < server.saveparamslen; j++) {
10096 buf = sdscatprintf(buf,"%ld %d",
10097 server.saveparams[j].seconds,
10098 server.saveparams[j].changes);
10099 if (j != server.saveparamslen-1)
10100 buf = sdscatlen(buf," ",1);
10101 }
10102 addReplyBulkCString(c,"save");
10103 addReplyBulkCString(c,buf);
10104 sdsfree(buf);
10105 matches++;
10106 }
500ece7c 10107 decrRefCount(o);
10108 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10109}
10110
10111static void configCommand(redisClient *c) {
10112 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10113 if (c->argc != 4) goto badarity;
10114 configSetCommand(c);
10115 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10116 if (c->argc != 3) goto badarity;
10117 configGetCommand(c);
10118 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10119 if (c->argc != 2) goto badarity;
10120 server.stat_numcommands = 0;
10121 server.stat_numconnections = 0;
10122 server.stat_expiredkeys = 0;
10123 server.stat_starttime = time(NULL);
10124 addReply(c,shared.ok);
10125 } else {
10126 addReplySds(c,sdscatprintf(sdsempty(),
10127 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10128 }
10129 return;
10130
10131badarity:
10132 addReplySds(c,sdscatprintf(sdsempty(),
10133 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10134 (char*) c->argv[1]->ptr));
10135}
10136
befec3cd 10137/* =========================== Pubsub implementation ======================== */
10138
ffc6b7f8 10139static void freePubsubPattern(void *p) {
10140 pubsubPattern *pat = p;
10141
10142 decrRefCount(pat->pattern);
10143 zfree(pat);
10144}
10145
10146static int listMatchPubsubPattern(void *a, void *b) {
10147 pubsubPattern *pa = a, *pb = b;
10148
10149 return (pa->client == pb->client) &&
bf028098 10150 (equalStringObjects(pa->pattern,pb->pattern));
ffc6b7f8 10151}
10152
10153/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10154 * 0 if the client was already subscribed to that channel. */
10155static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
befec3cd 10156 struct dictEntry *de;
10157 list *clients = NULL;
10158 int retval = 0;
10159
ffc6b7f8 10160 /* Add the channel to the client -> channels hash table */
10161 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
befec3cd 10162 retval = 1;
ffc6b7f8 10163 incrRefCount(channel);
10164 /* Add the client to the channel -> list of clients hash table */
10165 de = dictFind(server.pubsub_channels,channel);
befec3cd 10166 if (de == NULL) {
10167 clients = listCreate();
ffc6b7f8 10168 dictAdd(server.pubsub_channels,channel,clients);
10169 incrRefCount(channel);
befec3cd 10170 } else {
10171 clients = dictGetEntryVal(de);
10172 }
10173 listAddNodeTail(clients,c);
10174 }
10175 /* Notify the client */
10176 addReply(c,shared.mbulk3);
10177 addReply(c,shared.subscribebulk);
ffc6b7f8 10178 addReplyBulk(c,channel);
482b672d 10179 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
befec3cd 10180 return retval;
10181}
10182
ffc6b7f8 10183/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10184 * 0 if the client was not subscribed to the specified channel. */
10185static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
befec3cd 10186 struct dictEntry *de;
10187 list *clients;
10188 listNode *ln;
10189 int retval = 0;
10190
ffc6b7f8 10191 /* Remove the channel from the client -> channels hash table */
10192 incrRefCount(channel); /* channel may be just a pointer to the same object
201037f5 10193 we have in the hash tables. Protect it... */
ffc6b7f8 10194 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
befec3cd 10195 retval = 1;
ffc6b7f8 10196 /* Remove the client from the channel -> clients list hash table */
10197 de = dictFind(server.pubsub_channels,channel);
befec3cd 10198 assert(de != NULL);
10199 clients = dictGetEntryVal(de);
10200 ln = listSearchKey(clients,c);
10201 assert(ln != NULL);
10202 listDelNode(clients,ln);
ff767a75 10203 if (listLength(clients) == 0) {
10204 /* Free the list and associated hash entry at all if this was
10205 * the latest client, so that it will be possible to abuse
ffc6b7f8 10206 * Redis PUBSUB creating millions of channels. */
10207 dictDelete(server.pubsub_channels,channel);
ff767a75 10208 }
befec3cd 10209 }
10210 /* Notify the client */
10211 if (notify) {
10212 addReply(c,shared.mbulk3);
10213 addReply(c,shared.unsubscribebulk);
ffc6b7f8 10214 addReplyBulk(c,channel);
482b672d 10215 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10216 listLength(c->pubsub_patterns));
10217
10218 }
10219 decrRefCount(channel); /* it is finally safe to release it */
10220 return retval;
10221}
10222
10223/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10224static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10225 int retval = 0;
10226
10227 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10228 retval = 1;
10229 pubsubPattern *pat;
10230 listAddNodeTail(c->pubsub_patterns,pattern);
10231 incrRefCount(pattern);
10232 pat = zmalloc(sizeof(*pat));
10233 pat->pattern = getDecodedObject(pattern);
10234 pat->client = c;
10235 listAddNodeTail(server.pubsub_patterns,pat);
10236 }
10237 /* Notify the client */
10238 addReply(c,shared.mbulk3);
10239 addReply(c,shared.psubscribebulk);
10240 addReplyBulk(c,pattern);
482b672d 10241 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
ffc6b7f8 10242 return retval;
10243}
10244
10245/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10246 * 0 if the client was not subscribed to the specified channel. */
10247static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10248 listNode *ln;
10249 pubsubPattern pat;
10250 int retval = 0;
10251
10252 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10253 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10254 retval = 1;
10255 listDelNode(c->pubsub_patterns,ln);
10256 pat.client = c;
10257 pat.pattern = pattern;
10258 ln = listSearchKey(server.pubsub_patterns,&pat);
10259 listDelNode(server.pubsub_patterns,ln);
10260 }
10261 /* Notify the client */
10262 if (notify) {
10263 addReply(c,shared.mbulk3);
10264 addReply(c,shared.punsubscribebulk);
10265 addReplyBulk(c,pattern);
482b672d 10266 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10267 listLength(c->pubsub_patterns));
befec3cd 10268 }
ffc6b7f8 10269 decrRefCount(pattern);
befec3cd 10270 return retval;
10271}
10272
ffc6b7f8 10273/* Unsubscribe from all the channels. Return the number of channels the
10274 * client was subscribed from. */
10275static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10276 dictIterator *di = dictGetIterator(c->pubsub_channels);
befec3cd 10277 dictEntry *de;
10278 int count = 0;
10279
10280 while((de = dictNext(di)) != NULL) {
ffc6b7f8 10281 robj *channel = dictGetEntryKey(de);
befec3cd 10282
ffc6b7f8 10283 count += pubsubUnsubscribeChannel(c,channel,notify);
befec3cd 10284 }
10285 dictReleaseIterator(di);
10286 return count;
10287}
10288
ffc6b7f8 10289/* Unsubscribe from all the patterns. Return the number of patterns the
10290 * client was subscribed from. */
10291static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10292 listNode *ln;
10293 listIter li;
10294 int count = 0;
10295
10296 listRewind(c->pubsub_patterns,&li);
10297 while ((ln = listNext(&li)) != NULL) {
10298 robj *pattern = ln->value;
10299
10300 count += pubsubUnsubscribePattern(c,pattern,notify);
10301 }
10302 return count;
10303}
10304
befec3cd 10305/* Publish a message */
ffc6b7f8 10306static int pubsubPublishMessage(robj *channel, robj *message) {
befec3cd 10307 int receivers = 0;
10308 struct dictEntry *de;
ffc6b7f8 10309 listNode *ln;
10310 listIter li;
befec3cd 10311
ffc6b7f8 10312 /* Send to clients listening for that channel */
10313 de = dictFind(server.pubsub_channels,channel);
befec3cd 10314 if (de) {
10315 list *list = dictGetEntryVal(de);
10316 listNode *ln;
10317 listIter li;
10318
10319 listRewind(list,&li);
10320 while ((ln = listNext(&li)) != NULL) {
10321 redisClient *c = ln->value;
10322
10323 addReply(c,shared.mbulk3);
10324 addReply(c,shared.messagebulk);
ffc6b7f8 10325 addReplyBulk(c,channel);
befec3cd 10326 addReplyBulk(c,message);
10327 receivers++;
10328 }
10329 }
ffc6b7f8 10330 /* Send to clients listening to matching channels */
10331 if (listLength(server.pubsub_patterns)) {
10332 listRewind(server.pubsub_patterns,&li);
10333 channel = getDecodedObject(channel);
10334 while ((ln = listNext(&li)) != NULL) {
10335 pubsubPattern *pat = ln->value;
10336
10337 if (stringmatchlen((char*)pat->pattern->ptr,
10338 sdslen(pat->pattern->ptr),
10339 (char*)channel->ptr,
10340 sdslen(channel->ptr),0)) {
c8d0ea0e 10341 addReply(pat->client,shared.mbulk4);
10342 addReply(pat->client,shared.pmessagebulk);
10343 addReplyBulk(pat->client,pat->pattern);
ffc6b7f8 10344 addReplyBulk(pat->client,channel);
10345 addReplyBulk(pat->client,message);
10346 receivers++;
10347 }
10348 }
10349 decrRefCount(channel);
10350 }
befec3cd 10351 return receivers;
10352}
10353
10354static void subscribeCommand(redisClient *c) {
10355 int j;
10356
10357 for (j = 1; j < c->argc; j++)
ffc6b7f8 10358 pubsubSubscribeChannel(c,c->argv[j]);
befec3cd 10359}
10360
10361static void unsubscribeCommand(redisClient *c) {
10362 if (c->argc == 1) {
ffc6b7f8 10363 pubsubUnsubscribeAllChannels(c,1);
10364 return;
10365 } else {
10366 int j;
10367
10368 for (j = 1; j < c->argc; j++)
10369 pubsubUnsubscribeChannel(c,c->argv[j],1);
10370 }
10371}
10372
10373static void psubscribeCommand(redisClient *c) {
10374 int j;
10375
10376 for (j = 1; j < c->argc; j++)
10377 pubsubSubscribePattern(c,c->argv[j]);
10378}
10379
10380static void punsubscribeCommand(redisClient *c) {
10381 if (c->argc == 1) {
10382 pubsubUnsubscribeAllPatterns(c,1);
befec3cd 10383 return;
10384 } else {
10385 int j;
10386
10387 for (j = 1; j < c->argc; j++)
ffc6b7f8 10388 pubsubUnsubscribePattern(c,c->argv[j],1);
befec3cd 10389 }
10390}
10391
10392static void publishCommand(redisClient *c) {
10393 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
482b672d 10394 addReplyLongLong(c,receivers);
befec3cd 10395}
10396
37ab76c9 10397/* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10398 *
10399 * The implementation uses a per-DB hash table mapping keys to list of clients
10400 * WATCHing those keys, so that given a key that is going to be modified
10401 * we can mark all the associated clients as dirty.
10402 *
10403 * Also every client contains a list of WATCHed keys so that's possible to
10404 * un-watch such keys when the client is freed or when UNWATCH is called. */
10405
10406/* In the client->watched_keys list we need to use watchedKey structures
10407 * as in order to identify a key in Redis we need both the key name and the
10408 * DB */
10409typedef struct watchedKey {
10410 robj *key;
10411 redisDb *db;
10412} watchedKey;
10413
10414/* Watch for the specified key */
10415static void watchForKey(redisClient *c, robj *key) {
10416 list *clients = NULL;
10417 listIter li;
10418 listNode *ln;
10419 watchedKey *wk;
10420
10421 /* Check if we are already watching for this key */
10422 listRewind(c->watched_keys,&li);
10423 while((ln = listNext(&li))) {
10424 wk = listNodeValue(ln);
10425 if (wk->db == c->db && equalStringObjects(key,wk->key))
10426 return; /* Key already watched */
10427 }
10428 /* This key is not already watched in this DB. Let's add it */
10429 clients = dictFetchValue(c->db->watched_keys,key);
10430 if (!clients) {
10431 clients = listCreate();
10432 dictAdd(c->db->watched_keys,key,clients);
10433 incrRefCount(key);
10434 }
10435 listAddNodeTail(clients,c);
10436 /* Add the new key to the lits of keys watched by this client */
10437 wk = zmalloc(sizeof(*wk));
10438 wk->key = key;
10439 wk->db = c->db;
10440 incrRefCount(key);
10441 listAddNodeTail(c->watched_keys,wk);
10442}
10443
10444/* Unwatch all the keys watched by this client. To clean the EXEC dirty
10445 * flag is up to the caller. */
10446static void unwatchAllKeys(redisClient *c) {
10447 listIter li;
10448 listNode *ln;
10449
10450 if (listLength(c->watched_keys) == 0) return;
10451 listRewind(c->watched_keys,&li);
10452 while((ln = listNext(&li))) {
10453 list *clients;
10454 watchedKey *wk;
10455
10456 /* Lookup the watched key -> clients list and remove the client
10457 * from the list */
10458 wk = listNodeValue(ln);
10459 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10460 assert(clients != NULL);
10461 listDelNode(clients,listSearchKey(clients,c));
10462 /* Kill the entry at all if this was the only client */
10463 if (listLength(clients) == 0)
10464 dictDelete(wk->db->watched_keys, wk->key);
10465 /* Remove this watched key from the client->watched list */
10466 listDelNode(c->watched_keys,ln);
10467 decrRefCount(wk->key);
10468 zfree(wk);
10469 }
10470}
10471
10472/* "Touch" a key, so that if this key is being WATCHed by soem client the
10473 * next EXEC will fail. */
10474static void touchWatchedKey(redisDb *db, robj *key) {
10475 list *clients;
10476 listIter li;
10477 listNode *ln;
10478
10479 if (dictSize(db->watched_keys) == 0) return;
10480 clients = dictFetchValue(db->watched_keys, key);
10481 if (!clients) return;
10482
10483 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
10484 /* Check if we are already watching for this key */
10485 listRewind(clients,&li);
10486 while((ln = listNext(&li))) {
10487 redisClient *c = listNodeValue(ln);
10488
10489 c->flags |= REDIS_DIRTY_CAS;
10490 }
10491}
10492
9b30e1a2 10493/* On FLUSHDB or FLUSHALL all the watched keys that are present before the
10494 * flush but will be deleted as effect of the flushing operation should
10495 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
10496 * a FLUSHALL operation (all the DBs flushed). */
10497static void touchWatchedKeysOnFlush(int dbid) {
10498 listIter li1, li2;
10499 listNode *ln;
10500
10501 /* For every client, check all the waited keys */
10502 listRewind(server.clients,&li1);
10503 while((ln = listNext(&li1))) {
10504 redisClient *c = listNodeValue(ln);
10505 listRewind(c->watched_keys,&li2);
10506 while((ln = listNext(&li2))) {
10507 watchedKey *wk = listNodeValue(ln);
10508
10509 /* For every watched key matching the specified DB, if the
10510 * key exists, mark the client as dirty, as the key will be
10511 * removed. */
10512 if (dbid == -1 || wk->db->id == dbid) {
10513 if (dictFind(wk->db->dict, wk->key) != NULL)
10514 c->flags |= REDIS_DIRTY_CAS;
10515 }
10516 }
10517 }
10518}
10519
37ab76c9 10520static void watchCommand(redisClient *c) {
10521 int j;
10522
6531c94d 10523 if (c->flags & REDIS_MULTI) {
10524 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
10525 return;
10526 }
37ab76c9 10527 for (j = 1; j < c->argc; j++)
10528 watchForKey(c,c->argv[j]);
10529 addReply(c,shared.ok);
10530}
10531
10532static void unwatchCommand(redisClient *c) {
10533 unwatchAllKeys(c);
10534 c->flags &= (~REDIS_DIRTY_CAS);
10535 addReply(c,shared.ok);
10536}
10537
7f957c92 10538/* ================================= Debugging ============================== */
10539
ba798261 10540/* Compute the sha1 of string at 's' with 'len' bytes long.
10541 * The SHA1 is then xored againt the string pointed by digest.
10542 * Since xor is commutative, this operation is used in order to
10543 * "add" digests relative to unordered elements.
10544 *
10545 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
10546static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
10547 SHA1_CTX ctx;
10548 unsigned char hash[20], *s = ptr;
10549 int j;
10550
10551 SHA1Init(&ctx);
10552 SHA1Update(&ctx,s,len);
10553 SHA1Final(hash,&ctx);
10554
10555 for (j = 0; j < 20; j++)
10556 digest[j] ^= hash[j];
10557}
10558
10559static void xorObjectDigest(unsigned char *digest, robj *o) {
10560 o = getDecodedObject(o);
10561 xorDigest(digest,o->ptr,sdslen(o->ptr));
10562 decrRefCount(o);
10563}
10564
10565/* This function instead of just computing the SHA1 and xoring it
10566 * against diget, also perform the digest of "digest" itself and
10567 * replace the old value with the new one.
10568 *
10569 * So the final digest will be:
10570 *
10571 * digest = SHA1(digest xor SHA1(data))
10572 *
10573 * This function is used every time we want to preserve the order so
10574 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
10575 *
10576 * Also note that mixdigest("foo") followed by mixdigest("bar")
10577 * will lead to a different digest compared to "fo", "obar".
10578 */
10579static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
10580 SHA1_CTX ctx;
10581 char *s = ptr;
10582
10583 xorDigest(digest,s,len);
10584 SHA1Init(&ctx);
10585 SHA1Update(&ctx,digest,20);
10586 SHA1Final(digest,&ctx);
10587}
10588
10589static void mixObjectDigest(unsigned char *digest, robj *o) {
10590 o = getDecodedObject(o);
10591 mixDigest(digest,o->ptr,sdslen(o->ptr));
10592 decrRefCount(o);
10593}
10594
10595/* Compute the dataset digest. Since keys, sets elements, hashes elements
10596 * are not ordered, we use a trick: every aggregate digest is the xor
10597 * of the digests of their elements. This way the order will not change
10598 * the result. For list instead we use a feedback entering the output digest
10599 * as input in order to ensure that a different ordered list will result in
10600 * a different digest. */
10601static void computeDatasetDigest(unsigned char *final) {
10602 unsigned char digest[20];
10603 char buf[128];
10604 dictIterator *di = NULL;
10605 dictEntry *de;
10606 int j;
10607 uint32_t aux;
10608
10609 memset(final,0,20); /* Start with a clean result */
10610
10611 for (j = 0; j < server.dbnum; j++) {
10612 redisDb *db = server.db+j;
10613
10614 if (dictSize(db->dict) == 0) continue;
10615 di = dictGetIterator(db->dict);
10616
10617 /* hash the DB id, so the same dataset moved in a different
10618 * DB will lead to a different digest */
10619 aux = htonl(j);
10620 mixDigest(final,&aux,sizeof(aux));
10621
10622 /* Iterate this DB writing every entry */
10623 while((de = dictNext(di)) != NULL) {
cbae1d34 10624 robj *key, *o, *kcopy;
ba798261 10625 time_t expiretime;
10626
10627 memset(digest,0,20); /* This key-val digest */
10628 key = dictGetEntryKey(de);
cbae1d34 10629
10630 if (!server.vm_enabled) {
10631 mixObjectDigest(digest,key);
ba798261 10632 o = dictGetEntryVal(de);
ba798261 10633 } else {
cbae1d34 10634 /* Don't work with the key directly as when VM is active
10635 * this is unsafe: TODO: fix decrRefCount to check if the
10636 * count really reached 0 to avoid this mess */
10637 kcopy = dupStringObject(key);
10638 mixObjectDigest(digest,kcopy);
10639 o = lookupKeyRead(db,kcopy);
10640 decrRefCount(kcopy);
ba798261 10641 }
10642 aux = htonl(o->type);
10643 mixDigest(digest,&aux,sizeof(aux));
10644 expiretime = getExpire(db,key);
10645
10646 /* Save the key and associated value */
10647 if (o->type == REDIS_STRING) {
10648 mixObjectDigest(digest,o);
10649 } else if (o->type == REDIS_LIST) {
10650 list *list = o->ptr;
10651 listNode *ln;
10652 listIter li;
10653
10654 listRewind(list,&li);
10655 while((ln = listNext(&li))) {
10656 robj *eleobj = listNodeValue(ln);
10657
10658 mixObjectDigest(digest,eleobj);
10659 }
10660 } else if (o->type == REDIS_SET) {
10661 dict *set = o->ptr;
10662 dictIterator *di = dictGetIterator(set);
10663 dictEntry *de;
10664
10665 while((de = dictNext(di)) != NULL) {
10666 robj *eleobj = dictGetEntryKey(de);
10667
10668 xorObjectDigest(digest,eleobj);
10669 }
10670 dictReleaseIterator(di);
10671 } else if (o->type == REDIS_ZSET) {
10672 zset *zs = o->ptr;
10673 dictIterator *di = dictGetIterator(zs->dict);
10674 dictEntry *de;
10675
10676 while((de = dictNext(di)) != NULL) {
10677 robj *eleobj = dictGetEntryKey(de);
10678 double *score = dictGetEntryVal(de);
10679 unsigned char eledigest[20];
10680
10681 snprintf(buf,sizeof(buf),"%.17g",*score);
10682 memset(eledigest,0,20);
10683 mixObjectDigest(eledigest,eleobj);
10684 mixDigest(eledigest,buf,strlen(buf));
10685 xorDigest(digest,eledigest,20);
10686 }
10687 dictReleaseIterator(di);
10688 } else if (o->type == REDIS_HASH) {
10689 hashIterator *hi;
10690 robj *obj;
10691
10692 hi = hashInitIterator(o);
10693 while (hashNext(hi) != REDIS_ERR) {
10694 unsigned char eledigest[20];
10695
10696 memset(eledigest,0,20);
10697 obj = hashCurrent(hi,REDIS_HASH_KEY);
10698 mixObjectDigest(eledigest,obj);
10699 decrRefCount(obj);
10700 obj = hashCurrent(hi,REDIS_HASH_VALUE);
10701 mixObjectDigest(eledigest,obj);
10702 decrRefCount(obj);
10703 xorDigest(digest,eledigest,20);
10704 }
10705 hashReleaseIterator(hi);
10706 } else {
10707 redisPanic("Unknown object type");
10708 }
ba798261 10709 /* If the key has an expire, add it to the mix */
10710 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
10711 /* We can finally xor the key-val digest to the final digest */
10712 xorDigest(final,digest,20);
10713 }
10714 dictReleaseIterator(di);
10715 }
10716}
10717
7f957c92 10718static void debugCommand(redisClient *c) {
10719 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10720 *((char*)-1) = 'x';
210e29f7 10721 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10722 if (rdbSave(server.dbfilename) != REDIS_OK) {
10723 addReply(c,shared.err);
10724 return;
10725 }
10726 emptyDb();
10727 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10728 addReply(c,shared.err);
10729 return;
10730 }
10731 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10732 addReply(c,shared.ok);
71c2b467 10733 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10734 emptyDb();
10735 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10736 addReply(c,shared.err);
10737 return;
10738 }
10739 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10740 addReply(c,shared.ok);
333298da 10741 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10742 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10743 robj *key, *val;
10744
10745 if (!de) {
10746 addReply(c,shared.nokeyerr);
10747 return;
10748 }
10749 key = dictGetEntryKey(de);
10750 val = dictGetEntryVal(de);
59146ef3 10751 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
10752 key->storage == REDIS_VM_SWAPPING)) {
07efaf74 10753 char *strenc;
10754 char buf[128];
10755
10756 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10757 strenc = strencoding[val->encoding];
10758 } else {
10759 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10760 strenc = buf;
10761 }
ace06542 10762 addReplySds(c,sdscatprintf(sdsempty(),
10763 "+Key at:%p refcount:%d, value at:%p refcount:%d "
07efaf74 10764 "encoding:%s serializedlength:%lld\r\n",
682ac724 10765 (void*)key, key->refcount, (void*)val, val->refcount,
07efaf74 10766 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 10767 } else {
10768 addReplySds(c,sdscatprintf(sdsempty(),
10769 "+Key at:%p refcount:%d, value swapped at: page %llu "
10770 "using %llu pages\r\n",
10771 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10772 (unsigned long long) key->vm.usedpages));
10773 }
78ebe4c8 10774 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10775 lookupKeyRead(c->db,c->argv[2]);
10776 addReply(c,shared.ok);
7d30035d 10777 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10778 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10779 robj *key, *val;
10780
10781 if (!server.vm_enabled) {
10782 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10783 return;
10784 }
10785 if (!de) {
10786 addReply(c,shared.nokeyerr);
10787 return;
10788 }
10789 key = dictGetEntryKey(de);
10790 val = dictGetEntryVal(de);
4ef8de8a 10791 /* If the key is shared we want to create a copy */
10792 if (key->refcount > 1) {
10793 robj *newkey = dupStringObject(key);
10794 decrRefCount(key);
10795 key = dictGetEntryKey(de) = newkey;
10796 }
10797 /* Swap it */
7d30035d 10798 if (key->storage != REDIS_VM_MEMORY) {
10799 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 10800 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 10801 dictGetEntryVal(de) = NULL;
10802 addReply(c,shared.ok);
10803 } else {
10804 addReply(c,shared.err);
10805 }
59305dc7 10806 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
10807 long keys, j;
10808 robj *key, *val;
10809 char buf[128];
10810
10811 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
10812 return;
10813 for (j = 0; j < keys; j++) {
10814 snprintf(buf,sizeof(buf),"key:%lu",j);
10815 key = createStringObject(buf,strlen(buf));
10816 if (lookupKeyRead(c->db,key) != NULL) {
10817 decrRefCount(key);
10818 continue;
10819 }
10820 snprintf(buf,sizeof(buf),"value:%lu",j);
10821 val = createStringObject(buf,strlen(buf));
10822 dictAdd(c->db->dict,key,val);
10823 }
10824 addReply(c,shared.ok);
ba798261 10825 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
10826 unsigned char digest[20];
10827 sds d = sdsnew("+");
10828 int j;
10829
10830 computeDatasetDigest(digest);
10831 for (j = 0; j < 20; j++)
10832 d = sdscatprintf(d, "%02x",digest[j]);
10833
10834 d = sdscatlen(d,"\r\n",2);
10835 addReplySds(c,d);
7f957c92 10836 } else {
333298da 10837 addReplySds(c,sdsnew(
bdcb92f2 10838 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 10839 }
10840}
56906eef 10841
6c96ba7d 10842static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 10843 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
fdfb02e7 10844 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
dfc5e96c 10845#ifdef HAVE_BACKTRACE
10846 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10847 *((char*)-1) = 'x';
10848#endif
10849}
10850
c651fd9e 10851static void _redisPanic(char *msg, char *file, int line) {
10852 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
17772754 10853 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
c651fd9e 10854#ifdef HAVE_BACKTRACE
10855 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10856 *((char*)-1) = 'x';
10857#endif
10858}
10859
bcfc686d 10860/* =================================== Main! ================================ */
56906eef 10861
bcfc686d 10862#ifdef __linux__
10863int linuxOvercommitMemoryValue(void) {
10864 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10865 char buf[64];
56906eef 10866
bcfc686d 10867 if (!fp) return -1;
10868 if (fgets(buf,64,fp) == NULL) {
10869 fclose(fp);
10870 return -1;
10871 }
10872 fclose(fp);
56906eef 10873
bcfc686d 10874 return atoi(buf);
10875}
10876
10877void linuxOvercommitMemoryWarning(void) {
10878 if (linuxOvercommitMemoryValue() == 0) {
7ccd2d0a 10879 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
bcfc686d 10880 }
10881}
10882#endif /* __linux__ */
10883
10884static void daemonize(void) {
10885 int fd;
10886 FILE *fp;
10887
10888 if (fork() != 0) exit(0); /* parent exits */
10889 setsid(); /* create a new session */
10890
10891 /* Every output goes to /dev/null. If Redis is daemonized but
10892 * the 'logfile' is set to 'stdout' in the configuration file
10893 * it will not log at all. */
10894 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10895 dup2(fd, STDIN_FILENO);
10896 dup2(fd, STDOUT_FILENO);
10897 dup2(fd, STDERR_FILENO);
10898 if (fd > STDERR_FILENO) close(fd);
10899 }
10900 /* Try to write the pid file */
10901 fp = fopen(server.pidfile,"w");
10902 if (fp) {
10903 fprintf(fp,"%d\n",getpid());
10904 fclose(fp);
56906eef 10905 }
56906eef 10906}
10907
42ab0172
AO
10908static void version() {
10909 printf("Redis server version %s\n", REDIS_VERSION);
10910 exit(0);
10911}
10912
723fb69b
AO
10913static void usage() {
10914 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 10915 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
10916 exit(1);
10917}
10918
bcfc686d 10919int main(int argc, char **argv) {
9651a787 10920 time_t start;
10921
bcfc686d 10922 initServerConfig();
10923 if (argc == 2) {
44efe66e 10924 if (strcmp(argv[1], "-v") == 0 ||
10925 strcmp(argv[1], "--version") == 0) version();
10926 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 10927 resetServerSaveParams();
10928 loadServerConfig(argv[1]);
723fb69b
AO
10929 } else if ((argc > 2)) {
10930 usage();
bcfc686d 10931 } else {
10932 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10933 }
bcfc686d 10934 if (server.daemonize) daemonize();
71c54b21 10935 initServer();
bcfc686d 10936 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10937#ifdef __linux__
10938 linuxOvercommitMemoryWarning();
10939#endif
9651a787 10940 start = time(NULL);
bcfc686d 10941 if (server.appendonly) {
10942 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 10943 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 10944 } else {
10945 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 10946 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 10947 }
bcfc686d 10948 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 10949 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 10950 aeMain(server.el);
10951 aeDeleteEventLoop(server.el);
10952 return 0;
10953}
10954
10955/* ============================= Backtrace support ========================= */
10956
10957#ifdef HAVE_BACKTRACE
10958static char *findFuncName(void *pointer, unsigned long *offset);
10959
56906eef 10960static void *getMcontextEip(ucontext_t *uc) {
10961#if defined(__FreeBSD__)
10962 return (void*) uc->uc_mcontext.mc_eip;
10963#elif defined(__dietlibc__)
10964 return (void*) uc->uc_mcontext.eip;
06db1f50 10965#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 10966 #if __x86_64__
10967 return (void*) uc->uc_mcontext->__ss.__rip;
10968 #else
56906eef 10969 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 10970 #endif
06db1f50 10971#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 10972 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 10973 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 10974 #else
10975 return (void*) uc->uc_mcontext->__ss.__eip;
e0a62c7f 10976 #endif
54bac49d 10977#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 10978 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 10979#elif defined(__ia64__) /* Linux IA64 */
10980 return (void*) uc->uc_mcontext.sc_ip;
10981#else
10982 return NULL;
56906eef 10983#endif
10984}
10985
10986static void segvHandler(int sig, siginfo_t *info, void *secret) {
10987 void *trace[100];
10988 char **messages = NULL;
10989 int i, trace_size = 0;
10990 unsigned long offset=0;
56906eef 10991 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 10992 sds infostring;
56906eef 10993 REDIS_NOTUSED(info);
10994
10995 redisLog(REDIS_WARNING,
10996 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 10997 infostring = genRedisInfoString();
10998 redisLog(REDIS_WARNING, "%s",infostring);
10999 /* It's not safe to sdsfree() the returned string under memory
11000 * corruption conditions. Let it leak as we are going to abort */
e0a62c7f 11001
56906eef 11002 trace_size = backtrace(trace, 100);
de96dbfe 11003 /* overwrite sigaction with caller's address */
b91cf5ef 11004 if (getMcontextEip(uc) != NULL) {
11005 trace[1] = getMcontextEip(uc);
11006 }
56906eef 11007 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 11008
d76412d1 11009 for (i=1; i<trace_size; ++i) {
56906eef 11010 char *fn = findFuncName(trace[i], &offset), *p;
11011
11012 p = strchr(messages[i],'+');
11013 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11014 redisLog(REDIS_WARNING,"%s", messages[i]);
11015 } else {
11016 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11017 }
11018 }
b177fd30 11019 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 11020 _exit(0);
fe3bbfbe 11021}
56906eef 11022
fab43727 11023static void sigtermHandler(int sig) {
11024 REDIS_NOTUSED(sig);
b58ba105 11025
fab43727 11026 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11027 server.shutdown_asap = 1;
b58ba105
AM
11028}
11029
56906eef 11030static void setupSigSegvAction(void) {
11031 struct sigaction act;
11032
11033 sigemptyset (&act.sa_mask);
11034 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11035 * is used. Otherwise, sa_handler is used */
11036 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11037 act.sa_sigaction = segvHandler;
11038 sigaction (SIGSEGV, &act, NULL);
11039 sigaction (SIGBUS, &act, NULL);
12fea928 11040 sigaction (SIGFPE, &act, NULL);
11041 sigaction (SIGILL, &act, NULL);
11042 sigaction (SIGBUS, &act, NULL);
b58ba105
AM
11043
11044 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
fab43727 11045 act.sa_handler = sigtermHandler;
b58ba105 11046 sigaction (SIGTERM, &act, NULL);
e65fdc78 11047 return;
56906eef 11048}
e65fdc78 11049
bcfc686d 11050#include "staticsymbols.h"
11051/* This function try to convert a pointer into a function name. It's used in
11052 * oreder to provide a backtrace under segmentation fault that's able to
11053 * display functions declared as static (otherwise the backtrace is useless). */
11054static char *findFuncName(void *pointer, unsigned long *offset){
11055 int i, ret = -1;
11056 unsigned long off, minoff = 0;
ed9b544e 11057
bcfc686d 11058 /* Try to match against the Symbol with the smallest offset */
11059 for (i=0; symsTable[i].pointer; i++) {
11060 unsigned long lp = (unsigned long) pointer;
0bc03378 11061
bcfc686d 11062 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11063 off=lp-symsTable[i].pointer;
11064 if (ret < 0 || off < minoff) {
11065 minoff=off;
11066 ret=i;
11067 }
11068 }
0bc03378 11069 }
bcfc686d 11070 if (ret == -1) return NULL;
11071 *offset = minoff;
11072 return symsTable[ret].name;
0bc03378 11073}
bcfc686d 11074#else /* HAVE_BACKTRACE */
11075static void setupSigSegvAction(void) {
0bc03378 11076}
bcfc686d 11077#endif /* HAVE_BACKTRACE */
0bc03378 11078
ed9b544e 11079
ed9b544e 11080
bcfc686d 11081/* The End */
11082
11083
ed9b544e 11084