]> git.saurik.com Git - redis.git/blame - redis.c
build fixed when simpler shells are used to create release.h
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
26ef09a8 30#define REDIS_VERSION "1.3.12"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
40#include <signal.h>
fbf9bcdb 41
42#ifdef HAVE_BACKTRACE
c9468bcf 43#include <execinfo.h>
44#include <ucontext.h>
fbf9bcdb 45#endif /* HAVE_BACKTRACE */
46
ed9b544e 47#include <sys/wait.h>
48#include <errno.h>
49#include <assert.h>
50#include <ctype.h>
51#include <stdarg.h>
52#include <inttypes.h>
53#include <arpa/inet.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#include <sys/time.h>
57#include <sys/resource.h>
2895e862 58#include <sys/uio.h>
f78fd11b 59#include <limits.h>
fb82e75c 60#include <float.h>
a7866db6 61#include <math.h>
92f8e882 62#include <pthread.h>
0bc1b2f6 63
64#if defined(__sun)
5043dff3 65#include "solarisfixes.h"
66#endif
ed9b544e 67
c9468bcf 68#include "redis.h"
ed9b544e 69#include "ae.h" /* Event driven programming library */
70#include "sds.h" /* Dynamic safe strings */
71#include "anet.h" /* Networking the easy way */
72#include "dict.h" /* Hash tables */
73#include "adlist.h" /* Linked lists */
74#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 75#include "lzf.h" /* LZF compression library */
76#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
ba798261 77#include "zipmap.h" /* Compact dictionary-alike data structure */
78#include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
5436146c 79#include "release.h" /* Release and/or git repository information */
ed9b544e 80
81/* Error codes */
82#define REDIS_OK 0
83#define REDIS_ERR -1
84
85/* Static server configuration */
86#define REDIS_SERVERPORT 6379 /* TCP port */
87#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 88#define REDIS_IOBUF_LEN 1024
ed9b544e 89#define REDIS_LOADBUF_LEN 1024
248ea310 90#define REDIS_STATIC_ARGS 8
ed9b544e 91#define REDIS_DEFAULT_DBNUM 16
92#define REDIS_CONFIGLINE_MAX 1024
93#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
94#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
8ca3e9d1 95#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
6f376729 96#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 97#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98
99/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
100#define REDIS_WRITEV_THRESHOLD 3
101/* Max number of iovecs used for each writev call */
102#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 103
104/* Hash table parameters */
105#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 106
107/* Command flags */
3fd78bcd 108#define REDIS_CMD_BULK 1 /* Bulk write command */
109#define REDIS_CMD_INLINE 2 /* Inline command */
110/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
111 this flags will return an error when the 'maxmemory' option is set in the
112 config file and the server is using more than maxmemory bytes of memory.
113 In short this commands are denied on low memory conditions. */
114#define REDIS_CMD_DENYOOM 4
4005fef1 115#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
ed9b544e 116
117/* Object types */
118#define REDIS_STRING 0
119#define REDIS_LIST 1
120#define REDIS_SET 2
1812e024 121#define REDIS_ZSET 3
122#define REDIS_HASH 4
f78fd11b 123
5234952b 124/* Objects encoding. Some kind of objects like Strings and Hashes can be
125 * internally represented in multiple ways. The 'encoding' field of the object
126 * is set to one of this fields for this object. */
942a3961 127#define REDIS_ENCODING_RAW 0 /* Raw representation */
128#define REDIS_ENCODING_INT 1 /* Encoded as integer */
5234952b 129#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
130#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
942a3961 131
07efaf74 132static char* strencoding[] = {
133 "raw", "int", "zipmap", "hashtable"
134};
135
f78fd11b 136/* Object types only used for dumping to disk */
bb32ede5 137#define REDIS_EXPIRETIME 253
ed9b544e 138#define REDIS_SELECTDB 254
139#define REDIS_EOF 255
140
f78fd11b 141/* Defines related to the dump file format. To store 32 bits lengths for short
142 * keys requires a lot of space, so we check the most significant 2 bits of
143 * the first byte to interpreter the length:
144 *
145 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
146 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
147 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 148 * 11|000000 this means: specially encoded object will follow. The six bits
149 * number specify the kind of object that follows.
150 * See the REDIS_RDB_ENC_* defines.
f78fd11b 151 *
10c43610 152 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
153 * values, will fit inside. */
f78fd11b 154#define REDIS_RDB_6BITLEN 0
155#define REDIS_RDB_14BITLEN 1
156#define REDIS_RDB_32BITLEN 2
17be1a4a 157#define REDIS_RDB_ENCVAL 3
f78fd11b 158#define REDIS_RDB_LENERR UINT_MAX
159
a4d1ba9a 160/* When a length of a string object stored on disk has the first two bits
161 * set, the remaining two bits specify a special encoding for the object
162 * accordingly to the following defines: */
163#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
164#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
165#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 166#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 167
75680a3c 168/* Virtual memory object->where field. */
169#define REDIS_VM_MEMORY 0 /* The object is on memory */
170#define REDIS_VM_SWAPPED 1 /* The object is on disk */
171#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
172#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
173
06224fec 174/* Virtual memory static configuration stuff.
175 * Check vmFindContiguousPages() to know more about this magic numbers. */
176#define REDIS_VM_MAX_NEAR_PAGES 65536
177#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 178#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 179#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 180/* The following is the *percentage* of completed I/O jobs to process when the
181 * handelr is called. While Virtual Memory I/O operations are performed by
182 * threads, this operations must be processed by the main thread when completed
183 * in order to take effect. */
c953f24b 184#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 185
ed9b544e 186/* Client flags */
d5d55fc3 187#define REDIS_SLAVE 1 /* This client is a slave server */
188#define REDIS_MASTER 2 /* This client is a master server */
189#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
190#define REDIS_MULTI 8 /* This client is in a MULTI context */
191#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
192#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
ed9b544e 193
40d224a9 194/* Slave replication state - slave side */
ed9b544e 195#define REDIS_REPL_NONE 0 /* No active replication */
196#define REDIS_REPL_CONNECT 1 /* Must connect to master */
197#define REDIS_REPL_CONNECTED 2 /* Connected to master */
198
40d224a9 199/* Slave replication state - from the point of view of master
200 * Note that in SEND_BULK and ONLINE state the slave receives new updates
201 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
202 * to start the next background saving in order to send updates to it. */
203#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
204#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
205#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
206#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
207
ed9b544e 208/* List related stuff */
209#define REDIS_HEAD 0
210#define REDIS_TAIL 1
211
212/* Sort operations */
213#define REDIS_SORT_GET 0
443c6409 214#define REDIS_SORT_ASC 1
215#define REDIS_SORT_DESC 2
ed9b544e 216#define REDIS_SORTKEY_MAX 1024
217
218/* Log levels */
219#define REDIS_DEBUG 0
f870935d 220#define REDIS_VERBOSE 1
221#define REDIS_NOTICE 2
222#define REDIS_WARNING 3
ed9b544e 223
224/* Anti-warning macro... */
225#define REDIS_NOTUSED(V) ((void) V)
226
6b47e12e 227#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
228#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 229
48f0308a 230/* Append only defines */
231#define APPENDFSYNC_NO 0
232#define APPENDFSYNC_ALWAYS 1
233#define APPENDFSYNC_EVERYSEC 2
234
cbba7dd7 235/* Hashes related defaults */
236#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
237#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
238
dfc5e96c 239/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 240#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
c651fd9e 241#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
6c96ba7d 242static void _redisAssert(char *estr, char *file, int line);
c651fd9e 243static void _redisPanic(char *msg, char *file, int line);
dfc5e96c 244
ed9b544e 245/*================================= Data types ============================== */
246
247/* A redis object, that is a type able to hold a string / list / set */
75680a3c 248
249/* The VM object structure */
250struct redisObjectVM {
3a66edc7 251 off_t page; /* the page at witch the object is stored on disk */
252 off_t usedpages; /* number of pages used on disk */
253 time_t atime; /* Last access time */
75680a3c 254} vm;
255
256/* The actual Redis Object */
ed9b544e 257typedef struct redisObject {
ed9b544e 258 void *ptr;
942a3961 259 unsigned char type;
260 unsigned char encoding;
d894161b 261 unsigned char storage; /* If this object is a key, where is the value?
262 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
263 unsigned char vtype; /* If this object is a key, and value is swapped out,
264 * this is the type of the swapped out object. */
ed9b544e 265 int refcount;
75680a3c 266 /* VM fields, this are only allocated if VM is active, otherwise the
267 * object allocation function will just allocate
268 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
269 * Redis without VM active will not have any overhead. */
270 struct redisObjectVM vm;
ed9b544e 271} robj;
272
dfc5e96c 273/* Macro used to initalize a Redis object allocated on the stack.
274 * Note that this macro is taken near the structure definition to make sure
275 * we'll update it when the structure is changed, to avoid bugs like
276 * bug #85 introduced exactly in this way. */
277#define initStaticStringObject(_var,_ptr) do { \
278 _var.refcount = 1; \
279 _var.type = REDIS_STRING; \
280 _var.encoding = REDIS_ENCODING_RAW; \
281 _var.ptr = _ptr; \
3a66edc7 282 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 283} while(0);
284
3305306f 285typedef struct redisDb {
4409877e 286 dict *dict; /* The keyspace for this DB */
287 dict *expires; /* Timeout of keys with a timeout set */
288 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 289 dict *io_keys; /* Keys with clients waiting for VM I/O */
3305306f 290 int id;
291} redisDb;
292
6e469882 293/* Client MULTI/EXEC state */
294typedef struct multiCmd {
295 robj **argv;
296 int argc;
297 struct redisCommand *cmd;
298} multiCmd;
299
300typedef struct multiState {
301 multiCmd *commands; /* Array of MULTI commands */
302 int count; /* Total number of MULTI commands */
303} multiState;
304
ed9b544e 305/* With multiplexing we need to take per-clinet state.
306 * Clients are taken in a liked list. */
307typedef struct redisClient {
308 int fd;
3305306f 309 redisDb *db;
ed9b544e 310 int dictid;
311 sds querybuf;
e8a74421 312 robj **argv, **mbargv;
313 int argc, mbargc;
40d224a9 314 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 315 int multibulk; /* multi bulk command format active */
ed9b544e 316 list *reply;
317 int sentlen;
318 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 319 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 320 int slaveseldb; /* slave selected db, if this client is a slave */
321 int authenticated; /* when requirepass is non-NULL */
322 int replstate; /* replication state if this is a slave */
323 int repldbfd; /* replication DB file descriptor */
6e469882 324 long repldboff; /* replication DB file offset */
40d224a9 325 off_t repldbsize; /* replication DB file size */
6e469882 326 multiState mstate; /* MULTI/EXEC state */
d5d55fc3 327 robj **blockingkeys; /* The key we are waiting to terminate a blocking
4409877e 328 * operation such as BLPOP. Otherwise NULL. */
b177fd30 329 int blockingkeysnum; /* Number of blocking keys */
4409877e 330 time_t blockingto; /* Blocking operation timeout. If UNIX current time
331 * is >= blockingto then the operation timed out. */
92f8e882 332 list *io_keys; /* Keys this client is waiting to be loaded from the
333 * swap file in order to continue. */
ffc6b7f8 334 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
335 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
ed9b544e 336} redisClient;
337
338struct saveparam {
339 time_t seconds;
340 int changes;
341};
342
343/* Global server state structure */
344struct redisServer {
345 int port;
346 int fd;
3305306f 347 redisDb *db;
ed9b544e 348 long long dirty; /* changes to DB from the last save */
349 list *clients;
87eca727 350 list *slaves, *monitors;
ed9b544e 351 char neterr[ANET_ERR_LEN];
352 aeEventLoop *el;
353 int cronloops; /* number of times the cron function run */
354 list *objfreelist; /* A list of freed objects to avoid malloc() */
355 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 356 /* Fields used only for stats */
357 time_t stat_starttime; /* server start time */
358 long long stat_numcommands; /* number of processed commands */
359 long long stat_numconnections; /* number of connections received */
2a6a2ed1 360 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 361 /* Configuration */
362 int verbosity;
363 int glueoutputbuf;
364 int maxidletime;
365 int dbnum;
366 int daemonize;
44b38ef4 367 int appendonly;
48f0308a 368 int appendfsync;
369 time_t lastfsync;
44b38ef4 370 int appendfd;
371 int appendseldb;
ed329fcf 372 char *pidfile;
9f3c422c 373 pid_t bgsavechildpid;
9d65a1bb 374 pid_t bgrewritechildpid;
375 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
28ed1f33 376 sds aofbuf; /* AOF buffer, written before entering the event loop */
ed9b544e 377 struct saveparam *saveparams;
378 int saveparamslen;
379 char *logfile;
380 char *bindaddr;
381 char *dbfilename;
44b38ef4 382 char *appendfilename;
abcb223e 383 char *requirepass;
121f70cf 384 int rdbcompression;
8ca3e9d1 385 int activerehashing;
ed9b544e 386 /* Replication related */
387 int isslave;
d0ccebcf 388 char *masterauth;
ed9b544e 389 char *masterhost;
390 int masterport;
40d224a9 391 redisClient *master; /* client that is master for this slave */
ed9b544e 392 int replstate;
285add55 393 unsigned int maxclients;
4ef8de8a 394 unsigned long long maxmemory;
d5d55fc3 395 unsigned int blpop_blocked_clients;
396 unsigned int vm_blocked_clients;
ed9b544e 397 /* Sort parameters - qsort_r() is only available under BSD so we
398 * have to take this state global, in order to pass it to sortCompare() */
399 int sort_desc;
400 int sort_alpha;
401 int sort_bypattern;
75680a3c 402 /* Virtual memory configuration */
403 int vm_enabled;
054e426d 404 char *vm_swap_file;
75680a3c 405 off_t vm_page_size;
406 off_t vm_pages;
4ef8de8a 407 unsigned long long vm_max_memory;
cbba7dd7 408 /* Hashes config */
409 size_t hash_max_zipmap_entries;
410 size_t hash_max_zipmap_value;
75680a3c 411 /* Virtual memory state */
412 FILE *vm_fp;
413 int vm_fd;
414 off_t vm_next_page; /* Next probably empty page */
415 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 416 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 417 time_t unixtime; /* Unix time sampled every second. */
92f8e882 418 /* Virtual memory I/O threads stuff */
92f8e882 419 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 420 * put the result of the operation in the io_done list. While the
421 * job is being processed, it's put on io_processing queue. */
422 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
423 list *io_processing; /* List of VM I/O jobs being processed */
424 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 425 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 426 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 427 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
428 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 429 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 430 int io_active_threads; /* Number of running I/O threads */
431 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 432 /* Our main thread is blocked on the event loop, locking for sockets ready
433 * to be read or written, so when a threaded I/O operation is ready to be
434 * processed by the main thread, the I/O thread will use a unix pipe to
435 * awake the main thread. The followings are the two pipe FDs. */
436 int io_ready_pipe_read;
437 int io_ready_pipe_write;
7d98e08c 438 /* Virtual memory stats */
439 unsigned long long vm_stats_used_pages;
440 unsigned long long vm_stats_swapped_objects;
441 unsigned long long vm_stats_swapouts;
442 unsigned long long vm_stats_swapins;
befec3cd 443 /* Pubsub */
ffc6b7f8 444 dict *pubsub_channels; /* Map channels to list of subscribed clients */
445 list *pubsub_patterns; /* A list of pubsub_patterns */
befec3cd 446 /* Misc */
b9bc0eef 447 FILE *devnull;
ed9b544e 448};
449
ffc6b7f8 450typedef struct pubsubPattern {
451 redisClient *client;
452 robj *pattern;
453} pubsubPattern;
454
ed9b544e 455typedef void redisCommandProc(redisClient *c);
ca1788b5 456typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
ed9b544e 457struct redisCommand {
458 char *name;
459 redisCommandProc *proc;
460 int arity;
461 int flags;
76583ea4
PN
462 /* Use a function to determine which keys need to be loaded
463 * in the background prior to executing this command. Takes precedence
464 * over vm_firstkey and others, ignored when NULL */
ca1788b5 465 redisVmPreloadProc *vm_preload_proc;
7c775e09 466 /* What keys should be loaded in background when calling this command? */
467 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
468 int vm_lastkey; /* THe last argument that's a key */
469 int vm_keystep; /* The step between first and last key */
ed9b544e 470};
471
de96dbfe 472struct redisFunctionSym {
473 char *name;
56906eef 474 unsigned long pointer;
de96dbfe 475};
476
ed9b544e 477typedef struct _redisSortObject {
478 robj *obj;
479 union {
480 double score;
481 robj *cmpobj;
482 } u;
483} redisSortObject;
484
485typedef struct _redisSortOperation {
486 int type;
487 robj *pattern;
488} redisSortOperation;
489
6b47e12e 490/* ZSETs use a specialized version of Skiplists */
491
492typedef struct zskiplistNode {
493 struct zskiplistNode **forward;
e3870fab 494 struct zskiplistNode *backward;
912b9165 495 unsigned int *span;
6b47e12e 496 double score;
497 robj *obj;
498} zskiplistNode;
499
500typedef struct zskiplist {
e3870fab 501 struct zskiplistNode *header, *tail;
d13f767c 502 unsigned long length;
6b47e12e 503 int level;
504} zskiplist;
505
1812e024 506typedef struct zset {
507 dict *dict;
6b47e12e 508 zskiplist *zsl;
1812e024 509} zset;
510
6b47e12e 511/* Our shared "common" objects */
512
05df7621 513#define REDIS_SHARED_INTEGERS 10000
ed9b544e 514struct sharedObjectsStruct {
c937aa89 515 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 516 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 517 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
518 *outofrangeerr, *plus,
ed9b544e 519 *select0, *select1, *select2, *select3, *select4,
befec3cd 520 *select5, *select6, *select7, *select8, *select9,
c8d0ea0e 521 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
522 *mbulk4, *psubscribebulk, *punsubscribebulk,
523 *integers[REDIS_SHARED_INTEGERS];
ed9b544e 524} shared;
525
a7866db6 526/* Global vars that are actally used as constants. The following double
527 * values are used for double on-disk serialization, and are initialized
528 * at runtime to avoid strange compiler optimizations. */
529
530static double R_Zero, R_PosInf, R_NegInf, R_Nan;
531
92f8e882 532/* VM threaded I/O request message */
b9bc0eef 533#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
534#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
535#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 536typedef struct iojob {
996cb5f7 537 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 538 redisDb *db;/* Redis database */
92f8e882 539 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 540 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 541 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
542 off_t page; /* Swap page where to read/write the object */
248ea310 543 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 544 int canceled; /* True if this command was canceled by blocking side of VM */
545 pthread_t thread; /* ID of the thread processing this entry */
546} iojob;
92f8e882 547
ed9b544e 548/*================================ Prototypes =============================== */
549
550static void freeStringObject(robj *o);
551static void freeListObject(robj *o);
552static void freeSetObject(robj *o);
553static void decrRefCount(void *o);
554static robj *createObject(int type, void *ptr);
555static void freeClient(redisClient *c);
f78fd11b 556static int rdbLoad(char *filename);
ed9b544e 557static void addReply(redisClient *c, robj *obj);
558static void addReplySds(redisClient *c, sds s);
559static void incrRefCount(robj *o);
f78fd11b 560static int rdbSaveBackground(char *filename);
ed9b544e 561static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 562static robj *dupStringObject(robj *o);
248ea310 563static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
dd142b9c 564static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
28ed1f33 565static void flushAppendOnlyFile(void);
44b38ef4 566static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 567static int syncWithMaster(void);
05df7621 568static robj *tryObjectEncoding(robj *o);
9d65a1bb 569static robj *getDecodedObject(robj *o);
3305306f 570static int removeExpire(redisDb *db, robj *key);
571static int expireIfNeeded(redisDb *db, robj *key);
572static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 573static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 574static int deleteKey(redisDb *db, robj *key);
bb32ede5 575static time_t getExpire(redisDb *db, robj *key);
576static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 577static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 578static void freeMemoryIfNeeded(void);
de96dbfe 579static int processCommand(redisClient *c);
56906eef 580static void setupSigSegvAction(void);
a3b21203 581static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 582static void aofRemoveTempFile(pid_t childpid);
0ea663ea 583static size_t stringObjectLen(robj *o);
638e42ac 584static void processInputBuffer(redisClient *c);
6b47e12e 585static zskiplist *zslCreate(void);
fd8ccf44 586static void zslFree(zskiplist *zsl);
2b59cfdf 587static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 588static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 589static void initClientMultiState(redisClient *c);
590static void freeClientMultiState(redisClient *c);
591static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 592static void unblockClientWaitingData(redisClient *c);
4409877e 593static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 594static void vmInit(void);
a35ddf12 595static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 596static robj *vmLoadObject(robj *key);
7e69548d 597static robj *vmPreviewObject(robj *key);
a69a0c9c 598static int vmSwapOneObjectBlocking(void);
599static int vmSwapOneObjectThreaded(void);
7e69548d 600static int vmCanSwapOut(void);
a5819310 601static int tryFreeOneObjectFromFreelist(void);
996cb5f7 602static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
603static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
604static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 605static void lockThreadedIO(void);
606static void unlockThreadedIO(void);
607static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
608static void freeIOJob(iojob *j);
609static void queueIOJob(iojob *j);
a5819310 610static int vmWriteObjectOnSwap(robj *o, off_t page);
611static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 612static void waitEmptyIOJobsQueue(void);
613static void vmReopenSwapFile(void);
970e10bb 614static int vmFreePage(off_t page);
ca1788b5 615static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
3805e04f 616static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
0a6f3f0f 617static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
d5d55fc3 618static int dontWaitForSwappedKey(redisClient *c, robj *key);
619static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
620static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
621static struct redisCommand *lookupCommand(char *name);
622static void call(redisClient *c, struct redisCommand *cmd);
623static void resetClient(redisClient *c);
ada386b2 624static void convertToRealHash(robj *o);
ffc6b7f8 625static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
626static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
627static void freePubsubPattern(void *p);
628static int listMatchPubsubPattern(void *a, void *b);
629static int compareStringObjects(robj *a, robj *b);
bf028098 630static int equalStringObjects(robj *a, robj *b);
befec3cd 631static void usage();
8f63ddca 632static int rewriteAppendOnlyFileBackground(void);
242a64f3 633static int vmSwapObjectBlocking(robj *key, robj *val);
ed9b544e 634
abcb223e 635static void authCommand(redisClient *c);
ed9b544e 636static void pingCommand(redisClient *c);
637static void echoCommand(redisClient *c);
638static void setCommand(redisClient *c);
639static void setnxCommand(redisClient *c);
526d00a5 640static void setexCommand(redisClient *c);
ed9b544e 641static void getCommand(redisClient *c);
642static void delCommand(redisClient *c);
643static void existsCommand(redisClient *c);
644static void incrCommand(redisClient *c);
645static void decrCommand(redisClient *c);
646static void incrbyCommand(redisClient *c);
647static void decrbyCommand(redisClient *c);
648static void selectCommand(redisClient *c);
649static void randomkeyCommand(redisClient *c);
650static void keysCommand(redisClient *c);
651static void dbsizeCommand(redisClient *c);
652static void lastsaveCommand(redisClient *c);
653static void saveCommand(redisClient *c);
654static void bgsaveCommand(redisClient *c);
9d65a1bb 655static void bgrewriteaofCommand(redisClient *c);
ed9b544e 656static void shutdownCommand(redisClient *c);
657static void moveCommand(redisClient *c);
658static void renameCommand(redisClient *c);
659static void renamenxCommand(redisClient *c);
660static void lpushCommand(redisClient *c);
661static void rpushCommand(redisClient *c);
662static void lpopCommand(redisClient *c);
663static void rpopCommand(redisClient *c);
664static void llenCommand(redisClient *c);
665static void lindexCommand(redisClient *c);
666static void lrangeCommand(redisClient *c);
667static void ltrimCommand(redisClient *c);
668static void typeCommand(redisClient *c);
669static void lsetCommand(redisClient *c);
670static void saddCommand(redisClient *c);
671static void sremCommand(redisClient *c);
a4460ef4 672static void smoveCommand(redisClient *c);
ed9b544e 673static void sismemberCommand(redisClient *c);
674static void scardCommand(redisClient *c);
12fea928 675static void spopCommand(redisClient *c);
2abb95a9 676static void srandmemberCommand(redisClient *c);
ed9b544e 677static void sinterCommand(redisClient *c);
678static void sinterstoreCommand(redisClient *c);
40d224a9 679static void sunionCommand(redisClient *c);
680static void sunionstoreCommand(redisClient *c);
f4f56e1d 681static void sdiffCommand(redisClient *c);
682static void sdiffstoreCommand(redisClient *c);
ed9b544e 683static void syncCommand(redisClient *c);
684static void flushdbCommand(redisClient *c);
685static void flushallCommand(redisClient *c);
686static void sortCommand(redisClient *c);
687static void lremCommand(redisClient *c);
0f5f7e9a 688static void rpoplpushcommand(redisClient *c);
ed9b544e 689static void infoCommand(redisClient *c);
70003d28 690static void mgetCommand(redisClient *c);
87eca727 691static void monitorCommand(redisClient *c);
3305306f 692static void expireCommand(redisClient *c);
802e8373 693static void expireatCommand(redisClient *c);
f6b141c5 694static void getsetCommand(redisClient *c);
fd88489a 695static void ttlCommand(redisClient *c);
321b0e13 696static void slaveofCommand(redisClient *c);
7f957c92 697static void debugCommand(redisClient *c);
f6b141c5 698static void msetCommand(redisClient *c);
699static void msetnxCommand(redisClient *c);
fd8ccf44 700static void zaddCommand(redisClient *c);
7db723ad 701static void zincrbyCommand(redisClient *c);
cc812361 702static void zrangeCommand(redisClient *c);
50c55df5 703static void zrangebyscoreCommand(redisClient *c);
f44dd428 704static void zcountCommand(redisClient *c);
e3870fab 705static void zrevrangeCommand(redisClient *c);
3c41331e 706static void zcardCommand(redisClient *c);
1b7106e7 707static void zremCommand(redisClient *c);
6e333bbe 708static void zscoreCommand(redisClient *c);
1807985b 709static void zremrangebyscoreCommand(redisClient *c);
6e469882 710static void multiCommand(redisClient *c);
711static void execCommand(redisClient *c);
18b6cb76 712static void discardCommand(redisClient *c);
4409877e 713static void blpopCommand(redisClient *c);
714static void brpopCommand(redisClient *c);
4b00bebd 715static void appendCommand(redisClient *c);
39191553 716static void substrCommand(redisClient *c);
69d95c3e 717static void zrankCommand(redisClient *c);
798d9e55 718static void zrevrankCommand(redisClient *c);
978c2c94 719static void hsetCommand(redisClient *c);
1f1c7695 720static void hsetnxCommand(redisClient *c);
978c2c94 721static void hgetCommand(redisClient *c);
09aeb579
PN
722static void hmsetCommand(redisClient *c);
723static void hmgetCommand(redisClient *c);
07efaf74 724static void hdelCommand(redisClient *c);
92b27fe9 725static void hlenCommand(redisClient *c);
9212eafd 726static void zremrangebyrankCommand(redisClient *c);
5d373da9 727static void zunionstoreCommand(redisClient *c);
728static void zinterstoreCommand(redisClient *c);
78409a0f 729static void hkeysCommand(redisClient *c);
730static void hvalsCommand(redisClient *c);
731static void hgetallCommand(redisClient *c);
a86f14b1 732static void hexistsCommand(redisClient *c);
500ece7c 733static void configCommand(redisClient *c);
01426b05 734static void hincrbyCommand(redisClient *c);
befec3cd 735static void subscribeCommand(redisClient *c);
736static void unsubscribeCommand(redisClient *c);
ffc6b7f8 737static void psubscribeCommand(redisClient *c);
738static void punsubscribeCommand(redisClient *c);
befec3cd 739static void publishCommand(redisClient *c);
f6b141c5 740
ed9b544e 741/*================================= Globals ================================= */
742
743/* Global vars */
744static struct redisServer server; /* server global state */
745static struct redisCommand cmdTable[] = {
76583ea4
PN
746 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
748 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
526d00a5 749 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
750 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
751 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
752 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
753 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
754 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
755 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
756 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
757 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
758 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
759 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
760 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
761 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
762 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
763 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
764 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
765 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
766 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
767 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
768 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
769 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
770 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
771 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
772 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
773 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
774 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
778 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
779 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
780 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
781 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
782 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
783 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
784 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
785 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
786 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
787 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
5d373da9 789 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
790 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
791 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
792 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
793 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
794 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
795 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
796 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
798 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
799 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
1f1c7695 800 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 801 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
d33278d1 802 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 803 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
01426b05 804 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
805 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
806 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
807 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
808 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
809 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 810 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
811 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
812 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
813 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
814 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
815 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
816 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
818 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
819 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
820 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
821 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
823 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
824 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
825 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
826 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
828 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
831 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
832 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
834 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
3805e04f 835 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
836 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
840 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
841 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
842 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
843 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
844 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
845 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 846 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
befec3cd 847 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
848 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
ffc6b7f8 849 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
850 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
4005fef1 851 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
76583ea4 852 {NULL,NULL,0,0,NULL,0,0,0}
ed9b544e 853};
bcfc686d 854
ed9b544e 855/*============================ Utility functions ============================ */
856
857/* Glob-style pattern matching. */
500ece7c 858static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 859 const char *string, int stringLen, int nocase)
860{
861 while(patternLen) {
862 switch(pattern[0]) {
863 case '*':
864 while (pattern[1] == '*') {
865 pattern++;
866 patternLen--;
867 }
868 if (patternLen == 1)
869 return 1; /* match */
870 while(stringLen) {
871 if (stringmatchlen(pattern+1, patternLen-1,
872 string, stringLen, nocase))
873 return 1; /* match */
874 string++;
875 stringLen--;
876 }
877 return 0; /* no match */
878 break;
879 case '?':
880 if (stringLen == 0)
881 return 0; /* no match */
882 string++;
883 stringLen--;
884 break;
885 case '[':
886 {
887 int not, match;
888
889 pattern++;
890 patternLen--;
891 not = pattern[0] == '^';
892 if (not) {
893 pattern++;
894 patternLen--;
895 }
896 match = 0;
897 while(1) {
898 if (pattern[0] == '\\') {
899 pattern++;
900 patternLen--;
901 if (pattern[0] == string[0])
902 match = 1;
903 } else if (pattern[0] == ']') {
904 break;
905 } else if (patternLen == 0) {
906 pattern--;
907 patternLen++;
908 break;
909 } else if (pattern[1] == '-' && patternLen >= 3) {
910 int start = pattern[0];
911 int end = pattern[2];
912 int c = string[0];
913 if (start > end) {
914 int t = start;
915 start = end;
916 end = t;
917 }
918 if (nocase) {
919 start = tolower(start);
920 end = tolower(end);
921 c = tolower(c);
922 }
923 pattern += 2;
924 patternLen -= 2;
925 if (c >= start && c <= end)
926 match = 1;
927 } else {
928 if (!nocase) {
929 if (pattern[0] == string[0])
930 match = 1;
931 } else {
932 if (tolower((int)pattern[0]) == tolower((int)string[0]))
933 match = 1;
934 }
935 }
936 pattern++;
937 patternLen--;
938 }
939 if (not)
940 match = !match;
941 if (!match)
942 return 0; /* no match */
943 string++;
944 stringLen--;
945 break;
946 }
947 case '\\':
948 if (patternLen >= 2) {
949 pattern++;
950 patternLen--;
951 }
952 /* fall through */
953 default:
954 if (!nocase) {
955 if (pattern[0] != string[0])
956 return 0; /* no match */
957 } else {
958 if (tolower((int)pattern[0]) != tolower((int)string[0]))
959 return 0; /* no match */
960 }
961 string++;
962 stringLen--;
963 break;
964 }
965 pattern++;
966 patternLen--;
967 if (stringLen == 0) {
968 while(*pattern == '*') {
969 pattern++;
970 patternLen--;
971 }
972 break;
973 }
974 }
975 if (patternLen == 0 && stringLen == 0)
976 return 1;
977 return 0;
978}
979
500ece7c 980static int stringmatch(const char *pattern, const char *string, int nocase) {
981 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
982}
983
2b619329 984/* Convert a string representing an amount of memory into the number of
985 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
986 * (1024*1024*1024).
987 *
988 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
989 * set to 0 */
990static long long memtoll(const char *p, int *err) {
991 const char *u;
992 char buf[128];
993 long mul; /* unit multiplier */
994 long long val;
995 unsigned int digits;
996
997 if (err) *err = 0;
998 /* Search the first non digit character. */
999 u = p;
1000 if (*u == '-') u++;
1001 while(*u && isdigit(*u)) u++;
1002 if (*u == '\0' || !strcasecmp(u,"b")) {
1003 mul = 1;
72324005 1004 } else if (!strcasecmp(u,"k")) {
2b619329 1005 mul = 1000;
72324005 1006 } else if (!strcasecmp(u,"kb")) {
2b619329 1007 mul = 1024;
72324005 1008 } else if (!strcasecmp(u,"m")) {
2b619329 1009 mul = 1000*1000;
72324005 1010 } else if (!strcasecmp(u,"mb")) {
2b619329 1011 mul = 1024*1024;
72324005 1012 } else if (!strcasecmp(u,"g")) {
2b619329 1013 mul = 1000L*1000*1000;
72324005 1014 } else if (!strcasecmp(u,"gb")) {
2b619329 1015 mul = 1024L*1024*1024;
1016 } else {
1017 if (err) *err = 1;
1018 mul = 1;
1019 }
1020 digits = u-p;
1021 if (digits >= sizeof(buf)) {
1022 if (err) *err = 1;
1023 return LLONG_MAX;
1024 }
1025 memcpy(buf,p,digits);
1026 buf[digits] = '\0';
1027 val = strtoll(buf,NULL,10);
1028 return val*mul;
1029}
1030
ee14da56 1031/* Convert a long long into a string. Returns the number of
1032 * characters needed to represent the number, that can be shorter if passed
1033 * buffer length is not enough to store the whole number. */
1034static int ll2string(char *s, size_t len, long long value) {
1035 char buf[32], *p;
1036 unsigned long long v;
1037 size_t l;
1038
1039 if (len == 0) return 0;
1040 v = (value < 0) ? -value : value;
1041 p = buf+31; /* point to the last character */
1042 do {
1043 *p-- = '0'+(v%10);
1044 v /= 10;
1045 } while(v);
1046 if (value < 0) *p-- = '-';
1047 p++;
1048 l = 32-(p-buf);
1049 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1050 memcpy(s,p,l);
1051 s[l] = '\0';
1052 return l;
1053}
1054
56906eef 1055static void redisLog(int level, const char *fmt, ...) {
ed9b544e 1056 va_list ap;
1057 FILE *fp;
1058
1059 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1060 if (!fp) return;
1061
1062 va_start(ap, fmt);
1063 if (level >= server.verbosity) {
6766f45e 1064 char *c = ".-*#";
1904ecc1 1065 char buf[64];
1066 time_t now;
1067
1068 now = time(NULL);
6c9385e0 1069 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 1070 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 1071 vfprintf(fp, fmt, ap);
1072 fprintf(fp,"\n");
1073 fflush(fp);
1074 }
1075 va_end(ap);
1076
1077 if (server.logfile) fclose(fp);
1078}
1079
1080/*====================== Hash table type implementation ==================== */
1081
1082/* This is an hash table type that uses the SDS dynamic strings libary as
1083 * keys and radis objects as values (objects can hold SDS strings,
1084 * lists, sets). */
1085
1812e024 1086static void dictVanillaFree(void *privdata, void *val)
1087{
1088 DICT_NOTUSED(privdata);
1089 zfree(val);
1090}
1091
4409877e 1092static void dictListDestructor(void *privdata, void *val)
1093{
1094 DICT_NOTUSED(privdata);
1095 listRelease((list*)val);
1096}
1097
ed9b544e 1098static int sdsDictKeyCompare(void *privdata, const void *key1,
1099 const void *key2)
1100{
1101 int l1,l2;
1102 DICT_NOTUSED(privdata);
1103
1104 l1 = sdslen((sds)key1);
1105 l2 = sdslen((sds)key2);
1106 if (l1 != l2) return 0;
1107 return memcmp(key1, key2, l1) == 0;
1108}
1109
1110static void dictRedisObjectDestructor(void *privdata, void *val)
1111{
1112 DICT_NOTUSED(privdata);
1113
a35ddf12 1114 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 1115 decrRefCount(val);
1116}
1117
942a3961 1118static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1119 const void *key2)
1120{
1121 const robj *o1 = key1, *o2 = key2;
1122 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1123}
1124
942a3961 1125static unsigned int dictObjHash(const void *key) {
ed9b544e 1126 const robj *o = key;
1127 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1128}
1129
942a3961 1130static int dictEncObjKeyCompare(void *privdata, const void *key1,
1131 const void *key2)
1132{
9d65a1bb 1133 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1134 int cmp;
942a3961 1135
2a1198b4 1136 if (o1->encoding == REDIS_ENCODING_INT &&
dc05abde 1137 o2->encoding == REDIS_ENCODING_INT)
1138 return o1->ptr == o2->ptr;
2a1198b4 1139
9d65a1bb 1140 o1 = getDecodedObject(o1);
1141 o2 = getDecodedObject(o2);
1142 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1143 decrRefCount(o1);
1144 decrRefCount(o2);
1145 return cmp;
942a3961 1146}
1147
1148static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1149 robj *o = (robj*) key;
942a3961 1150
ed9e4966 1151 if (o->encoding == REDIS_ENCODING_RAW) {
1152 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1153 } else {
1154 if (o->encoding == REDIS_ENCODING_INT) {
1155 char buf[32];
1156 int len;
1157
ee14da56 1158 len = ll2string(buf,32,(long)o->ptr);
ed9e4966 1159 return dictGenHashFunction((unsigned char*)buf, len);
1160 } else {
1161 unsigned int hash;
1162
1163 o = getDecodedObject(o);
1164 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1165 decrRefCount(o);
1166 return hash;
1167 }
1168 }
942a3961 1169}
1170
f2d9f50f 1171/* Sets type and expires */
ed9b544e 1172static dictType setDictType = {
942a3961 1173 dictEncObjHash, /* hash function */
ed9b544e 1174 NULL, /* key dup */
1175 NULL, /* val dup */
942a3961 1176 dictEncObjKeyCompare, /* key compare */
ed9b544e 1177 dictRedisObjectDestructor, /* key destructor */
1178 NULL /* val destructor */
1179};
1180
f2d9f50f 1181/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1182static dictType zsetDictType = {
1183 dictEncObjHash, /* hash function */
1184 NULL, /* key dup */
1185 NULL, /* val dup */
1186 dictEncObjKeyCompare, /* key compare */
1187 dictRedisObjectDestructor, /* key destructor */
da0a1620 1188 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1189};
1190
f2d9f50f 1191/* Db->dict */
5234952b 1192static dictType dbDictType = {
942a3961 1193 dictObjHash, /* hash function */
ed9b544e 1194 NULL, /* key dup */
1195 NULL, /* val dup */
942a3961 1196 dictObjKeyCompare, /* key compare */
ed9b544e 1197 dictRedisObjectDestructor, /* key destructor */
1198 dictRedisObjectDestructor /* val destructor */
1199};
1200
f2d9f50f 1201/* Db->expires */
1202static dictType keyptrDictType = {
1203 dictObjHash, /* hash function */
1204 NULL, /* key dup */
1205 NULL, /* val dup */
1206 dictObjKeyCompare, /* key compare */
1207 dictRedisObjectDestructor, /* key destructor */
1208 NULL /* val destructor */
1209};
1210
5234952b 1211/* Hash type hash table (note that small hashes are represented with zimpaps) */
1212static dictType hashDictType = {
1213 dictEncObjHash, /* hash function */
1214 NULL, /* key dup */
1215 NULL, /* val dup */
1216 dictEncObjKeyCompare, /* key compare */
1217 dictRedisObjectDestructor, /* key destructor */
1218 dictRedisObjectDestructor /* val destructor */
1219};
1220
4409877e 1221/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1222 * lists as values. It's used for blocking operations (BLPOP) and to
1223 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1224static dictType keylistDictType = {
1225 dictObjHash, /* hash function */
1226 NULL, /* key dup */
1227 NULL, /* val dup */
1228 dictObjKeyCompare, /* key compare */
1229 dictRedisObjectDestructor, /* key destructor */
1230 dictListDestructor /* val destructor */
1231};
1232
42ab0172
AO
1233static void version();
1234
ed9b544e 1235/* ========================= Random utility functions ======================= */
1236
1237/* Redis generally does not try to recover from out of memory conditions
1238 * when allocating objects or strings, it is not clear if it will be possible
1239 * to report this condition to the client since the networking layer itself
1240 * is based on heap allocation for send buffers, so we simply abort.
1241 * At least the code will be simpler to read... */
1242static void oom(const char *msg) {
71c54b21 1243 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1244 sleep(1);
1245 abort();
1246}
1247
1248/* ====================== Redis server networking stuff ===================== */
56906eef 1249static void closeTimedoutClients(void) {
ed9b544e 1250 redisClient *c;
ed9b544e 1251 listNode *ln;
1252 time_t now = time(NULL);
c7df85a4 1253 listIter li;
ed9b544e 1254
c7df85a4 1255 listRewind(server.clients,&li);
1256 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1257 c = listNodeValue(ln);
f86a74e9 1258 if (server.maxidletime &&
1259 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1260 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
ffc6b7f8 1261 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1262 listLength(c->pubsub_patterns) == 0 &&
d6cc8867 1263 (now - c->lastinteraction > server.maxidletime))
f86a74e9 1264 {
f870935d 1265 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1266 freeClient(c);
f86a74e9 1267 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1268 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1269 addReply(c,shared.nullmultibulk);
b0d8747d 1270 unblockClientWaitingData(c);
f86a74e9 1271 }
ed9b544e 1272 }
1273 }
ed9b544e 1274}
1275
12fea928 1276static int htNeedsResize(dict *dict) {
1277 long long size, used;
1278
1279 size = dictSlots(dict);
1280 used = dictSize(dict);
1281 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1282 (used*100/size < REDIS_HT_MINFILL));
1283}
1284
0bc03378 1285/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1286 * we resize the hash table to save memory */
56906eef 1287static void tryResizeHashTables(void) {
0bc03378 1288 int j;
1289
1290 for (j = 0; j < server.dbnum; j++) {
5413c40d 1291 if (htNeedsResize(server.db[j].dict))
0bc03378 1292 dictResize(server.db[j].dict);
12fea928 1293 if (htNeedsResize(server.db[j].expires))
1294 dictResize(server.db[j].expires);
0bc03378 1295 }
1296}
1297
8ca3e9d1 1298/* Our hash table implementation performs rehashing incrementally while
1299 * we write/read from the hash table. Still if the server is idle, the hash
1300 * table will use two tables for a long time. So we try to use 1 millisecond
1301 * of CPU time at every serverCron() loop in order to rehash some key. */
1302static void incrementallyRehash(void) {
1303 int j;
1304
1305 for (j = 0; j < server.dbnum; j++) {
1306 if (dictIsRehashing(server.db[j].dict)) {
1307 dictRehashMilliseconds(server.db[j].dict,1);
1308 break; /* already used our millisecond for this loop... */
1309 }
1310 }
1311}
1312
9d65a1bb 1313/* A background saving child (BGSAVE) terminated its work. Handle this. */
1314void backgroundSaveDoneHandler(int statloc) {
1315 int exitcode = WEXITSTATUS(statloc);
1316 int bysignal = WIFSIGNALED(statloc);
1317
1318 if (!bysignal && exitcode == 0) {
1319 redisLog(REDIS_NOTICE,
1320 "Background saving terminated with success");
1321 server.dirty = 0;
1322 server.lastsave = time(NULL);
1323 } else if (!bysignal && exitcode != 0) {
1324 redisLog(REDIS_WARNING, "Background saving error");
1325 } else {
1326 redisLog(REDIS_WARNING,
454eea7c 1327 "Background saving terminated by signal %d", WTERMSIG(statloc));
9d65a1bb 1328 rdbRemoveTempFile(server.bgsavechildpid);
1329 }
1330 server.bgsavechildpid = -1;
1331 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1332 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1333 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1334}
1335
1336/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1337 * Handle this. */
1338void backgroundRewriteDoneHandler(int statloc) {
1339 int exitcode = WEXITSTATUS(statloc);
1340 int bysignal = WIFSIGNALED(statloc);
1341
1342 if (!bysignal && exitcode == 0) {
1343 int fd;
1344 char tmpfile[256];
1345
1346 redisLog(REDIS_NOTICE,
1347 "Background append only file rewriting terminated with success");
1348 /* Now it's time to flush the differences accumulated by the parent */
1349 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1350 fd = open(tmpfile,O_WRONLY|O_APPEND);
1351 if (fd == -1) {
1352 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1353 goto cleanup;
1354 }
1355 /* Flush our data... */
1356 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1357 (signed) sdslen(server.bgrewritebuf)) {
1358 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1359 close(fd);
1360 goto cleanup;
1361 }
b32627cd 1362 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1363 /* Now our work is to rename the temp file into the stable file. And
1364 * switch the file descriptor used by the server for append only. */
1365 if (rename(tmpfile,server.appendfilename) == -1) {
1366 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1367 close(fd);
1368 goto cleanup;
1369 }
1370 /* Mission completed... almost */
1371 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1372 if (server.appendfd != -1) {
1373 /* If append only is actually enabled... */
1374 close(server.appendfd);
1375 server.appendfd = fd;
1376 fsync(fd);
85a83172 1377 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1378 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1379 } else {
1380 /* If append only is disabled we just generate a dump in this
1381 * format. Why not? */
1382 close(fd);
1383 }
1384 } else if (!bysignal && exitcode != 0) {
1385 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1386 } else {
1387 redisLog(REDIS_WARNING,
454eea7c 1388 "Background append only file rewriting terminated by signal %d",
1389 WTERMSIG(statloc));
9d65a1bb 1390 }
1391cleanup:
1392 sdsfree(server.bgrewritebuf);
1393 server.bgrewritebuf = sdsempty();
1394 aofRemoveTempFile(server.bgrewritechildpid);
1395 server.bgrewritechildpid = -1;
1396}
1397
884d4b39 1398/* This function is called once a background process of some kind terminates,
1399 * as we want to avoid resizing the hash tables when there is a child in order
1400 * to play well with copy-on-write (otherwise when a resize happens lots of
1401 * memory pages are copied). The goal of this function is to update the ability
1402 * for dict.c to resize the hash tables accordingly to the fact we have o not
1403 * running childs. */
1404static void updateDictResizePolicy(void) {
1405 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1406 dictEnableResize();
1407 else
1408 dictDisableResize();
1409}
1410
56906eef 1411static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1412 int j, loops = server.cronloops++;
ed9b544e 1413 REDIS_NOTUSED(eventLoop);
1414 REDIS_NOTUSED(id);
1415 REDIS_NOTUSED(clientData);
1416
3a66edc7 1417 /* We take a cached value of the unix time in the global state because
1418 * with virtual memory and aging there is to store the current time
1419 * in objects at every object access, and accuracy is not needed.
1420 * To access a global var is faster than calling time(NULL) */
1421 server.unixtime = time(NULL);
1422
0bc03378 1423 /* Show some info about non-empty databases */
ed9b544e 1424 for (j = 0; j < server.dbnum; j++) {
dec423d9 1425 long long size, used, vkeys;
94754ccc 1426
3305306f 1427 size = dictSlots(server.db[j].dict);
1428 used = dictSize(server.db[j].dict);
94754ccc 1429 vkeys = dictSize(server.db[j].expires);
1763929f 1430 if (!(loops % 50) && (used || vkeys)) {
f870935d 1431 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1432 /* dictPrintStats(server.dict); */
ed9b544e 1433 }
ed9b544e 1434 }
1435
0bc03378 1436 /* We don't want to resize the hash tables while a bacground saving
1437 * is in progress: the saving child is created using fork() that is
1438 * implemented with a copy-on-write semantic in most modern systems, so
1439 * if we resize the HT while there is the saving child at work actually
1440 * a lot of memory movements in the parent will cause a lot of pages
1441 * copied. */
8ca3e9d1 1442 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1443 if (!(loops % 10)) tryResizeHashTables();
1444 if (server.activerehashing) incrementallyRehash();
884d4b39 1445 }
0bc03378 1446
ed9b544e 1447 /* Show information about connected clients */
1763929f 1448 if (!(loops % 50)) {
bdcb92f2 1449 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
ed9b544e 1450 listLength(server.clients)-listLength(server.slaves),
1451 listLength(server.slaves),
bdcb92f2 1452 zmalloc_used_memory());
ed9b544e 1453 }
1454
1455 /* Close connections of timedout clients */
1763929f 1456 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1457 closeTimedoutClients();
1458
9d65a1bb 1459 /* Check if a background saving or AOF rewrite in progress terminated */
1460 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1461 int statloc;
9d65a1bb 1462 pid_t pid;
1463
1464 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1465 if (pid == server.bgsavechildpid) {
1466 backgroundSaveDoneHandler(statloc);
ed9b544e 1467 } else {
9d65a1bb 1468 backgroundRewriteDoneHandler(statloc);
ed9b544e 1469 }
884d4b39 1470 updateDictResizePolicy();
ed9b544e 1471 }
1472 } else {
1473 /* If there is not a background saving in progress check if
1474 * we have to save now */
1475 time_t now = time(NULL);
1476 for (j = 0; j < server.saveparamslen; j++) {
1477 struct saveparam *sp = server.saveparams+j;
1478
1479 if (server.dirty >= sp->changes &&
1480 now-server.lastsave > sp->seconds) {
1481 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1482 sp->changes, sp->seconds);
f78fd11b 1483 rdbSaveBackground(server.dbfilename);
ed9b544e 1484 break;
1485 }
1486 }
1487 }
94754ccc 1488
f2324293 1489 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1490 * will use few CPU cycles if there are few expiring keys, otherwise
1491 * it will get more aggressive to avoid that too much memory is used by
1492 * keys that can be removed from the keyspace. */
94754ccc 1493 for (j = 0; j < server.dbnum; j++) {
f2324293 1494 int expired;
94754ccc 1495 redisDb *db = server.db+j;
94754ccc 1496
f2324293 1497 /* Continue to expire if at the end of the cycle more than 25%
1498 * of the keys were expired. */
1499 do {
4ef8de8a 1500 long num = dictSize(db->expires);
94754ccc 1501 time_t now = time(NULL);
1502
f2324293 1503 expired = 0;
94754ccc 1504 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1505 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1506 while (num--) {
1507 dictEntry *de;
1508 time_t t;
1509
1510 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1511 t = (time_t) dictGetEntryVal(de);
1512 if (now > t) {
1513 deleteKey(db,dictGetEntryKey(de));
f2324293 1514 expired++;
2a6a2ed1 1515 server.stat_expiredkeys++;
94754ccc 1516 }
1517 }
f2324293 1518 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1519 }
1520
4ef8de8a 1521 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1522 * is enbled. Try to free objects from the free list first. */
7e69548d 1523 if (vmCanSwapOut()) {
1524 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1525 server.vm_max_memory)
1526 {
72e9fd40 1527 int retval;
1528
a5819310 1529 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1530 retval = (server.vm_max_threads == 0) ?
1531 vmSwapOneObjectBlocking() :
1532 vmSwapOneObjectThreaded();
1763929f 1533 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1534 zmalloc_used_memory() >
1535 (server.vm_max_memory+server.vm_max_memory/10))
1536 {
1537 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1538 }
72e9fd40 1539 /* Note that when using threade I/O we free just one object,
1540 * because anyway when the I/O thread in charge to swap this
1541 * object out will finish, the handler of completed jobs
1542 * will try to swap more objects if we are still out of memory. */
1543 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1544 }
1545 }
1546
ed9b544e 1547 /* Check if we should connect to a MASTER */
1763929f 1548 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1549 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1550 if (syncWithMaster() == REDIS_OK) {
1551 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
8f63ddca 1552 if (server.appendonly) rewriteAppendOnlyFileBackground();
ed9b544e 1553 }
1554 }
1763929f 1555 return 100;
ed9b544e 1556}
1557
d5d55fc3 1558/* This function gets called every time Redis is entering the
1559 * main loop of the event driven library, that is, before to sleep
1560 * for ready file descriptors. */
1561static void beforeSleep(struct aeEventLoop *eventLoop) {
1562 REDIS_NOTUSED(eventLoop);
1563
28ed1f33 1564 /* Awake clients that got all the swapped keys they requested */
d5d55fc3 1565 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1566 listIter li;
1567 listNode *ln;
1568
1569 listRewind(server.io_ready_clients,&li);
1570 while((ln = listNext(&li))) {
1571 redisClient *c = ln->value;
1572 struct redisCommand *cmd;
1573
1574 /* Resume the client. */
1575 listDelNode(server.io_ready_clients,ln);
1576 c->flags &= (~REDIS_IO_WAIT);
1577 server.vm_blocked_clients--;
1578 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1579 readQueryFromClient, c);
1580 cmd = lookupCommand(c->argv[0]->ptr);
1581 assert(cmd != NULL);
1582 call(c,cmd);
1583 resetClient(c);
1584 /* There may be more data to process in the input buffer. */
1585 if (c->querybuf && sdslen(c->querybuf) > 0)
1586 processInputBuffer(c);
1587 }
1588 }
28ed1f33 1589 /* Write the AOF buffer on disk */
1590 flushAppendOnlyFile();
d5d55fc3 1591}
1592
ed9b544e 1593static void createSharedObjects(void) {
05df7621 1594 int j;
1595
ed9b544e 1596 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1597 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1598 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1599 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1600 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1601 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1602 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1603 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1604 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1605 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1606 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1607 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1608 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1609 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1610 "-ERR no such key\r\n"));
ed9b544e 1611 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1612 "-ERR syntax error\r\n"));
c937aa89 1613 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1614 "-ERR source and destination objects are the same\r\n"));
1615 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1616 "-ERR index out of range\r\n"));
ed9b544e 1617 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1618 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1619 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1620 shared.select0 = createStringObject("select 0\r\n",10);
1621 shared.select1 = createStringObject("select 1\r\n",10);
1622 shared.select2 = createStringObject("select 2\r\n",10);
1623 shared.select3 = createStringObject("select 3\r\n",10);
1624 shared.select4 = createStringObject("select 4\r\n",10);
1625 shared.select5 = createStringObject("select 5\r\n",10);
1626 shared.select6 = createStringObject("select 6\r\n",10);
1627 shared.select7 = createStringObject("select 7\r\n",10);
1628 shared.select8 = createStringObject("select 8\r\n",10);
1629 shared.select9 = createStringObject("select 9\r\n",10);
befec3cd 1630 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
c8d0ea0e 1631 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
befec3cd 1632 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
fc46bb71 1633 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
ffc6b7f8 1634 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1635 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
befec3cd 1636 shared.mbulk3 = createStringObject("*3\r\n",4);
c8d0ea0e 1637 shared.mbulk4 = createStringObject("*4\r\n",4);
05df7621 1638 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1639 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1640 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1641 }
ed9b544e 1642}
1643
1644static void appendServerSaveParams(time_t seconds, int changes) {
1645 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1646 server.saveparams[server.saveparamslen].seconds = seconds;
1647 server.saveparams[server.saveparamslen].changes = changes;
1648 server.saveparamslen++;
1649}
1650
bcfc686d 1651static void resetServerSaveParams() {
ed9b544e 1652 zfree(server.saveparams);
1653 server.saveparams = NULL;
1654 server.saveparamslen = 0;
1655}
1656
1657static void initServerConfig() {
1658 server.dbnum = REDIS_DEFAULT_DBNUM;
1659 server.port = REDIS_SERVERPORT;
f870935d 1660 server.verbosity = REDIS_VERBOSE;
ed9b544e 1661 server.maxidletime = REDIS_MAXIDLETIME;
1662 server.saveparams = NULL;
1663 server.logfile = NULL; /* NULL = log on standard output */
1664 server.bindaddr = NULL;
1665 server.glueoutputbuf = 1;
1666 server.daemonize = 0;
44b38ef4 1667 server.appendonly = 0;
1b677732 1668 server.appendfsync = APPENDFSYNC_EVERYSEC;
48f0308a 1669 server.lastfsync = time(NULL);
44b38ef4 1670 server.appendfd = -1;
1671 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1672 server.pidfile = zstrdup("/var/run/redis.pid");
1673 server.dbfilename = zstrdup("dump.rdb");
1674 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1675 server.requirepass = NULL;
b0553789 1676 server.rdbcompression = 1;
8ca3e9d1 1677 server.activerehashing = 1;
285add55 1678 server.maxclients = 0;
d5d55fc3 1679 server.blpop_blocked_clients = 0;
3fd78bcd 1680 server.maxmemory = 0;
75680a3c 1681 server.vm_enabled = 0;
054e426d 1682 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1683 server.vm_page_size = 256; /* 256 bytes per page */
1684 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1685 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1686 server.vm_max_threads = 4;
d5d55fc3 1687 server.vm_blocked_clients = 0;
cbba7dd7 1688 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1689 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
75680a3c 1690
bcfc686d 1691 resetServerSaveParams();
ed9b544e 1692
1693 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1694 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1695 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1696 /* Replication related */
1697 server.isslave = 0;
d0ccebcf 1698 server.masterauth = NULL;
ed9b544e 1699 server.masterhost = NULL;
1700 server.masterport = 6379;
1701 server.master = NULL;
1702 server.replstate = REDIS_REPL_NONE;
a7866db6 1703
1704 /* Double constants initialization */
1705 R_Zero = 0.0;
1706 R_PosInf = 1.0/R_Zero;
1707 R_NegInf = -1.0/R_Zero;
1708 R_Nan = R_Zero/R_Zero;
ed9b544e 1709}
1710
1711static void initServer() {
1712 int j;
1713
1714 signal(SIGHUP, SIG_IGN);
1715 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1716 setupSigSegvAction();
ed9b544e 1717
b9bc0eef 1718 server.devnull = fopen("/dev/null","w");
1719 if (server.devnull == NULL) {
1720 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1721 exit(1);
1722 }
ed9b544e 1723 server.clients = listCreate();
1724 server.slaves = listCreate();
87eca727 1725 server.monitors = listCreate();
ed9b544e 1726 server.objfreelist = listCreate();
1727 createSharedObjects();
1728 server.el = aeCreateEventLoop();
3305306f 1729 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
ed9b544e 1730 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1731 if (server.fd == -1) {
1732 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1733 exit(1);
1734 }
3305306f 1735 for (j = 0; j < server.dbnum; j++) {
5234952b 1736 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1737 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
4409877e 1738 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1739 if (server.vm_enabled)
1740 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1741 server.db[j].id = j;
1742 }
ffc6b7f8 1743 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1744 server.pubsub_patterns = listCreate();
1745 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1746 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
ed9b544e 1747 server.cronloops = 0;
9f3c422c 1748 server.bgsavechildpid = -1;
9d65a1bb 1749 server.bgrewritechildpid = -1;
1750 server.bgrewritebuf = sdsempty();
28ed1f33 1751 server.aofbuf = sdsempty();
ed9b544e 1752 server.lastsave = time(NULL);
1753 server.dirty = 0;
ed9b544e 1754 server.stat_numcommands = 0;
1755 server.stat_numconnections = 0;
2a6a2ed1 1756 server.stat_expiredkeys = 0;
ed9b544e 1757 server.stat_starttime = time(NULL);
3a66edc7 1758 server.unixtime = time(NULL);
d8f8b666 1759 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1760 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1761 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1762
1763 if (server.appendonly) {
3bb225d6 1764 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1765 if (server.appendfd == -1) {
1766 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1767 strerror(errno));
1768 exit(1);
1769 }
1770 }
75680a3c 1771
1772 if (server.vm_enabled) vmInit();
ed9b544e 1773}
1774
1775/* Empty the whole database */
ca37e9cd 1776static long long emptyDb() {
ed9b544e 1777 int j;
ca37e9cd 1778 long long removed = 0;
ed9b544e 1779
3305306f 1780 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1781 removed += dictSize(server.db[j].dict);
3305306f 1782 dictEmpty(server.db[j].dict);
1783 dictEmpty(server.db[j].expires);
1784 }
ca37e9cd 1785 return removed;
ed9b544e 1786}
1787
85dd2f3a 1788static int yesnotoi(char *s) {
1789 if (!strcasecmp(s,"yes")) return 1;
1790 else if (!strcasecmp(s,"no")) return 0;
1791 else return -1;
1792}
1793
ed9b544e 1794/* I agree, this is a very rudimental way to load a configuration...
1795 will improve later if the config gets more complex */
1796static void loadServerConfig(char *filename) {
c9a111ac 1797 FILE *fp;
ed9b544e 1798 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1799 int linenum = 0;
1800 sds line = NULL;
c9a111ac 1801
1802 if (filename[0] == '-' && filename[1] == '\0')
1803 fp = stdin;
1804 else {
1805 if ((fp = fopen(filename,"r")) == NULL) {
9a22de82 1806 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
c9a111ac 1807 exit(1);
1808 }
ed9b544e 1809 }
c9a111ac 1810
ed9b544e 1811 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1812 sds *argv;
1813 int argc, j;
1814
1815 linenum++;
1816 line = sdsnew(buf);
1817 line = sdstrim(line," \t\r\n");
1818
1819 /* Skip comments and blank lines*/
1820 if (line[0] == '#' || line[0] == '\0') {
1821 sdsfree(line);
1822 continue;
1823 }
1824
1825 /* Split into arguments */
1826 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1827 sdstolower(argv[0]);
1828
1829 /* Execute config directives */
bb0b03a3 1830 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1831 server.maxidletime = atoi(argv[1]);
0150db36 1832 if (server.maxidletime < 0) {
ed9b544e 1833 err = "Invalid timeout value"; goto loaderr;
1834 }
bb0b03a3 1835 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1836 server.port = atoi(argv[1]);
1837 if (server.port < 1 || server.port > 65535) {
1838 err = "Invalid port"; goto loaderr;
1839 }
bb0b03a3 1840 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1841 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1842 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1843 int seconds = atoi(argv[1]);
1844 int changes = atoi(argv[2]);
1845 if (seconds < 1 || changes < 0) {
1846 err = "Invalid save parameters"; goto loaderr;
1847 }
1848 appendServerSaveParams(seconds,changes);
bb0b03a3 1849 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1850 if (chdir(argv[1]) == -1) {
1851 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1852 argv[1], strerror(errno));
1853 exit(1);
1854 }
bb0b03a3 1855 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1856 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1857 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1858 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1859 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1860 else {
1861 err = "Invalid log level. Must be one of debug, notice, warning";
1862 goto loaderr;
1863 }
bb0b03a3 1864 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1865 FILE *logfp;
ed9b544e 1866
1867 server.logfile = zstrdup(argv[1]);
bb0b03a3 1868 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1869 zfree(server.logfile);
1870 server.logfile = NULL;
1871 }
1872 if (server.logfile) {
1873 /* Test if we are able to open the file. The server will not
1874 * be able to abort just for this problem later... */
c9a111ac 1875 logfp = fopen(server.logfile,"a");
1876 if (logfp == NULL) {
ed9b544e 1877 err = sdscatprintf(sdsempty(),
1878 "Can't open the log file: %s", strerror(errno));
1879 goto loaderr;
1880 }
c9a111ac 1881 fclose(logfp);
ed9b544e 1882 }
bb0b03a3 1883 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1884 server.dbnum = atoi(argv[1]);
1885 if (server.dbnum < 1) {
1886 err = "Invalid number of databases"; goto loaderr;
1887 }
b3f83f12
JZ
1888 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1889 loadServerConfig(argv[1]);
285add55 1890 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1891 server.maxclients = atoi(argv[1]);
3fd78bcd 1892 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
2b619329 1893 server.maxmemory = memtoll(argv[1],NULL);
bb0b03a3 1894 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1895 server.masterhost = sdsnew(argv[1]);
1896 server.masterport = atoi(argv[2]);
1897 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1898 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1899 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1900 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1901 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1902 err = "argument must be 'yes' or 'no'"; goto loaderr;
1903 }
121f70cf 1904 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1905 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
8ca3e9d1 1906 err = "argument must be 'yes' or 'no'"; goto loaderr;
1907 }
1908 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1909 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
121f70cf 1910 err = "argument must be 'yes' or 'no'"; goto loaderr;
1911 }
bb0b03a3 1912 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1913 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1914 err = "argument must be 'yes' or 'no'"; goto loaderr;
1915 }
44b38ef4 1916 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1917 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1918 err = "argument must be 'yes' or 'no'"; goto loaderr;
1919 }
f3b52411
PN
1920 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1921 zfree(server.appendfilename);
1922 server.appendfilename = zstrdup(argv[1]);
48f0308a 1923 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1924 if (!strcasecmp(argv[1],"no")) {
48f0308a 1925 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1926 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1927 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1928 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1929 server.appendfsync = APPENDFSYNC_EVERYSEC;
1930 } else {
1931 err = "argument must be 'no', 'always' or 'everysec'";
1932 goto loaderr;
1933 }
bb0b03a3 1934 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 1935 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1936 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 1937 zfree(server.pidfile);
054e426d 1938 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1939 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 1940 zfree(server.dbfilename);
054e426d 1941 server.dbfilename = zstrdup(argv[1]);
75680a3c 1942 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1943 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1944 err = "argument must be 'yes' or 'no'"; goto loaderr;
1945 }
054e426d 1946 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 1947 zfree(server.vm_swap_file);
054e426d 1948 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 1949 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2b619329 1950 server.vm_max_memory = memtoll(argv[1],NULL);
4ef8de8a 1951 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2b619329 1952 server.vm_page_size = memtoll(argv[1], NULL);
4ef8de8a 1953 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2b619329 1954 server.vm_pages = memtoll(argv[1], NULL);
92f8e882 1955 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1956 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 1957 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2b619329 1958 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
cbba7dd7 1959 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2b619329 1960 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
ed9b544e 1961 } else {
1962 err = "Bad directive or wrong number of arguments"; goto loaderr;
1963 }
1964 for (j = 0; j < argc; j++)
1965 sdsfree(argv[j]);
1966 zfree(argv);
1967 sdsfree(line);
1968 }
c9a111ac 1969 if (fp != stdin) fclose(fp);
ed9b544e 1970 return;
1971
1972loaderr:
1973 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1974 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1975 fprintf(stderr, ">>> '%s'\n", line);
1976 fprintf(stderr, "%s\n", err);
1977 exit(1);
1978}
1979
1980static void freeClientArgv(redisClient *c) {
1981 int j;
1982
1983 for (j = 0; j < c->argc; j++)
1984 decrRefCount(c->argv[j]);
e8a74421 1985 for (j = 0; j < c->mbargc; j++)
1986 decrRefCount(c->mbargv[j]);
ed9b544e 1987 c->argc = 0;
e8a74421 1988 c->mbargc = 0;
ed9b544e 1989}
1990
1991static void freeClient(redisClient *c) {
1992 listNode *ln;
1993
4409877e 1994 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 1995 * call, we have to set querybuf to NULL *before* to call
1996 * unblockClientWaitingData() to avoid processInputBuffer() will get
1997 * called. Also it is important to remove the file events after
1998 * this, because this call adds the READABLE event. */
4409877e 1999 sdsfree(c->querybuf);
2000 c->querybuf = NULL;
2001 if (c->flags & REDIS_BLOCKED)
b0d8747d 2002 unblockClientWaitingData(c);
4409877e 2003
ffc6b7f8 2004 /* Unsubscribe from all the pubsub channels */
2005 pubsubUnsubscribeAllChannels(c,0);
2006 pubsubUnsubscribeAllPatterns(c,0);
2007 dictRelease(c->pubsub_channels);
2008 listRelease(c->pubsub_patterns);
befec3cd 2009 /* Obvious cleanup */
ed9b544e 2010 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2011 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 2012 listRelease(c->reply);
2013 freeClientArgv(c);
2014 close(c->fd);
92f8e882 2015 /* Remove from the list of clients */
ed9b544e 2016 ln = listSearchKey(server.clients,c);
dfc5e96c 2017 redisAssert(ln != NULL);
ed9b544e 2018 listDelNode(server.clients,ln);
d5d55fc3 2019 /* Remove from the list of clients waiting for swapped keys */
2020 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2021 ln = listSearchKey(server.io_ready_clients,c);
2022 if (ln) {
2023 listDelNode(server.io_ready_clients,ln);
2024 server.vm_blocked_clients--;
2025 }
2026 }
2027 while (server.vm_enabled && listLength(c->io_keys)) {
2028 ln = listFirst(c->io_keys);
2029 dontWaitForSwappedKey(c,ln->value);
92f8e882 2030 }
b3e3d0d7 2031 listRelease(c->io_keys);
befec3cd 2032 /* Master/slave cleanup */
ed9b544e 2033 if (c->flags & REDIS_SLAVE) {
6208b3a7 2034 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2035 close(c->repldbfd);
87eca727 2036 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2037 ln = listSearchKey(l,c);
dfc5e96c 2038 redisAssert(ln != NULL);
87eca727 2039 listDelNode(l,ln);
ed9b544e 2040 }
2041 if (c->flags & REDIS_MASTER) {
2042 server.master = NULL;
2043 server.replstate = REDIS_REPL_CONNECT;
2044 }
befec3cd 2045 /* Release memory */
93ea3759 2046 zfree(c->argv);
e8a74421 2047 zfree(c->mbargv);
6e469882 2048 freeClientMultiState(c);
ed9b544e 2049 zfree(c);
2050}
2051
cc30e368 2052#define GLUEREPLY_UP_TO (1024)
ed9b544e 2053static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 2054 int copylen = 0;
2055 char buf[GLUEREPLY_UP_TO];
6208b3a7 2056 listNode *ln;
c7df85a4 2057 listIter li;
ed9b544e 2058 robj *o;
2059
c7df85a4 2060 listRewind(c->reply,&li);
2061 while((ln = listNext(&li))) {
c28b42ac 2062 int objlen;
2063
ed9b544e 2064 o = ln->value;
c28b42ac 2065 objlen = sdslen(o->ptr);
2066 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2067 memcpy(buf+copylen,o->ptr,objlen);
2068 copylen += objlen;
ed9b544e 2069 listDelNode(c->reply,ln);
c28b42ac 2070 } else {
2071 if (copylen == 0) return;
2072 break;
ed9b544e 2073 }
ed9b544e 2074 }
c28b42ac 2075 /* Now the output buffer is empty, add the new single element */
2076 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2077 listAddNodeHead(c->reply,o);
ed9b544e 2078}
2079
2080static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2081 redisClient *c = privdata;
2082 int nwritten = 0, totwritten = 0, objlen;
2083 robj *o;
2084 REDIS_NOTUSED(el);
2085 REDIS_NOTUSED(mask);
2086
2895e862 2087 /* Use writev() if we have enough buffers to send */
7ea870c0 2088 if (!server.glueoutputbuf &&
e0a62c7f 2089 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
7ea870c0 2090 !(c->flags & REDIS_MASTER))
2895e862 2091 {
2092 sendReplyToClientWritev(el, fd, privdata, mask);
2093 return;
2094 }
2895e862 2095
ed9b544e 2096 while(listLength(c->reply)) {
c28b42ac 2097 if (server.glueoutputbuf && listLength(c->reply) > 1)
2098 glueReplyBuffersIfNeeded(c);
2099
ed9b544e 2100 o = listNodeValue(listFirst(c->reply));
2101 objlen = sdslen(o->ptr);
2102
2103 if (objlen == 0) {
2104 listDelNode(c->reply,listFirst(c->reply));
2105 continue;
2106 }
2107
2108 if (c->flags & REDIS_MASTER) {
6f376729 2109 /* Don't reply to a master */
ed9b544e 2110 nwritten = objlen - c->sentlen;
2111 } else {
a4d1ba9a 2112 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 2113 if (nwritten <= 0) break;
2114 }
2115 c->sentlen += nwritten;
2116 totwritten += nwritten;
2117 /* If we fully sent the object on head go to the next one */
2118 if (c->sentlen == objlen) {
2119 listDelNode(c->reply,listFirst(c->reply));
2120 c->sentlen = 0;
2121 }
6f376729 2122 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 2123 * bytes, in a single threaded server it's a good idea to serve
6f376729 2124 * other clients as well, even if a very large request comes from
2125 * super fast link that is always able to accept data (in real world
12f9d551 2126 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 2127 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 2128 }
2129 if (nwritten == -1) {
2130 if (errno == EAGAIN) {
2131 nwritten = 0;
2132 } else {
f870935d 2133 redisLog(REDIS_VERBOSE,
ed9b544e 2134 "Error writing to client: %s", strerror(errno));
2135 freeClient(c);
2136 return;
2137 }
2138 }
2139 if (totwritten > 0) c->lastinteraction = time(NULL);
2140 if (listLength(c->reply) == 0) {
2141 c->sentlen = 0;
2142 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2143 }
2144}
2145
2895e862 2146static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2147{
2148 redisClient *c = privdata;
2149 int nwritten = 0, totwritten = 0, objlen, willwrite;
2150 robj *o;
2151 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2152 int offset, ion = 0;
2153 REDIS_NOTUSED(el);
2154 REDIS_NOTUSED(mask);
2155
2156 listNode *node;
2157 while (listLength(c->reply)) {
2158 offset = c->sentlen;
2159 ion = 0;
2160 willwrite = 0;
2161
2162 /* fill-in the iov[] array */
2163 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2164 o = listNodeValue(node);
2165 objlen = sdslen(o->ptr);
2166
e0a62c7f 2167 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2895e862 2168 break;
2169
2170 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2171 break; /* no more iovecs */
2172
2173 iov[ion].iov_base = ((char*)o->ptr) + offset;
2174 iov[ion].iov_len = objlen - offset;
2175 willwrite += objlen - offset;
2176 offset = 0; /* just for the first item */
2177 ion++;
2178 }
2179
2180 if(willwrite == 0)
2181 break;
2182
2183 /* write all collected blocks at once */
2184 if((nwritten = writev(fd, iov, ion)) < 0) {
2185 if (errno != EAGAIN) {
f870935d 2186 redisLog(REDIS_VERBOSE,
2895e862 2187 "Error writing to client: %s", strerror(errno));
2188 freeClient(c);
2189 return;
2190 }
2191 break;
2192 }
2193
2194 totwritten += nwritten;
2195 offset = c->sentlen;
2196
2197 /* remove written robjs from c->reply */
2198 while (nwritten && listLength(c->reply)) {
2199 o = listNodeValue(listFirst(c->reply));
2200 objlen = sdslen(o->ptr);
2201
2202 if(nwritten >= objlen - offset) {
2203 listDelNode(c->reply, listFirst(c->reply));
2204 nwritten -= objlen - offset;
2205 c->sentlen = 0;
2206 } else {
2207 /* partial write */
2208 c->sentlen += nwritten;
2209 break;
2210 }
2211 offset = 0;
2212 }
2213 }
2214
e0a62c7f 2215 if (totwritten > 0)
2895e862 2216 c->lastinteraction = time(NULL);
2217
2218 if (listLength(c->reply) == 0) {
2219 c->sentlen = 0;
2220 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2221 }
2222}
2223
ed9b544e 2224static struct redisCommand *lookupCommand(char *name) {
2225 int j = 0;
2226 while(cmdTable[j].name != NULL) {
bb0b03a3 2227 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
ed9b544e 2228 j++;
2229 }
2230 return NULL;
2231}
2232
2233/* resetClient prepare the client to process the next command */
2234static void resetClient(redisClient *c) {
2235 freeClientArgv(c);
2236 c->bulklen = -1;
e8a74421 2237 c->multibulk = 0;
ed9b544e 2238}
2239
6e469882 2240/* Call() is the core of Redis execution of a command */
2241static void call(redisClient *c, struct redisCommand *cmd) {
2242 long long dirty;
2243
2244 dirty = server.dirty;
2245 cmd->proc(c);
4005fef1 2246 dirty = server.dirty-dirty;
2247
2248 if (server.appendonly && dirty)
6e469882 2249 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
4005fef1 2250 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2251 listLength(server.slaves))
248ea310 2252 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2253 if (listLength(server.monitors))
dd142b9c 2254 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2255 server.stat_numcommands++;
2256}
2257
ed9b544e 2258/* If this function gets called we already read a whole
2259 * command, argments are in the client argv/argc fields.
2260 * processCommand() execute the command or prepare the
2261 * server for a bulk read from the client.
2262 *
2263 * If 1 is returned the client is still alive and valid and
2264 * and other operations can be performed by the caller. Otherwise
2265 * if 0 is returned the client was destroied (i.e. after QUIT). */
2266static int processCommand(redisClient *c) {
2267 struct redisCommand *cmd;
ed9b544e 2268
3fd78bcd 2269 /* Free some memory if needed (maxmemory setting) */
2270 if (server.maxmemory) freeMemoryIfNeeded();
2271
e8a74421 2272 /* Handle the multi bulk command type. This is an alternative protocol
2273 * supported by Redis in order to receive commands that are composed of
2274 * multiple binary-safe "bulk" arguments. The latency of processing is
2275 * a bit higher but this allows things like multi-sets, so if this
2276 * protocol is used only for MSET and similar commands this is a big win. */
2277 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2278 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2279 if (c->multibulk <= 0) {
2280 resetClient(c);
2281 return 1;
2282 } else {
2283 decrRefCount(c->argv[c->argc-1]);
2284 c->argc--;
2285 return 1;
2286 }
2287 } else if (c->multibulk) {
2288 if (c->bulklen == -1) {
2289 if (((char*)c->argv[0]->ptr)[0] != '$') {
2290 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2291 resetClient(c);
2292 return 1;
2293 } else {
2294 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2295 decrRefCount(c->argv[0]);
2296 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2297 c->argc--;
2298 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2299 resetClient(c);
2300 return 1;
2301 }
2302 c->argc--;
2303 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2304 return 1;
2305 }
2306 } else {
2307 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2308 c->mbargv[c->mbargc] = c->argv[0];
2309 c->mbargc++;
2310 c->argc--;
2311 c->multibulk--;
2312 if (c->multibulk == 0) {
2313 robj **auxargv;
2314 int auxargc;
2315
2316 /* Here we need to swap the multi-bulk argc/argv with the
2317 * normal argc/argv of the client structure. */
2318 auxargv = c->argv;
2319 c->argv = c->mbargv;
2320 c->mbargv = auxargv;
2321
2322 auxargc = c->argc;
2323 c->argc = c->mbargc;
2324 c->mbargc = auxargc;
2325
2326 /* We need to set bulklen to something different than -1
2327 * in order for the code below to process the command without
2328 * to try to read the last argument of a bulk command as
2329 * a special argument. */
2330 c->bulklen = 0;
2331 /* continue below and process the command */
2332 } else {
2333 c->bulklen = -1;
2334 return 1;
2335 }
2336 }
2337 }
2338 /* -- end of multi bulk commands processing -- */
2339
ed9b544e 2340 /* The QUIT command is handled as a special case. Normal command
2341 * procs are unable to close the client connection safely */
bb0b03a3 2342 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2343 freeClient(c);
2344 return 0;
2345 }
d5d55fc3 2346
2347 /* Now lookup the command and check ASAP about trivial error conditions
2348 * such wrong arity, bad command name and so forth. */
ed9b544e 2349 cmd = lookupCommand(c->argv[0]->ptr);
2350 if (!cmd) {
2c14807b 2351 addReplySds(c,
2352 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2353 (char*)c->argv[0]->ptr));
ed9b544e 2354 resetClient(c);
2355 return 1;
2356 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2357 (c->argc < -cmd->arity)) {
454d4e43 2358 addReplySds(c,
2359 sdscatprintf(sdsempty(),
2360 "-ERR wrong number of arguments for '%s' command\r\n",
2361 cmd->name));
ed9b544e 2362 resetClient(c);
2363 return 1;
2364 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2365 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2366 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2367
2368 decrRefCount(c->argv[c->argc-1]);
2369 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2370 c->argc--;
2371 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2372 resetClient(c);
2373 return 1;
2374 }
2375 c->argc--;
2376 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2377 /* It is possible that the bulk read is already in the
8d0490e7 2378 * buffer. Check this condition and handle it accordingly.
2379 * This is just a fast path, alternative to call processInputBuffer().
2380 * It's a good idea since the code is small and this condition
2381 * happens most of the times. */
ed9b544e 2382 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2383 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2384 c->argc++;
2385 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2386 } else {
d5d55fc3 2387 /* Otherwise return... there is to read the last argument
2388 * from the socket. */
ed9b544e 2389 return 1;
2390 }
2391 }
942a3961 2392 /* Let's try to encode the bulk object to save space. */
2393 if (cmd->flags & REDIS_CMD_BULK)
05df7621 2394 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
942a3961 2395
e63943a4 2396 /* Check if the user is authenticated */
2397 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2398 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2399 resetClient(c);
2400 return 1;
2401 }
2402
b61a28fe 2403 /* Handle the maxmemory directive */
2404 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2405 zmalloc_used_memory() > server.maxmemory)
2406 {
2407 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2408 resetClient(c);
2409 return 1;
2410 }
2411
d6cc8867 2412 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
e6cca5db 2413 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2414 &&
ffc6b7f8 2415 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2416 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2417 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
d6cc8867 2418 resetClient(c);
2419 return 1;
2420 }
2421
ed9b544e 2422 /* Exec the command */
18b6cb76 2423 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
6e469882 2424 queueMultiCommand(c,cmd);
2425 addReply(c,shared.queued);
2426 } else {
d5d55fc3 2427 if (server.vm_enabled && server.vm_max_threads > 0 &&
0a6f3f0f 2428 blockClientOnSwappedKeys(c,cmd)) return 1;
6e469882 2429 call(c,cmd);
2430 }
ed9b544e 2431
2432 /* Prepare the client for the next command */
ed9b544e 2433 resetClient(c);
2434 return 1;
2435}
2436
248ea310 2437static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2438 listNode *ln;
c7df85a4 2439 listIter li;
ed9b544e 2440 int outc = 0, j;
93ea3759 2441 robj **outv;
248ea310 2442 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2443 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2444 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2445 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2446 robj *lenobj;
93ea3759 2447
2448 if (argc <= REDIS_STATIC_ARGS) {
2449 outv = static_outv;
2450 } else {
248ea310 2451 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2452 }
248ea310 2453
2454 lenobj = createObject(REDIS_STRING,
2455 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2456 lenobj->refcount = 0;
2457 outv[outc++] = lenobj;
ed9b544e 2458 for (j = 0; j < argc; j++) {
248ea310 2459 lenobj = createObject(REDIS_STRING,
2460 sdscatprintf(sdsempty(),"$%lu\r\n",
2461 (unsigned long) stringObjectLen(argv[j])));
2462 lenobj->refcount = 0;
2463 outv[outc++] = lenobj;
ed9b544e 2464 outv[outc++] = argv[j];
248ea310 2465 outv[outc++] = shared.crlf;
ed9b544e 2466 }
ed9b544e 2467
40d224a9 2468 /* Increment all the refcounts at start and decrement at end in order to
2469 * be sure to free objects if there is no slave in a replication state
2470 * able to be feed with commands */
2471 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2472 listRewind(slaves,&li);
2473 while((ln = listNext(&li))) {
ed9b544e 2474 redisClient *slave = ln->value;
40d224a9 2475
2476 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2477 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2478
2479 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2480 if (slave->slaveseldb != dictid) {
2481 robj *selectcmd;
2482
2483 switch(dictid) {
2484 case 0: selectcmd = shared.select0; break;
2485 case 1: selectcmd = shared.select1; break;
2486 case 2: selectcmd = shared.select2; break;
2487 case 3: selectcmd = shared.select3; break;
2488 case 4: selectcmd = shared.select4; break;
2489 case 5: selectcmd = shared.select5; break;
2490 case 6: selectcmd = shared.select6; break;
2491 case 7: selectcmd = shared.select7; break;
2492 case 8: selectcmd = shared.select8; break;
2493 case 9: selectcmd = shared.select9; break;
2494 default:
2495 selectcmd = createObject(REDIS_STRING,
2496 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2497 selectcmd->refcount = 0;
2498 break;
2499 }
2500 addReply(slave,selectcmd);
2501 slave->slaveseldb = dictid;
2502 }
2503 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2504 }
40d224a9 2505 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2506 if (outv != static_outv) zfree(outv);
ed9b544e 2507}
2508
dd142b9c 2509static sds sdscatrepr(sds s, char *p, size_t len) {
2510 s = sdscatlen(s,"\"",1);
2511 while(len--) {
2512 switch(*p) {
2513 case '\\':
2514 case '"':
2515 s = sdscatprintf(s,"\\%c",*p);
2516 break;
2517 case '\n': s = sdscatlen(s,"\\n",1); break;
2518 case '\r': s = sdscatlen(s,"\\r",1); break;
2519 case '\t': s = sdscatlen(s,"\\t",1); break;
2520 case '\a': s = sdscatlen(s,"\\a",1); break;
2521 case '\b': s = sdscatlen(s,"\\b",1); break;
2522 default:
2523 if (isprint(*p))
2524 s = sdscatprintf(s,"%c",*p);
2525 else
2526 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2527 break;
2528 }
2529 p++;
2530 }
2531 return sdscatlen(s,"\"",1);
2532}
2533
2534static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2535 listNode *ln;
2536 listIter li;
2537 int j;
2538 sds cmdrepr = sdsnew("+");
2539 robj *cmdobj;
2540 struct timeval tv;
2541
2542 gettimeofday(&tv,NULL);
2543 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2544 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2545
2546 for (j = 0; j < argc; j++) {
2547 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2548 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2549 } else {
2550 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2551 sdslen(argv[j]->ptr));
2552 }
2553 if (j != argc-1)
2554 cmdrepr = sdscatlen(cmdrepr," ",1);
2555 }
2556 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2557 cmdobj = createObject(REDIS_STRING,cmdrepr);
2558
2559 listRewind(monitors,&li);
2560 while((ln = listNext(&li))) {
2561 redisClient *monitor = ln->value;
2562 addReply(monitor,cmdobj);
2563 }
2564 decrRefCount(cmdobj);
2565}
2566
638e42ac 2567static void processInputBuffer(redisClient *c) {
ed9b544e 2568again:
4409877e 2569 /* Before to process the input buffer, make sure the client is not
2570 * waitig for a blocking operation such as BLPOP. Note that the first
2571 * iteration the client is never blocked, otherwise the processInputBuffer
2572 * would not be called at all, but after the execution of the first commands
2573 * in the input buffer the client may be blocked, and the "goto again"
2574 * will try to reiterate. The following line will make it return asap. */
92f8e882 2575 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2576 if (c->bulklen == -1) {
2577 /* Read the first line of the query */
2578 char *p = strchr(c->querybuf,'\n');
2579 size_t querylen;
644fafa3 2580
ed9b544e 2581 if (p) {
2582 sds query, *argv;
2583 int argc, j;
e0a62c7f 2584
ed9b544e 2585 query = c->querybuf;
2586 c->querybuf = sdsempty();
2587 querylen = 1+(p-(query));
2588 if (sdslen(query) > querylen) {
2589 /* leave data after the first line of the query in the buffer */
2590 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2591 }
2592 *p = '\0'; /* remove "\n" */
2593 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2594 sdsupdatelen(query);
2595
2596 /* Now we can split the query in arguments */
ed9b544e 2597 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2598 sdsfree(query);
2599
2600 if (c->argv) zfree(c->argv);
2601 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2602
2603 for (j = 0; j < argc; j++) {
ed9b544e 2604 if (sdslen(argv[j])) {
2605 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2606 c->argc++;
2607 } else {
2608 sdsfree(argv[j]);
2609 }
2610 }
2611 zfree(argv);
7c49733c 2612 if (c->argc) {
2613 /* Execute the command. If the client is still valid
2614 * after processCommand() return and there is something
2615 * on the query buffer try to process the next command. */
2616 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2617 } else {
2618 /* Nothing to process, argc == 0. Just process the query
2619 * buffer if it's not empty or return to the caller */
2620 if (sdslen(c->querybuf)) goto again;
2621 }
ed9b544e 2622 return;
644fafa3 2623 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2624 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2625 freeClient(c);
2626 return;
2627 }
2628 } else {
2629 /* Bulk read handling. Note that if we are at this point
2630 the client already sent a command terminated with a newline,
2631 we are reading the bulk data that is actually the last
2632 argument of the command. */
2633 int qbl = sdslen(c->querybuf);
2634
2635 if (c->bulklen <= qbl) {
2636 /* Copy everything but the final CRLF as final argument */
2637 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2638 c->argc++;
2639 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2640 /* Process the command. If the client is still valid after
2641 * the processing and there is more data in the buffer
2642 * try to parse it. */
2643 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2644 return;
2645 }
2646 }
2647}
2648
638e42ac 2649static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2650 redisClient *c = (redisClient*) privdata;
2651 char buf[REDIS_IOBUF_LEN];
2652 int nread;
2653 REDIS_NOTUSED(el);
2654 REDIS_NOTUSED(mask);
2655
2656 nread = read(fd, buf, REDIS_IOBUF_LEN);
2657 if (nread == -1) {
2658 if (errno == EAGAIN) {
2659 nread = 0;
2660 } else {
f870935d 2661 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2662 freeClient(c);
2663 return;
2664 }
2665 } else if (nread == 0) {
f870935d 2666 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2667 freeClient(c);
2668 return;
2669 }
2670 if (nread) {
2671 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2672 c->lastinteraction = time(NULL);
2673 } else {
2674 return;
2675 }
168ac5c6 2676 processInputBuffer(c);
638e42ac 2677}
2678
ed9b544e 2679static int selectDb(redisClient *c, int id) {
2680 if (id < 0 || id >= server.dbnum)
2681 return REDIS_ERR;
3305306f 2682 c->db = &server.db[id];
ed9b544e 2683 return REDIS_OK;
2684}
2685
40d224a9 2686static void *dupClientReplyValue(void *o) {
2687 incrRefCount((robj*)o);
12d090d2 2688 return o;
40d224a9 2689}
2690
ffc6b7f8 2691static int listMatchObjects(void *a, void *b) {
bf028098 2692 return equalStringObjects(a,b);
ffc6b7f8 2693}
2694
ed9b544e 2695static redisClient *createClient(int fd) {
2696 redisClient *c = zmalloc(sizeof(*c));
2697
2698 anetNonBlock(NULL,fd);
2699 anetTcpNoDelay(NULL,fd);
2700 if (!c) return NULL;
2701 selectDb(c,0);
2702 c->fd = fd;
2703 c->querybuf = sdsempty();
2704 c->argc = 0;
93ea3759 2705 c->argv = NULL;
ed9b544e 2706 c->bulklen = -1;
e8a74421 2707 c->multibulk = 0;
2708 c->mbargc = 0;
2709 c->mbargv = NULL;
ed9b544e 2710 c->sentlen = 0;
2711 c->flags = 0;
2712 c->lastinteraction = time(NULL);
abcb223e 2713 c->authenticated = 0;
40d224a9 2714 c->replstate = REDIS_REPL_NONE;
6b47e12e 2715 c->reply = listCreate();
ed9b544e 2716 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2717 listSetDupMethod(c->reply,dupClientReplyValue);
92f8e882 2718 c->blockingkeys = NULL;
2719 c->blockingkeysnum = 0;
2720 c->io_keys = listCreate();
2721 listSetFreeMethod(c->io_keys,decrRefCount);
ffc6b7f8 2722 c->pubsub_channels = dictCreate(&setDictType,NULL);
2723 c->pubsub_patterns = listCreate();
2724 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2725 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
ed9b544e 2726 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2727 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2728 freeClient(c);
2729 return NULL;
2730 }
6b47e12e 2731 listAddNodeTail(server.clients,c);
6e469882 2732 initClientMultiState(c);
ed9b544e 2733 return c;
2734}
2735
2736static void addReply(redisClient *c, robj *obj) {
2737 if (listLength(c->reply) == 0 &&
6208b3a7 2738 (c->replstate == REDIS_REPL_NONE ||
2739 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2740 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2741 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2742
2743 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2744 obj = dupStringObject(obj);
2745 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2746 }
9d65a1bb 2747 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2748}
2749
2750static void addReplySds(redisClient *c, sds s) {
2751 robj *o = createObject(REDIS_STRING,s);
2752 addReply(c,o);
2753 decrRefCount(o);
2754}
2755
e2665397 2756static void addReplyDouble(redisClient *c, double d) {
2757 char buf[128];
2758
2759 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2760 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2761 (unsigned long) strlen(buf),buf));
e2665397 2762}
2763
aa7c2934
PN
2764static void addReplyLongLong(redisClient *c, long long ll) {
2765 char buf[128];
2766 size_t len;
2767
2768 if (ll == 0) {
2769 addReply(c,shared.czero);
2770 return;
2771 } else if (ll == 1) {
2772 addReply(c,shared.cone);
2773 return;
2774 }
482b672d 2775 buf[0] = ':';
2776 len = ll2string(buf+1,sizeof(buf)-1,ll);
2777 buf[len+1] = '\r';
2778 buf[len+2] = '\n';
2779 addReplySds(c,sdsnewlen(buf,len+3));
aa7c2934
PN
2780}
2781
92b27fe9 2782static void addReplyUlong(redisClient *c, unsigned long ul) {
2783 char buf[128];
2784 size_t len;
2785
dd88747b 2786 if (ul == 0) {
2787 addReply(c,shared.czero);
2788 return;
2789 } else if (ul == 1) {
2790 addReply(c,shared.cone);
2791 return;
2792 }
92b27fe9 2793 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2794 addReplySds(c,sdsnewlen(buf,len));
2795}
2796
942a3961 2797static void addReplyBulkLen(redisClient *c, robj *obj) {
482b672d 2798 size_t len, intlen;
2799 char buf[128];
942a3961 2800
2801 if (obj->encoding == REDIS_ENCODING_RAW) {
2802 len = sdslen(obj->ptr);
2803 } else {
2804 long n = (long)obj->ptr;
2805
e054afda 2806 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2807 len = 1;
2808 if (n < 0) {
2809 len++;
2810 n = -n;
2811 }
2812 while((n = n/10) != 0) {
2813 len++;
2814 }
2815 }
482b672d 2816 buf[0] = '$';
2817 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2818 buf[intlen+1] = '\r';
2819 buf[intlen+2] = '\n';
2820 addReplySds(c,sdsnewlen(buf,intlen+3));
942a3961 2821}
2822
dd88747b 2823static void addReplyBulk(redisClient *c, robj *obj) {
2824 addReplyBulkLen(c,obj);
2825 addReply(c,obj);
2826 addReply(c,shared.crlf);
2827}
2828
500ece7c 2829/* In the CONFIG command we need to add vanilla C string as bulk replies */
2830static void addReplyBulkCString(redisClient *c, char *s) {
2831 if (s == NULL) {
2832 addReply(c,shared.nullbulk);
2833 } else {
2834 robj *o = createStringObject(s,strlen(s));
2835 addReplyBulk(c,o);
2836 decrRefCount(o);
2837 }
2838}
2839
ed9b544e 2840static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2841 int cport, cfd;
2842 char cip[128];
285add55 2843 redisClient *c;
ed9b544e 2844 REDIS_NOTUSED(el);
2845 REDIS_NOTUSED(mask);
2846 REDIS_NOTUSED(privdata);
2847
2848 cfd = anetAccept(server.neterr, fd, cip, &cport);
2849 if (cfd == AE_ERR) {
f870935d 2850 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2851 return;
2852 }
f870935d 2853 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2854 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2855 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2856 close(cfd); /* May be already closed, just ingore errors */
2857 return;
2858 }
285add55 2859 /* If maxclient directive is set and this is one client more... close the
2860 * connection. Note that we create the client instead to check before
2861 * for this condition, since now the socket is already set in nonblocking
2862 * mode and we can send an error for free using the Kernel I/O */
2863 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2864 char *err = "-ERR max number of clients reached\r\n";
2865
2866 /* That's a best effort error message, don't check write errors */
fee803ba 2867 if (write(c->fd,err,strlen(err)) == -1) {
2868 /* Nothing to do, Just to avoid the warning... */
2869 }
285add55 2870 freeClient(c);
2871 return;
2872 }
ed9b544e 2873 server.stat_numconnections++;
2874}
2875
2876/* ======================= Redis objects implementation ===================== */
2877
2878static robj *createObject(int type, void *ptr) {
2879 robj *o;
2880
a5819310 2881 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2882 if (listLength(server.objfreelist)) {
2883 listNode *head = listFirst(server.objfreelist);
2884 o = listNodeValue(head);
2885 listDelNode(server.objfreelist,head);
a5819310 2886 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2887 } else {
75680a3c 2888 if (server.vm_enabled) {
a5819310 2889 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2890 o = zmalloc(sizeof(*o));
2891 } else {
2892 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2893 }
ed9b544e 2894 }
ed9b544e 2895 o->type = type;
942a3961 2896 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2897 o->ptr = ptr;
2898 o->refcount = 1;
3a66edc7 2899 if (server.vm_enabled) {
1064ef87 2900 /* Note that this code may run in the context of an I/O thread
2901 * and accessing to server.unixtime in theory is an error
2902 * (no locks). But in practice this is safe, and even if we read
2903 * garbage Redis will not fail, as it's just a statistical info */
3a66edc7 2904 o->vm.atime = server.unixtime;
2905 o->storage = REDIS_VM_MEMORY;
2906 }
ed9b544e 2907 return o;
2908}
2909
2910static robj *createStringObject(char *ptr, size_t len) {
2911 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2912}
2913
3f973463
PN
2914static robj *createStringObjectFromLongLong(long long value) {
2915 robj *o;
2916 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2917 incrRefCount(shared.integers[value]);
2918 o = shared.integers[value];
2919 } else {
2920 o = createObject(REDIS_STRING, NULL);
2921 if (value >= LONG_MIN && value <= LONG_MAX) {
2922 o->encoding = REDIS_ENCODING_INT;
2923 o->ptr = (void*)((long)value);
2924 } else {
ee14da56 2925 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3f973463
PN
2926 }
2927 }
2928 return o;
2929}
2930
4ef8de8a 2931static robj *dupStringObject(robj *o) {
b9bc0eef 2932 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2933 return createStringObject(o->ptr,sdslen(o->ptr));
2934}
2935
ed9b544e 2936static robj *createListObject(void) {
2937 list *l = listCreate();
2938
ed9b544e 2939 listSetFreeMethod(l,decrRefCount);
2940 return createObject(REDIS_LIST,l);
2941}
2942
2943static robj *createSetObject(void) {
2944 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2945 return createObject(REDIS_SET,d);
2946}
2947
5234952b 2948static robj *createHashObject(void) {
2949 /* All the Hashes start as zipmaps. Will be automatically converted
2950 * into hash tables if there are enough elements or big elements
2951 * inside. */
2952 unsigned char *zm = zipmapNew();
2953 robj *o = createObject(REDIS_HASH,zm);
2954 o->encoding = REDIS_ENCODING_ZIPMAP;
2955 return o;
2956}
2957
1812e024 2958static robj *createZsetObject(void) {
6b47e12e 2959 zset *zs = zmalloc(sizeof(*zs));
2960
2961 zs->dict = dictCreate(&zsetDictType,NULL);
2962 zs->zsl = zslCreate();
2963 return createObject(REDIS_ZSET,zs);
1812e024 2964}
2965
ed9b544e 2966static void freeStringObject(robj *o) {
942a3961 2967 if (o->encoding == REDIS_ENCODING_RAW) {
2968 sdsfree(o->ptr);
2969 }
ed9b544e 2970}
2971
2972static void freeListObject(robj *o) {
2973 listRelease((list*) o->ptr);
2974}
2975
2976static void freeSetObject(robj *o) {
2977 dictRelease((dict*) o->ptr);
2978}
2979
fd8ccf44 2980static void freeZsetObject(robj *o) {
2981 zset *zs = o->ptr;
2982
2983 dictRelease(zs->dict);
2984 zslFree(zs->zsl);
2985 zfree(zs);
2986}
2987
ed9b544e 2988static void freeHashObject(robj *o) {
cbba7dd7 2989 switch (o->encoding) {
2990 case REDIS_ENCODING_HT:
2991 dictRelease((dict*) o->ptr);
2992 break;
2993 case REDIS_ENCODING_ZIPMAP:
2994 zfree(o->ptr);
2995 break;
2996 default:
f83c6cb5 2997 redisPanic("Unknown hash encoding type");
cbba7dd7 2998 break;
2999 }
ed9b544e 3000}
3001
3002static void incrRefCount(robj *o) {
3003 o->refcount++;
3004}
3005
3006static void decrRefCount(void *obj) {
3007 robj *o = obj;
94754ccc 3008
c651fd9e 3009 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
970e10bb 3010 /* Object is a key of a swapped out value, or in the process of being
3011 * loaded. */
996cb5f7 3012 if (server.vm_enabled &&
3013 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3014 {
996cb5f7 3015 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 3016 redisAssert(o->type == REDIS_STRING);
a35ddf12 3017 freeStringObject(o);
3018 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 3019 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 3020 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3021 !listAddNodeHead(server.objfreelist,o))
3022 zfree(o);
a5819310 3023 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 3024 server.vm_stats_swapped_objects--;
a35ddf12 3025 return;
3026 }
996cb5f7 3027 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 3028 if (--(o->refcount) == 0) {
996cb5f7 3029 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3030 vmCancelThreadedIOJob(obj);
ed9b544e 3031 switch(o->type) {
3032 case REDIS_STRING: freeStringObject(o); break;
3033 case REDIS_LIST: freeListObject(o); break;
3034 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 3035 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 3036 case REDIS_HASH: freeHashObject(o); break;
f83c6cb5 3037 default: redisPanic("Unknown object type"); break;
ed9b544e 3038 }
a5819310 3039 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 3040 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3041 !listAddNodeHead(server.objfreelist,o))
3042 zfree(o);
a5819310 3043 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 3044 }
3045}
3046
942a3961 3047static robj *lookupKey(redisDb *db, robj *key) {
3048 dictEntry *de = dictFind(db->dict,key);
3a66edc7 3049 if (de) {
55cf8433 3050 robj *key = dictGetEntryKey(de);
3051 robj *val = dictGetEntryVal(de);
3a66edc7 3052
55cf8433 3053 if (server.vm_enabled) {
996cb5f7 3054 if (key->storage == REDIS_VM_MEMORY ||
3055 key->storage == REDIS_VM_SWAPPING)
3056 {
3057 /* If we were swapping the object out, stop it, this key
3058 * was requested. */
3059 if (key->storage == REDIS_VM_SWAPPING)
3060 vmCancelThreadedIOJob(key);
55cf8433 3061 /* Update the access time of the key for the aging algorithm. */
3062 key->vm.atime = server.unixtime;
3063 } else {
d5d55fc3 3064 int notify = (key->storage == REDIS_VM_LOADING);
3065
55cf8433 3066 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 3067 redisAssert(val == NULL);
55cf8433 3068 val = vmLoadObject(key);
3069 dictGetEntryVal(de) = val;
d5d55fc3 3070
3071 /* Clients blocked by the VM subsystem may be waiting for
3072 * this key... */
3073 if (notify) handleClientsBlockedOnSwappedKey(db,key);
55cf8433 3074 }
3075 }
3076 return val;
3a66edc7 3077 } else {
3078 return NULL;
3079 }
942a3961 3080}
3081
3082static robj *lookupKeyRead(redisDb *db, robj *key) {
3083 expireIfNeeded(db,key);
3084 return lookupKey(db,key);
3085}
3086
3087static robj *lookupKeyWrite(redisDb *db, robj *key) {
3088 deleteIfVolatile(db,key);
3089 return lookupKey(db,key);
3090}
3091
92b27fe9 3092static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3093 robj *o = lookupKeyRead(c->db, key);
3094 if (!o) addReply(c,reply);
3095 return o;
3096}
3097
3098static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3099 robj *o = lookupKeyWrite(c->db, key);
3100 if (!o) addReply(c,reply);
3101 return o;
3102}
3103
3104static int checkType(redisClient *c, robj *o, int type) {
3105 if (o->type != type) {
3106 addReply(c,shared.wrongtypeerr);
3107 return 1;
3108 }
3109 return 0;
3110}
3111
942a3961 3112static int deleteKey(redisDb *db, robj *key) {
3113 int retval;
3114
3115 /* We need to protect key from destruction: after the first dictDelete()
3116 * it may happen that 'key' is no longer valid if we don't increment
3117 * it's count. This may happen when we get the object reference directly
3118 * from the hash table with dictRandomKey() or dict iterators */
3119 incrRefCount(key);
3120 if (dictSize(db->expires)) dictDelete(db->expires,key);
3121 retval = dictDelete(db->dict,key);
3122 decrRefCount(key);
3123
3124 return retval == DICT_OK;
3125}
3126
724a51b1 3127/* Check if the nul-terminated string 's' can be represented by a long
3128 * (that is, is a number that fits into long without any other space or
3129 * character before or after the digits).
3130 *
3131 * If so, the function returns REDIS_OK and *longval is set to the value
3132 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 3133static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 3134 char buf[32], *endptr;
3135 long value;
3136 int slen;
e0a62c7f 3137
724a51b1 3138 value = strtol(s, &endptr, 10);
3139 if (endptr[0] != '\0') return REDIS_ERR;
ee14da56 3140 slen = ll2string(buf,32,value);
724a51b1 3141
3142 /* If the number converted back into a string is not identical
3143 * then it's not possible to encode the string as integer */
f69f2cba 3144 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 3145 if (longval) *longval = value;
3146 return REDIS_OK;
3147}
3148
942a3961 3149/* Try to encode a string object in order to save space */
05df7621 3150static robj *tryObjectEncoding(robj *o) {
942a3961 3151 long value;
942a3961 3152 sds s = o->ptr;
3305306f 3153
942a3961 3154 if (o->encoding != REDIS_ENCODING_RAW)
05df7621 3155 return o; /* Already encoded */
3305306f 3156
05df7621 3157 /* It's not safe to encode shared objects: shared objects can be shared
942a3961 3158 * everywhere in the "object space" of Redis. Encoded objects can only
3159 * appear as "values" (and not, for instance, as keys) */
05df7621 3160 if (o->refcount > 1) return o;
3305306f 3161
942a3961 3162 /* Currently we try to encode only strings */
dfc5e96c 3163 redisAssert(o->type == REDIS_STRING);
94754ccc 3164
724a51b1 3165 /* Check if we can represent this string as a long integer */
05df7621 3166 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
942a3961 3167
3168 /* Ok, this object can be encoded */
05df7621 3169 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3170 decrRefCount(o);
3171 incrRefCount(shared.integers[value]);
3172 return shared.integers[value];
3173 } else {
3174 o->encoding = REDIS_ENCODING_INT;
3175 sdsfree(o->ptr);
3176 o->ptr = (void*) value;
3177 return o;
3178 }
942a3961 3179}
3180
9d65a1bb 3181/* Get a decoded version of an encoded object (returned as a new object).
3182 * If the object is already raw-encoded just increment the ref count. */
3183static robj *getDecodedObject(robj *o) {
942a3961 3184 robj *dec;
e0a62c7f 3185
9d65a1bb 3186 if (o->encoding == REDIS_ENCODING_RAW) {
3187 incrRefCount(o);
3188 return o;
3189 }
942a3961 3190 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3191 char buf[32];
3192
ee14da56 3193 ll2string(buf,32,(long)o->ptr);
942a3961 3194 dec = createStringObject(buf,strlen(buf));
3195 return dec;
3196 } else {
08ee9b57 3197 redisPanic("Unknown encoding type");
942a3961 3198 }
3305306f 3199}
3200
d7f43c08 3201/* Compare two string objects via strcmp() or alike.
3202 * Note that the objects may be integer-encoded. In such a case we
ee14da56 3203 * use ll2string() to get a string representation of the numbers on the stack
1fd9bc8a 3204 * and compare the strings, it's much faster than calling getDecodedObject().
3205 *
3206 * Important note: if objects are not integer encoded, but binary-safe strings,
3207 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3208 * binary safe. */
724a51b1 3209static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 3210 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 3211 char bufa[128], bufb[128], *astr, *bstr;
3212 int bothsds = 1;
724a51b1 3213
e197b441 3214 if (a == b) return 0;
d7f43c08 3215 if (a->encoding != REDIS_ENCODING_RAW) {
ee14da56 3216 ll2string(bufa,sizeof(bufa),(long) a->ptr);
d7f43c08 3217 astr = bufa;
3218 bothsds = 0;
724a51b1 3219 } else {
d7f43c08 3220 astr = a->ptr;
724a51b1 3221 }
d7f43c08 3222 if (b->encoding != REDIS_ENCODING_RAW) {
ee14da56 3223 ll2string(bufb,sizeof(bufb),(long) b->ptr);
d7f43c08 3224 bstr = bufb;
3225 bothsds = 0;
3226 } else {
3227 bstr = b->ptr;
3228 }
3229 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3230}
3231
bf028098 3232/* Equal string objects return 1 if the two objects are the same from the
3233 * point of view of a string comparison, otherwise 0 is returned. Note that
3234 * this function is faster then checking for (compareStringObject(a,b) == 0)
3235 * because it can perform some more optimization. */
3236static int equalStringObjects(robj *a, robj *b) {
3237 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3238 return a->ptr == b->ptr;
3239 } else {
3240 return compareStringObjects(a,b) == 0;
3241 }
3242}
3243
0ea663ea 3244static size_t stringObjectLen(robj *o) {
dfc5e96c 3245 redisAssert(o->type == REDIS_STRING);
0ea663ea 3246 if (o->encoding == REDIS_ENCODING_RAW) {
3247 return sdslen(o->ptr);
3248 } else {
3249 char buf[32];
3250
ee14da56 3251 return ll2string(buf,32,(long)o->ptr);
0ea663ea 3252 }
3253}
3254
bd79a6bd
PN
3255static int getDoubleFromObject(robj *o, double *target) {
3256 double value;
682c73e8 3257 char *eptr;
bbe025e0 3258
bd79a6bd
PN
3259 if (o == NULL) {
3260 value = 0;
3261 } else {
3262 redisAssert(o->type == REDIS_STRING);
3263 if (o->encoding == REDIS_ENCODING_RAW) {
3264 value = strtod(o->ptr, &eptr);
682c73e8 3265 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3266 } else if (o->encoding == REDIS_ENCODING_INT) {
3267 value = (long)o->ptr;
3268 } else {
946342c1 3269 redisPanic("Unknown string encoding");
bd79a6bd
PN
3270 }
3271 }
3272
bd79a6bd
PN
3273 *target = value;
3274 return REDIS_OK;
3275}
bbe025e0 3276
bd79a6bd
PN
3277static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3278 double value;
3279 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3280 if (msg != NULL) {
3281 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3282 } else {
3283 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3284 }
bbe025e0
AM
3285 return REDIS_ERR;
3286 }
3287
bd79a6bd 3288 *target = value;
bbe025e0
AM
3289 return REDIS_OK;
3290}
3291
bd79a6bd
PN
3292static int getLongLongFromObject(robj *o, long long *target) {
3293 long long value;
682c73e8 3294 char *eptr;
bbe025e0 3295
bd79a6bd
PN
3296 if (o == NULL) {
3297 value = 0;
3298 } else {
3299 redisAssert(o->type == REDIS_STRING);
3300 if (o->encoding == REDIS_ENCODING_RAW) {
3301 value = strtoll(o->ptr, &eptr, 10);
682c73e8 3302 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3303 } else if (o->encoding == REDIS_ENCODING_INT) {
3304 value = (long)o->ptr;
3305 } else {
946342c1 3306 redisPanic("Unknown string encoding");
bd79a6bd
PN
3307 }
3308 }
3309
bd79a6bd
PN
3310 *target = value;
3311 return REDIS_OK;
3312}
bbe025e0 3313
bd79a6bd
PN
3314static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3315 long long value;
3316 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3317 if (msg != NULL) {
3318 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3319 } else {
3320 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3321 }
bbe025e0
AM
3322 return REDIS_ERR;
3323 }
3324
bd79a6bd 3325 *target = value;
bbe025e0
AM
3326 return REDIS_OK;
3327}
3328
bd79a6bd
PN
3329static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3330 long long value;
bbe025e0 3331
bd79a6bd
PN
3332 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3333 if (value < LONG_MIN || value > LONG_MAX) {
3334 if (msg != NULL) {
3335 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3336 } else {
3337 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3338 }
bbe025e0
AM
3339 return REDIS_ERR;
3340 }
3341
bd79a6bd 3342 *target = value;
bbe025e0
AM
3343 return REDIS_OK;
3344}
3345
06233c45 3346/*============================ RDB saving/loading =========================== */
ed9b544e 3347
f78fd11b 3348static int rdbSaveType(FILE *fp, unsigned char type) {
3349 if (fwrite(&type,1,1,fp) == 0) return -1;
3350 return 0;
3351}
3352
bb32ede5 3353static int rdbSaveTime(FILE *fp, time_t t) {
3354 int32_t t32 = (int32_t) t;
3355 if (fwrite(&t32,4,1,fp) == 0) return -1;
3356 return 0;
3357}
3358
e3566d4b 3359/* check rdbLoadLen() comments for more info */
f78fd11b 3360static int rdbSaveLen(FILE *fp, uint32_t len) {
3361 unsigned char buf[2];
3362
3363 if (len < (1<<6)) {
3364 /* Save a 6 bit len */
10c43610 3365 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3366 if (fwrite(buf,1,1,fp) == 0) return -1;
3367 } else if (len < (1<<14)) {
3368 /* Save a 14 bit len */
10c43610 3369 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3370 buf[1] = len&0xFF;
17be1a4a 3371 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3372 } else {
3373 /* Save a 32 bit len */
10c43610 3374 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3375 if (fwrite(buf,1,1,fp) == 0) return -1;
3376 len = htonl(len);
3377 if (fwrite(&len,4,1,fp) == 0) return -1;
3378 }
3379 return 0;
3380}
3381
32a66513 3382/* Encode 'value' as an integer if possible (if integer will fit the
3383 * supported range). If the function sucessful encoded the integer
3384 * then the (up to 5 bytes) encoded representation is written in the
3385 * string pointed by 'enc' and the length is returned. Otherwise
3386 * 0 is returned. */
3387static int rdbEncodeInteger(long long value, unsigned char *enc) {
e3566d4b 3388 /* Finally check if it fits in our ranges */
3389 if (value >= -(1<<7) && value <= (1<<7)-1) {
3390 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3391 enc[1] = value&0xFF;
3392 return 2;
3393 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3394 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3395 enc[1] = value&0xFF;
3396 enc[2] = (value>>8)&0xFF;
3397 return 3;
3398 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3399 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3400 enc[1] = value&0xFF;
3401 enc[2] = (value>>8)&0xFF;
3402 enc[3] = (value>>16)&0xFF;
3403 enc[4] = (value>>24)&0xFF;
3404 return 5;
3405 } else {
3406 return 0;
3407 }
3408}
3409
32a66513 3410/* String objects in the form "2391" "-100" without any space and with a
3411 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3412 * encoded as integers to save space */
3413static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3414 long long value;
3415 char *endptr, buf[32];
3416
3417 /* Check if it's possible to encode this value as a number */
3418 value = strtoll(s, &endptr, 10);
3419 if (endptr[0] != '\0') return 0;
3420 ll2string(buf,32,value);
3421
3422 /* If the number converted back into a string is not identical
3423 * then it's not possible to encode the string as integer */
3424 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3425
3426 return rdbEncodeInteger(value,enc);
3427}
3428
b1befe6a 3429static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3430 size_t comprlen, outlen;
774e3047 3431 unsigned char byte;
3432 void *out;
3433
3434 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3435 if (len <= 4) return 0;
3436 outlen = len-4;
3a2694c4 3437 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3438 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3439 if (comprlen == 0) {
88e85998 3440 zfree(out);
774e3047 3441 return 0;
3442 }
3443 /* Data compressed! Let's save it on disk */
3444 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3445 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3446 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3447 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3448 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3449 zfree(out);
774e3047 3450 return comprlen;
3451
3452writeerr:
88e85998 3453 zfree(out);
774e3047 3454 return -1;
3455}
3456
e3566d4b 3457/* Save a string objet as [len][data] on disk. If the object is a string
3458 * representation of an integer value we try to safe it in a special form */
b1befe6a 3459static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3460 int enclen;
10c43610 3461
774e3047 3462 /* Try integer encoding */
e3566d4b 3463 if (len <= 11) {
3464 unsigned char buf[5];
b1befe6a 3465 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3466 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3467 return 0;
3468 }
3469 }
774e3047 3470
3471 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3472 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3473 if (server.rdbcompression && len > 20) {
774e3047 3474 int retval;
3475
b1befe6a 3476 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3477 if (retval == -1) return -1;
3478 if (retval > 0) return 0;
3479 /* retval == 0 means data can't be compressed, save the old way */
3480 }
3481
3482 /* Store verbatim */
10c43610 3483 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3484 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3485 return 0;
3486}
3487
942a3961 3488/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3489static int rdbSaveStringObject(FILE *fp, robj *obj) {
3490 int retval;
942a3961 3491
32a66513 3492 /* Avoid to decode the object, then encode it again, if the
3493 * object is alrady integer encoded. */
3494 if (obj->encoding == REDIS_ENCODING_INT) {
3495 long val = (long) obj->ptr;
3496 unsigned char buf[5];
3497 int enclen;
3498
3499 if ((enclen = rdbEncodeInteger(val,buf)) > 0) {
3500 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3501 return 0;
3502 }
3503 /* otherwise... fall throught and continue with the usual
3504 * code path. */
3505 }
3506
f2d9f50f 3507 /* Avoid incr/decr ref count business when possible.
3508 * This plays well with copy-on-write given that we are probably
3509 * in a child process (BGSAVE). Also this makes sure key objects
3510 * of swapped objects are not incRefCount-ed (an assert does not allow
3511 * this in order to avoid bugs) */
3512 if (obj->encoding != REDIS_ENCODING_RAW) {
996cb5f7 3513 obj = getDecodedObject(obj);
b1befe6a 3514 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3515 decrRefCount(obj);
3516 } else {
b1befe6a 3517 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3518 }
9d65a1bb 3519 return retval;
942a3961 3520}
3521
a7866db6 3522/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3523 * 8 bit integer specifing the length of the representation.
3524 * This 8 bit integer has special values in order to specify the following
3525 * conditions:
3526 * 253: not a number
3527 * 254: + inf
3528 * 255: - inf
3529 */
3530static int rdbSaveDoubleValue(FILE *fp, double val) {
3531 unsigned char buf[128];
3532 int len;
3533
3534 if (isnan(val)) {
3535 buf[0] = 253;
3536 len = 1;
3537 } else if (!isfinite(val)) {
3538 len = 1;
3539 buf[0] = (val < 0) ? 255 : 254;
3540 } else {
88e8d89f 3541#if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
fe244589 3542 /* Check if the float is in a safe range to be casted into a
3543 * long long. We are assuming that long long is 64 bit here.
3544 * Also we are assuming that there are no implementations around where
3545 * double has precision < 52 bit.
3546 *
3547 * Under this assumptions we test if a double is inside an interval
3548 * where casting to long long is safe. Then using two castings we
3549 * make sure the decimal part is zero. If all this is true we use
3550 * integer printing function that is much faster. */
fb82e75c 3551 double min = -4503599627370495; /* (2^52)-1 */
3552 double max = 4503599627370496; /* -(2^52) */
fe244589 3553 if (val > min && val < max && val == ((double)((long long)val)))
8c096b16 3554 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3555 else
88e8d89f 3556#endif
8c096b16 3557 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3558 buf[0] = strlen((char*)buf+1);
a7866db6 3559 len = buf[0]+1;
3560 }
3561 if (fwrite(buf,len,1,fp) == 0) return -1;
3562 return 0;
3563}
3564
06233c45 3565/* Save a Redis object. */
3566static int rdbSaveObject(FILE *fp, robj *o) {
3567 if (o->type == REDIS_STRING) {
3568 /* Save a string value */
3569 if (rdbSaveStringObject(fp,o) == -1) return -1;
3570 } else if (o->type == REDIS_LIST) {
3571 /* Save a list value */
3572 list *list = o->ptr;
c7df85a4 3573 listIter li;
06233c45 3574 listNode *ln;
3575
06233c45 3576 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
c7df85a4 3577 listRewind(list,&li);
3578 while((ln = listNext(&li))) {
06233c45 3579 robj *eleobj = listNodeValue(ln);
3580
3581 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3582 }
3583 } else if (o->type == REDIS_SET) {
3584 /* Save a set value */
3585 dict *set = o->ptr;
3586 dictIterator *di = dictGetIterator(set);
3587 dictEntry *de;
3588
3589 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3590 while((de = dictNext(di)) != NULL) {
3591 robj *eleobj = dictGetEntryKey(de);
3592
3593 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3594 }
3595 dictReleaseIterator(di);
3596 } else if (o->type == REDIS_ZSET) {
3597 /* Save a set value */
3598 zset *zs = o->ptr;
3599 dictIterator *di = dictGetIterator(zs->dict);
3600 dictEntry *de;
3601
3602 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3603 while((de = dictNext(di)) != NULL) {
3604 robj *eleobj = dictGetEntryKey(de);
3605 double *score = dictGetEntryVal(de);
3606
3607 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3608 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3609 }
3610 dictReleaseIterator(di);
b1befe6a 3611 } else if (o->type == REDIS_HASH) {
3612 /* Save a hash value */
3613 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3614 unsigned char *p = zipmapRewind(o->ptr);
3615 unsigned int count = zipmapLen(o->ptr);
3616 unsigned char *key, *val;
3617 unsigned int klen, vlen;
3618
3619 if (rdbSaveLen(fp,count) == -1) return -1;
3620 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3621 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3622 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3623 }
3624 } else {
3625 dictIterator *di = dictGetIterator(o->ptr);
3626 dictEntry *de;
3627
3628 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3629 while((de = dictNext(di)) != NULL) {
3630 robj *key = dictGetEntryKey(de);
3631 robj *val = dictGetEntryVal(de);
3632
3633 if (rdbSaveStringObject(fp,key) == -1) return -1;
3634 if (rdbSaveStringObject(fp,val) == -1) return -1;
3635 }
3636 dictReleaseIterator(di);
3637 }
06233c45 3638 } else {
f83c6cb5 3639 redisPanic("Unknown object type");
06233c45 3640 }
3641 return 0;
3642}
3643
3644/* Return the length the object will have on disk if saved with
3645 * the rdbSaveObject() function. Currently we use a trick to get
3646 * this length with very little changes to the code. In the future
3647 * we could switch to a faster solution. */
b9bc0eef 3648static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3649 if (fp == NULL) fp = server.devnull;
06233c45 3650 rewind(fp);
3651 assert(rdbSaveObject(fp,o) != 1);
3652 return ftello(fp);
3653}
3654
06224fec 3655/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3656static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3657 off_t bytes = rdbSavedObjectLen(o,fp);
e0a62c7f 3658
06224fec 3659 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3660}
3661
ed9b544e 3662/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3663static int rdbSave(char *filename) {
ed9b544e 3664 dictIterator *di = NULL;
3665 dictEntry *de;
ed9b544e 3666 FILE *fp;
3667 char tmpfile[256];
3668 int j;
bb32ede5 3669 time_t now = time(NULL);
ed9b544e 3670
2316bb3b 3671 /* Wait for I/O therads to terminate, just in case this is a
3672 * foreground-saving, to avoid seeking the swap file descriptor at the
3673 * same time. */
3674 if (server.vm_enabled)
3675 waitEmptyIOJobsQueue();
3676
a3b21203 3677 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3678 fp = fopen(tmpfile,"w");
3679 if (!fp) {
3680 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3681 return REDIS_ERR;
3682 }
f78fd11b 3683 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3684 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3685 redisDb *db = server.db+j;
3686 dict *d = db->dict;
3305306f 3687 if (dictSize(d) == 0) continue;
ed9b544e 3688 di = dictGetIterator(d);
3689 if (!di) {
3690 fclose(fp);
3691 return REDIS_ERR;
3692 }
3693
3694 /* Write the SELECT DB opcode */
f78fd11b 3695 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3696 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3697
3698 /* Iterate this DB writing every entry */
3699 while((de = dictNext(di)) != NULL) {
3700 robj *key = dictGetEntryKey(de);
3701 robj *o = dictGetEntryVal(de);
bb32ede5 3702 time_t expiretime = getExpire(db,key);
3703
3704 /* Save the expire time */
3705 if (expiretime != -1) {
3706 /* If this key is already expired skip it */
3707 if (expiretime < now) continue;
3708 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3709 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3710 }
7e69548d 3711 /* Save the key and associated value. This requires special
3712 * handling if the value is swapped out. */
996cb5f7 3713 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3714 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3715 /* Save type, key, value */
3716 if (rdbSaveType(fp,o->type) == -1) goto werr;
3717 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3718 if (rdbSaveObject(fp,o) == -1) goto werr;
3719 } else {
996cb5f7 3720 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3721 robj *po;
7e69548d 3722 /* Get a preview of the object in memory */
3723 po = vmPreviewObject(key);
7e69548d 3724 /* Save type, key, value */
3725 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3726 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3727 if (rdbSaveObject(fp,po) == -1) goto werr;
3728 /* Remove the loaded object from memory */
3729 decrRefCount(po);
7e69548d 3730 }
ed9b544e 3731 }
3732 dictReleaseIterator(di);
3733 }
3734 /* EOF opcode */
f78fd11b 3735 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3736
3737 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3738 fflush(fp);
3739 fsync(fileno(fp));
3740 fclose(fp);
e0a62c7f 3741
ed9b544e 3742 /* Use RENAME to make sure the DB file is changed atomically only
3743 * if the generate DB file is ok. */
3744 if (rename(tmpfile,filename) == -1) {
325d1eb4 3745 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3746 unlink(tmpfile);
3747 return REDIS_ERR;
3748 }
3749 redisLog(REDIS_NOTICE,"DB saved on disk");
3750 server.dirty = 0;
3751 server.lastsave = time(NULL);
3752 return REDIS_OK;
3753
3754werr:
3755 fclose(fp);
3756 unlink(tmpfile);
3757 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3758 if (di) dictReleaseIterator(di);
3759 return REDIS_ERR;
3760}
3761
f78fd11b 3762static int rdbSaveBackground(char *filename) {
ed9b544e 3763 pid_t childpid;
3764
9d65a1bb 3765 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3766 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3767 if ((childpid = fork()) == 0) {
3768 /* Child */
054e426d 3769 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3770 close(server.fd);
f78fd11b 3771 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3772 _exit(0);
ed9b544e 3773 } else {
478c2c6f 3774 _exit(1);
ed9b544e 3775 }
3776 } else {
3777 /* Parent */
5a7c647e 3778 if (childpid == -1) {
3779 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3780 strerror(errno));
3781 return REDIS_ERR;
3782 }
ed9b544e 3783 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3784 server.bgsavechildpid = childpid;
884d4b39 3785 updateDictResizePolicy();
ed9b544e 3786 return REDIS_OK;
3787 }
3788 return REDIS_OK; /* unreached */
3789}
3790
a3b21203 3791static void rdbRemoveTempFile(pid_t childpid) {
3792 char tmpfile[256];
3793
3794 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3795 unlink(tmpfile);
3796}
3797
f78fd11b 3798static int rdbLoadType(FILE *fp) {
3799 unsigned char type;
7b45bfb2 3800 if (fread(&type,1,1,fp) == 0) return -1;
3801 return type;
3802}
3803
bb32ede5 3804static time_t rdbLoadTime(FILE *fp) {
3805 int32_t t32;
3806 if (fread(&t32,4,1,fp) == 0) return -1;
3807 return (time_t) t32;
3808}
3809
e3566d4b 3810/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3811 * of this file for a description of how this are stored on disk.
3812 *
3813 * isencoded is set to 1 if the readed length is not actually a length but
3814 * an "encoding type", check the above comments for more info */
c78a8ccc 3815static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3816 unsigned char buf[2];
3817 uint32_t len;
c78a8ccc 3818 int type;
f78fd11b 3819
e3566d4b 3820 if (isencoded) *isencoded = 0;
c78a8ccc 3821 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3822 type = (buf[0]&0xC0)>>6;
3823 if (type == REDIS_RDB_6BITLEN) {
3824 /* Read a 6 bit len */
3825 return buf[0]&0x3F;
3826 } else if (type == REDIS_RDB_ENCVAL) {
3827 /* Read a 6 bit len encoding type */
3828 if (isencoded) *isencoded = 1;
3829 return buf[0]&0x3F;
3830 } else if (type == REDIS_RDB_14BITLEN) {
3831 /* Read a 14 bit len */
3832 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3833 return ((buf[0]&0x3F)<<8)|buf[1];
3834 } else {
3835 /* Read a 32 bit len */
f78fd11b 3836 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3837 return ntohl(len);
f78fd11b 3838 }
f78fd11b 3839}
3840
ad30aa60 3841/* Load an integer-encoded object from file 'fp', with the specified
3842 * encoding type 'enctype'. If encode is true the function may return
3843 * an integer-encoded object as reply, otherwise the returned object
3844 * will always be encoded as a raw string. */
3845static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
e3566d4b 3846 unsigned char enc[4];
3847 long long val;
3848
3849 if (enctype == REDIS_RDB_ENC_INT8) {
3850 if (fread(enc,1,1,fp) == 0) return NULL;
3851 val = (signed char)enc[0];
3852 } else if (enctype == REDIS_RDB_ENC_INT16) {
3853 uint16_t v;
3854 if (fread(enc,2,1,fp) == 0) return NULL;
3855 v = enc[0]|(enc[1]<<8);
3856 val = (int16_t)v;
3857 } else if (enctype == REDIS_RDB_ENC_INT32) {
3858 uint32_t v;
3859 if (fread(enc,4,1,fp) == 0) return NULL;
3860 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3861 val = (int32_t)v;
3862 } else {
3863 val = 0; /* anti-warning */
f83c6cb5 3864 redisPanic("Unknown RDB integer encoding type");
e3566d4b 3865 }
ad30aa60 3866 if (encode)
3867 return createStringObjectFromLongLong(val);
3868 else
3869 return createObject(REDIS_STRING,sdsfromlonglong(val));
e3566d4b 3870}
3871
c78a8ccc 3872static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3873 unsigned int len, clen;
3874 unsigned char *c = NULL;
3875 sds val = NULL;
3876
c78a8ccc 3877 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3878 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3879 if ((c = zmalloc(clen)) == NULL) goto err;
3880 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3881 if (fread(c,clen,1,fp) == 0) goto err;
3882 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3883 zfree(c);
88e85998 3884 return createObject(REDIS_STRING,val);
3885err:
3886 zfree(c);
3887 sdsfree(val);
3888 return NULL;
3889}
3890
ad30aa60 3891static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
e3566d4b 3892 int isencoded;
3893 uint32_t len;
f78fd11b 3894 sds val;
3895
c78a8ccc 3896 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3897 if (isencoded) {
3898 switch(len) {
3899 case REDIS_RDB_ENC_INT8:
3900 case REDIS_RDB_ENC_INT16:
3901 case REDIS_RDB_ENC_INT32:
ad30aa60 3902 return rdbLoadIntegerObject(fp,len,encode);
88e85998 3903 case REDIS_RDB_ENC_LZF:
bdcb92f2 3904 return rdbLoadLzfStringObject(fp);
e3566d4b 3905 default:
f83c6cb5 3906 redisPanic("Unknown RDB encoding type");
e3566d4b 3907 }
3908 }
3909
f78fd11b 3910 if (len == REDIS_RDB_LENERR) return NULL;
3911 val = sdsnewlen(NULL,len);
3912 if (len && fread(val,len,1,fp) == 0) {
3913 sdsfree(val);
3914 return NULL;
3915 }
bdcb92f2 3916 return createObject(REDIS_STRING,val);
f78fd11b 3917}
3918
ad30aa60 3919static robj *rdbLoadStringObject(FILE *fp) {
3920 return rdbGenericLoadStringObject(fp,0);
3921}
3922
3923static robj *rdbLoadEncodedStringObject(FILE *fp) {
3924 return rdbGenericLoadStringObject(fp,1);
3925}
3926
a7866db6 3927/* For information about double serialization check rdbSaveDoubleValue() */
3928static int rdbLoadDoubleValue(FILE *fp, double *val) {
3929 char buf[128];
3930 unsigned char len;
3931
3932 if (fread(&len,1,1,fp) == 0) return -1;
3933 switch(len) {
3934 case 255: *val = R_NegInf; return 0;
3935 case 254: *val = R_PosInf; return 0;
3936 case 253: *val = R_Nan; return 0;
3937 default:
3938 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3939 buf[len] = '\0';
a7866db6 3940 sscanf(buf, "%lg", val);
3941 return 0;
3942 }
3943}
3944
c78a8ccc 3945/* Load a Redis object of the specified type from the specified file.
3946 * On success a newly allocated object is returned, otherwise NULL. */
3947static robj *rdbLoadObject(int type, FILE *fp) {
3948 robj *o;
3949
bcd11906 3950 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 3951 if (type == REDIS_STRING) {
3952 /* Read string value */
ad30aa60 3953 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 3954 o = tryObjectEncoding(o);
c78a8ccc 3955 } else if (type == REDIS_LIST || type == REDIS_SET) {
3956 /* Read list/set value */
3957 uint32_t listlen;
3958
3959 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3960 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3c68de9b 3961 /* It's faster to expand the dict to the right size asap in order
3962 * to avoid rehashing */
3963 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3964 dictExpand(o->ptr,listlen);
c78a8ccc 3965 /* Load every single element of the list/set */
3966 while(listlen--) {
3967 robj *ele;
3968
ad30aa60 3969 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 3970 ele = tryObjectEncoding(ele);
c78a8ccc 3971 if (type == REDIS_LIST) {
3972 listAddNodeTail((list*)o->ptr,ele);
3973 } else {
3974 dictAdd((dict*)o->ptr,ele,NULL);
3975 }
3976 }
3977 } else if (type == REDIS_ZSET) {
3978 /* Read list/set value */
ada386b2 3979 size_t zsetlen;
c78a8ccc 3980 zset *zs;
3981
3982 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3983 o = createZsetObject();
3984 zs = o->ptr;
3985 /* Load every single element of the list/set */
3986 while(zsetlen--) {
3987 robj *ele;
3988 double *score = zmalloc(sizeof(double));
3989
ad30aa60 3990 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 3991 ele = tryObjectEncoding(ele);
c78a8ccc 3992 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3993 dictAdd(zs->dict,ele,score);
3994 zslInsert(zs->zsl,*score,ele);
3995 incrRefCount(ele); /* added to skiplist */
3996 }
ada386b2 3997 } else if (type == REDIS_HASH) {
3998 size_t hashlen;
3999
4000 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4001 o = createHashObject();
4002 /* Too many entries? Use an hash table. */
4003 if (hashlen > server.hash_max_zipmap_entries)
4004 convertToRealHash(o);
4005 /* Load every key/value, then set it into the zipmap or hash
4006 * table, as needed. */
4007 while(hashlen--) {
4008 robj *key, *val;
4009
4010 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4011 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4012 /* If we are using a zipmap and there are too big values
4013 * the object is converted to real hash table encoding. */
4014 if (o->encoding != REDIS_ENCODING_HT &&
4015 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4016 sdslen(val->ptr) > server.hash_max_zipmap_value))
4017 {
4018 convertToRealHash(o);
4019 }
4020
4021 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4022 unsigned char *zm = o->ptr;
4023
4024 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4025 val->ptr,sdslen(val->ptr),NULL);
4026 o->ptr = zm;
4027 decrRefCount(key);
4028 decrRefCount(val);
4029 } else {
05df7621 4030 key = tryObjectEncoding(key);
4031 val = tryObjectEncoding(val);
ada386b2 4032 dictAdd((dict*)o->ptr,key,val);
ada386b2 4033 }
4034 }
c78a8ccc 4035 } else {
f83c6cb5 4036 redisPanic("Unknown object type");
c78a8ccc 4037 }
4038 return o;
4039}
4040
f78fd11b 4041static int rdbLoad(char *filename) {
ed9b544e 4042 FILE *fp;
f78fd11b 4043 uint32_t dbid;
bb32ede5 4044 int type, retval, rdbver;
585af7e2 4045 int swap_all_values = 0;
3305306f 4046 dict *d = server.db[0].dict;
bb32ede5 4047 redisDb *db = server.db+0;
f78fd11b 4048 char buf[1024];
242a64f3 4049 time_t expiretime, now = time(NULL);
b492cf00 4050 long long loadedkeys = 0;
bb32ede5 4051
ed9b544e 4052 fp = fopen(filename,"r");
4053 if (!fp) return REDIS_ERR;
4054 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 4055 buf[9] = '\0';
4056 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 4057 fclose(fp);
4058 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4059 return REDIS_ERR;
4060 }
f78fd11b 4061 rdbver = atoi(buf+5);
c78a8ccc 4062 if (rdbver != 1) {
f78fd11b 4063 fclose(fp);
4064 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4065 return REDIS_ERR;
4066 }
ed9b544e 4067 while(1) {
585af7e2 4068 robj *key, *val;
ed9b544e 4069
585af7e2 4070 expiretime = -1;
ed9b544e 4071 /* Read type. */
f78fd11b 4072 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 4073 if (type == REDIS_EXPIRETIME) {
4074 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4075 /* We read the time so we need to read the object type again */
4076 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4077 }
ed9b544e 4078 if (type == REDIS_EOF) break;
4079 /* Handle SELECT DB opcode as a special case */
4080 if (type == REDIS_SELECTDB) {
c78a8ccc 4081 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 4082 goto eoferr;
ed9b544e 4083 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 4084 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 4085 exit(1);
4086 }
bb32ede5 4087 db = server.db+dbid;
4088 d = db->dict;
ed9b544e 4089 continue;
4090 }
4091 /* Read key */
585af7e2 4092 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
c78a8ccc 4093 /* Read value */
585af7e2 4094 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
89e689c5 4095 /* Check if the key already expired */
4096 if (expiretime != -1 && expiretime < now) {
4097 decrRefCount(key);
4098 decrRefCount(val);
4099 continue;
4100 }
ed9b544e 4101 /* Add the new object in the hash table */
585af7e2 4102 retval = dictAdd(d,key,val);
ed9b544e 4103 if (retval == DICT_ERR) {
585af7e2 4104 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
ed9b544e 4105 exit(1);
4106 }
242a64f3 4107 loadedkeys++;
bb32ede5 4108 /* Set the expire time if needed */
89e689c5 4109 if (expiretime != -1) setExpire(db,key,expiretime);
242a64f3 4110
b492cf00 4111 /* Handle swapping while loading big datasets when VM is on */
242a64f3 4112
4113 /* If we detecter we are hopeless about fitting something in memory
4114 * we just swap every new key on disk. Directly...
4115 * Note that's important to check for this condition before resorting
4116 * to random sampling, otherwise we may try to swap already
4117 * swapped keys. */
585af7e2 4118 if (swap_all_values) {
4119 dictEntry *de = dictFind(d,key);
242a64f3 4120
4121 /* de may be NULL since the key already expired */
4122 if (de) {
585af7e2 4123 key = dictGetEntryKey(de);
4124 val = dictGetEntryVal(de);
242a64f3 4125
585af7e2 4126 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
242a64f3 4127 dictGetEntryVal(de) = NULL;
4128 }
4129 }
4130 continue;
4131 }
4132
4133 /* If we have still some hope of having some value fitting memory
4134 * then we try random sampling. */
585af7e2 4135 if (!swap_all_values && server.vm_enabled && (loadedkeys % 5000) == 0) {
b492cf00 4136 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 4137 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 4138 }
242a64f3 4139 if (zmalloc_used_memory() > server.vm_max_memory)
585af7e2 4140 swap_all_values = 1; /* We are already using too much mem */
b492cf00 4141 }
ed9b544e 4142 }
4143 fclose(fp);
4144 return REDIS_OK;
4145
4146eoferr: /* unexpected end of file is handled here with a fatal exit */
f80dff62 4147 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 4148 exit(1);
4149 return REDIS_ERR; /* Just to avoid warning */
4150}
4151
4152/*================================== Commands =============================== */
4153
abcb223e 4154static void authCommand(redisClient *c) {
2e77c2ee 4155 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
4156 c->authenticated = 1;
4157 addReply(c,shared.ok);
4158 } else {
4159 c->authenticated = 0;
fa4c0aba 4160 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
4161 }
4162}
4163
ed9b544e 4164static void pingCommand(redisClient *c) {
4165 addReply(c,shared.pong);
4166}
4167
4168static void echoCommand(redisClient *c) {
dd88747b 4169 addReplyBulk(c,c->argv[1]);
ed9b544e 4170}
4171
4172/*=================================== Strings =============================== */
4173
526d00a5 4174static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
ed9b544e 4175 int retval;
10ce1276 4176 long seconds = 0; /* initialized to avoid an harmness warning */
ed9b544e 4177
526d00a5 4178 if (expire) {
4179 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4180 return;
4181 if (seconds <= 0) {
4182 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4183 return;
4184 }
4185 }
4186
4187 if (nx) deleteIfVolatile(c->db,key);
4188 retval = dictAdd(c->db->dict,key,val);
ed9b544e 4189 if (retval == DICT_ERR) {
4190 if (!nx) {
1b03836c 4191 /* If the key is about a swapped value, we want a new key object
4192 * to overwrite the old. So we delete the old key in the database.
4193 * This will also make sure that swap pages about the old object
4194 * will be marked as free. */
526d00a5 4195 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4196 incrRefCount(key);
4197 dictReplace(c->db->dict,key,val);
4198 incrRefCount(val);
ed9b544e 4199 } else {
c937aa89 4200 addReply(c,shared.czero);
ed9b544e 4201 return;
4202 }
4203 } else {
526d00a5 4204 incrRefCount(key);
4205 incrRefCount(val);
ed9b544e 4206 }
4207 server.dirty++;
526d00a5 4208 removeExpire(c->db,key);
4209 if (expire) setExpire(c->db,key,time(NULL)+seconds);
c937aa89 4210 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 4211}
4212
4213static void setCommand(redisClient *c) {
526d00a5 4214 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
ed9b544e 4215}
4216
4217static void setnxCommand(redisClient *c) {
526d00a5 4218 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4219}
4220
4221static void setexCommand(redisClient *c) {
4222 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
ed9b544e 4223}
4224
322fc7d8 4225static int getGenericCommand(redisClient *c) {
dd88747b 4226 robj *o;
e0a62c7f 4227
dd88747b 4228 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 4229 return REDIS_OK;
dd88747b 4230
4231 if (o->type != REDIS_STRING) {
4232 addReply(c,shared.wrongtypeerr);
4233 return REDIS_ERR;
ed9b544e 4234 } else {
dd88747b 4235 addReplyBulk(c,o);
4236 return REDIS_OK;
ed9b544e 4237 }
4238}
4239
322fc7d8 4240static void getCommand(redisClient *c) {
4241 getGenericCommand(c);
4242}
4243
f6b141c5 4244static void getsetCommand(redisClient *c) {
322fc7d8 4245 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 4246 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4247 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4248 } else {
4249 incrRefCount(c->argv[1]);
4250 }
4251 incrRefCount(c->argv[2]);
4252 server.dirty++;
4253 removeExpire(c->db,c->argv[1]);
4254}
4255
70003d28 4256static void mgetCommand(redisClient *c) {
70003d28 4257 int j;
e0a62c7f 4258
c937aa89 4259 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 4260 for (j = 1; j < c->argc; j++) {
3305306f 4261 robj *o = lookupKeyRead(c->db,c->argv[j]);
4262 if (o == NULL) {
c937aa89 4263 addReply(c,shared.nullbulk);
70003d28 4264 } else {
70003d28 4265 if (o->type != REDIS_STRING) {
c937aa89 4266 addReply(c,shared.nullbulk);
70003d28 4267 } else {
dd88747b 4268 addReplyBulk(c,o);
70003d28 4269 }
4270 }
4271 }
4272}
4273
6c446631 4274static void msetGenericCommand(redisClient *c, int nx) {
906573e7 4275 int j, busykeys = 0;
6c446631 4276
4277 if ((c->argc % 2) == 0) {
454d4e43 4278 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 4279 return;
4280 }
4281 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4282 * set nothing at all if at least one already key exists. */
4283 if (nx) {
4284 for (j = 1; j < c->argc; j += 2) {
906573e7 4285 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4286 busykeys++;
6c446631 4287 }
4288 }
4289 }
906573e7 4290 if (busykeys) {
4291 addReply(c, shared.czero);
4292 return;
4293 }
6c446631 4294
4295 for (j = 1; j < c->argc; j += 2) {
4296 int retval;
4297
05df7621 4298 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
6c446631 4299 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4300 if (retval == DICT_ERR) {
4301 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4302 incrRefCount(c->argv[j+1]);
4303 } else {
4304 incrRefCount(c->argv[j]);
4305 incrRefCount(c->argv[j+1]);
4306 }
4307 removeExpire(c->db,c->argv[j]);
4308 }
4309 server.dirty += (c->argc-1)/2;
4310 addReply(c, nx ? shared.cone : shared.ok);
4311}
4312
4313static void msetCommand(redisClient *c) {
4314 msetGenericCommand(c,0);
4315}
4316
4317static void msetnxCommand(redisClient *c) {
4318 msetGenericCommand(c,1);
4319}
4320
d68ed120 4321static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 4322 long long value;
4323 int retval;
4324 robj *o;
e0a62c7f 4325
3305306f 4326 o = lookupKeyWrite(c->db,c->argv[1]);
6485f293
PN
4327 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4328 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
ed9b544e 4329
4330 value += incr;
d6f4c262 4331 o = createStringObjectFromLongLong(value);
3305306f 4332 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 4333 if (retval == DICT_ERR) {
3305306f 4334 dictReplace(c->db->dict,c->argv[1],o);
4335 removeExpire(c->db,c->argv[1]);
ed9b544e 4336 } else {
4337 incrRefCount(c->argv[1]);
4338 }
4339 server.dirty++;
c937aa89 4340 addReply(c,shared.colon);
ed9b544e 4341 addReply(c,o);
4342 addReply(c,shared.crlf);
4343}
4344
4345static void incrCommand(redisClient *c) {
a4d1ba9a 4346 incrDecrCommand(c,1);
ed9b544e 4347}
4348
4349static void decrCommand(redisClient *c) {
a4d1ba9a 4350 incrDecrCommand(c,-1);
ed9b544e 4351}
4352
4353static void incrbyCommand(redisClient *c) {
bbe025e0
AM
4354 long long incr;
4355
bd79a6bd 4356 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4357 incrDecrCommand(c,incr);
ed9b544e 4358}
4359
4360static void decrbyCommand(redisClient *c) {
bbe025e0
AM
4361 long long incr;
4362
bd79a6bd 4363 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4364 incrDecrCommand(c,-incr);
ed9b544e 4365}
4366
4b00bebd 4367static void appendCommand(redisClient *c) {
4368 int retval;
4369 size_t totlen;
4370 robj *o;
4371
4372 o = lookupKeyWrite(c->db,c->argv[1]);
4373 if (o == NULL) {
4374 /* Create the key */
4375 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4376 incrRefCount(c->argv[1]);
4377 incrRefCount(c->argv[2]);
4378 totlen = stringObjectLen(c->argv[2]);
4379 } else {
4380 dictEntry *de;
e0a62c7f 4381
4b00bebd 4382 de = dictFind(c->db->dict,c->argv[1]);
4383 assert(de != NULL);
4384
4385 o = dictGetEntryVal(de);
4386 if (o->type != REDIS_STRING) {
4387 addReply(c,shared.wrongtypeerr);
4388 return;
4389 }
4390 /* If the object is specially encoded or shared we have to make
4391 * a copy */
4392 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4393 robj *decoded = getDecodedObject(o);
4394
4395 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4396 decrRefCount(decoded);
4397 dictReplace(c->db->dict,c->argv[1],o);
4398 }
4399 /* APPEND! */
4400 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4401 o->ptr = sdscatlen(o->ptr,
4402 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4403 } else {
4404 o->ptr = sdscatprintf(o->ptr, "%ld",
4405 (unsigned long) c->argv[2]->ptr);
4406 }
4407 totlen = sdslen(o->ptr);
4408 }
4409 server.dirty++;
4410 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4411}
4412
39191553 4413static void substrCommand(redisClient *c) {
4414 robj *o;
4415 long start = atoi(c->argv[2]->ptr);
4416 long end = atoi(c->argv[3]->ptr);
dd88747b 4417 size_t rangelen, strlen;
4418 sds range;
39191553 4419
dd88747b 4420 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4421 checkType(c,o,REDIS_STRING)) return;
39191553 4422
dd88747b 4423 o = getDecodedObject(o);
4424 strlen = sdslen(o->ptr);
8fe7fad7 4425
dd88747b 4426 /* convert negative indexes */
4427 if (start < 0) start = strlen+start;
4428 if (end < 0) end = strlen+end;
4429 if (start < 0) start = 0;
4430 if (end < 0) end = 0;
39191553 4431
dd88747b 4432 /* indexes sanity checks */
4433 if (start > end || (size_t)start >= strlen) {
4434 /* Out of range start or start > end result in null reply */
4435 addReply(c,shared.nullbulk);
4436 decrRefCount(o);
4437 return;
39191553 4438 }
dd88747b 4439 if ((size_t)end >= strlen) end = strlen-1;
4440 rangelen = (end-start)+1;
4441
4442 /* Return the result */
4443 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4444 range = sdsnewlen((char*)o->ptr+start,rangelen);
4445 addReplySds(c,range);
4446 addReply(c,shared.crlf);
4447 decrRefCount(o);
39191553 4448}
4449
ed9b544e 4450/* ========================= Type agnostic commands ========================= */
4451
4452static void delCommand(redisClient *c) {
5109cdff 4453 int deleted = 0, j;
4454
4455 for (j = 1; j < c->argc; j++) {
4456 if (deleteKey(c->db,c->argv[j])) {
4457 server.dirty++;
4458 deleted++;
4459 }
4460 }
482b672d 4461 addReplyLongLong(c,deleted);
ed9b544e 4462}
4463
4464static void existsCommand(redisClient *c) {
f4f06efc
PN
4465 expireIfNeeded(c->db,c->argv[1]);
4466 if (dictFind(c->db->dict,c->argv[1])) {
4467 addReply(c, shared.cone);
4468 } else {
4469 addReply(c, shared.czero);
4470 }
ed9b544e 4471}
4472
4473static void selectCommand(redisClient *c) {
4474 int id = atoi(c->argv[1]->ptr);
e0a62c7f 4475
ed9b544e 4476 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4477 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4478 } else {
4479 addReply(c,shared.ok);
4480 }
4481}
4482
4483static void randomkeyCommand(redisClient *c) {
4484 dictEntry *de;
dc4be23e 4485 robj *key;
e0a62c7f 4486
3305306f 4487 while(1) {
4488 de = dictGetRandomKey(c->db->dict);
ce7bef07 4489 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 4490 }
2b619329 4491
ed9b544e 4492 if (de == NULL) {
dc4be23e 4493 addReply(c,shared.nullbulk);
4494 return;
4495 }
4496
4497 key = dictGetEntryKey(de);
4498 if (server.vm_enabled) {
4499 key = dupStringObject(key);
4500 addReplyBulk(c,key);
4501 decrRefCount(key);
ed9b544e 4502 } else {
dc4be23e 4503 addReplyBulk(c,key);
ed9b544e 4504 }
4505}
4506
4507static void keysCommand(redisClient *c) {
4508 dictIterator *di;
4509 dictEntry *de;
4510 sds pattern = c->argv[1]->ptr;
4511 int plen = sdslen(pattern);
a3f9eec2 4512 unsigned long numkeys = 0;
ed9b544e 4513 robj *lenobj = createObject(REDIS_STRING,NULL);
4514
3305306f 4515 di = dictGetIterator(c->db->dict);
ed9b544e 4516 addReply(c,lenobj);
4517 decrRefCount(lenobj);
4518 while((de = dictNext(di)) != NULL) {
4519 robj *keyobj = dictGetEntryKey(de);
3305306f 4520
ed9b544e 4521 sds key = keyobj->ptr;
4522 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4523 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 4524 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4525 addReplyBulk(c,keyobj);
3305306f 4526 numkeys++;
3305306f 4527 }
ed9b544e 4528 }
4529 }
4530 dictReleaseIterator(di);
a3f9eec2 4531 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4532}
4533
4534static void dbsizeCommand(redisClient *c) {
4535 addReplySds(c,
3305306f 4536 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4537}
4538
4539static void lastsaveCommand(redisClient *c) {
4540 addReplySds(c,
c937aa89 4541 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4542}
4543
4544static void typeCommand(redisClient *c) {
3305306f 4545 robj *o;
ed9b544e 4546 char *type;
3305306f 4547
4548 o = lookupKeyRead(c->db,c->argv[1]);
4549 if (o == NULL) {
c937aa89 4550 type = "+none";
ed9b544e 4551 } else {
ed9b544e 4552 switch(o->type) {
c937aa89 4553 case REDIS_STRING: type = "+string"; break;
4554 case REDIS_LIST: type = "+list"; break;
4555 case REDIS_SET: type = "+set"; break;
412a8bce 4556 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4557 case REDIS_HASH: type = "+hash"; break;
4558 default: type = "+unknown"; break;
ed9b544e 4559 }
4560 }
4561 addReplySds(c,sdsnew(type));
4562 addReply(c,shared.crlf);
4563}
4564
4565static void saveCommand(redisClient *c) {
9d65a1bb 4566 if (server.bgsavechildpid != -1) {
05557f6d 4567 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4568 return;
4569 }
f78fd11b 4570 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4571 addReply(c,shared.ok);
4572 } else {
4573 addReply(c,shared.err);
4574 }
4575}
4576
4577static void bgsaveCommand(redisClient *c) {
9d65a1bb 4578 if (server.bgsavechildpid != -1) {
ed9b544e 4579 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4580 return;
4581 }
f78fd11b 4582 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4583 char *status = "+Background saving started\r\n";
4584 addReplySds(c,sdsnew(status));
ed9b544e 4585 } else {
4586 addReply(c,shared.err);
4587 }
4588}
4589
4590static void shutdownCommand(redisClient *c) {
4591 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
a3b21203 4592 /* Kill the saving child if there is a background saving in progress.
4593 We want to avoid race conditions, for instance our saving child may
4594 overwrite the synchronous saving did by SHUTDOWN. */
9d65a1bb 4595 if (server.bgsavechildpid != -1) {
9f3c422c 4596 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4597 kill(server.bgsavechildpid,SIGKILL);
a3b21203 4598 rdbRemoveTempFile(server.bgsavechildpid);
9f3c422c 4599 }
ac945e2d 4600 if (server.appendonly) {
4601 /* Append only file: fsync() the AOF and exit */
4602 fsync(server.appendfd);
054e426d 4603 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 4604 exit(0);
ed9b544e 4605 } else {
ac945e2d 4606 /* Snapshotting. Perform a SYNC SAVE and exit */
4607 if (rdbSave(server.dbfilename) == REDIS_OK) {
4608 if (server.daemonize)
4609 unlink(server.pidfile);
4610 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4611 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4612 exit(0);
4613 } else {
dd88747b 4614 /* Ooops.. error saving! The best we can do is to continue
4615 * operating. Note that if there was a background saving process,
4616 * in the next cron() Redis will be notified that the background
4617 * saving aborted, handling special stuff like slaves pending for
4618 * synchronization... */
e0a62c7f 4619 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
dd88747b 4620 addReplySds(c,
4621 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
ac945e2d 4622 }
ed9b544e 4623 }
4624}
4625
4626static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4627 robj *o;
4628
4629 /* To use the same key as src and dst is probably an error */
4630 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4631 addReply(c,shared.sameobjecterr);
ed9b544e 4632 return;
4633 }
4634
dd88747b 4635 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4636 return;
dd88747b 4637
ed9b544e 4638 incrRefCount(o);
3305306f 4639 deleteIfVolatile(c->db,c->argv[2]);
4640 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 4641 if (nx) {
4642 decrRefCount(o);
c937aa89 4643 addReply(c,shared.czero);
ed9b544e 4644 return;
4645 }
3305306f 4646 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 4647 } else {
4648 incrRefCount(c->argv[2]);
4649 }
3305306f 4650 deleteKey(c->db,c->argv[1]);
ed9b544e 4651 server.dirty++;
c937aa89 4652 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4653}
4654
4655static void renameCommand(redisClient *c) {
4656 renameGenericCommand(c,0);
4657}
4658
4659static void renamenxCommand(redisClient *c) {
4660 renameGenericCommand(c,1);
4661}
4662
4663static void moveCommand(redisClient *c) {
3305306f 4664 robj *o;
4665 redisDb *src, *dst;
ed9b544e 4666 int srcid;
4667
4668 /* Obtain source and target DB pointers */
3305306f 4669 src = c->db;
4670 srcid = c->db->id;
ed9b544e 4671 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4672 addReply(c,shared.outofrangeerr);
ed9b544e 4673 return;
4674 }
3305306f 4675 dst = c->db;
4676 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4677
4678 /* If the user is moving using as target the same
4679 * DB as the source DB it is probably an error. */
4680 if (src == dst) {
c937aa89 4681 addReply(c,shared.sameobjecterr);
ed9b544e 4682 return;
4683 }
4684
4685 /* Check if the element exists and get a reference */
3305306f 4686 o = lookupKeyWrite(c->db,c->argv[1]);
4687 if (!o) {
c937aa89 4688 addReply(c,shared.czero);
ed9b544e 4689 return;
4690 }
4691
4692 /* Try to add the element to the target DB */
3305306f 4693 deleteIfVolatile(dst,c->argv[1]);
4694 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 4695 addReply(c,shared.czero);
ed9b544e 4696 return;
4697 }
3305306f 4698 incrRefCount(c->argv[1]);
ed9b544e 4699 incrRefCount(o);
4700
4701 /* OK! key moved, free the entry in the source DB */
3305306f 4702 deleteKey(src,c->argv[1]);
ed9b544e 4703 server.dirty++;
c937aa89 4704 addReply(c,shared.cone);
ed9b544e 4705}
4706
4707/* =================================== Lists ================================ */
4708static void pushGenericCommand(redisClient *c, int where) {
4709 robj *lobj;
ed9b544e 4710 list *list;
3305306f 4711
4712 lobj = lookupKeyWrite(c->db,c->argv[1]);
4713 if (lobj == NULL) {
95242ab5 4714 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4715 addReply(c,shared.cone);
95242ab5 4716 return;
4717 }
ed9b544e 4718 lobj = createListObject();
4719 list = lobj->ptr;
4720 if (where == REDIS_HEAD) {
6b47e12e 4721 listAddNodeHead(list,c->argv[2]);
ed9b544e 4722 } else {
6b47e12e 4723 listAddNodeTail(list,c->argv[2]);
ed9b544e 4724 }
3305306f 4725 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 4726 incrRefCount(c->argv[1]);
4727 incrRefCount(c->argv[2]);
4728 } else {
ed9b544e 4729 if (lobj->type != REDIS_LIST) {
4730 addReply(c,shared.wrongtypeerr);
4731 return;
4732 }
95242ab5 4733 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4734 addReply(c,shared.cone);
95242ab5 4735 return;
4736 }
ed9b544e 4737 list = lobj->ptr;
4738 if (where == REDIS_HEAD) {
6b47e12e 4739 listAddNodeHead(list,c->argv[2]);
ed9b544e 4740 } else {
6b47e12e 4741 listAddNodeTail(list,c->argv[2]);
ed9b544e 4742 }
4743 incrRefCount(c->argv[2]);
4744 }
4745 server.dirty++;
482b672d 4746 addReplyLongLong(c,listLength(list));
ed9b544e 4747}
4748
4749static void lpushCommand(redisClient *c) {
4750 pushGenericCommand(c,REDIS_HEAD);
4751}
4752
4753static void rpushCommand(redisClient *c) {
4754 pushGenericCommand(c,REDIS_TAIL);
4755}
4756
4757static void llenCommand(redisClient *c) {
3305306f 4758 robj *o;
ed9b544e 4759 list *l;
dd88747b 4760
4761 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4762 checkType(c,o,REDIS_LIST)) return;
e0a62c7f 4763
dd88747b 4764 l = o->ptr;
4765 addReplyUlong(c,listLength(l));
ed9b544e 4766}
4767
4768static void lindexCommand(redisClient *c) {
3305306f 4769 robj *o;
ed9b544e 4770 int index = atoi(c->argv[2]->ptr);
dd88747b 4771 list *list;
4772 listNode *ln;
4773
4774 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4775 checkType(c,o,REDIS_LIST)) return;
4776 list = o->ptr;
4777
4778 ln = listIndex(list, index);
4779 if (ln == NULL) {
c937aa89 4780 addReply(c,shared.nullbulk);
ed9b544e 4781 } else {
dd88747b 4782 robj *ele = listNodeValue(ln);
4783 addReplyBulk(c,ele);
ed9b544e 4784 }
4785}
4786
4787static void lsetCommand(redisClient *c) {
3305306f 4788 robj *o;
ed9b544e 4789 int index = atoi(c->argv[2]->ptr);
dd88747b 4790 list *list;
4791 listNode *ln;
4792
4793 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4794 checkType(c,o,REDIS_LIST)) return;
4795 list = o->ptr;
4796
4797 ln = listIndex(list, index);
4798 if (ln == NULL) {
4799 addReply(c,shared.outofrangeerr);
ed9b544e 4800 } else {
dd88747b 4801 robj *ele = listNodeValue(ln);
ed9b544e 4802
dd88747b 4803 decrRefCount(ele);
4804 listNodeValue(ln) = c->argv[3];
4805 incrRefCount(c->argv[3]);
4806 addReply(c,shared.ok);
4807 server.dirty++;
ed9b544e 4808 }
4809}
4810
4811static void popGenericCommand(redisClient *c, int where) {
3305306f 4812 robj *o;
dd88747b 4813 list *list;
4814 listNode *ln;
3305306f 4815
dd88747b 4816 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4817 checkType(c,o,REDIS_LIST)) return;
4818 list = o->ptr;
ed9b544e 4819
dd88747b 4820 if (where == REDIS_HEAD)
4821 ln = listFirst(list);
4822 else
4823 ln = listLast(list);
ed9b544e 4824
dd88747b 4825 if (ln == NULL) {
4826 addReply(c,shared.nullbulk);
4827 } else {
4828 robj *ele = listNodeValue(ln);
4829 addReplyBulk(c,ele);
4830 listDelNode(list,ln);
3ea27d37 4831 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4832 server.dirty++;
ed9b544e 4833 }
4834}
4835
4836static void lpopCommand(redisClient *c) {
4837 popGenericCommand(c,REDIS_HEAD);
4838}
4839
4840static void rpopCommand(redisClient *c) {
4841 popGenericCommand(c,REDIS_TAIL);
4842}
4843
4844static void lrangeCommand(redisClient *c) {
3305306f 4845 robj *o;
ed9b544e 4846 int start = atoi(c->argv[2]->ptr);
4847 int end = atoi(c->argv[3]->ptr);
dd88747b 4848 int llen;
4849 int rangelen, j;
4850 list *list;
4851 listNode *ln;
4852 robj *ele;
4853
4e27f268 4854 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4855 || checkType(c,o,REDIS_LIST)) return;
dd88747b 4856 list = o->ptr;
4857 llen = listLength(list);
4858
4859 /* convert negative indexes */
4860 if (start < 0) start = llen+start;
4861 if (end < 0) end = llen+end;
4862 if (start < 0) start = 0;
4863 if (end < 0) end = 0;
4864
4865 /* indexes sanity checks */
4866 if (start > end || start >= llen) {
4867 /* Out of range start or start > end result in empty list */
4868 addReply(c,shared.emptymultibulk);
4869 return;
4870 }
4871 if (end >= llen) end = llen-1;
4872 rangelen = (end-start)+1;
3305306f 4873
dd88747b 4874 /* Return the result in form of a multi-bulk reply */
4875 ln = listIndex(list, start);
4876 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4877 for (j = 0; j < rangelen; j++) {
4878 ele = listNodeValue(ln);
4879 addReplyBulk(c,ele);
4880 ln = ln->next;
ed9b544e 4881 }
4882}
4883
4884static void ltrimCommand(redisClient *c) {
3305306f 4885 robj *o;
ed9b544e 4886 int start = atoi(c->argv[2]->ptr);
4887 int end = atoi(c->argv[3]->ptr);
dd88747b 4888 int llen;
4889 int j, ltrim, rtrim;
4890 list *list;
4891 listNode *ln;
4892
4893 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4894 checkType(c,o,REDIS_LIST)) return;
4895 list = o->ptr;
4896 llen = listLength(list);
4897
4898 /* convert negative indexes */
4899 if (start < 0) start = llen+start;
4900 if (end < 0) end = llen+end;
4901 if (start < 0) start = 0;
4902 if (end < 0) end = 0;
4903
4904 /* indexes sanity checks */
4905 if (start > end || start >= llen) {
4906 /* Out of range start or start > end result in empty list */
4907 ltrim = llen;
4908 rtrim = 0;
ed9b544e 4909 } else {
dd88747b 4910 if (end >= llen) end = llen-1;
4911 ltrim = start;
4912 rtrim = llen-end-1;
4913 }
ed9b544e 4914
dd88747b 4915 /* Remove list elements to perform the trim */
4916 for (j = 0; j < ltrim; j++) {
4917 ln = listFirst(list);
4918 listDelNode(list,ln);
4919 }
4920 for (j = 0; j < rtrim; j++) {
4921 ln = listLast(list);
4922 listDelNode(list,ln);
ed9b544e 4923 }
3ea27d37 4924 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4925 server.dirty++;
4926 addReply(c,shared.ok);
ed9b544e 4927}
4928
4929static void lremCommand(redisClient *c) {
3305306f 4930 robj *o;
dd88747b 4931 list *list;
4932 listNode *ln, *next;
4933 int toremove = atoi(c->argv[2]->ptr);
4934 int removed = 0;
4935 int fromtail = 0;
a4d1ba9a 4936
dd88747b 4937 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4938 checkType(c,o,REDIS_LIST)) return;
4939 list = o->ptr;
4940
4941 if (toremove < 0) {
4942 toremove = -toremove;
4943 fromtail = 1;
4944 }
4945 ln = fromtail ? list->tail : list->head;
4946 while (ln) {
4947 robj *ele = listNodeValue(ln);
4948
4949 next = fromtail ? ln->prev : ln->next;
bf028098 4950 if (equalStringObjects(ele,c->argv[3])) {
dd88747b 4951 listDelNode(list,ln);
4952 server.dirty++;
4953 removed++;
4954 if (toremove && removed == toremove) break;
ed9b544e 4955 }
dd88747b 4956 ln = next;
ed9b544e 4957 }
3ea27d37 4958 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4959 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 4960}
4961
12f9d551 4962/* This is the semantic of this command:
0f5f7e9a 4963 * RPOPLPUSH srclist dstlist:
12f9d551 4964 * IF LLEN(srclist) > 0
4965 * element = RPOP srclist
4966 * LPUSH dstlist element
4967 * RETURN element
4968 * ELSE
4969 * RETURN nil
4970 * END
4971 * END
4972 *
4973 * The idea is to be able to get an element from a list in a reliable way
4974 * since the element is not just returned but pushed against another list
4975 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4976 */
0f5f7e9a 4977static void rpoplpushcommand(redisClient *c) {
12f9d551 4978 robj *sobj;
dd88747b 4979 list *srclist;
4980 listNode *ln;
4981
4982 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4983 checkType(c,sobj,REDIS_LIST)) return;
4984 srclist = sobj->ptr;
4985 ln = listLast(srclist);
12f9d551 4986
dd88747b 4987 if (ln == NULL) {
12f9d551 4988 addReply(c,shared.nullbulk);
4989 } else {
dd88747b 4990 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4991 robj *ele = listNodeValue(ln);
4992 list *dstlist;
e20fb74f 4993
dd88747b 4994 if (dobj && dobj->type != REDIS_LIST) {
4995 addReply(c,shared.wrongtypeerr);
4996 return;
4997 }
12f9d551 4998
dd88747b 4999 /* Add the element to the target list (unless it's directly
5000 * passed to some BLPOP-ing client */
5001 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
5002 if (dobj == NULL) {
5003 /* Create the list if the key does not exist */
5004 dobj = createListObject();
5005 dictAdd(c->db->dict,c->argv[2],dobj);
5006 incrRefCount(c->argv[2]);
12f9d551 5007 }
dd88747b 5008 dstlist = dobj->ptr;
5009 listAddNodeHead(dstlist,ele);
5010 incrRefCount(ele);
12f9d551 5011 }
dd88747b 5012
5013 /* Send the element to the client as reply as well */
5014 addReplyBulk(c,ele);
5015
5016 /* Finally remove the element from the source list */
5017 listDelNode(srclist,ln);
3ea27d37 5018 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5019 server.dirty++;
12f9d551 5020 }
5021}
5022
ed9b544e 5023/* ==================================== Sets ================================ */
5024
5025static void saddCommand(redisClient *c) {
ed9b544e 5026 robj *set;
5027
3305306f 5028 set = lookupKeyWrite(c->db,c->argv[1]);
5029 if (set == NULL) {
ed9b544e 5030 set = createSetObject();
3305306f 5031 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 5032 incrRefCount(c->argv[1]);
5033 } else {
ed9b544e 5034 if (set->type != REDIS_SET) {
c937aa89 5035 addReply(c,shared.wrongtypeerr);
ed9b544e 5036 return;
5037 }
5038 }
5039 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5040 incrRefCount(c->argv[2]);
5041 server.dirty++;
c937aa89 5042 addReply(c,shared.cone);
ed9b544e 5043 } else {
c937aa89 5044 addReply(c,shared.czero);
ed9b544e 5045 }
5046}
5047
5048static void sremCommand(redisClient *c) {
3305306f 5049 robj *set;
ed9b544e 5050
dd88747b 5051 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5052 checkType(c,set,REDIS_SET)) return;
5053
5054 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5055 server.dirty++;
5056 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 5057 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5058 addReply(c,shared.cone);
ed9b544e 5059 } else {
dd88747b 5060 addReply(c,shared.czero);
ed9b544e 5061 }
5062}
5063
a4460ef4 5064static void smoveCommand(redisClient *c) {
5065 robj *srcset, *dstset;
5066
5067 srcset = lookupKeyWrite(c->db,c->argv[1]);
5068 dstset = lookupKeyWrite(c->db,c->argv[2]);
5069
5070 /* If the source key does not exist return 0, if it's of the wrong type
5071 * raise an error */
5072 if (srcset == NULL || srcset->type != REDIS_SET) {
5073 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5074 return;
5075 }
5076 /* Error if the destination key is not a set as well */
5077 if (dstset && dstset->type != REDIS_SET) {
5078 addReply(c,shared.wrongtypeerr);
5079 return;
5080 }
5081 /* Remove the element from the source set */
5082 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5083 /* Key not found in the src set! return zero */
5084 addReply(c,shared.czero);
5085 return;
5086 }
3ea27d37 5087 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5088 deleteKey(c->db,c->argv[1]);
a4460ef4 5089 server.dirty++;
5090 /* Add the element to the destination set */
5091 if (!dstset) {
5092 dstset = createSetObject();
5093 dictAdd(c->db->dict,c->argv[2],dstset);
5094 incrRefCount(c->argv[2]);
5095 }
5096 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5097 incrRefCount(c->argv[3]);
5098 addReply(c,shared.cone);
5099}
5100
ed9b544e 5101static void sismemberCommand(redisClient *c) {
3305306f 5102 robj *set;
ed9b544e 5103
dd88747b 5104 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5105 checkType(c,set,REDIS_SET)) return;
5106
5107 if (dictFind(set->ptr,c->argv[2]))
5108 addReply(c,shared.cone);
5109 else
c937aa89 5110 addReply(c,shared.czero);
ed9b544e 5111}
5112
5113static void scardCommand(redisClient *c) {
3305306f 5114 robj *o;
ed9b544e 5115 dict *s;
dd88747b 5116
5117 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5118 checkType(c,o,REDIS_SET)) return;
e0a62c7f 5119
dd88747b 5120 s = o->ptr;
5121 addReplyUlong(c,dictSize(s));
ed9b544e 5122}
5123
12fea928 5124static void spopCommand(redisClient *c) {
5125 robj *set;
5126 dictEntry *de;
5127
dd88747b 5128 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5129 checkType(c,set,REDIS_SET)) return;
5130
5131 de = dictGetRandomKey(set->ptr);
5132 if (de == NULL) {
12fea928 5133 addReply(c,shared.nullbulk);
5134 } else {
dd88747b 5135 robj *ele = dictGetEntryKey(de);
12fea928 5136
dd88747b 5137 addReplyBulk(c,ele);
5138 dictDelete(set->ptr,ele);
5139 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 5140 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5141 server.dirty++;
12fea928 5142 }
5143}
5144
2abb95a9 5145static void srandmemberCommand(redisClient *c) {
5146 robj *set;
5147 dictEntry *de;
5148
dd88747b 5149 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5150 checkType(c,set,REDIS_SET)) return;
5151
5152 de = dictGetRandomKey(set->ptr);
5153 if (de == NULL) {
2abb95a9 5154 addReply(c,shared.nullbulk);
5155 } else {
dd88747b 5156 robj *ele = dictGetEntryKey(de);
2abb95a9 5157
dd88747b 5158 addReplyBulk(c,ele);
2abb95a9 5159 }
5160}
5161
ed9b544e 5162static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5163 dict **d1 = (void*) s1, **d2 = (void*) s2;
5164
3305306f 5165 return dictSize(*d1)-dictSize(*d2);
ed9b544e 5166}
5167
682ac724 5168static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 5169 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5170 dictIterator *di;
5171 dictEntry *de;
5172 robj *lenobj = NULL, *dstset = NULL;
682ac724 5173 unsigned long j, cardinality = 0;
ed9b544e 5174
ed9b544e 5175 for (j = 0; j < setsnum; j++) {
5176 robj *setobj;
3305306f 5177
5178 setobj = dstkey ?
5179 lookupKeyWrite(c->db,setskeys[j]) :
5180 lookupKeyRead(c->db,setskeys[j]);
5181 if (!setobj) {
ed9b544e 5182 zfree(dv);
5faa6025 5183 if (dstkey) {
fdcaae84 5184 if (deleteKey(c->db,dstkey))
5185 server.dirty++;
0d36ded0 5186 addReply(c,shared.czero);
5faa6025 5187 } else {
4e27f268 5188 addReply(c,shared.emptymultibulk);
5faa6025 5189 }
ed9b544e 5190 return;
5191 }
ed9b544e 5192 if (setobj->type != REDIS_SET) {
5193 zfree(dv);
c937aa89 5194 addReply(c,shared.wrongtypeerr);
ed9b544e 5195 return;
5196 }
5197 dv[j] = setobj->ptr;
5198 }
5199 /* Sort sets from the smallest to largest, this will improve our
5200 * algorithm's performace */
5201 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5202
5203 /* The first thing we should output is the total number of elements...
5204 * since this is a multi-bulk write, but at this stage we don't know
5205 * the intersection set size, so we use a trick, append an empty object
5206 * to the output list and save the pointer to later modify it with the
5207 * right length */
5208 if (!dstkey) {
5209 lenobj = createObject(REDIS_STRING,NULL);
5210 addReply(c,lenobj);
5211 decrRefCount(lenobj);
5212 } else {
5213 /* If we have a target key where to store the resulting set
5214 * create this key with an empty set inside */
5215 dstset = createSetObject();
ed9b544e 5216 }
5217
5218 /* Iterate all the elements of the first (smallest) set, and test
5219 * the element against all the other sets, if at least one set does
5220 * not include the element it is discarded */
5221 di = dictGetIterator(dv[0]);
ed9b544e 5222
5223 while((de = dictNext(di)) != NULL) {
5224 robj *ele;
5225
5226 for (j = 1; j < setsnum; j++)
5227 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5228 if (j != setsnum)
5229 continue; /* at least one set does not contain the member */
5230 ele = dictGetEntryKey(de);
5231 if (!dstkey) {
dd88747b 5232 addReplyBulk(c,ele);
ed9b544e 5233 cardinality++;
5234 } else {
5235 dictAdd(dstset->ptr,ele,NULL);
5236 incrRefCount(ele);
5237 }
5238 }
5239 dictReleaseIterator(di);
5240
83cdfe18 5241 if (dstkey) {
3ea27d37 5242 /* Store the resulting set into the target, if the intersection
5243 * is not an empty set. */
83cdfe18 5244 deleteKey(c->db,dstkey);
3ea27d37 5245 if (dictSize((dict*)dstset->ptr) > 0) {
5246 dictAdd(c->db->dict,dstkey,dstset);
5247 incrRefCount(dstkey);
482b672d 5248 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5249 } else {
5250 decrRefCount(dstset);
d36c4e97 5251 addReply(c,shared.czero);
3ea27d37 5252 }
40d224a9 5253 server.dirty++;
d36c4e97 5254 } else {
5255 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 5256 }
ed9b544e 5257 zfree(dv);
5258}
5259
5260static void sinterCommand(redisClient *c) {
5261 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5262}
5263
5264static void sinterstoreCommand(redisClient *c) {
5265 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5266}
5267
f4f56e1d 5268#define REDIS_OP_UNION 0
5269#define REDIS_OP_DIFF 1
2830ca53 5270#define REDIS_OP_INTER 2
f4f56e1d 5271
5272static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 5273 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5274 dictIterator *di;
5275 dictEntry *de;
f4f56e1d 5276 robj *dstset = NULL;
40d224a9 5277 int j, cardinality = 0;
5278
40d224a9 5279 for (j = 0; j < setsnum; j++) {
5280 robj *setobj;
5281
5282 setobj = dstkey ?
5283 lookupKeyWrite(c->db,setskeys[j]) :
5284 lookupKeyRead(c->db,setskeys[j]);
5285 if (!setobj) {
5286 dv[j] = NULL;
5287 continue;
5288 }
5289 if (setobj->type != REDIS_SET) {
5290 zfree(dv);
5291 addReply(c,shared.wrongtypeerr);
5292 return;
5293 }
5294 dv[j] = setobj->ptr;
5295 }
5296
5297 /* We need a temp set object to store our union. If the dstkey
5298 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5299 * this set object will be the resulting object to set into the target key*/
5300 dstset = createSetObject();
5301
40d224a9 5302 /* Iterate all the elements of all the sets, add every element a single
5303 * time to the result set */
5304 for (j = 0; j < setsnum; j++) {
51829ed3 5305 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 5306 if (!dv[j]) continue; /* non existing keys are like empty sets */
5307
5308 di = dictGetIterator(dv[j]);
40d224a9 5309
5310 while((de = dictNext(di)) != NULL) {
5311 robj *ele;
5312
5313 /* dictAdd will not add the same element multiple times */
5314 ele = dictGetEntryKey(de);
f4f56e1d 5315 if (op == REDIS_OP_UNION || j == 0) {
5316 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5317 incrRefCount(ele);
40d224a9 5318 cardinality++;
5319 }
f4f56e1d 5320 } else if (op == REDIS_OP_DIFF) {
5321 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5322 cardinality--;
5323 }
40d224a9 5324 }
5325 }
5326 dictReleaseIterator(di);
51829ed3 5327
d36c4e97 5328 /* result set is empty? Exit asap. */
5329 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 5330 }
5331
f4f56e1d 5332 /* Output the content of the resulting set, if not in STORE mode */
5333 if (!dstkey) {
5334 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5335 di = dictGetIterator(dstset->ptr);
f4f56e1d 5336 while((de = dictNext(di)) != NULL) {
5337 robj *ele;
5338
5339 ele = dictGetEntryKey(de);
dd88747b 5340 addReplyBulk(c,ele);
f4f56e1d 5341 }
5342 dictReleaseIterator(di);
d36c4e97 5343 decrRefCount(dstset);
83cdfe18
AG
5344 } else {
5345 /* If we have a target key where to store the resulting set
5346 * create this key with the result set inside */
5347 deleteKey(c->db,dstkey);
3ea27d37 5348 if (dictSize((dict*)dstset->ptr) > 0) {
5349 dictAdd(c->db->dict,dstkey,dstset);
5350 incrRefCount(dstkey);
482b672d 5351 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5352 } else {
5353 decrRefCount(dstset);
d36c4e97 5354 addReply(c,shared.czero);
3ea27d37 5355 }
40d224a9 5356 server.dirty++;
5357 }
5358 zfree(dv);
5359}
5360
5361static void sunionCommand(redisClient *c) {
f4f56e1d 5362 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 5363}
5364
5365static void sunionstoreCommand(redisClient *c) {
f4f56e1d 5366 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5367}
5368
5369static void sdiffCommand(redisClient *c) {
5370 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5371}
5372
5373static void sdiffstoreCommand(redisClient *c) {
5374 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 5375}
5376
6b47e12e 5377/* ==================================== ZSets =============================== */
5378
5379/* ZSETs are ordered sets using two data structures to hold the same elements
5380 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5381 * data structure.
5382 *
5383 * The elements are added to an hash table mapping Redis objects to scores.
5384 * At the same time the elements are added to a skip list mapping scores
5385 * to Redis objects (so objects are sorted by scores in this "view"). */
5386
5387/* This skiplist implementation is almost a C translation of the original
5388 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5389 * Alternative to Balanced Trees", modified in three ways:
5390 * a) this implementation allows for repeated values.
5391 * b) the comparison is not just by key (our 'score') but by satellite data.
5392 * c) there is a back pointer, so it's a doubly linked list with the back
5393 * pointers being only at "level 1". This allows to traverse the list
5394 * from tail to head, useful for ZREVRANGE. */
5395
5396static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5397 zskiplistNode *zn = zmalloc(sizeof(*zn));
5398
5399 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2b37892e
PN
5400 if (level > 0)
5401 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
6b47e12e 5402 zn->score = score;
5403 zn->obj = obj;
5404 return zn;
5405}
5406
5407static zskiplist *zslCreate(void) {
5408 int j;
5409 zskiplist *zsl;
e0a62c7f 5410
6b47e12e 5411 zsl = zmalloc(sizeof(*zsl));
5412 zsl->level = 1;
cc812361 5413 zsl->length = 0;
6b47e12e 5414 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 5415 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 5416 zsl->header->forward[j] = NULL;
94e543b5 5417
5418 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5419 if (j < ZSKIPLIST_MAXLEVEL-1)
5420 zsl->header->span[j] = 0;
69d95c3e 5421 }
e3870fab 5422 zsl->header->backward = NULL;
5423 zsl->tail = NULL;
6b47e12e 5424 return zsl;
5425}
5426
fd8ccf44 5427static void zslFreeNode(zskiplistNode *node) {
5428 decrRefCount(node->obj);
ad807e6f 5429 zfree(node->forward);
69d95c3e 5430 zfree(node->span);
fd8ccf44 5431 zfree(node);
5432}
5433
5434static void zslFree(zskiplist *zsl) {
ad807e6f 5435 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 5436
ad807e6f 5437 zfree(zsl->header->forward);
69d95c3e 5438 zfree(zsl->header->span);
ad807e6f 5439 zfree(zsl->header);
fd8ccf44 5440 while(node) {
599379dd 5441 next = node->forward[0];
fd8ccf44 5442 zslFreeNode(node);
5443 node = next;
5444 }
ad807e6f 5445 zfree(zsl);
fd8ccf44 5446}
5447
6b47e12e 5448static int zslRandomLevel(void) {
5449 int level = 1;
5450 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5451 level += 1;
10c2baa5 5452 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6b47e12e 5453}
5454
5455static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5456 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 5457 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 5458 int i, level;
5459
5460 x = zsl->header;
5461 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
5462 /* store rank that is crossed to reach the insert position */
5463 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 5464
9d60e6e4 5465 while (x->forward[i] &&
5466 (x->forward[i]->score < score ||
5467 (x->forward[i]->score == score &&
69d95c3e 5468 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 5469 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 5470 x = x->forward[i];
69d95c3e 5471 }
6b47e12e 5472 update[i] = x;
5473 }
6b47e12e 5474 /* we assume the key is not already inside, since we allow duplicated
5475 * scores, and the re-insertion of score and redis object should never
5476 * happpen since the caller of zslInsert() should test in the hash table
5477 * if the element is already inside or not. */
5478 level = zslRandomLevel();
5479 if (level > zsl->level) {
69d95c3e 5480 for (i = zsl->level; i < level; i++) {
2b37892e 5481 rank[i] = 0;
6b47e12e 5482 update[i] = zsl->header;
2b37892e 5483 update[i]->span[i-1] = zsl->length;
69d95c3e 5484 }
6b47e12e 5485 zsl->level = level;
5486 }
5487 x = zslCreateNode(level,score,obj);
5488 for (i = 0; i < level; i++) {
5489 x->forward[i] = update[i]->forward[i];
5490 update[i]->forward[i] = x;
69d95c3e
PN
5491
5492 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
5493 if (i > 0) {
5494 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5495 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5496 }
6b47e12e 5497 }
69d95c3e
PN
5498
5499 /* increment span for untouched levels */
5500 for (i = level; i < zsl->level; i++) {
2b37892e 5501 update[i]->span[i-1]++;
69d95c3e
PN
5502 }
5503
bb975144 5504 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 5505 if (x->forward[0])
5506 x->forward[0]->backward = x;
5507 else
5508 zsl->tail = x;
cc812361 5509 zsl->length++;
6b47e12e 5510}
5511
84105336
PN
5512/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5513void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5514 int i;
5515 for (i = 0; i < zsl->level; i++) {
5516 if (update[i]->forward[i] == x) {
5517 if (i > 0) {
5518 update[i]->span[i-1] += x->span[i-1] - 1;
5519 }
5520 update[i]->forward[i] = x->forward[i];
5521 } else {
5522 /* invariant: i > 0, because update[0]->forward[0]
5523 * is always equal to x */
5524 update[i]->span[i-1] -= 1;
5525 }
5526 }
5527 if (x->forward[0]) {
5528 x->forward[0]->backward = x->backward;
5529 } else {
5530 zsl->tail = x->backward;
5531 }
5532 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5533 zsl->level--;
5534 zsl->length--;
5535}
5536
50c55df5 5537/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 5538static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 5539 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5540 int i;
5541
5542 x = zsl->header;
5543 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 5544 while (x->forward[i] &&
5545 (x->forward[i]->score < score ||
5546 (x->forward[i]->score == score &&
5547 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 5548 x = x->forward[i];
5549 update[i] = x;
5550 }
5551 /* We may have multiple elements with the same score, what we need
5552 * is to find the element with both the right score and object. */
5553 x = x->forward[0];
bf028098 5554 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
84105336 5555 zslDeleteNode(zsl, x, update);
9d60e6e4 5556 zslFreeNode(x);
9d60e6e4 5557 return 1;
5558 } else {
5559 return 0; /* not found */
e197b441 5560 }
5561 return 0; /* not found */
fd8ccf44 5562}
5563
1807985b 5564/* Delete all the elements with score between min and max from the skiplist.
5565 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5566 * Note that this function takes the reference to the hash table view of the
5567 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 5568static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 5569 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5570 unsigned long removed = 0;
5571 int i;
5572
5573 x = zsl->header;
5574 for (i = zsl->level-1; i >= 0; i--) {
5575 while (x->forward[i] && x->forward[i]->score < min)
5576 x = x->forward[i];
5577 update[i] = x;
5578 }
5579 /* We may have multiple elements with the same score, what we need
5580 * is to find the element with both the right score and object. */
5581 x = x->forward[0];
5582 while (x && x->score <= max) {
84105336
PN
5583 zskiplistNode *next = x->forward[0];
5584 zslDeleteNode(zsl, x, update);
1807985b 5585 dictDelete(dict,x->obj);
5586 zslFreeNode(x);
1807985b 5587 removed++;
5588 x = next;
5589 }
5590 return removed; /* not found */
5591}
1807985b 5592
9212eafd 5593/* Delete all the elements with rank between start and end from the skiplist.
2424490f 5594 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
5595static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5596 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5597 unsigned long traversed = 0, removed = 0;
5598 int i;
5599
9212eafd
PN
5600 x = zsl->header;
5601 for (i = zsl->level-1; i >= 0; i--) {
5602 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5603 traversed += i > 0 ? x->span[i-1] : 1;
5604 x = x->forward[i];
1807985b 5605 }
9212eafd
PN
5606 update[i] = x;
5607 }
5608
5609 traversed++;
5610 x = x->forward[0];
5611 while (x && traversed <= end) {
84105336
PN
5612 zskiplistNode *next = x->forward[0];
5613 zslDeleteNode(zsl, x, update);
1807985b 5614 dictDelete(dict,x->obj);
5615 zslFreeNode(x);
1807985b 5616 removed++;
9212eafd 5617 traversed++;
1807985b 5618 x = next;
5619 }
9212eafd 5620 return removed;
1807985b 5621}
5622
50c55df5 5623/* Find the first node having a score equal or greater than the specified one.
5624 * Returns NULL if there is no match. */
5625static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5626 zskiplistNode *x;
5627 int i;
5628
5629 x = zsl->header;
5630 for (i = zsl->level-1; i >= 0; i--) {
5631 while (x->forward[i] && x->forward[i]->score < score)
5632 x = x->forward[i];
5633 }
5634 /* We may have multiple elements with the same score, what we need
5635 * is to find the element with both the right score and object. */
5636 return x->forward[0];
5637}
5638
27b0ccca
PN
5639/* Find the rank for an element by both score and key.
5640 * Returns 0 when the element cannot be found, rank otherwise.
5641 * Note that the rank is 1-based due to the span of zsl->header to the
5642 * first element. */
5643static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5644 zskiplistNode *x;
5645 unsigned long rank = 0;
5646 int i;
5647
5648 x = zsl->header;
5649 for (i = zsl->level-1; i >= 0; i--) {
5650 while (x->forward[i] &&
5651 (x->forward[i]->score < score ||
5652 (x->forward[i]->score == score &&
5653 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 5654 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
5655 x = x->forward[i];
5656 }
5657
5658 /* x might be equal to zsl->header, so test if obj is non-NULL */
bf028098 5659 if (x->obj && equalStringObjects(x->obj,o)) {
27b0ccca
PN
5660 return rank;
5661 }
5662 }
5663 return 0;
5664}
5665
e74825c2
PN
5666/* Finds an element by its rank. The rank argument needs to be 1-based. */
5667zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5668 zskiplistNode *x;
5669 unsigned long traversed = 0;
5670 int i;
5671
5672 x = zsl->header;
5673 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 5674 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5675 {
a50ea45c 5676 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
5677 x = x->forward[i];
5678 }
e74825c2
PN
5679 if (traversed == rank) {
5680 return x;
5681 }
5682 }
5683 return NULL;
5684}
5685
fd8ccf44 5686/* The actual Z-commands implementations */
5687
7db723ad 5688/* This generic command implements both ZADD and ZINCRBY.
e2665397 5689 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 5690 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 5691static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 5692 robj *zsetobj;
5693 zset *zs;
5694 double *score;
5695
e2665397 5696 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 5697 if (zsetobj == NULL) {
5698 zsetobj = createZsetObject();
e2665397 5699 dictAdd(c->db->dict,key,zsetobj);
5700 incrRefCount(key);
fd8ccf44 5701 } else {
5702 if (zsetobj->type != REDIS_ZSET) {
5703 addReply(c,shared.wrongtypeerr);
5704 return;
5705 }
5706 }
fd8ccf44 5707 zs = zsetobj->ptr;
e2665397 5708
7db723ad 5709 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 5710 * needs to handle the two different conditions. It's all about setting
5711 * '*score', that is, the new score to set, to the right value. */
5712 score = zmalloc(sizeof(double));
5713 if (doincrement) {
5714 dictEntry *de;
5715
5716 /* Read the old score. If the element was not present starts from 0 */
5717 de = dictFind(zs->dict,ele);
5718 if (de) {
5719 double *oldscore = dictGetEntryVal(de);
5720 *score = *oldscore + scoreval;
5721 } else {
5722 *score = scoreval;
5723 }
5724 } else {
5725 *score = scoreval;
5726 }
5727
5728 /* What follows is a simple remove and re-insert operation that is common
7db723ad 5729 * to both ZADD and ZINCRBY... */
e2665397 5730 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 5731 /* case 1: New element */
e2665397 5732 incrRefCount(ele); /* added to hash */
5733 zslInsert(zs->zsl,*score,ele);
5734 incrRefCount(ele); /* added to skiplist */
fd8ccf44 5735 server.dirty++;
e2665397 5736 if (doincrement)
e2665397 5737 addReplyDouble(c,*score);
91d71bfc 5738 else
5739 addReply(c,shared.cone);
fd8ccf44 5740 } else {
5741 dictEntry *de;
5742 double *oldscore;
e0a62c7f 5743
fd8ccf44 5744 /* case 2: Score update operation */
e2665397 5745 de = dictFind(zs->dict,ele);
dfc5e96c 5746 redisAssert(de != NULL);
fd8ccf44 5747 oldscore = dictGetEntryVal(de);
5748 if (*score != *oldscore) {
5749 int deleted;
5750
e2665397 5751 /* Remove and insert the element in the skip list with new score */
5752 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 5753 redisAssert(deleted != 0);
e2665397 5754 zslInsert(zs->zsl,*score,ele);
5755 incrRefCount(ele);
5756 /* Update the score in the hash table */
5757 dictReplace(zs->dict,ele,score);
fd8ccf44 5758 server.dirty++;
2161a965 5759 } else {
5760 zfree(score);
fd8ccf44 5761 }
e2665397 5762 if (doincrement)
5763 addReplyDouble(c,*score);
5764 else
5765 addReply(c,shared.czero);
fd8ccf44 5766 }
5767}
5768
e2665397 5769static void zaddCommand(redisClient *c) {
5770 double scoreval;
5771
bd79a6bd 5772 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5773 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5774}
5775
7db723ad 5776static void zincrbyCommand(redisClient *c) {
e2665397 5777 double scoreval;
5778
bd79a6bd 5779 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5780 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5781}
5782
1b7106e7 5783static void zremCommand(redisClient *c) {
5784 robj *zsetobj;
5785 zset *zs;
dd88747b 5786 dictEntry *de;
5787 double *oldscore;
5788 int deleted;
1b7106e7 5789
dd88747b 5790 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5791 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 5792
dd88747b 5793 zs = zsetobj->ptr;
5794 de = dictFind(zs->dict,c->argv[2]);
5795 if (de == NULL) {
5796 addReply(c,shared.czero);
5797 return;
1b7106e7 5798 }
dd88747b 5799 /* Delete from the skiplist */
5800 oldscore = dictGetEntryVal(de);
5801 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5802 redisAssert(deleted != 0);
5803
5804 /* Delete from the hash table */
5805 dictDelete(zs->dict,c->argv[2]);
5806 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5807 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5808 server.dirty++;
5809 addReply(c,shared.cone);
1b7106e7 5810}
5811
1807985b 5812static void zremrangebyscoreCommand(redisClient *c) {
bbe025e0
AM
5813 double min;
5814 double max;
dd88747b 5815 long deleted;
1807985b 5816 robj *zsetobj;
5817 zset *zs;
5818
bd79a6bd
PN
5819 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5820 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
bbe025e0 5821
dd88747b 5822 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5823 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 5824
dd88747b 5825 zs = zsetobj->ptr;
5826 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5827 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5828 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5829 server.dirty += deleted;
482b672d 5830 addReplyLongLong(c,deleted);
1807985b 5831}
5832
9212eafd 5833static void zremrangebyrankCommand(redisClient *c) {
bbe025e0
AM
5834 long start;
5835 long end;
dd88747b 5836 int llen;
5837 long deleted;
9212eafd
PN
5838 robj *zsetobj;
5839 zset *zs;
5840
bd79a6bd
PN
5841 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5842 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 5843
dd88747b 5844 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5845 checkType(c,zsetobj,REDIS_ZSET)) return;
5846 zs = zsetobj->ptr;
5847 llen = zs->zsl->length;
9212eafd 5848
dd88747b 5849 /* convert negative indexes */
5850 if (start < 0) start = llen+start;
5851 if (end < 0) end = llen+end;
5852 if (start < 0) start = 0;
5853 if (end < 0) end = 0;
9212eafd 5854
dd88747b 5855 /* indexes sanity checks */
5856 if (start > end || start >= llen) {
5857 addReply(c,shared.czero);
5858 return;
9212eafd 5859 }
dd88747b 5860 if (end >= llen) end = llen-1;
5861
5862 /* increment start and end because zsl*Rank functions
5863 * use 1-based rank */
5864 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5865 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5866 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5867 server.dirty += deleted;
482b672d 5868 addReplyLongLong(c, deleted);
9212eafd
PN
5869}
5870
8f92e768
PN
5871typedef struct {
5872 dict *dict;
5873 double weight;
5874} zsetopsrc;
5875
5876static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5877 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5878 unsigned long size1, size2;
5879 size1 = d1->dict ? dictSize(d1->dict) : 0;
5880 size2 = d2->dict ? dictSize(d2->dict) : 0;
5881 return size1 - size2;
5882}
5883
d2764cd6
PN
5884#define REDIS_AGGR_SUM 1
5885#define REDIS_AGGR_MIN 2
5886#define REDIS_AGGR_MAX 3
5887
5888inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5889 if (aggregate == REDIS_AGGR_SUM) {
5890 *target = *target + val;
5891 } else if (aggregate == REDIS_AGGR_MIN) {
5892 *target = val < *target ? val : *target;
5893 } else if (aggregate == REDIS_AGGR_MAX) {
5894 *target = val > *target ? val : *target;
5895 } else {
5896 /* safety net */
f83c6cb5 5897 redisPanic("Unknown ZUNION/INTER aggregate type");
d2764cd6
PN
5898 }
5899}
5900
2830ca53 5901static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
8f92e768 5902 int i, j, zsetnum;
d2764cd6 5903 int aggregate = REDIS_AGGR_SUM;
8f92e768 5904 zsetopsrc *src;
2830ca53
PN
5905 robj *dstobj;
5906 zset *dstzset;
b287c9bb
PN
5907 dictIterator *di;
5908 dictEntry *de;
5909
2830ca53
PN
5910 /* expect zsetnum input keys to be given */
5911 zsetnum = atoi(c->argv[2]->ptr);
5912 if (zsetnum < 1) {
5d373da9 5913 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
2830ca53 5914 return;
b287c9bb 5915 }
2830ca53
PN
5916
5917 /* test if the expected number of keys would overflow */
5918 if (3+zsetnum > c->argc) {
b287c9bb
PN
5919 addReply(c,shared.syntaxerr);
5920 return;
5921 }
5922
2830ca53 5923 /* read keys to be used for input */
b9eed483 5924 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
2830ca53 5925 for (i = 0, j = 3; i < zsetnum; i++, j++) {
b287c9bb
PN
5926 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5927 if (!zsetobj) {
8f92e768 5928 src[i].dict = NULL;
b287c9bb
PN
5929 } else {
5930 if (zsetobj->type != REDIS_ZSET) {
8f92e768 5931 zfree(src);
b287c9bb
PN
5932 addReply(c,shared.wrongtypeerr);
5933 return;
5934 }
8f92e768 5935 src[i].dict = ((zset*)zsetobj->ptr)->dict;
b287c9bb 5936 }
2830ca53
PN
5937
5938 /* default all weights to 1 */
8f92e768 5939 src[i].weight = 1.0;
b287c9bb
PN
5940 }
5941
2830ca53
PN
5942 /* parse optional extra arguments */
5943 if (j < c->argc) {
d2764cd6 5944 int remaining = c->argc - j;
b287c9bb 5945
2830ca53 5946 while (remaining) {
d2764cd6 5947 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 5948 j++; remaining--;
2830ca53 5949 for (i = 0; i < zsetnum; i++, j++, remaining--) {
bd79a6bd 5950 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
bbe025e0 5951 return;
2830ca53 5952 }
d2764cd6
PN
5953 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5954 j++; remaining--;
5955 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5956 aggregate = REDIS_AGGR_SUM;
5957 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5958 aggregate = REDIS_AGGR_MIN;
5959 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5960 aggregate = REDIS_AGGR_MAX;
5961 } else {
5962 zfree(src);
5963 addReply(c,shared.syntaxerr);
5964 return;
5965 }
5966 j++; remaining--;
2830ca53 5967 } else {
8f92e768 5968 zfree(src);
2830ca53
PN
5969 addReply(c,shared.syntaxerr);
5970 return;
5971 }
5972 }
5973 }
b287c9bb 5974
d2764cd6
PN
5975 /* sort sets from the smallest to largest, this will improve our
5976 * algorithm's performance */
5977 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5978
2830ca53
PN
5979 dstobj = createZsetObject();
5980 dstzset = dstobj->ptr;
5981
5982 if (op == REDIS_OP_INTER) {
8f92e768
PN
5983 /* skip going over all entries if the smallest zset is NULL or empty */
5984 if (src[0].dict && dictSize(src[0].dict) > 0) {
5985 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5986 * from small to large, all src[i > 0].dict are non-empty too */
5987 di = dictGetIterator(src[0].dict);
2830ca53 5988 while((de = dictNext(di)) != NULL) {
d2764cd6
PN
5989 double *score = zmalloc(sizeof(double)), value;
5990 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
2830ca53 5991
d2764cd6
PN
5992 for (j = 1; j < zsetnum; j++) {
5993 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 5994 if (other) {
d2764cd6
PN
5995 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5996 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
5997 } else {
5998 break;
5999 }
6000 }
b287c9bb 6001
2830ca53 6002 /* skip entry when not present in every source dict */
8f92e768 6003 if (j != zsetnum) {
2830ca53
PN
6004 zfree(score);
6005 } else {
6006 robj *o = dictGetEntryKey(de);
6007 dictAdd(dstzset->dict,o,score);
6008 incrRefCount(o); /* added to dictionary */
6009 zslInsert(dstzset->zsl,*score,o);
6010 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
6011 }
6012 }
2830ca53
PN
6013 dictReleaseIterator(di);
6014 }
6015 } else if (op == REDIS_OP_UNION) {
6016 for (i = 0; i < zsetnum; i++) {
8f92e768 6017 if (!src[i].dict) continue;
2830ca53 6018
8f92e768 6019 di = dictGetIterator(src[i].dict);
2830ca53
PN
6020 while((de = dictNext(di)) != NULL) {
6021 /* skip key when already processed */
6022 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6023
d2764cd6
PN
6024 double *score = zmalloc(sizeof(double)), value;
6025 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
2830ca53 6026
d2764cd6
PN
6027 /* because the zsets are sorted by size, its only possible
6028 * for sets at larger indices to hold this entry */
6029 for (j = (i+1); j < zsetnum; j++) {
6030 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6031 if (other) {
d2764cd6
PN
6032 value = src[j].weight * (*(double*)dictGetEntryVal(other));
6033 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6034 }
6035 }
b287c9bb 6036
2830ca53
PN
6037 robj *o = dictGetEntryKey(de);
6038 dictAdd(dstzset->dict,o,score);
6039 incrRefCount(o); /* added to dictionary */
6040 zslInsert(dstzset->zsl,*score,o);
6041 incrRefCount(o); /* added to skiplist */
6042 }
6043 dictReleaseIterator(di);
b287c9bb 6044 }
2830ca53
PN
6045 } else {
6046 /* unknown operator */
6047 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
6048 }
6049
6050 deleteKey(c->db,dstkey);
3ea27d37 6051 if (dstzset->zsl->length) {
6052 dictAdd(c->db->dict,dstkey,dstobj);
6053 incrRefCount(dstkey);
482b672d 6054 addReplyLongLong(c, dstzset->zsl->length);
3ea27d37 6055 server.dirty++;
6056 } else {
8bca8773 6057 decrRefCount(dstobj);
3ea27d37 6058 addReply(c, shared.czero);
6059 }
8f92e768 6060 zfree(src);
b287c9bb
PN
6061}
6062
5d373da9 6063static void zunionstoreCommand(redisClient *c) {
2830ca53 6064 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
6065}
6066
5d373da9 6067static void zinterstoreCommand(redisClient *c) {
2830ca53 6068 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
6069}
6070
e3870fab 6071static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 6072 robj *o;
bbe025e0
AM
6073 long start;
6074 long end;
752da584 6075 int withscores = 0;
dd88747b 6076 int llen;
6077 int rangelen, j;
6078 zset *zsetobj;
6079 zskiplist *zsl;
6080 zskiplistNode *ln;
6081 robj *ele;
752da584 6082
bd79a6bd
PN
6083 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6084 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 6085
752da584 6086 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6087 withscores = 1;
6088 } else if (c->argc >= 5) {
6089 addReply(c,shared.syntaxerr);
6090 return;
6091 }
cc812361 6092
4e27f268 6093 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6094 || checkType(c,o,REDIS_ZSET)) return;
dd88747b 6095 zsetobj = o->ptr;
6096 zsl = zsetobj->zsl;
6097 llen = zsl->length;
cc812361 6098
dd88747b 6099 /* convert negative indexes */
6100 if (start < 0) start = llen+start;
6101 if (end < 0) end = llen+end;
6102 if (start < 0) start = 0;
6103 if (end < 0) end = 0;
cc812361 6104
dd88747b 6105 /* indexes sanity checks */
6106 if (start > end || start >= llen) {
6107 /* Out of range start or start > end result in empty list */
6108 addReply(c,shared.emptymultibulk);
6109 return;
6110 }
6111 if (end >= llen) end = llen-1;
6112 rangelen = (end-start)+1;
cc812361 6113
dd88747b 6114 /* check if starting point is trivial, before searching
6115 * the element in log(N) time */
6116 if (reverse) {
6117 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6118 } else {
6119 ln = start == 0 ?
6120 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6121 }
cc812361 6122
dd88747b 6123 /* Return the result in form of a multi-bulk reply */
6124 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6125 withscores ? (rangelen*2) : rangelen));
6126 for (j = 0; j < rangelen; j++) {
6127 ele = ln->obj;
6128 addReplyBulk(c,ele);
6129 if (withscores)
6130 addReplyDouble(c,ln->score);
6131 ln = reverse ? ln->backward : ln->forward[0];
cc812361 6132 }
6133}
6134
e3870fab 6135static void zrangeCommand(redisClient *c) {
6136 zrangeGenericCommand(c,0);
6137}
6138
6139static void zrevrangeCommand(redisClient *c) {
6140 zrangeGenericCommand(c,1);
6141}
6142
f44dd428 6143/* This command implements both ZRANGEBYSCORE and ZCOUNT.
6144 * If justcount is non-zero, just the count is returned. */
6145static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 6146 robj *o;
f44dd428 6147 double min, max;
6148 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 6149 int offset = 0, limit = -1;
0500ef27
SH
6150 int withscores = 0;
6151 int badsyntax = 0;
6152
f44dd428 6153 /* Parse the min-max interval. If one of the values is prefixed
6154 * by the "(" character, it's considered "open". For instance
6155 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6156 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6157 if (((char*)c->argv[2]->ptr)[0] == '(') {
6158 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6159 minex = 1;
6160 } else {
6161 min = strtod(c->argv[2]->ptr,NULL);
6162 }
6163 if (((char*)c->argv[3]->ptr)[0] == '(') {
6164 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6165 maxex = 1;
6166 } else {
6167 max = strtod(c->argv[3]->ptr,NULL);
6168 }
6169
6170 /* Parse "WITHSCORES": note that if the command was called with
6171 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6172 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 6173 if (c->argc == 5 || c->argc == 8) {
3a3978b1 6174 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6175 withscores = 1;
6176 else
6177 badsyntax = 1;
0500ef27 6178 }
3a3978b1 6179 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 6180 badsyntax = 1;
0500ef27 6181 if (badsyntax) {
454d4e43 6182 addReplySds(c,
6183 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 6184 return;
0500ef27
SH
6185 }
6186
f44dd428 6187 /* Parse "LIMIT" */
0500ef27 6188 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 6189 addReply(c,shared.syntaxerr);
6190 return;
0500ef27 6191 } else if (c->argc == (7 + withscores)) {
80181f78 6192 offset = atoi(c->argv[5]->ptr);
6193 limit = atoi(c->argv[6]->ptr);
0b13687c 6194 if (offset < 0) offset = 0;
80181f78 6195 }
50c55df5 6196
f44dd428 6197 /* Ok, lookup the key and get the range */
50c55df5 6198 o = lookupKeyRead(c->db,c->argv[1]);
6199 if (o == NULL) {
4e27f268 6200 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6201 } else {
6202 if (o->type != REDIS_ZSET) {
6203 addReply(c,shared.wrongtypeerr);
6204 } else {
6205 zset *zsetobj = o->ptr;
6206 zskiplist *zsl = zsetobj->zsl;
6207 zskiplistNode *ln;
f44dd428 6208 robj *ele, *lenobj = NULL;
6209 unsigned long rangelen = 0;
50c55df5 6210
f44dd428 6211 /* Get the first node with the score >= min, or with
6212 * score > min if 'minex' is true. */
50c55df5 6213 ln = zslFirstWithScore(zsl,min);
f44dd428 6214 while (minex && ln && ln->score == min) ln = ln->forward[0];
6215
50c55df5 6216 if (ln == NULL) {
6217 /* No element matching the speciifed interval */
f44dd428 6218 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6219 return;
6220 }
6221
6222 /* We don't know in advance how many matching elements there
6223 * are in the list, so we push this object that will represent
6224 * the multi-bulk length in the output buffer, and will "fix"
6225 * it later */
f44dd428 6226 if (!justcount) {
6227 lenobj = createObject(REDIS_STRING,NULL);
6228 addReply(c,lenobj);
6229 decrRefCount(lenobj);
6230 }
50c55df5 6231
f44dd428 6232 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 6233 if (offset) {
6234 offset--;
6235 ln = ln->forward[0];
6236 continue;
6237 }
6238 if (limit == 0) break;
f44dd428 6239 if (!justcount) {
6240 ele = ln->obj;
dd88747b 6241 addReplyBulk(c,ele);
f44dd428 6242 if (withscores)
6243 addReplyDouble(c,ln->score);
6244 }
50c55df5 6245 ln = ln->forward[0];
6246 rangelen++;
80181f78 6247 if (limit > 0) limit--;
50c55df5 6248 }
f44dd428 6249 if (justcount) {
482b672d 6250 addReplyLongLong(c,(long)rangelen);
f44dd428 6251 } else {
6252 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6253 withscores ? (rangelen*2) : rangelen);
6254 }
50c55df5 6255 }
6256 }
6257}
6258
f44dd428 6259static void zrangebyscoreCommand(redisClient *c) {
6260 genericZrangebyscoreCommand(c,0);
6261}
6262
6263static void zcountCommand(redisClient *c) {
6264 genericZrangebyscoreCommand(c,1);
6265}
6266
3c41331e 6267static void zcardCommand(redisClient *c) {
e197b441 6268 robj *o;
6269 zset *zs;
dd88747b 6270
6271 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6272 checkType(c,o,REDIS_ZSET)) return;
6273
6274 zs = o->ptr;
6275 addReplyUlong(c,zs->zsl->length);
e197b441 6276}
6277
6e333bbe 6278static void zscoreCommand(redisClient *c) {
6279 robj *o;
6280 zset *zs;
dd88747b 6281 dictEntry *de;
6282
6283 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6284 checkType(c,o,REDIS_ZSET)) return;
6285
6286 zs = o->ptr;
6287 de = dictFind(zs->dict,c->argv[2]);
6288 if (!de) {
96d8b4ee 6289 addReply(c,shared.nullbulk);
6e333bbe 6290 } else {
dd88747b 6291 double *score = dictGetEntryVal(de);
6e333bbe 6292
dd88747b 6293 addReplyDouble(c,*score);
6e333bbe 6294 }
6295}
6296
798d9e55 6297static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 6298 robj *o;
dd88747b 6299 zset *zs;
6300 zskiplist *zsl;
6301 dictEntry *de;
6302 unsigned long rank;
6303 double *score;
6304
6305 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6306 checkType(c,o,REDIS_ZSET)) return;
6307
6308 zs = o->ptr;
6309 zsl = zs->zsl;
6310 de = dictFind(zs->dict,c->argv[2]);
6311 if (!de) {
69d95c3e
PN
6312 addReply(c,shared.nullbulk);
6313 return;
6314 }
69d95c3e 6315
dd88747b 6316 score = dictGetEntryVal(de);
6317 rank = zslGetRank(zsl, *score, c->argv[2]);
6318 if (rank) {
6319 if (reverse) {
482b672d 6320 addReplyLongLong(c, zsl->length - rank);
27b0ccca 6321 } else {
482b672d 6322 addReplyLongLong(c, rank-1);
69d95c3e 6323 }
dd88747b 6324 } else {
6325 addReply(c,shared.nullbulk);
978c2c94 6326 }
6327}
6328
798d9e55
PN
6329static void zrankCommand(redisClient *c) {
6330 zrankGenericCommand(c, 0);
6331}
6332
6333static void zrevrankCommand(redisClient *c) {
6334 zrankGenericCommand(c, 1);
6335}
6336
7fb16bac
PN
6337/* ========================= Hashes utility functions ======================= */
6338#define REDIS_HASH_KEY 1
6339#define REDIS_HASH_VALUE 2
978c2c94 6340
7fb16bac
PN
6341/* Check the length of a number of objects to see if we need to convert a
6342 * zipmap to a real hash. Note that we only check string encoded objects
6343 * as their string length can be queried in constant time. */
6344static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6345 int i;
6346 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
978c2c94 6347
7fb16bac
PN
6348 for (i = start; i <= end; i++) {
6349 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6350 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6351 {
6352 convertToRealHash(subject);
978c2c94 6353 return;
6354 }
6355 }
7fb16bac 6356}
bae2c7ec 6357
97224de7
PN
6358/* Encode given objects in-place when the hash uses a dict. */
6359static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6360 if (subject->encoding == REDIS_ENCODING_HT) {
3f973463
PN
6361 if (o1) *o1 = tryObjectEncoding(*o1);
6362 if (o2) *o2 = tryObjectEncoding(*o2);
97224de7
PN
6363 }
6364}
6365
7fb16bac 6366/* Get the value from a hash identified by key. Returns either a string
a3f3af86
PN
6367 * object or NULL if the value cannot be found. The refcount of the object
6368 * is always increased by 1 when the value was found. */
7fb16bac
PN
6369static robj *hashGet(robj *o, robj *key) {
6370 robj *value = NULL;
978c2c94 6371 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7fb16bac
PN
6372 unsigned char *v;
6373 unsigned int vlen;
6374 key = getDecodedObject(key);
6375 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6376 value = createStringObject((char*)v,vlen);
6377 }
6378 decrRefCount(key);
6379 } else {
6380 dictEntry *de = dictFind(o->ptr,key);
6381 if (de != NULL) {
6382 value = dictGetEntryVal(de);
a3f3af86 6383 incrRefCount(value);
7fb16bac
PN
6384 }
6385 }
6386 return value;
6387}
978c2c94 6388
7fb16bac
PN
6389/* Test if the key exists in the given hash. Returns 1 if the key
6390 * exists and 0 when it doesn't. */
6391static int hashExists(robj *o, robj *key) {
6392 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6393 key = getDecodedObject(key);
6394 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6395 decrRefCount(key);
6396 return 1;
6397 }
6398 decrRefCount(key);
6399 } else {
6400 if (dictFind(o->ptr,key) != NULL) {
6401 return 1;
6402 }
6403 }
6404 return 0;
6405}
bae2c7ec 6406
7fb16bac
PN
6407/* Add an element, discard the old if the key already exists.
6408 * Return 0 on insert and 1 on update. */
feb8d7e6 6409static int hashSet(robj *o, robj *key, robj *value) {
7fb16bac
PN
6410 int update = 0;
6411 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6412 key = getDecodedObject(key);
6413 value = getDecodedObject(value);
6414 o->ptr = zipmapSet(o->ptr,
6415 key->ptr,sdslen(key->ptr),
6416 value->ptr,sdslen(value->ptr), &update);
6417 decrRefCount(key);
6418 decrRefCount(value);
6419
6420 /* Check if the zipmap needs to be upgraded to a real hash table */
6421 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
bae2c7ec 6422 convertToRealHash(o);
978c2c94 6423 } else {
7fb16bac
PN
6424 if (dictReplace(o->ptr,key,value)) {
6425 /* Insert */
6426 incrRefCount(key);
978c2c94 6427 } else {
7fb16bac 6428 /* Update */
978c2c94 6429 update = 1;
6430 }
7fb16bac 6431 incrRefCount(value);
978c2c94 6432 }
7fb16bac 6433 return update;
978c2c94 6434}
6435
7fb16bac
PN
6436/* Delete an element from a hash.
6437 * Return 1 on deleted and 0 on not found. */
6438static int hashDelete(robj *o, robj *key) {
6439 int deleted = 0;
6440 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6441 key = getDecodedObject(key);
6442 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6443 decrRefCount(key);
6444 } else {
6445 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6446 /* Always check if the dictionary needs a resize after a delete. */
6447 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
d33278d1 6448 }
7fb16bac
PN
6449 return deleted;
6450}
d33278d1 6451
7fb16bac 6452/* Return the number of elements in a hash. */
c811bb38 6453static unsigned long hashLength(robj *o) {
7fb16bac
PN
6454 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6455 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6456}
6457
6458/* Structure to hold hash iteration abstration. Note that iteration over
6459 * hashes involves both fields and values. Because it is possible that
6460 * not both are required, store pointers in the iterator to avoid
6461 * unnecessary memory allocation for fields/values. */
6462typedef struct {
6463 int encoding;
6464 unsigned char *zi;
6465 unsigned char *zk, *zv;
6466 unsigned int zklen, zvlen;
6467
6468 dictIterator *di;
6469 dictEntry *de;
6470} hashIterator;
6471
c44d3b56
PN
6472static hashIterator *hashInitIterator(robj *subject) {
6473 hashIterator *hi = zmalloc(sizeof(hashIterator));
7fb16bac
PN
6474 hi->encoding = subject->encoding;
6475 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6476 hi->zi = zipmapRewind(subject->ptr);
6477 } else if (hi->encoding == REDIS_ENCODING_HT) {
6478 hi->di = dictGetIterator(subject->ptr);
d33278d1 6479 } else {
7fb16bac 6480 redisAssert(NULL);
d33278d1 6481 }
c44d3b56 6482 return hi;
7fb16bac 6483}
d33278d1 6484
7fb16bac
PN
6485static void hashReleaseIterator(hashIterator *hi) {
6486 if (hi->encoding == REDIS_ENCODING_HT) {
6487 dictReleaseIterator(hi->di);
d33278d1 6488 }
c44d3b56 6489 zfree(hi);
7fb16bac 6490}
d33278d1 6491
7fb16bac
PN
6492/* Move to the next entry in the hash. Return REDIS_OK when the next entry
6493 * could be found and REDIS_ERR when the iterator reaches the end. */
c811bb38 6494static int hashNext(hashIterator *hi) {
7fb16bac
PN
6495 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6496 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6497 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6498 } else {
6499 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6500 }
6501 return REDIS_OK;
6502}
d33278d1 6503
0c390abc 6504/* Get key or value object at current iteration position.
a3f3af86 6505 * This increases the refcount of the field object by 1. */
c811bb38 6506static robj *hashCurrent(hashIterator *hi, int what) {
7fb16bac
PN
6507 robj *o;
6508 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6509 if (what & REDIS_HASH_KEY) {
6510 o = createStringObject((char*)hi->zk,hi->zklen);
6511 } else {
6512 o = createStringObject((char*)hi->zv,hi->zvlen);
d33278d1 6513 }
d33278d1 6514 } else {
7fb16bac
PN
6515 if (what & REDIS_HASH_KEY) {
6516 o = dictGetEntryKey(hi->de);
6517 } else {
6518 o = dictGetEntryVal(hi->de);
d33278d1 6519 }
a3f3af86 6520 incrRefCount(o);
d33278d1 6521 }
7fb16bac 6522 return o;
d33278d1
PN
6523}
6524
7fb16bac
PN
6525static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6526 robj *o = lookupKeyWrite(c->db,key);
01426b05
PN
6527 if (o == NULL) {
6528 o = createHashObject();
7fb16bac
PN
6529 dictAdd(c->db->dict,key,o);
6530 incrRefCount(key);
01426b05
PN
6531 } else {
6532 if (o->type != REDIS_HASH) {
6533 addReply(c,shared.wrongtypeerr);
7fb16bac 6534 return NULL;
01426b05
PN
6535 }
6536 }
7fb16bac
PN
6537 return o;
6538}
01426b05 6539
7fb16bac
PN
6540/* ============================= Hash commands ============================== */
6541static void hsetCommand(redisClient *c) {
6e9e463f 6542 int update;
7fb16bac 6543 robj *o;
bbe025e0 6544
7fb16bac
PN
6545 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6546 hashTryConversion(o,c->argv,2,3);
97224de7 6547 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6548 update = hashSet(o,c->argv[2],c->argv[3]);
6e9e463f 6549 addReply(c, update ? shared.czero : shared.cone);
7fb16bac
PN
6550 server.dirty++;
6551}
01426b05 6552
1f1c7695
PN
6553static void hsetnxCommand(redisClient *c) {
6554 robj *o;
6555 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6556 hashTryConversion(o,c->argv,2,3);
6557
6558 if (hashExists(o, c->argv[2])) {
6559 addReply(c, shared.czero);
01426b05 6560 } else {
97224de7 6561 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6562 hashSet(o,c->argv[2],c->argv[3]);
1f1c7695
PN
6563 addReply(c, shared.cone);
6564 server.dirty++;
6565 }
6566}
01426b05 6567
7fb16bac
PN
6568static void hmsetCommand(redisClient *c) {
6569 int i;
6570 robj *o;
01426b05 6571
7fb16bac
PN
6572 if ((c->argc % 2) == 1) {
6573 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6574 return;
6575 }
01426b05 6576
7fb16bac
PN
6577 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6578 hashTryConversion(o,c->argv,2,c->argc-1);
6579 for (i = 2; i < c->argc; i += 2) {
97224de7 6580 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
feb8d7e6 6581 hashSet(o,c->argv[i],c->argv[i+1]);
7fb16bac
PN
6582 }
6583 addReply(c, shared.ok);
edc2f63a 6584 server.dirty++;
7fb16bac
PN
6585}
6586
6587static void hincrbyCommand(redisClient *c) {
6588 long long value, incr;
6589 robj *o, *current, *new;
6590
bd79a6bd 6591 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
7fb16bac
PN
6592 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6593 if ((current = hashGet(o,c->argv[2])) != NULL) {
946342c1
PN
6594 if (getLongLongFromObjectOrReply(c,current,&value,
6595 "hash value is not an integer") != REDIS_OK) {
6596 decrRefCount(current);
6597 return;
6598 }
a3f3af86 6599 decrRefCount(current);
7fb16bac
PN
6600 } else {
6601 value = 0;
01426b05
PN
6602 }
6603
7fb16bac 6604 value += incr;
3f973463
PN
6605 new = createStringObjectFromLongLong(value);
6606 hashTryObjectEncoding(o,&c->argv[2],NULL);
feb8d7e6 6607 hashSet(o,c->argv[2],new);
7fb16bac
PN
6608 decrRefCount(new);
6609 addReplyLongLong(c,value);
01426b05 6610 server.dirty++;
01426b05
PN
6611}
6612
978c2c94 6613static void hgetCommand(redisClient *c) {
7fb16bac 6614 robj *o, *value;
dd88747b 6615 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6616 checkType(c,o,REDIS_HASH)) return;
6617
7fb16bac
PN
6618 if ((value = hashGet(o,c->argv[2])) != NULL) {
6619 addReplyBulk(c,value);
a3f3af86 6620 decrRefCount(value);
dd88747b 6621 } else {
7fb16bac 6622 addReply(c,shared.nullbulk);
69d95c3e 6623 }
69d95c3e
PN
6624}
6625
09aeb579
PN
6626static void hmgetCommand(redisClient *c) {
6627 int i;
7fb16bac
PN
6628 robj *o, *value;
6629 o = lookupKeyRead(c->db,c->argv[1]);
6630 if (o != NULL && o->type != REDIS_HASH) {
6631 addReply(c,shared.wrongtypeerr);
09aeb579
PN
6632 }
6633
7fb16bac
PN
6634 /* Note the check for o != NULL happens inside the loop. This is
6635 * done because objects that cannot be found are considered to be
6636 * an empty hash. The reply should then be a series of NULLs. */
09aeb579 6637 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7fb16bac
PN
6638 for (i = 2; i < c->argc; i++) {
6639 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6640 addReplyBulk(c,value);
a3f3af86 6641 decrRefCount(value);
7fb16bac
PN
6642 } else {
6643 addReply(c,shared.nullbulk);
09aeb579
PN
6644 }
6645 }
6646}
6647
07efaf74 6648static void hdelCommand(redisClient *c) {
dd88747b 6649 robj *o;
dd88747b 6650 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6651 checkType(c,o,REDIS_HASH)) return;
07efaf74 6652
7fb16bac
PN
6653 if (hashDelete(o,c->argv[2])) {
6654 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6655 addReply(c,shared.cone);
6656 server.dirty++;
dd88747b 6657 } else {
7fb16bac 6658 addReply(c,shared.czero);
07efaf74 6659 }
6660}
6661
92b27fe9 6662static void hlenCommand(redisClient *c) {
6663 robj *o;
dd88747b 6664 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 6665 checkType(c,o,REDIS_HASH)) return;
6666
7fb16bac 6667 addReplyUlong(c,hashLength(o));
92b27fe9 6668}
6669
78409a0f 6670static void genericHgetallCommand(redisClient *c, int flags) {
7fb16bac 6671 robj *o, *lenobj, *obj;
78409a0f 6672 unsigned long count = 0;
c44d3b56 6673 hashIterator *hi;
78409a0f 6674
4e27f268 6675 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
78409a0f 6676 || checkType(c,o,REDIS_HASH)) return;
6677
6678 lenobj = createObject(REDIS_STRING,NULL);
6679 addReply(c,lenobj);
6680 decrRefCount(lenobj);
6681
c44d3b56
PN
6682 hi = hashInitIterator(o);
6683 while (hashNext(hi) != REDIS_ERR) {
7fb16bac 6684 if (flags & REDIS_HASH_KEY) {
c44d3b56 6685 obj = hashCurrent(hi,REDIS_HASH_KEY);
7fb16bac 6686 addReplyBulk(c,obj);
a3f3af86 6687 decrRefCount(obj);
7fb16bac 6688 count++;
78409a0f 6689 }
7fb16bac 6690 if (flags & REDIS_HASH_VALUE) {
c44d3b56 6691 obj = hashCurrent(hi,REDIS_HASH_VALUE);
7fb16bac 6692 addReplyBulk(c,obj);
a3f3af86 6693 decrRefCount(obj);
7fb16bac 6694 count++;
78409a0f 6695 }
78409a0f 6696 }
c44d3b56 6697 hashReleaseIterator(hi);
7fb16bac 6698
78409a0f 6699 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6700}
6701
6702static void hkeysCommand(redisClient *c) {
7fb16bac 6703 genericHgetallCommand(c,REDIS_HASH_KEY);
78409a0f 6704}
6705
6706static void hvalsCommand(redisClient *c) {
7fb16bac 6707 genericHgetallCommand(c,REDIS_HASH_VALUE);
78409a0f 6708}
6709
6710static void hgetallCommand(redisClient *c) {
7fb16bac 6711 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
78409a0f 6712}
6713
a86f14b1 6714static void hexistsCommand(redisClient *c) {
6715 robj *o;
a86f14b1 6716 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6717 checkType(c,o,REDIS_HASH)) return;
6718
7fb16bac 6719 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
a86f14b1 6720}
6721
ada386b2 6722static void convertToRealHash(robj *o) {
6723 unsigned char *key, *val, *p, *zm = o->ptr;
6724 unsigned int klen, vlen;
6725 dict *dict = dictCreate(&hashDictType,NULL);
6726
6727 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6728 p = zipmapRewind(zm);
6729 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6730 robj *keyobj, *valobj;
6731
6732 keyobj = createStringObject((char*)key,klen);
6733 valobj = createStringObject((char*)val,vlen);
05df7621 6734 keyobj = tryObjectEncoding(keyobj);
6735 valobj = tryObjectEncoding(valobj);
ada386b2 6736 dictAdd(dict,keyobj,valobj);
6737 }
6738 o->encoding = REDIS_ENCODING_HT;
6739 o->ptr = dict;
6740 zfree(zm);
6741}
6742
6b47e12e 6743/* ========================= Non type-specific commands ==================== */
6744
ed9b544e 6745static void flushdbCommand(redisClient *c) {
ca37e9cd 6746 server.dirty += dictSize(c->db->dict);
3305306f 6747 dictEmpty(c->db->dict);
6748 dictEmpty(c->db->expires);
ed9b544e 6749 addReply(c,shared.ok);
ed9b544e 6750}
6751
6752static void flushallCommand(redisClient *c) {
ca37e9cd 6753 server.dirty += emptyDb();
ed9b544e 6754 addReply(c,shared.ok);
500ece7c 6755 if (server.bgsavechildpid != -1) {
6756 kill(server.bgsavechildpid,SIGKILL);
6757 rdbRemoveTempFile(server.bgsavechildpid);
6758 }
f78fd11b 6759 rdbSave(server.dbfilename);
ca37e9cd 6760 server.dirty++;
ed9b544e 6761}
6762
56906eef 6763static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 6764 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 6765 so->type = type;
6766 so->pattern = pattern;
6767 return so;
6768}
6769
6770/* Return the value associated to the key with a name obtained
55017f9d
PN
6771 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6772 * The returned object will always have its refcount increased by 1
6773 * when it is non-NULL. */
56906eef 6774static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6d7d1370 6775 char *p, *f;
ed9b544e 6776 sds spat, ssub;
6d7d1370
PN
6777 robj keyobj, fieldobj, *o;
6778 int prefixlen, sublen, postfixlen, fieldlen;
ed9b544e 6779 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6780 struct {
f1017b3f 6781 long len;
6782 long free;
ed9b544e 6783 char buf[REDIS_SORTKEY_MAX+1];
6d7d1370 6784 } keyname, fieldname;
ed9b544e 6785
28173a49 6786 /* If the pattern is "#" return the substitution object itself in order
6787 * to implement the "SORT ... GET #" feature. */
6788 spat = pattern->ptr;
6789 if (spat[0] == '#' && spat[1] == '\0') {
55017f9d 6790 incrRefCount(subst);
28173a49 6791 return subst;
6792 }
6793
6794 /* The substitution object may be specially encoded. If so we create
9d65a1bb 6795 * a decoded object on the fly. Otherwise getDecodedObject will just
6796 * increment the ref count, that we'll decrement later. */
6797 subst = getDecodedObject(subst);
942a3961 6798
ed9b544e 6799 ssub = subst->ptr;
6800 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6801 p = strchr(spat,'*');
ed5a857a 6802 if (!p) {
6803 decrRefCount(subst);
6804 return NULL;
6805 }
ed9b544e 6806
6d7d1370
PN
6807 /* Find out if we're dealing with a hash dereference. */
6808 if ((f = strstr(p+1, "->")) != NULL) {
6809 fieldlen = sdslen(spat)-(f-spat);
6810 /* this also copies \0 character */
6811 memcpy(fieldname.buf,f+2,fieldlen-1);
6812 fieldname.len = fieldlen-2;
6813 } else {
6814 fieldlen = 0;
6815 }
6816
ed9b544e 6817 prefixlen = p-spat;
6818 sublen = sdslen(ssub);
6d7d1370 6819 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
ed9b544e 6820 memcpy(keyname.buf,spat,prefixlen);
6821 memcpy(keyname.buf+prefixlen,ssub,sublen);
6822 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6823 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6824 keyname.len = prefixlen+sublen+postfixlen;
942a3961 6825 decrRefCount(subst);
6826
6d7d1370
PN
6827 /* Lookup substituted key */
6828 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6829 o = lookupKeyRead(db,&keyobj);
55017f9d
PN
6830 if (o == NULL) return NULL;
6831
6832 if (fieldlen > 0) {
6833 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6d7d1370 6834
705dad38
PN
6835 /* Retrieve value from hash by the field name. This operation
6836 * already increases the refcount of the returned object. */
6d7d1370
PN
6837 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6838 o = hashGet(o, &fieldobj);
705dad38 6839 } else {
55017f9d 6840 if (o->type != REDIS_STRING) return NULL;
b6f07345 6841
705dad38
PN
6842 /* Every object that this function returns needs to have its refcount
6843 * increased. sortCommand decreases it again. */
6844 incrRefCount(o);
6d7d1370
PN
6845 }
6846
6847 return o;
ed9b544e 6848}
6849
6850/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6851 * the additional parameter is not standard but a BSD-specific we have to
6852 * pass sorting parameters via the global 'server' structure */
6853static int sortCompare(const void *s1, const void *s2) {
6854 const redisSortObject *so1 = s1, *so2 = s2;
6855 int cmp;
6856
6857 if (!server.sort_alpha) {
6858 /* Numeric sorting. Here it's trivial as we precomputed scores */
6859 if (so1->u.score > so2->u.score) {
6860 cmp = 1;
6861 } else if (so1->u.score < so2->u.score) {
6862 cmp = -1;
6863 } else {
6864 cmp = 0;
6865 }
6866 } else {
6867 /* Alphanumeric sorting */
6868 if (server.sort_bypattern) {
6869 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6870 /* At least one compare object is NULL */
6871 if (so1->u.cmpobj == so2->u.cmpobj)
6872 cmp = 0;
6873 else if (so1->u.cmpobj == NULL)
6874 cmp = -1;
6875 else
6876 cmp = 1;
6877 } else {
6878 /* We have both the objects, use strcoll */
6879 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6880 }
6881 } else {
08ee9b57 6882 /* Compare elements directly. */
6883 cmp = compareStringObjects(so1->obj,so2->obj);
ed9b544e 6884 }
6885 }
6886 return server.sort_desc ? -cmp : cmp;
6887}
6888
6889/* The SORT command is the most complex command in Redis. Warning: this code
6890 * is optimized for speed and a bit less for readability */
6891static void sortCommand(redisClient *c) {
ed9b544e 6892 list *operations;
6893 int outputlen = 0;
6894 int desc = 0, alpha = 0;
6895 int limit_start = 0, limit_count = -1, start, end;
6896 int j, dontsort = 0, vectorlen;
6897 int getop = 0; /* GET operation counter */
443c6409 6898 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 6899 redisSortObject *vector; /* Resulting vector to sort */
6900
6901 /* Lookup the key to sort. It must be of the right types */
3305306f 6902 sortval = lookupKeyRead(c->db,c->argv[1]);
6903 if (sortval == NULL) {
4e27f268 6904 addReply(c,shared.emptymultibulk);
ed9b544e 6905 return;
6906 }
a5eb649b 6907 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6908 sortval->type != REDIS_ZSET)
6909 {
c937aa89 6910 addReply(c,shared.wrongtypeerr);
ed9b544e 6911 return;
6912 }
6913
6914 /* Create a list of operations to perform for every sorted element.
6915 * Operations can be GET/DEL/INCR/DECR */
6916 operations = listCreate();
092dac2a 6917 listSetFreeMethod(operations,zfree);
ed9b544e 6918 j = 2;
6919
6920 /* Now we need to protect sortval incrementing its count, in the future
6921 * SORT may have options able to overwrite/delete keys during the sorting
6922 * and the sorted key itself may get destroied */
6923 incrRefCount(sortval);
6924
6925 /* The SORT command has an SQL-alike syntax, parse it */
6926 while(j < c->argc) {
6927 int leftargs = c->argc-j-1;
6928 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6929 desc = 0;
6930 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6931 desc = 1;
6932 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6933 alpha = 1;
6934 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6935 limit_start = atoi(c->argv[j+1]->ptr);
6936 limit_count = atoi(c->argv[j+2]->ptr);
6937 j+=2;
443c6409 6938 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6939 storekey = c->argv[j+1];
6940 j++;
ed9b544e 6941 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6942 sortby = c->argv[j+1];
6943 /* If the BY pattern does not contain '*', i.e. it is constant,
6944 * we don't need to sort nor to lookup the weight keys. */
6945 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6946 j++;
6947 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6948 listAddNodeTail(operations,createSortOperation(
6949 REDIS_SORT_GET,c->argv[j+1]));
6950 getop++;
6951 j++;
ed9b544e 6952 } else {
6953 decrRefCount(sortval);
6954 listRelease(operations);
c937aa89 6955 addReply(c,shared.syntaxerr);
ed9b544e 6956 return;
6957 }
6958 j++;
6959 }
6960
6961 /* Load the sorting vector with all the objects to sort */
a5eb649b 6962 switch(sortval->type) {
6963 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6964 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6965 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
f83c6cb5 6966 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
a5eb649b 6967 }
ed9b544e 6968 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 6969 j = 0;
a5eb649b 6970
ed9b544e 6971 if (sortval->type == REDIS_LIST) {
6972 list *list = sortval->ptr;
6208b3a7 6973 listNode *ln;
c7df85a4 6974 listIter li;
6208b3a7 6975
c7df85a4 6976 listRewind(list,&li);
6977 while((ln = listNext(&li))) {
ed9b544e 6978 robj *ele = ln->value;
6979 vector[j].obj = ele;
6980 vector[j].u.score = 0;
6981 vector[j].u.cmpobj = NULL;
ed9b544e 6982 j++;
6983 }
6984 } else {
a5eb649b 6985 dict *set;
ed9b544e 6986 dictIterator *di;
6987 dictEntry *setele;
6988
a5eb649b 6989 if (sortval->type == REDIS_SET) {
6990 set = sortval->ptr;
6991 } else {
6992 zset *zs = sortval->ptr;
6993 set = zs->dict;
6994 }
6995
ed9b544e 6996 di = dictGetIterator(set);
ed9b544e 6997 while((setele = dictNext(di)) != NULL) {
6998 vector[j].obj = dictGetEntryKey(setele);
6999 vector[j].u.score = 0;
7000 vector[j].u.cmpobj = NULL;
7001 j++;
7002 }
7003 dictReleaseIterator(di);
7004 }
dfc5e96c 7005 redisAssert(j == vectorlen);
ed9b544e 7006
7007 /* Now it's time to load the right scores in the sorting vector */
7008 if (dontsort == 0) {
7009 for (j = 0; j < vectorlen; j++) {
6d7d1370 7010 robj *byval;
ed9b544e 7011 if (sortby) {
6d7d1370 7012 /* lookup value to sort by */
3305306f 7013 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
705dad38 7014 if (!byval) continue;
ed9b544e 7015 } else {
6d7d1370
PN
7016 /* use object itself to sort by */
7017 byval = vector[j].obj;
7018 }
7019
7020 if (alpha) {
08ee9b57 7021 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6d7d1370
PN
7022 } else {
7023 if (byval->encoding == REDIS_ENCODING_RAW) {
7024 vector[j].u.score = strtod(byval->ptr,NULL);
16fa22f1 7025 } else if (byval->encoding == REDIS_ENCODING_INT) {
6d7d1370
PN
7026 /* Don't need to decode the object if it's
7027 * integer-encoded (the only encoding supported) so
7028 * far. We can just cast it */
16fa22f1
PN
7029 vector[j].u.score = (long)byval->ptr;
7030 } else {
7031 redisAssert(1 != 1);
942a3961 7032 }
ed9b544e 7033 }
6d7d1370 7034
705dad38
PN
7035 /* when the object was retrieved using lookupKeyByPattern,
7036 * its refcount needs to be decreased. */
7037 if (sortby) {
7038 decrRefCount(byval);
ed9b544e 7039 }
7040 }
7041 }
7042
7043 /* We are ready to sort the vector... perform a bit of sanity check
7044 * on the LIMIT option too. We'll use a partial version of quicksort. */
7045 start = (limit_start < 0) ? 0 : limit_start;
7046 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7047 if (start >= vectorlen) {
7048 start = vectorlen-1;
7049 end = vectorlen-2;
7050 }
7051 if (end >= vectorlen) end = vectorlen-1;
7052
7053 if (dontsort == 0) {
7054 server.sort_desc = desc;
7055 server.sort_alpha = alpha;
7056 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 7057 if (sortby && (start != 0 || end != vectorlen-1))
7058 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7059 else
7060 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 7061 }
7062
7063 /* Send command output to the output buffer, performing the specified
7064 * GET/DEL/INCR/DECR operations if any. */
7065 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 7066 if (storekey == NULL) {
7067 /* STORE option not specified, sent the sorting result to client */
7068 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7069 for (j = start; j <= end; j++) {
7070 listNode *ln;
c7df85a4 7071 listIter li;
7072
dd88747b 7073 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 7074 listRewind(operations,&li);
7075 while((ln = listNext(&li))) {
443c6409 7076 redisSortOperation *sop = ln->value;
7077 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7078 vector[j].obj);
7079
7080 if (sop->type == REDIS_SORT_GET) {
55017f9d 7081 if (!val) {
443c6409 7082 addReply(c,shared.nullbulk);
7083 } else {
dd88747b 7084 addReplyBulk(c,val);
55017f9d 7085 decrRefCount(val);
443c6409 7086 }
7087 } else {
dfc5e96c 7088 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 7089 }
7090 }
ed9b544e 7091 }
443c6409 7092 } else {
7093 robj *listObject = createListObject();
7094 list *listPtr = (list*) listObject->ptr;
7095
7096 /* STORE option specified, set the sorting result as a List object */
7097 for (j = start; j <= end; j++) {
7098 listNode *ln;
c7df85a4 7099 listIter li;
7100
443c6409 7101 if (!getop) {
7102 listAddNodeTail(listPtr,vector[j].obj);
7103 incrRefCount(vector[j].obj);
7104 }
c7df85a4 7105 listRewind(operations,&li);
7106 while((ln = listNext(&li))) {
443c6409 7107 redisSortOperation *sop = ln->value;
7108 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7109 vector[j].obj);
7110
7111 if (sop->type == REDIS_SORT_GET) {
55017f9d 7112 if (!val) {
443c6409 7113 listAddNodeTail(listPtr,createStringObject("",0));
7114 } else {
55017f9d
PN
7115 /* We should do a incrRefCount on val because it is
7116 * added to the list, but also a decrRefCount because
7117 * it is returned by lookupKeyByPattern. This results
7118 * in doing nothing at all. */
443c6409 7119 listAddNodeTail(listPtr,val);
443c6409 7120 }
ed9b544e 7121 } else {
dfc5e96c 7122 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 7123 }
ed9b544e 7124 }
ed9b544e 7125 }
121796f7 7126 if (dictReplace(c->db->dict,storekey,listObject)) {
7127 incrRefCount(storekey);
7128 }
443c6409 7129 /* Note: we add 1 because the DB is dirty anyway since even if the
7130 * SORT result is empty a new key is set and maybe the old content
7131 * replaced. */
7132 server.dirty += 1+outputlen;
7133 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 7134 }
7135
7136 /* Cleanup */
7137 decrRefCount(sortval);
7138 listRelease(operations);
7139 for (j = 0; j < vectorlen; j++) {
16fa22f1 7140 if (alpha && vector[j].u.cmpobj)
ed9b544e 7141 decrRefCount(vector[j].u.cmpobj);
7142 }
7143 zfree(vector);
7144}
7145
ec6c7a1d 7146/* Convert an amount of bytes into a human readable string in the form
7147 * of 100B, 2G, 100M, 4K, and so forth. */
7148static void bytesToHuman(char *s, unsigned long long n) {
7149 double d;
7150
7151 if (n < 1024) {
7152 /* Bytes */
7153 sprintf(s,"%lluB",n);
7154 return;
7155 } else if (n < (1024*1024)) {
7156 d = (double)n/(1024);
7157 sprintf(s,"%.2fK",d);
7158 } else if (n < (1024LL*1024*1024)) {
7159 d = (double)n/(1024*1024);
7160 sprintf(s,"%.2fM",d);
7161 } else if (n < (1024LL*1024*1024*1024)) {
7162 d = (double)n/(1024LL*1024*1024);
b72f6a4b 7163 sprintf(s,"%.2fG",d);
ec6c7a1d 7164 }
7165}
7166
1c85b79f 7167/* Create the string returned by the INFO command. This is decoupled
7168 * by the INFO command itself as we need to report the same information
7169 * on memory corruption problems. */
7170static sds genRedisInfoString(void) {
ed9b544e 7171 sds info;
7172 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 7173 int j;
ec6c7a1d 7174 char hmem[64];
55a8298f 7175
b72f6a4b 7176 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 7177 info = sdscatprintf(sdsempty(),
7178 "redis_version:%s\r\n"
5436146c
PN
7179 "redis_git_sha1:%s\r\n"
7180 "redis_git_dirty:%d\r\n"
f1017b3f 7181 "arch_bits:%s\r\n"
7a932b74 7182 "multiplexing_api:%s\r\n"
0d7170a4 7183 "process_id:%ld\r\n"
682ac724 7184 "uptime_in_seconds:%ld\r\n"
7185 "uptime_in_days:%ld\r\n"
ed9b544e 7186 "connected_clients:%d\r\n"
7187 "connected_slaves:%d\r\n"
f86a74e9 7188 "blocked_clients:%d\r\n"
5fba9f71 7189 "used_memory:%zu\r\n"
ec6c7a1d 7190 "used_memory_human:%s\r\n"
ed9b544e 7191 "changes_since_last_save:%lld\r\n"
be2bb6b0 7192 "bgsave_in_progress:%d\r\n"
682ac724 7193 "last_save_time:%ld\r\n"
b3fad521 7194 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 7195 "total_connections_received:%lld\r\n"
7196 "total_commands_processed:%lld\r\n"
2a6a2ed1 7197 "expired_keys:%lld\r\n"
3be2c9d7 7198 "hash_max_zipmap_entries:%zu\r\n"
7199 "hash_max_zipmap_value:%zu\r\n"
ffc6b7f8 7200 "pubsub_channels:%ld\r\n"
7201 "pubsub_patterns:%u\r\n"
7d98e08c 7202 "vm_enabled:%d\r\n"
a0f643ea 7203 "role:%s\r\n"
ed9b544e 7204 ,REDIS_VERSION,
5436146c 7205 REDIS_GIT_SHA1,
274e45e3 7206 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
f1017b3f 7207 (sizeof(long) == 8) ? "64" : "32",
7a932b74 7208 aeGetApiName(),
0d7170a4 7209 (long) getpid(),
a0f643ea 7210 uptime,
7211 uptime/(3600*24),
ed9b544e 7212 listLength(server.clients)-listLength(server.slaves),
7213 listLength(server.slaves),
d5d55fc3 7214 server.blpop_blocked_clients,
b72f6a4b 7215 zmalloc_used_memory(),
ec6c7a1d 7216 hmem,
ed9b544e 7217 server.dirty,
9d65a1bb 7218 server.bgsavechildpid != -1,
ed9b544e 7219 server.lastsave,
b3fad521 7220 server.bgrewritechildpid != -1,
ed9b544e 7221 server.stat_numconnections,
7222 server.stat_numcommands,
2a6a2ed1 7223 server.stat_expiredkeys,
55a8298f 7224 server.hash_max_zipmap_entries,
7225 server.hash_max_zipmap_value,
ffc6b7f8 7226 dictSize(server.pubsub_channels),
7227 listLength(server.pubsub_patterns),
7d98e08c 7228 server.vm_enabled != 0,
a0f643ea 7229 server.masterhost == NULL ? "master" : "slave"
ed9b544e 7230 );
a0f643ea 7231 if (server.masterhost) {
7232 info = sdscatprintf(info,
7233 "master_host:%s\r\n"
7234 "master_port:%d\r\n"
7235 "master_link_status:%s\r\n"
7236 "master_last_io_seconds_ago:%d\r\n"
7237 ,server.masterhost,
7238 server.masterport,
7239 (server.replstate == REDIS_REPL_CONNECTED) ?
7240 "up" : "down",
f72b934d 7241 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 7242 );
7243 }
7d98e08c 7244 if (server.vm_enabled) {
1064ef87 7245 lockThreadedIO();
7d98e08c 7246 info = sdscatprintf(info,
7247 "vm_conf_max_memory:%llu\r\n"
7248 "vm_conf_page_size:%llu\r\n"
7249 "vm_conf_pages:%llu\r\n"
7250 "vm_stats_used_pages:%llu\r\n"
7251 "vm_stats_swapped_objects:%llu\r\n"
7252 "vm_stats_swappin_count:%llu\r\n"
7253 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 7254 "vm_stats_io_newjobs_len:%lu\r\n"
7255 "vm_stats_io_processing_len:%lu\r\n"
7256 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 7257 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 7258 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 7259 ,(unsigned long long) server.vm_max_memory,
7260 (unsigned long long) server.vm_page_size,
7261 (unsigned long long) server.vm_pages,
7262 (unsigned long long) server.vm_stats_used_pages,
7263 (unsigned long long) server.vm_stats_swapped_objects,
7264 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 7265 (unsigned long long) server.vm_stats_swapouts,
7266 (unsigned long) listLength(server.io_newjobs),
7267 (unsigned long) listLength(server.io_processing),
7268 (unsigned long) listLength(server.io_processed),
d5d55fc3 7269 (unsigned long) server.io_active_threads,
7270 (unsigned long) server.vm_blocked_clients
7d98e08c 7271 );
1064ef87 7272 unlockThreadedIO();
7d98e08c 7273 }
c3cb078d 7274 for (j = 0; j < server.dbnum; j++) {
7275 long long keys, vkeys;
7276
7277 keys = dictSize(server.db[j].dict);
7278 vkeys = dictSize(server.db[j].expires);
7279 if (keys || vkeys) {
9d65a1bb 7280 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 7281 j, keys, vkeys);
7282 }
7283 }
1c85b79f 7284 return info;
7285}
7286
7287static void infoCommand(redisClient *c) {
7288 sds info = genRedisInfoString();
83c6a618 7289 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7290 (unsigned long)sdslen(info)));
ed9b544e 7291 addReplySds(c,info);
70003d28 7292 addReply(c,shared.crlf);
ed9b544e 7293}
7294
3305306f 7295static void monitorCommand(redisClient *c) {
7296 /* ignore MONITOR if aleady slave or in monitor mode */
7297 if (c->flags & REDIS_SLAVE) return;
7298
7299 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7300 c->slaveseldb = 0;
6b47e12e 7301 listAddNodeTail(server.monitors,c);
3305306f 7302 addReply(c,shared.ok);
7303}
7304
7305/* ================================= Expire ================================= */
7306static int removeExpire(redisDb *db, robj *key) {
7307 if (dictDelete(db->expires,key) == DICT_OK) {
7308 return 1;
7309 } else {
7310 return 0;
7311 }
7312}
7313
7314static int setExpire(redisDb *db, robj *key, time_t when) {
7315 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7316 return 0;
7317 } else {
7318 incrRefCount(key);
7319 return 1;
7320 }
7321}
7322
bb32ede5 7323/* Return the expire time of the specified key, or -1 if no expire
7324 * is associated with this key (i.e. the key is non volatile) */
7325static time_t getExpire(redisDb *db, robj *key) {
7326 dictEntry *de;
7327
7328 /* No expire? return ASAP */
7329 if (dictSize(db->expires) == 0 ||
7330 (de = dictFind(db->expires,key)) == NULL) return -1;
7331
7332 return (time_t) dictGetEntryVal(de);
7333}
7334
3305306f 7335static int expireIfNeeded(redisDb *db, robj *key) {
7336 time_t when;
7337 dictEntry *de;
7338
7339 /* No expire? return ASAP */
7340 if (dictSize(db->expires) == 0 ||
7341 (de = dictFind(db->expires,key)) == NULL) return 0;
7342
7343 /* Lookup the expire */
7344 when = (time_t) dictGetEntryVal(de);
7345 if (time(NULL) <= when) return 0;
7346
7347 /* Delete the key */
7348 dictDelete(db->expires,key);
2a6a2ed1 7349 server.stat_expiredkeys++;
3305306f 7350 return dictDelete(db->dict,key) == DICT_OK;
7351}
7352
7353static int deleteIfVolatile(redisDb *db, robj *key) {
7354 dictEntry *de;
7355
7356 /* No expire? return ASAP */
7357 if (dictSize(db->expires) == 0 ||
7358 (de = dictFind(db->expires,key)) == NULL) return 0;
7359
7360 /* Delete the key */
0c66a471 7361 server.dirty++;
2a6a2ed1 7362 server.stat_expiredkeys++;
3305306f 7363 dictDelete(db->expires,key);
7364 return dictDelete(db->dict,key) == DICT_OK;
7365}
7366
bbe025e0 7367static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
3305306f 7368 dictEntry *de;
bbe025e0
AM
7369 time_t seconds;
7370
bd79a6bd 7371 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
bbe025e0
AM
7372
7373 seconds -= offset;
3305306f 7374
802e8373 7375 de = dictFind(c->db->dict,key);
3305306f 7376 if (de == NULL) {
7377 addReply(c,shared.czero);
7378 return;
7379 }
d4dd6556 7380 if (seconds <= 0) {
43e5ccdf 7381 if (deleteKey(c->db,key)) server.dirty++;
7382 addReply(c, shared.cone);
3305306f 7383 return;
7384 } else {
7385 time_t when = time(NULL)+seconds;
802e8373 7386 if (setExpire(c->db,key,when)) {
3305306f 7387 addReply(c,shared.cone);
77423026 7388 server.dirty++;
7389 } else {
3305306f 7390 addReply(c,shared.czero);
77423026 7391 }
3305306f 7392 return;
7393 }
7394}
7395
802e8373 7396static void expireCommand(redisClient *c) {
bbe025e0 7397 expireGenericCommand(c,c->argv[1],c->argv[2],0);
802e8373 7398}
7399
7400static void expireatCommand(redisClient *c) {
bbe025e0 7401 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
802e8373 7402}
7403
fd88489a 7404static void ttlCommand(redisClient *c) {
7405 time_t expire;
7406 int ttl = -1;
7407
7408 expire = getExpire(c->db,c->argv[1]);
7409 if (expire != -1) {
7410 ttl = (int) (expire-time(NULL));
7411 if (ttl < 0) ttl = -1;
7412 }
7413 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7414}
7415
6e469882 7416/* ================================ MULTI/EXEC ============================== */
7417
7418/* Client state initialization for MULTI/EXEC */
7419static void initClientMultiState(redisClient *c) {
7420 c->mstate.commands = NULL;
7421 c->mstate.count = 0;
7422}
7423
7424/* Release all the resources associated with MULTI/EXEC state */
7425static void freeClientMultiState(redisClient *c) {
7426 int j;
7427
7428 for (j = 0; j < c->mstate.count; j++) {
7429 int i;
7430 multiCmd *mc = c->mstate.commands+j;
7431
7432 for (i = 0; i < mc->argc; i++)
7433 decrRefCount(mc->argv[i]);
7434 zfree(mc->argv);
7435 }
7436 zfree(c->mstate.commands);
7437}
7438
7439/* Add a new command into the MULTI commands queue */
7440static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7441 multiCmd *mc;
7442 int j;
7443
7444 c->mstate.commands = zrealloc(c->mstate.commands,
7445 sizeof(multiCmd)*(c->mstate.count+1));
7446 mc = c->mstate.commands+c->mstate.count;
7447 mc->cmd = cmd;
7448 mc->argc = c->argc;
7449 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7450 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7451 for (j = 0; j < c->argc; j++)
7452 incrRefCount(mc->argv[j]);
7453 c->mstate.count++;
7454}
7455
7456static void multiCommand(redisClient *c) {
7457 c->flags |= REDIS_MULTI;
36c548f0 7458 addReply(c,shared.ok);
6e469882 7459}
7460
18b6cb76
DJ
7461static void discardCommand(redisClient *c) {
7462 if (!(c->flags & REDIS_MULTI)) {
7463 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7464 return;
7465 }
7466
7467 freeClientMultiState(c);
7468 initClientMultiState(c);
7469 c->flags &= (~REDIS_MULTI);
7470 addReply(c,shared.ok);
7471}
7472
66c8853f 7473/* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7474 * implememntation for more information. */
7475static void execCommandReplicateMulti(redisClient *c) {
7476 struct redisCommand *cmd;
7477 robj *multistring = createStringObject("MULTI",5);
7478
7479 cmd = lookupCommand("multi");
7480 if (server.appendonly)
7481 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7482 if (listLength(server.slaves))
7483 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7484 decrRefCount(multistring);
7485}
7486
6e469882 7487static void execCommand(redisClient *c) {
7488 int j;
7489 robj **orig_argv;
7490 int orig_argc;
7491
7492 if (!(c->flags & REDIS_MULTI)) {
7493 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7494 return;
7495 }
7496
66c8853f 7497 /* Replicate a MULTI request now that we are sure the block is executed.
7498 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7499 * both the AOF and the replication link will have the same consistency
7500 * and atomicity guarantees. */
7501 execCommandReplicateMulti(c);
7502
7503 /* Exec all the queued commands */
6e469882 7504 orig_argv = c->argv;
7505 orig_argc = c->argc;
7506 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7507 for (j = 0; j < c->mstate.count; j++) {
7508 c->argc = c->mstate.commands[j].argc;
7509 c->argv = c->mstate.commands[j].argv;
7510 call(c,c->mstate.commands[j].cmd);
7511 }
7512 c->argv = orig_argv;
7513 c->argc = orig_argc;
7514 freeClientMultiState(c);
7515 initClientMultiState(c);
7516 c->flags &= (~REDIS_MULTI);
66c8853f 7517 /* Make sure the EXEC command is always replicated / AOF, since we
7518 * always send the MULTI command (we can't know beforehand if the
7519 * next operations will contain at least a modification to the DB). */
7520 server.dirty++;
6e469882 7521}
7522
4409877e 7523/* =========================== Blocking Operations ========================= */
7524
7525/* Currently Redis blocking operations support is limited to list POP ops,
7526 * so the current implementation is not fully generic, but it is also not
7527 * completely specific so it will not require a rewrite to support new
7528 * kind of blocking operations in the future.
7529 *
7530 * Still it's important to note that list blocking operations can be already
7531 * used as a notification mechanism in order to implement other blocking
7532 * operations at application level, so there must be a very strong evidence
7533 * of usefulness and generality before new blocking operations are implemented.
7534 *
7535 * This is how the current blocking POP works, we use BLPOP as example:
7536 * - If the user calls BLPOP and the key exists and contains a non empty list
7537 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7538 * if there is not to block.
7539 * - If instead BLPOP is called and the key does not exists or the list is
7540 * empty we need to block. In order to do so we remove the notification for
7541 * new data to read in the client socket (so that we'll not serve new
7542 * requests if the blocking request is not served). Also we put the client
95242ab5 7543 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
4409877e 7544 * blocking for this keys.
7545 * - If a PUSH operation against a key with blocked clients waiting is
7546 * performed, we serve the first in the list: basically instead to push
7547 * the new element inside the list we return it to the (first / oldest)
7548 * blocking client, unblock the client, and remove it form the list.
7549 *
7550 * The above comment and the source code should be enough in order to understand
7551 * the implementation and modify / fix it later.
7552 */
7553
7554/* Set a client in blocking mode for the specified key, with the specified
7555 * timeout */
b177fd30 7556static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 7557 dictEntry *de;
7558 list *l;
b177fd30 7559 int j;
4409877e 7560
b177fd30 7561 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7562 c->blockingkeysnum = numkeys;
4409877e 7563 c->blockingto = timeout;
b177fd30 7564 for (j = 0; j < numkeys; j++) {
7565 /* Add the key in the client structure, to map clients -> keys */
7566 c->blockingkeys[j] = keys[j];
7567 incrRefCount(keys[j]);
4409877e 7568
b177fd30 7569 /* And in the other "side", to map keys -> clients */
7570 de = dictFind(c->db->blockingkeys,keys[j]);
7571 if (de == NULL) {
7572 int retval;
7573
7574 /* For every key we take a list of clients blocked for it */
7575 l = listCreate();
7576 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7577 incrRefCount(keys[j]);
7578 assert(retval == DICT_OK);
7579 } else {
7580 l = dictGetEntryVal(de);
7581 }
7582 listAddNodeTail(l,c);
4409877e 7583 }
b177fd30 7584 /* Mark the client as a blocked client */
4409877e 7585 c->flags |= REDIS_BLOCKED;
d5d55fc3 7586 server.blpop_blocked_clients++;
4409877e 7587}
7588
7589/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 7590static void unblockClientWaitingData(redisClient *c) {
4409877e 7591 dictEntry *de;
7592 list *l;
b177fd30 7593 int j;
4409877e 7594
b177fd30 7595 assert(c->blockingkeys != NULL);
7596 /* The client may wait for multiple keys, so unblock it for every key. */
7597 for (j = 0; j < c->blockingkeysnum; j++) {
7598 /* Remove this client from the list of clients waiting for this key. */
7599 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7600 assert(de != NULL);
7601 l = dictGetEntryVal(de);
7602 listDelNode(l,listSearchKey(l,c));
7603 /* If the list is empty we need to remove it to avoid wasting memory */
7604 if (listLength(l) == 0)
7605 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7606 decrRefCount(c->blockingkeys[j]);
7607 }
7608 /* Cleanup the client structure */
7609 zfree(c->blockingkeys);
7610 c->blockingkeys = NULL;
4409877e 7611 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 7612 server.blpop_blocked_clients--;
5921aa36 7613 /* We want to process data if there is some command waiting
b0d8747d 7614 * in the input buffer. Note that this is safe even if
7615 * unblockClientWaitingData() gets called from freeClient() because
7616 * freeClient() will be smart enough to call this function
7617 * *after* c->querybuf was set to NULL. */
4409877e 7618 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7619}
7620
7621/* This should be called from any function PUSHing into lists.
7622 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7623 * 'ele' is the element pushed.
7624 *
7625 * If the function returns 0 there was no client waiting for a list push
7626 * against this key.
7627 *
7628 * If the function returns 1 there was a client waiting for a list push
7629 * against this key, the element was passed to this client thus it's not
7630 * needed to actually add it to the list and the caller should return asap. */
7631static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7632 struct dictEntry *de;
7633 redisClient *receiver;
7634 list *l;
7635 listNode *ln;
7636
7637 de = dictFind(c->db->blockingkeys,key);
7638 if (de == NULL) return 0;
7639 l = dictGetEntryVal(de);
7640 ln = listFirst(l);
7641 assert(ln != NULL);
7642 receiver = ln->value;
4409877e 7643
b177fd30 7644 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 7645 addReplyBulk(receiver,key);
7646 addReplyBulk(receiver,ele);
b0d8747d 7647 unblockClientWaitingData(receiver);
4409877e 7648 return 1;
7649}
7650
7651/* Blocking RPOP/LPOP */
7652static void blockingPopGenericCommand(redisClient *c, int where) {
7653 robj *o;
7654 time_t timeout;
b177fd30 7655 int j;
4409877e 7656
b177fd30 7657 for (j = 1; j < c->argc-1; j++) {
7658 o = lookupKeyWrite(c->db,c->argv[j]);
7659 if (o != NULL) {
7660 if (o->type != REDIS_LIST) {
7661 addReply(c,shared.wrongtypeerr);
4409877e 7662 return;
b177fd30 7663 } else {
7664 list *list = o->ptr;
7665 if (listLength(list) != 0) {
7666 /* If the list contains elements fall back to the usual
7667 * non-blocking POP operation */
7668 robj *argv[2], **orig_argv;
7669 int orig_argc;
e0a62c7f 7670
b177fd30 7671 /* We need to alter the command arguments before to call
7672 * popGenericCommand() as the command takes a single key. */
7673 orig_argv = c->argv;
7674 orig_argc = c->argc;
7675 argv[1] = c->argv[j];
7676 c->argv = argv;
7677 c->argc = 2;
7678
7679 /* Also the return value is different, we need to output
7680 * the multi bulk reply header and the key name. The
7681 * "real" command will add the last element (the value)
7682 * for us. If this souds like an hack to you it's just
7683 * because it is... */
7684 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 7685 addReplyBulk(c,argv[1]);
b177fd30 7686 popGenericCommand(c,where);
7687
7688 /* Fix the client structure with the original stuff */
7689 c->argv = orig_argv;
7690 c->argc = orig_argc;
7691 return;
7692 }
4409877e 7693 }
7694 }
7695 }
7696 /* If the list is empty or the key does not exists we must block */
b177fd30 7697 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 7698 if (timeout > 0) timeout += time(NULL);
b177fd30 7699 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 7700}
7701
7702static void blpopCommand(redisClient *c) {
7703 blockingPopGenericCommand(c,REDIS_HEAD);
7704}
7705
7706static void brpopCommand(redisClient *c) {
7707 blockingPopGenericCommand(c,REDIS_TAIL);
7708}
7709
ed9b544e 7710/* =============================== Replication ============================= */
7711
a4d1ba9a 7712static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7713 ssize_t nwritten, ret = size;
7714 time_t start = time(NULL);
7715
7716 timeout++;
7717 while(size) {
7718 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7719 nwritten = write(fd,ptr,size);
7720 if (nwritten == -1) return -1;
7721 ptr += nwritten;
7722 size -= nwritten;
7723 }
7724 if ((time(NULL)-start) > timeout) {
7725 errno = ETIMEDOUT;
7726 return -1;
7727 }
7728 }
7729 return ret;
7730}
7731
a4d1ba9a 7732static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7733 ssize_t nread, totread = 0;
7734 time_t start = time(NULL);
7735
7736 timeout++;
7737 while(size) {
7738 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7739 nread = read(fd,ptr,size);
7740 if (nread == -1) return -1;
7741 ptr += nread;
7742 size -= nread;
7743 totread += nread;
7744 }
7745 if ((time(NULL)-start) > timeout) {
7746 errno = ETIMEDOUT;
7747 return -1;
7748 }
7749 }
7750 return totread;
7751}
7752
7753static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7754 ssize_t nread = 0;
7755
7756 size--;
7757 while(size) {
7758 char c;
7759
7760 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7761 if (c == '\n') {
7762 *ptr = '\0';
7763 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7764 return nread;
7765 } else {
7766 *ptr++ = c;
7767 *ptr = '\0';
7768 nread++;
7769 }
7770 }
7771 return nread;
7772}
7773
7774static void syncCommand(redisClient *c) {
40d224a9 7775 /* ignore SYNC if aleady slave or in monitor mode */
7776 if (c->flags & REDIS_SLAVE) return;
7777
7778 /* SYNC can't be issued when the server has pending data to send to
7779 * the client about already issued commands. We need a fresh reply
7780 * buffer registering the differences between the BGSAVE and the current
7781 * dataset, so that we can copy to other slaves if needed. */
7782 if (listLength(c->reply) != 0) {
7783 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7784 return;
7785 }
7786
7787 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7788 /* Here we need to check if there is a background saving operation
7789 * in progress, or if it is required to start one */
9d65a1bb 7790 if (server.bgsavechildpid != -1) {
40d224a9 7791 /* Ok a background save is in progress. Let's check if it is a good
7792 * one for replication, i.e. if there is another slave that is
7793 * registering differences since the server forked to save */
7794 redisClient *slave;
7795 listNode *ln;
c7df85a4 7796 listIter li;
40d224a9 7797
c7df85a4 7798 listRewind(server.slaves,&li);
7799 while((ln = listNext(&li))) {
40d224a9 7800 slave = ln->value;
7801 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 7802 }
7803 if (ln) {
7804 /* Perfect, the server is already registering differences for
7805 * another slave. Set the right state, and copy the buffer. */
7806 listRelease(c->reply);
7807 c->reply = listDup(slave->reply);
40d224a9 7808 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7809 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7810 } else {
7811 /* No way, we need to wait for the next BGSAVE in order to
7812 * register differences */
7813 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7814 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7815 }
7816 } else {
7817 /* Ok we don't have a BGSAVE in progress, let's start one */
7818 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7819 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7820 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7821 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7822 return;
7823 }
7824 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7825 }
6208b3a7 7826 c->repldbfd = -1;
40d224a9 7827 c->flags |= REDIS_SLAVE;
7828 c->slaveseldb = 0;
6b47e12e 7829 listAddNodeTail(server.slaves,c);
40d224a9 7830 return;
7831}
7832
6208b3a7 7833static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7834 redisClient *slave = privdata;
7835 REDIS_NOTUSED(el);
7836 REDIS_NOTUSED(mask);
7837 char buf[REDIS_IOBUF_LEN];
7838 ssize_t nwritten, buflen;
7839
7840 if (slave->repldboff == 0) {
7841 /* Write the bulk write count before to transfer the DB. In theory here
7842 * we don't know how much room there is in the output buffer of the
7843 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7844 * operations) will never be smaller than the few bytes we need. */
7845 sds bulkcount;
7846
7847 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7848 slave->repldbsize);
7849 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7850 {
7851 sdsfree(bulkcount);
7852 freeClient(slave);
7853 return;
7854 }
7855 sdsfree(bulkcount);
7856 }
7857 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7858 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7859 if (buflen <= 0) {
7860 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7861 (buflen == 0) ? "premature EOF" : strerror(errno));
7862 freeClient(slave);
7863 return;
7864 }
7865 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 7866 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 7867 strerror(errno));
7868 freeClient(slave);
7869 return;
7870 }
7871 slave->repldboff += nwritten;
7872 if (slave->repldboff == slave->repldbsize) {
7873 close(slave->repldbfd);
7874 slave->repldbfd = -1;
7875 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7876 slave->replstate = REDIS_REPL_ONLINE;
7877 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 7878 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 7879 freeClient(slave);
7880 return;
7881 }
7882 addReplySds(slave,sdsempty());
7883 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7884 }
7885}
ed9b544e 7886
a3b21203 7887/* This function is called at the end of every backgrond saving.
7888 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7889 * otherwise REDIS_ERR is passed to the function.
7890 *
7891 * The goal of this function is to handle slaves waiting for a successful
7892 * background saving in order to perform non-blocking synchronization. */
7893static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 7894 listNode *ln;
7895 int startbgsave = 0;
c7df85a4 7896 listIter li;
ed9b544e 7897
c7df85a4 7898 listRewind(server.slaves,&li);
7899 while((ln = listNext(&li))) {
6208b3a7 7900 redisClient *slave = ln->value;
ed9b544e 7901
6208b3a7 7902 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7903 startbgsave = 1;
7904 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7905 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 7906 struct redis_stat buf;
e0a62c7f 7907
6208b3a7 7908 if (bgsaveerr != REDIS_OK) {
7909 freeClient(slave);
7910 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7911 continue;
7912 }
7913 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 7914 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 7915 freeClient(slave);
7916 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7917 continue;
7918 }
7919 slave->repldboff = 0;
7920 slave->repldbsize = buf.st_size;
7921 slave->replstate = REDIS_REPL_SEND_BULK;
7922 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 7923 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 7924 freeClient(slave);
7925 continue;
7926 }
7927 }
ed9b544e 7928 }
6208b3a7 7929 if (startbgsave) {
7930 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 7931 listIter li;
7932
7933 listRewind(server.slaves,&li);
6208b3a7 7934 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 7935 while((ln = listNext(&li))) {
6208b3a7 7936 redisClient *slave = ln->value;
ed9b544e 7937
6208b3a7 7938 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7939 freeClient(slave);
7940 }
7941 }
7942 }
ed9b544e 7943}
7944
7945static int syncWithMaster(void) {
d0ccebcf 7946 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 7947 long dumpsize;
ed9b544e 7948 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 7949 int dfd, maxtries = 5;
ed9b544e 7950
7951 if (fd == -1) {
7952 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7953 strerror(errno));
7954 return REDIS_ERR;
7955 }
d0ccebcf 7956
7957 /* AUTH with the master if required. */
7958 if(server.masterauth) {
7959 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7960 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7961 close(fd);
7962 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7963 strerror(errno));
7964 return REDIS_ERR;
7965 }
7966 /* Read the AUTH result. */
7967 if (syncReadLine(fd,buf,1024,3600) == -1) {
7968 close(fd);
7969 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7970 strerror(errno));
7971 return REDIS_ERR;
7972 }
7973 if (buf[0] != '+') {
7974 close(fd);
7975 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7976 return REDIS_ERR;
7977 }
7978 }
7979
ed9b544e 7980 /* Issue the SYNC command */
7981 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7982 close(fd);
7983 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7984 strerror(errno));
7985 return REDIS_ERR;
7986 }
7987 /* Read the bulk write count */
8c4d91fc 7988 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 7989 close(fd);
7990 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7991 strerror(errno));
7992 return REDIS_ERR;
7993 }
4aa701c1 7994 if (buf[0] != '$') {
7995 close(fd);
7996 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7997 return REDIS_ERR;
7998 }
18e61fa2 7999 dumpsize = strtol(buf+1,NULL,10);
8000 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 8001 /* Read the bulk write data on a temp file */
8c5abee8 8002 while(maxtries--) {
8003 snprintf(tmpfile,256,
8004 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8005 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8006 if (dfd != -1) break;
5de9ad7c 8007 sleep(1);
8c5abee8 8008 }
ed9b544e 8009 if (dfd == -1) {
8010 close(fd);
8011 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8012 return REDIS_ERR;
8013 }
8014 while(dumpsize) {
8015 int nread, nwritten;
8016
8017 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8018 if (nread == -1) {
8019 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8020 strerror(errno));
8021 close(fd);
8022 close(dfd);
8023 return REDIS_ERR;
8024 }
8025 nwritten = write(dfd,buf,nread);
8026 if (nwritten == -1) {
8027 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8028 close(fd);
8029 close(dfd);
8030 return REDIS_ERR;
8031 }
8032 dumpsize -= nread;
8033 }
8034 close(dfd);
8035 if (rename(tmpfile,server.dbfilename) == -1) {
8036 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8037 unlink(tmpfile);
8038 close(fd);
8039 return REDIS_ERR;
8040 }
8041 emptyDb();
f78fd11b 8042 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 8043 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8044 close(fd);
8045 return REDIS_ERR;
8046 }
8047 server.master = createClient(fd);
8048 server.master->flags |= REDIS_MASTER;
179b3952 8049 server.master->authenticated = 1;
ed9b544e 8050 server.replstate = REDIS_REPL_CONNECTED;
8051 return REDIS_OK;
8052}
8053
321b0e13 8054static void slaveofCommand(redisClient *c) {
8055 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8056 !strcasecmp(c->argv[2]->ptr,"one")) {
8057 if (server.masterhost) {
8058 sdsfree(server.masterhost);
8059 server.masterhost = NULL;
8060 if (server.master) freeClient(server.master);
8061 server.replstate = REDIS_REPL_NONE;
8062 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8063 }
8064 } else {
8065 sdsfree(server.masterhost);
8066 server.masterhost = sdsdup(c->argv[1]->ptr);
8067 server.masterport = atoi(c->argv[2]->ptr);
8068 if (server.master) freeClient(server.master);
8069 server.replstate = REDIS_REPL_CONNECT;
8070 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8071 server.masterhost, server.masterport);
8072 }
8073 addReply(c,shared.ok);
8074}
8075
3fd78bcd 8076/* ============================ Maxmemory directive ======================== */
8077
a5819310 8078/* Try to free one object form the pre-allocated objects free list.
8079 * This is useful under low mem conditions as by default we take 1 million
8080 * free objects allocated. On success REDIS_OK is returned, otherwise
8081 * REDIS_ERR. */
8082static int tryFreeOneObjectFromFreelist(void) {
f870935d 8083 robj *o;
8084
a5819310 8085 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8086 if (listLength(server.objfreelist)) {
8087 listNode *head = listFirst(server.objfreelist);
8088 o = listNodeValue(head);
8089 listDelNode(server.objfreelist,head);
8090 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8091 zfree(o);
8092 return REDIS_OK;
8093 } else {
8094 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8095 return REDIS_ERR;
8096 }
f870935d 8097}
8098
3fd78bcd 8099/* This function gets called when 'maxmemory' is set on the config file to limit
8100 * the max memory used by the server, and we are out of memory.
8101 * This function will try to, in order:
8102 *
8103 * - Free objects from the free list
8104 * - Try to remove keys with an EXPIRE set
8105 *
8106 * It is not possible to free enough memory to reach used-memory < maxmemory
8107 * the server will start refusing commands that will enlarge even more the
8108 * memory usage.
8109 */
8110static void freeMemoryIfNeeded(void) {
8111 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 8112 int j, k, freed = 0;
8113
8114 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8115 for (j = 0; j < server.dbnum; j++) {
8116 int minttl = -1;
8117 robj *minkey = NULL;
8118 struct dictEntry *de;
8119
8120 if (dictSize(server.db[j].expires)) {
8121 freed = 1;
8122 /* From a sample of three keys drop the one nearest to
8123 * the natural expire */
8124 for (k = 0; k < 3; k++) {
8125 time_t t;
8126
8127 de = dictGetRandomKey(server.db[j].expires);
8128 t = (time_t) dictGetEntryVal(de);
8129 if (minttl == -1 || t < minttl) {
8130 minkey = dictGetEntryKey(de);
8131 minttl = t;
3fd78bcd 8132 }
3fd78bcd 8133 }
a5819310 8134 deleteKey(server.db+j,minkey);
3fd78bcd 8135 }
3fd78bcd 8136 }
a5819310 8137 if (!freed) return; /* nothing to free... */
3fd78bcd 8138 }
8139}
8140
f80dff62 8141/* ============================== Append Only file ========================== */
8142
28ed1f33 8143/* Write the append only file buffer on disk.
8144 *
8145 * Since we are required to write the AOF before replying to the client,
8146 * and the only way the client socket can get a write is entering when the
8147 * the event loop, we accumulate all the AOF writes in a memory
8148 * buffer and write it on disk using this function just before entering
8149 * the event loop again. */
8150static void flushAppendOnlyFile(void) {
8151 time_t now;
8152 ssize_t nwritten;
8153
8154 if (sdslen(server.aofbuf) == 0) return;
8155
8156 /* We want to perform a single write. This should be guaranteed atomic
8157 * at least if the filesystem we are writing is a real physical one.
8158 * While this will save us against the server being killed I don't think
8159 * there is much to do about the whole server stopping for power problems
8160 * or alike */
8161 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8162 if (nwritten != (signed)sdslen(server.aofbuf)) {
8163 /* Ooops, we are in troubles. The best thing to do for now is
8164 * aborting instead of giving the illusion that everything is
8165 * working as expected. */
8166 if (nwritten == -1) {
8167 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8168 } else {
8169 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8170 }
8171 exit(1);
8172 }
8173 sdsfree(server.aofbuf);
8174 server.aofbuf = sdsempty();
8175
8176 /* Fsync if needed */
8177 now = time(NULL);
8178 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8179 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8180 now-server.lastfsync > 1))
8181 {
8182 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8183 * flushing metadata. */
8184 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8185 server.lastfsync = now;
8186 }
8187}
8188
9376e434
PN
8189static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8190 int j;
8191 buf = sdscatprintf(buf,"*%d\r\n",argc);
8192 for (j = 0; j < argc; j++) {
8193 robj *o = getDecodedObject(argv[j]);
8194 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8195 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8196 buf = sdscatlen(buf,"\r\n",2);
8197 decrRefCount(o);
8198 }
8199 return buf;
8200}
8201
8202static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8203 int argc = 3;
8204 long when;
8205 robj *argv[3];
8206
8207 /* Make sure we can use strtol */
8208 seconds = getDecodedObject(seconds);
8209 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8210 decrRefCount(seconds);
8211
8212 argv[0] = createStringObject("EXPIREAT",8);
8213 argv[1] = key;
8214 argv[2] = createObject(REDIS_STRING,
8215 sdscatprintf(sdsempty(),"%ld",when));
8216 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8217 decrRefCount(argv[0]);
8218 decrRefCount(argv[2]);
8219 return buf;
8220}
8221
f80dff62 8222static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8223 sds buf = sdsempty();
f80dff62 8224 robj *tmpargv[3];
8225
8226 /* The DB this command was targetting is not the same as the last command
8227 * we appendend. To issue a SELECT command is needed. */
8228 if (dictid != server.appendseldb) {
8229 char seldb[64];
8230
8231 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 8232 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 8233 (unsigned long)strlen(seldb),seldb);
f80dff62 8234 server.appendseldb = dictid;
8235 }
8236
f80dff62 8237 if (cmd->proc == expireCommand) {
9376e434
PN
8238 /* Translate EXPIRE into EXPIREAT */
8239 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8240 } else if (cmd->proc == setexCommand) {
8241 /* Translate SETEX to SET and EXPIREAT */
8242 tmpargv[0] = createStringObject("SET",3);
f80dff62 8243 tmpargv[1] = argv[1];
9376e434
PN
8244 tmpargv[2] = argv[3];
8245 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8246 decrRefCount(tmpargv[0]);
8247 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8248 } else {
8249 buf = catAppendOnlyGenericCommand(buf,argc,argv);
f80dff62 8250 }
8251
28ed1f33 8252 /* Append to the AOF buffer. This will be flushed on disk just before
8253 * of re-entering the event loop, so before the client will get a
8254 * positive reply about the operation performed. */
8255 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8256
85a83172 8257 /* If a background append only file rewriting is in progress we want to
8258 * accumulate the differences between the child DB and the current one
8259 * in a buffer, so that when the child process will do its work we
8260 * can append the differences to the new append only file. */
8261 if (server.bgrewritechildpid != -1)
8262 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8263
8264 sdsfree(buf);
f80dff62 8265}
8266
8267/* In Redis commands are always executed in the context of a client, so in
8268 * order to load the append only file we need to create a fake client. */
8269static struct redisClient *createFakeClient(void) {
8270 struct redisClient *c = zmalloc(sizeof(*c));
8271
8272 selectDb(c,0);
8273 c->fd = -1;
8274 c->querybuf = sdsempty();
8275 c->argc = 0;
8276 c->argv = NULL;
8277 c->flags = 0;
9387d17d 8278 /* We set the fake client as a slave waiting for the synchronization
8279 * so that Redis will not try to send replies to this client. */
8280 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 8281 c->reply = listCreate();
8282 listSetFreeMethod(c->reply,decrRefCount);
8283 listSetDupMethod(c->reply,dupClientReplyValue);
4132ad8d 8284 initClientMultiState(c);
f80dff62 8285 return c;
8286}
8287
8288static void freeFakeClient(struct redisClient *c) {
8289 sdsfree(c->querybuf);
8290 listRelease(c->reply);
4132ad8d 8291 freeClientMultiState(c);
f80dff62 8292 zfree(c);
8293}
8294
8295/* Replay the append log file. On error REDIS_OK is returned. On non fatal
8296 * error (the append only file is zero-length) REDIS_ERR is returned. On
8297 * fatal error an error message is logged and the program exists. */
8298int loadAppendOnlyFile(char *filename) {
8299 struct redisClient *fakeClient;
8300 FILE *fp = fopen(filename,"r");
8301 struct redis_stat sb;
b492cf00 8302 unsigned long long loadedkeys = 0;
4132ad8d 8303 int appendonly = server.appendonly;
f80dff62 8304
8305 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8306 return REDIS_ERR;
8307
8308 if (fp == NULL) {
8309 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8310 exit(1);
8311 }
8312
4132ad8d
PN
8313 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8314 * to the same file we're about to read. */
8315 server.appendonly = 0;
8316
f80dff62 8317 fakeClient = createFakeClient();
8318 while(1) {
8319 int argc, j;
8320 unsigned long len;
8321 robj **argv;
8322 char buf[128];
8323 sds argsds;
8324 struct redisCommand *cmd;
8325
8326 if (fgets(buf,sizeof(buf),fp) == NULL) {
8327 if (feof(fp))
8328 break;
8329 else
8330 goto readerr;
8331 }
8332 if (buf[0] != '*') goto fmterr;
8333 argc = atoi(buf+1);
8334 argv = zmalloc(sizeof(robj*)*argc);
8335 for (j = 0; j < argc; j++) {
8336 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8337 if (buf[0] != '$') goto fmterr;
8338 len = strtol(buf+1,NULL,10);
8339 argsds = sdsnewlen(NULL,len);
0f151ef1 8340 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 8341 argv[j] = createObject(REDIS_STRING,argsds);
8342 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8343 }
8344
8345 /* Command lookup */
8346 cmd = lookupCommand(argv[0]->ptr);
8347 if (!cmd) {
8348 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8349 exit(1);
8350 }
bdcb92f2 8351 /* Try object encoding */
f80dff62 8352 if (cmd->flags & REDIS_CMD_BULK)
05df7621 8353 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
f80dff62 8354 /* Run the command in the context of a fake client */
8355 fakeClient->argc = argc;
8356 fakeClient->argv = argv;
8357 cmd->proc(fakeClient);
8358 /* Discard the reply objects list from the fake client */
8359 while(listLength(fakeClient->reply))
8360 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8361 /* Clean up, ready for the next command */
8362 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8363 zfree(argv);
b492cf00 8364 /* Handle swapping while loading big datasets when VM is on */
8365 loadedkeys++;
8366 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8367 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 8368 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 8369 }
8370 }
f80dff62 8371 }
4132ad8d
PN
8372
8373 /* This point can only be reached when EOF is reached without errors.
8374 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8375 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8376
f80dff62 8377 fclose(fp);
8378 freeFakeClient(fakeClient);
4132ad8d 8379 server.appendonly = appendonly;
f80dff62 8380 return REDIS_OK;
8381
8382readerr:
8383 if (feof(fp)) {
8384 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8385 } else {
8386 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8387 }
8388 exit(1);
8389fmterr:
8390 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8391 exit(1);
8392}
8393
9d65a1bb 8394/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
9c8e3cee 8395static int fwriteBulkObject(FILE *fp, robj *obj) {
9d65a1bb 8396 char buf[128];
b9bc0eef 8397 int decrrc = 0;
8398
f2d9f50f 8399 /* Avoid the incr/decr ref count business if possible to help
8400 * copy-on-write (we are often in a child process when this function
8401 * is called).
8402 * Also makes sure that key objects don't get incrRefCount-ed when VM
8403 * is enabled */
8404 if (obj->encoding != REDIS_ENCODING_RAW) {
b9bc0eef 8405 obj = getDecodedObject(obj);
8406 decrrc = 1;
8407 }
9d65a1bb 8408 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8409 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 8410 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8411 goto err;
9d65a1bb 8412 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 8413 if (decrrc) decrRefCount(obj);
9d65a1bb 8414 return 1;
8415err:
b9bc0eef 8416 if (decrrc) decrRefCount(obj);
9d65a1bb 8417 return 0;
8418}
8419
9c8e3cee 8420/* Write binary-safe string into a file in the bulkformat
8421 * $<count>\r\n<payload>\r\n */
8422static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8423 char buf[128];
8424
8425 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8426 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8427 if (len && fwrite(s,len,1,fp) == 0) return 0;
8428 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8429 return 1;
8430}
8431
9d65a1bb 8432/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8433static int fwriteBulkDouble(FILE *fp, double d) {
8434 char buf[128], dbuf[128];
8435
8436 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8437 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8438 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8439 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8440 return 1;
8441}
8442
8443/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8444static int fwriteBulkLong(FILE *fp, long l) {
8445 char buf[128], lbuf[128];
8446
8447 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8448 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8449 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8450 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8451 return 1;
8452}
8453
8454/* Write a sequence of commands able to fully rebuild the dataset into
8455 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8456static int rewriteAppendOnlyFile(char *filename) {
8457 dictIterator *di = NULL;
8458 dictEntry *de;
8459 FILE *fp;
8460 char tmpfile[256];
8461 int j;
8462 time_t now = time(NULL);
8463
8464 /* Note that we have to use a different temp name here compared to the
8465 * one used by rewriteAppendOnlyFileBackground() function. */
8466 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8467 fp = fopen(tmpfile,"w");
8468 if (!fp) {
8469 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8470 return REDIS_ERR;
8471 }
8472 for (j = 0; j < server.dbnum; j++) {
8473 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8474 redisDb *db = server.db+j;
8475 dict *d = db->dict;
8476 if (dictSize(d) == 0) continue;
8477 di = dictGetIterator(d);
8478 if (!di) {
8479 fclose(fp);
8480 return REDIS_ERR;
8481 }
8482
8483 /* SELECT the new DB */
8484 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 8485 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 8486
8487 /* Iterate this DB writing every entry */
8488 while((de = dictNext(di)) != NULL) {
e7546c63 8489 robj *key, *o;
8490 time_t expiretime;
8491 int swapped;
8492
8493 key = dictGetEntryKey(de);
b9bc0eef 8494 /* If the value for this key is swapped, load a preview in memory.
8495 * We use a "swapped" flag to remember if we need to free the
8496 * value object instead to just increment the ref count anyway
8497 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 8498 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8499 key->storage == REDIS_VM_SWAPPING) {
e7546c63 8500 o = dictGetEntryVal(de);
8501 swapped = 0;
8502 } else {
8503 o = vmPreviewObject(key);
e7546c63 8504 swapped = 1;
8505 }
8506 expiretime = getExpire(db,key);
9d65a1bb 8507
8508 /* Save the key and associated value */
9d65a1bb 8509 if (o->type == REDIS_STRING) {
8510 /* Emit a SET command */
8511 char cmd[]="*3\r\n$3\r\nSET\r\n";
8512 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8513 /* Key and value */
9c8e3cee 8514 if (fwriteBulkObject(fp,key) == 0) goto werr;
8515 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 8516 } else if (o->type == REDIS_LIST) {
8517 /* Emit the RPUSHes needed to rebuild the list */
8518 list *list = o->ptr;
8519 listNode *ln;
c7df85a4 8520 listIter li;
9d65a1bb 8521
c7df85a4 8522 listRewind(list,&li);
8523 while((ln = listNext(&li))) {
9d65a1bb 8524 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8525 robj *eleobj = listNodeValue(ln);
8526
8527 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8528 if (fwriteBulkObject(fp,key) == 0) goto werr;
8529 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8530 }
8531 } else if (o->type == REDIS_SET) {
8532 /* Emit the SADDs needed to rebuild the set */
8533 dict *set = o->ptr;
8534 dictIterator *di = dictGetIterator(set);
8535 dictEntry *de;
8536
8537 while((de = dictNext(di)) != NULL) {
8538 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8539 robj *eleobj = dictGetEntryKey(de);
8540
8541 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8542 if (fwriteBulkObject(fp,key) == 0) goto werr;
8543 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8544 }
8545 dictReleaseIterator(di);
8546 } else if (o->type == REDIS_ZSET) {
8547 /* Emit the ZADDs needed to rebuild the sorted set */
8548 zset *zs = o->ptr;
8549 dictIterator *di = dictGetIterator(zs->dict);
8550 dictEntry *de;
8551
8552 while((de = dictNext(di)) != NULL) {
8553 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8554 robj *eleobj = dictGetEntryKey(de);
8555 double *score = dictGetEntryVal(de);
8556
8557 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8558 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8559 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 8560 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8561 }
8562 dictReleaseIterator(di);
9c8e3cee 8563 } else if (o->type == REDIS_HASH) {
8564 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8565
8566 /* Emit the HSETs needed to rebuild the hash */
8567 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8568 unsigned char *p = zipmapRewind(o->ptr);
8569 unsigned char *field, *val;
8570 unsigned int flen, vlen;
8571
8572 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8573 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8574 if (fwriteBulkObject(fp,key) == 0) goto werr;
8575 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8576 return -1;
8577 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8578 return -1;
8579 }
8580 } else {
8581 dictIterator *di = dictGetIterator(o->ptr);
8582 dictEntry *de;
8583
8584 while((de = dictNext(di)) != NULL) {
8585 robj *field = dictGetEntryKey(de);
8586 robj *val = dictGetEntryVal(de);
8587
8588 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8589 if (fwriteBulkObject(fp,key) == 0) goto werr;
8590 if (fwriteBulkObject(fp,field) == -1) return -1;
8591 if (fwriteBulkObject(fp,val) == -1) return -1;
8592 }
8593 dictReleaseIterator(di);
8594 }
9d65a1bb 8595 } else {
f83c6cb5 8596 redisPanic("Unknown object type");
9d65a1bb 8597 }
8598 /* Save the expire time */
8599 if (expiretime != -1) {
e96e4fbf 8600 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 8601 /* If this key is already expired skip it */
8602 if (expiretime < now) continue;
8603 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8604 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8605 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8606 }
b9bc0eef 8607 if (swapped) decrRefCount(o);
9d65a1bb 8608 }
8609 dictReleaseIterator(di);
8610 }
8611
8612 /* Make sure data will not remain on the OS's output buffers */
8613 fflush(fp);
8614 fsync(fileno(fp));
8615 fclose(fp);
e0a62c7f 8616
9d65a1bb 8617 /* Use RENAME to make sure the DB file is changed atomically only
8618 * if the generate DB file is ok. */
8619 if (rename(tmpfile,filename) == -1) {
8620 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8621 unlink(tmpfile);
8622 return REDIS_ERR;
8623 }
8624 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8625 return REDIS_OK;
8626
8627werr:
8628 fclose(fp);
8629 unlink(tmpfile);
e96e4fbf 8630 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 8631 if (di) dictReleaseIterator(di);
8632 return REDIS_ERR;
8633}
8634
8635/* This is how rewriting of the append only file in background works:
8636 *
8637 * 1) The user calls BGREWRITEAOF
8638 * 2) Redis calls this function, that forks():
8639 * 2a) the child rewrite the append only file in a temp file.
8640 * 2b) the parent accumulates differences in server.bgrewritebuf.
8641 * 3) When the child finished '2a' exists.
8642 * 4) The parent will trap the exit code, if it's OK, will append the
8643 * data accumulated into server.bgrewritebuf into the temp file, and
8644 * finally will rename(2) the temp file in the actual file name.
8645 * The the new file is reopened as the new append only file. Profit!
8646 */
8647static int rewriteAppendOnlyFileBackground(void) {
8648 pid_t childpid;
8649
8650 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 8651 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 8652 if ((childpid = fork()) == 0) {
8653 /* Child */
8654 char tmpfile[256];
9d65a1bb 8655
054e426d 8656 if (server.vm_enabled) vmReopenSwapFile();
8657 close(server.fd);
9d65a1bb 8658 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8659 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 8660 _exit(0);
9d65a1bb 8661 } else {
478c2c6f 8662 _exit(1);
9d65a1bb 8663 }
8664 } else {
8665 /* Parent */
8666 if (childpid == -1) {
8667 redisLog(REDIS_WARNING,
8668 "Can't rewrite append only file in background: fork: %s",
8669 strerror(errno));
8670 return REDIS_ERR;
8671 }
8672 redisLog(REDIS_NOTICE,
8673 "Background append only file rewriting started by pid %d",childpid);
8674 server.bgrewritechildpid = childpid;
884d4b39 8675 updateDictResizePolicy();
85a83172 8676 /* We set appendseldb to -1 in order to force the next call to the
8677 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8678 * accumulated by the parent into server.bgrewritebuf will start
8679 * with a SELECT statement and it will be safe to merge. */
8680 server.appendseldb = -1;
9d65a1bb 8681 return REDIS_OK;
8682 }
8683 return REDIS_OK; /* unreached */
8684}
8685
8686static void bgrewriteaofCommand(redisClient *c) {
8687 if (server.bgrewritechildpid != -1) {
8688 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8689 return;
8690 }
8691 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 8692 char *status = "+Background append only file rewriting started\r\n";
8693 addReplySds(c,sdsnew(status));
9d65a1bb 8694 } else {
8695 addReply(c,shared.err);
8696 }
8697}
8698
8699static void aofRemoveTempFile(pid_t childpid) {
8700 char tmpfile[256];
8701
8702 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8703 unlink(tmpfile);
8704}
8705
996cb5f7 8706/* Virtual Memory is composed mainly of two subsystems:
8707 * - Blocking Virutal Memory
8708 * - Threaded Virtual Memory I/O
8709 * The two parts are not fully decoupled, but functions are split among two
8710 * different sections of the source code (delimited by comments) in order to
8711 * make more clear what functionality is about the blocking VM and what about
8712 * the threaded (not blocking) VM.
8713 *
8714 * Redis VM design:
8715 *
8716 * Redis VM is a blocking VM (one that blocks reading swapped values from
8717 * disk into memory when a value swapped out is needed in memory) that is made
8718 * unblocking by trying to examine the command argument vector in order to
8719 * load in background values that will likely be needed in order to exec
8720 * the command. The command is executed only once all the relevant keys
8721 * are loaded into memory.
8722 *
8723 * This basically is almost as simple of a blocking VM, but almost as parallel
8724 * as a fully non-blocking VM.
8725 */
8726
8727/* =================== Virtual Memory - Blocking Side ====================== */
054e426d 8728
75680a3c 8729static void vmInit(void) {
8730 off_t totsize;
996cb5f7 8731 int pipefds[2];
bcaa7a4f 8732 size_t stacksize;
8b5bb414 8733 struct flock fl;
75680a3c 8734
4ad37480 8735 if (server.vm_max_threads != 0)
8736 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8737
054e426d 8738 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8b5bb414 8739 /* Try to open the old swap file, otherwise create it */
6fa987e3 8740 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8741 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8742 }
75680a3c 8743 if (server.vm_fp == NULL) {
6fa987e3 8744 redisLog(REDIS_WARNING,
8b5bb414 8745 "Can't open the swap file: %s. Exiting.",
6fa987e3 8746 strerror(errno));
75680a3c 8747 exit(1);
8748 }
8749 server.vm_fd = fileno(server.vm_fp);
8b5bb414 8750 /* Lock the swap file for writing, this is useful in order to avoid
8751 * another instance to use the same swap file for a config error. */
8752 fl.l_type = F_WRLCK;
8753 fl.l_whence = SEEK_SET;
8754 fl.l_start = fl.l_len = 0;
8755 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8756 redisLog(REDIS_WARNING,
8757 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8758 exit(1);
8759 }
8760 /* Initialize */
75680a3c 8761 server.vm_next_page = 0;
8762 server.vm_near_pages = 0;
7d98e08c 8763 server.vm_stats_used_pages = 0;
8764 server.vm_stats_swapped_objects = 0;
8765 server.vm_stats_swapouts = 0;
8766 server.vm_stats_swapins = 0;
75680a3c 8767 totsize = server.vm_pages*server.vm_page_size;
8768 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8769 if (ftruncate(server.vm_fd,totsize) == -1) {
8770 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8771 strerror(errno));
8772 exit(1);
8773 } else {
8774 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8775 }
7d30035d 8776 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 8777 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 8778 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 8779 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 8780
996cb5f7 8781 /* Initialize threaded I/O (used by Virtual Memory) */
8782 server.io_newjobs = listCreate();
8783 server.io_processing = listCreate();
8784 server.io_processed = listCreate();
d5d55fc3 8785 server.io_ready_clients = listCreate();
92f8e882 8786 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 8787 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8788 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 8789 server.io_active_threads = 0;
996cb5f7 8790 if (pipe(pipefds) == -1) {
8791 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8792 ,strerror(errno));
8793 exit(1);
8794 }
8795 server.io_ready_pipe_read = pipefds[0];
8796 server.io_ready_pipe_write = pipefds[1];
8797 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 8798 /* LZF requires a lot of stack */
8799 pthread_attr_init(&server.io_threads_attr);
8800 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8801 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8802 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 8803 /* Listen for events in the threaded I/O pipe */
8804 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8805 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8806 oom("creating file event");
75680a3c 8807}
8808
06224fec 8809/* Mark the page as used */
8810static void vmMarkPageUsed(off_t page) {
8811 off_t byte = page/8;
8812 int bit = page&7;
970e10bb 8813 redisAssert(vmFreePage(page) == 1);
06224fec 8814 server.vm_bitmap[byte] |= 1<<bit;
8815}
8816
8817/* Mark N contiguous pages as used, with 'page' being the first. */
8818static void vmMarkPagesUsed(off_t page, off_t count) {
8819 off_t j;
8820
8821 for (j = 0; j < count; j++)
7d30035d 8822 vmMarkPageUsed(page+j);
7d98e08c 8823 server.vm_stats_used_pages += count;
7c775e09 8824 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8825 (long long)count, (long long)page);
06224fec 8826}
8827
8828/* Mark the page as free */
8829static void vmMarkPageFree(off_t page) {
8830 off_t byte = page/8;
8831 int bit = page&7;
970e10bb 8832 redisAssert(vmFreePage(page) == 0);
06224fec 8833 server.vm_bitmap[byte] &= ~(1<<bit);
8834}
8835
8836/* Mark N contiguous pages as free, with 'page' being the first. */
8837static void vmMarkPagesFree(off_t page, off_t count) {
8838 off_t j;
8839
8840 for (j = 0; j < count; j++)
7d30035d 8841 vmMarkPageFree(page+j);
7d98e08c 8842 server.vm_stats_used_pages -= count;
7c775e09 8843 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8844 (long long)count, (long long)page);
06224fec 8845}
8846
8847/* Test if the page is free */
8848static int vmFreePage(off_t page) {
8849 off_t byte = page/8;
8850 int bit = page&7;
7d30035d 8851 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 8852}
8853
8854/* Find N contiguous free pages storing the first page of the cluster in *first.
e0a62c7f 8855 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
3a66edc7 8856 * REDIS_ERR is returned.
06224fec 8857 *
8858 * This function uses a simple algorithm: we try to allocate
8859 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8860 * again from the start of the swap file searching for free spaces.
8861 *
8862 * If it looks pretty clear that there are no free pages near our offset
8863 * we try to find less populated places doing a forward jump of
8864 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8865 * without hurry, and then we jump again and so forth...
e0a62c7f 8866 *
06224fec 8867 * This function can be improved using a free list to avoid to guess
8868 * too much, since we could collect data about freed pages.
8869 *
8870 * note: I implemented this function just after watching an episode of
8871 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8872 */
c7df85a4 8873static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 8874 off_t base, offset = 0, since_jump = 0, numfree = 0;
8875
8876 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8877 server.vm_near_pages = 0;
8878 server.vm_next_page = 0;
8879 }
8880 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8881 base = server.vm_next_page;
8882
8883 while(offset < server.vm_pages) {
8884 off_t this = base+offset;
8885
8886 /* If we overflow, restart from page zero */
8887 if (this >= server.vm_pages) {
8888 this -= server.vm_pages;
8889 if (this == 0) {
8890 /* Just overflowed, what we found on tail is no longer
8891 * interesting, as it's no longer contiguous. */
8892 numfree = 0;
8893 }
8894 }
8895 if (vmFreePage(this)) {
8896 /* This is a free page */
8897 numfree++;
8898 /* Already got N free pages? Return to the caller, with success */
8899 if (numfree == n) {
7d30035d 8900 *first = this-(n-1);
8901 server.vm_next_page = this+1;
7c775e09 8902 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 8903 return REDIS_OK;
06224fec 8904 }
8905 } else {
8906 /* The current one is not a free page */
8907 numfree = 0;
8908 }
8909
8910 /* Fast-forward if the current page is not free and we already
8911 * searched enough near this place. */
8912 since_jump++;
8913 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8914 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8915 since_jump = 0;
8916 /* Note that even if we rewind after the jump, we are don't need
8917 * to make sure numfree is set to zero as we only jump *if* it
8918 * is set to zero. */
8919 } else {
8920 /* Otherwise just check the next page */
8921 offset++;
8922 }
8923 }
3a66edc7 8924 return REDIS_ERR;
8925}
8926
a5819310 8927/* Write the specified object at the specified page of the swap file */
8928static int vmWriteObjectOnSwap(robj *o, off_t page) {
8929 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8930 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8931 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8932 redisLog(REDIS_WARNING,
9ebed7cf 8933 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 8934 strerror(errno));
8935 return REDIS_ERR;
8936 }
8937 rdbSaveObject(server.vm_fp,o);
ba76a8f9 8938 fflush(server.vm_fp);
a5819310 8939 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8940 return REDIS_OK;
8941}
8942
3a66edc7 8943/* Swap the 'val' object relative to 'key' into disk. Store all the information
8944 * needed to later retrieve the object into the key object.
8945 * If we can't find enough contiguous empty pages to swap the object on disk
8946 * REDIS_ERR is returned. */
a69a0c9c 8947static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 8948 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 8949 off_t page;
8950
8951 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 8952 assert(key->refcount == 1);
3a66edc7 8953 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 8954 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 8955 key->vm.page = page;
8956 key->vm.usedpages = pages;
8957 key->storage = REDIS_VM_SWAPPED;
d894161b 8958 key->vtype = val->type;
3a66edc7 8959 decrRefCount(val); /* Deallocate the object from memory. */
8960 vmMarkPagesUsed(page,pages);
7d30035d 8961 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8962 (unsigned char*) key->ptr,
8963 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 8964 server.vm_stats_swapped_objects++;
8965 server.vm_stats_swapouts++;
3a66edc7 8966 return REDIS_OK;
8967}
8968
a5819310 8969static robj *vmReadObjectFromSwap(off_t page, int type) {
8970 robj *o;
3a66edc7 8971
a5819310 8972 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8973 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 8974 redisLog(REDIS_WARNING,
d5d55fc3 8975 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 8976 strerror(errno));
478c2c6f 8977 _exit(1);
3a66edc7 8978 }
a5819310 8979 o = rdbLoadObject(type,server.vm_fp);
8980 if (o == NULL) {
d5d55fc3 8981 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 8982 _exit(1);
3a66edc7 8983 }
a5819310 8984 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8985 return o;
8986}
8987
8988/* Load the value object relative to the 'key' object from swap to memory.
8989 * The newly allocated object is returned.
8990 *
8991 * If preview is true the unserialized object is returned to the caller but
8992 * no changes are made to the key object, nor the pages are marked as freed */
8993static robj *vmGenericLoadObject(robj *key, int preview) {
8994 robj *val;
8995
d5d55fc3 8996 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
a5819310 8997 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 8998 if (!preview) {
8999 key->storage = REDIS_VM_MEMORY;
9000 key->vm.atime = server.unixtime;
9001 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9002 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
9003 (unsigned char*) key->ptr);
7d98e08c 9004 server.vm_stats_swapped_objects--;
38aba9a1 9005 } else {
9006 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
9007 (unsigned char*) key->ptr);
7e69548d 9008 }
7d98e08c 9009 server.vm_stats_swapins++;
3a66edc7 9010 return val;
06224fec 9011}
9012
7e69548d 9013/* Plain object loading, from swap to memory */
9014static robj *vmLoadObject(robj *key) {
996cb5f7 9015 /* If we are loading the object in background, stop it, we
9016 * need to load this object synchronously ASAP. */
9017 if (key->storage == REDIS_VM_LOADING)
9018 vmCancelThreadedIOJob(key);
7e69548d 9019 return vmGenericLoadObject(key,0);
9020}
9021
9022/* Just load the value on disk, without to modify the key.
9023 * This is useful when we want to perform some operation on the value
9024 * without to really bring it from swap to memory, like while saving the
9025 * dataset or rewriting the append only log. */
9026static robj *vmPreviewObject(robj *key) {
9027 return vmGenericLoadObject(key,1);
9028}
9029
4ef8de8a 9030/* How a good candidate is this object for swapping?
9031 * The better candidate it is, the greater the returned value.
9032 *
9033 * Currently we try to perform a fast estimation of the object size in
9034 * memory, and combine it with aging informations.
9035 *
9036 * Basically swappability = idle-time * log(estimated size)
9037 *
9038 * Bigger objects are preferred over smaller objects, but not
9039 * proportionally, this is why we use the logarithm. This algorithm is
9040 * just a first try and will probably be tuned later. */
9041static double computeObjectSwappability(robj *o) {
9042 time_t age = server.unixtime - o->vm.atime;
9043 long asize = 0;
9044 list *l;
9045 dict *d;
9046 struct dictEntry *de;
9047 int z;
9048
9049 if (age <= 0) return 0;
9050 switch(o->type) {
9051 case REDIS_STRING:
9052 if (o->encoding != REDIS_ENCODING_RAW) {
9053 asize = sizeof(*o);
9054 } else {
9055 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9056 }
9057 break;
9058 case REDIS_LIST:
9059 l = o->ptr;
9060 listNode *ln = listFirst(l);
9061
9062 asize = sizeof(list);
9063 if (ln) {
9064 robj *ele = ln->value;
9065 long elesize;
9066
9067 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9068 (sizeof(*o)+sdslen(ele->ptr)) :
9069 sizeof(*o);
9070 asize += (sizeof(listNode)+elesize)*listLength(l);
9071 }
9072 break;
9073 case REDIS_SET:
9074 case REDIS_ZSET:
9075 z = (o->type == REDIS_ZSET);
9076 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9077
9078 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9079 if (z) asize += sizeof(zset)-sizeof(dict);
9080 if (dictSize(d)) {
9081 long elesize;
9082 robj *ele;
9083
9084 de = dictGetRandomKey(d);
9085 ele = dictGetEntryKey(de);
9086 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9087 (sizeof(*o)+sdslen(ele->ptr)) :
9088 sizeof(*o);
9089 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9090 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9091 }
9092 break;
a97b9060 9093 case REDIS_HASH:
9094 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9095 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9096 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9097 unsigned int klen, vlen;
9098 unsigned char *key, *val;
9099
9100 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9101 klen = 0;
9102 vlen = 0;
9103 }
9104 asize = len*(klen+vlen+3);
9105 } else if (o->encoding == REDIS_ENCODING_HT) {
9106 d = o->ptr;
9107 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9108 if (dictSize(d)) {
9109 long elesize;
9110 robj *ele;
9111
9112 de = dictGetRandomKey(d);
9113 ele = dictGetEntryKey(de);
9114 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9115 (sizeof(*o)+sdslen(ele->ptr)) :
9116 sizeof(*o);
9117 ele = dictGetEntryVal(de);
9118 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9119 (sizeof(*o)+sdslen(ele->ptr)) :
9120 sizeof(*o);
9121 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9122 }
9123 }
9124 break;
4ef8de8a 9125 }
c8c72447 9126 return (double)age*log(1+asize);
4ef8de8a 9127}
9128
9129/* Try to swap an object that's a good candidate for swapping.
9130 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 9131 * to swap any object at all.
9132 *
9133 * If 'usethreaded' is true, Redis will try to swap the object in background
9134 * using I/O threads. */
9135static int vmSwapOneObject(int usethreads) {
4ef8de8a 9136 int j, i;
9137 struct dictEntry *best = NULL;
9138 double best_swappability = 0;
b9bc0eef 9139 redisDb *best_db = NULL;
4ef8de8a 9140 robj *key, *val;
9141
9142 for (j = 0; j < server.dbnum; j++) {
9143 redisDb *db = server.db+j;
b72f6a4b 9144 /* Why maxtries is set to 100?
9145 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9146 * are swappable objects */
b0d8747d 9147 int maxtries = 100;
4ef8de8a 9148
9149 if (dictSize(db->dict) == 0) continue;
9150 for (i = 0; i < 5; i++) {
9151 dictEntry *de;
9152 double swappability;
9153
e3cadb8a 9154 if (maxtries) maxtries--;
4ef8de8a 9155 de = dictGetRandomKey(db->dict);
9156 key = dictGetEntryKey(de);
9157 val = dictGetEntryVal(de);
1064ef87 9158 /* Only swap objects that are currently in memory.
9159 *
9160 * Also don't swap shared objects if threaded VM is on, as we
9161 * try to ensure that the main thread does not touch the
9162 * object while the I/O thread is using it, but we can't
9163 * control other keys without adding additional mutex. */
9164 if (key->storage != REDIS_VM_MEMORY ||
9165 (server.vm_max_threads != 0 && val->refcount != 1)) {
e3cadb8a 9166 if (maxtries) i--; /* don't count this try */
9167 continue;
9168 }
4ef8de8a 9169 swappability = computeObjectSwappability(val);
9170 if (!best || swappability > best_swappability) {
9171 best = de;
9172 best_swappability = swappability;
b9bc0eef 9173 best_db = db;
4ef8de8a 9174 }
9175 }
9176 }
7c775e09 9177 if (best == NULL) return REDIS_ERR;
4ef8de8a 9178 key = dictGetEntryKey(best);
9179 val = dictGetEntryVal(best);
9180
e3cadb8a 9181 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 9182 key->ptr, best_swappability);
9183
9184 /* Unshare the key if needed */
9185 if (key->refcount > 1) {
9186 robj *newkey = dupStringObject(key);
9187 decrRefCount(key);
9188 key = dictGetEntryKey(best) = newkey;
9189 }
9190 /* Swap it */
a69a0c9c 9191 if (usethreads) {
b9bc0eef 9192 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 9193 return REDIS_OK;
9194 } else {
a69a0c9c 9195 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9196 dictGetEntryVal(best) = NULL;
9197 return REDIS_OK;
9198 } else {
9199 return REDIS_ERR;
9200 }
4ef8de8a 9201 }
9202}
9203
a69a0c9c 9204static int vmSwapOneObjectBlocking() {
9205 return vmSwapOneObject(0);
9206}
9207
9208static int vmSwapOneObjectThreaded() {
9209 return vmSwapOneObject(1);
9210}
9211
7e69548d 9212/* Return true if it's safe to swap out objects in a given moment.
9213 * Basically we don't want to swap objects out while there is a BGSAVE
9214 * or a BGAEOREWRITE running in backgroud. */
9215static int vmCanSwapOut(void) {
9216 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9217}
9218
1b03836c 9219/* Delete a key if swapped. Returns 1 if the key was found, was swapped
9220 * and was deleted. Otherwise 0 is returned. */
9221static int deleteIfSwapped(redisDb *db, robj *key) {
9222 dictEntry *de;
9223 robj *foundkey;
9224
9225 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9226 foundkey = dictGetEntryKey(de);
9227 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9228 deleteKey(db,key);
9229 return 1;
9230}
9231
996cb5f7 9232/* =================== Virtual Memory - Threaded I/O ======================= */
9233
b9bc0eef 9234static void freeIOJob(iojob *j) {
d5d55fc3 9235 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9236 j->type == REDIS_IOJOB_DO_SWAP ||
9237 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
b9bc0eef 9238 decrRefCount(j->val);
78ebe4c8 9239 /* We don't decrRefCount the j->key field as we did't incremented
9240 * the count creating IO Jobs. This is because the key field here is
9241 * just used as an indentifier and if a key is removed the Job should
9242 * never be touched again. */
b9bc0eef 9243 zfree(j);
9244}
9245
996cb5f7 9246/* Every time a thread finished a Job, it writes a byte into the write side
9247 * of an unix pipe in order to "awake" the main thread, and this function
9248 * is called. */
9249static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9250 int mask)
9251{
9252 char buf[1];
b0d8747d 9253 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 9254 REDIS_NOTUSED(el);
9255 REDIS_NOTUSED(mask);
9256 REDIS_NOTUSED(privdata);
9257
9258 /* For every byte we read in the read side of the pipe, there is one
9259 * I/O job completed to process. */
9260 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 9261 iojob *j;
9262 listNode *ln;
9263 robj *key;
9264 struct dictEntry *de;
9265
996cb5f7 9266 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 9267
9268 /* Get the processed element (the oldest one) */
9269 lockThreadedIO();
1064ef87 9270 assert(listLength(server.io_processed) != 0);
f6c0bba8 9271 if (toprocess == -1) {
9272 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9273 if (toprocess <= 0) toprocess = 1;
9274 }
b9bc0eef 9275 ln = listFirst(server.io_processed);
9276 j = ln->value;
9277 listDelNode(server.io_processed,ln);
9278 unlockThreadedIO();
9279 /* If this job is marked as canceled, just ignore it */
9280 if (j->canceled) {
9281 freeIOJob(j);
9282 continue;
9283 }
9284 /* Post process it in the main thread, as there are things we
9285 * can do just here to avoid race conditions and/or invasive locks */
6c96ba7d 9286 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 9287 de = dictFind(j->db->dict,j->key);
9288 assert(de != NULL);
9289 key = dictGetEntryKey(de);
9290 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9291 redisDb *db;
9292
b9bc0eef 9293 /* Key loaded, bring it at home */
9294 key->storage = REDIS_VM_MEMORY;
9295 key->vm.atime = server.unixtime;
9296 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9297 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9298 (unsigned char*) key->ptr);
9299 server.vm_stats_swapped_objects--;
9300 server.vm_stats_swapins++;
d5d55fc3 9301 dictGetEntryVal(de) = j->val;
9302 incrRefCount(j->val);
9303 db = j->db;
b9bc0eef 9304 freeIOJob(j);
d5d55fc3 9305 /* Handle clients waiting for this key to be loaded. */
9306 handleClientsBlockedOnSwappedKey(db,key);
b9bc0eef 9307 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9308 /* Now we know the amount of pages required to swap this object.
9309 * Let's find some space for it, and queue this task again
9310 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 9311 if (!vmCanSwapOut() ||
9312 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9313 {
9314 /* Ooops... no space or we can't swap as there is
9315 * a fork()ed Redis trying to save stuff on disk. */
b9bc0eef 9316 freeIOJob(j);
054e426d 9317 key->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 9318 } else {
c7df85a4 9319 /* Note that we need to mark this pages as used now,
9320 * if the job will be canceled, we'll mark them as freed
9321 * again. */
9322 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 9323 j->type = REDIS_IOJOB_DO_SWAP;
9324 lockThreadedIO();
9325 queueIOJob(j);
9326 unlockThreadedIO();
9327 }
9328 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9329 robj *val;
9330
9331 /* Key swapped. We can finally free some memory. */
6c96ba7d 9332 if (key->storage != REDIS_VM_SWAPPING) {
9333 printf("key->storage: %d\n",key->storage);
9334 printf("key->name: %s\n",(char*)key->ptr);
9335 printf("key->refcount: %d\n",key->refcount);
9336 printf("val: %p\n",(void*)j->val);
9337 printf("val->type: %d\n",j->val->type);
9338 printf("val->ptr: %s\n",(char*)j->val->ptr);
9339 }
9340 redisAssert(key->storage == REDIS_VM_SWAPPING);
b9bc0eef 9341 val = dictGetEntryVal(de);
9342 key->vm.page = j->page;
9343 key->vm.usedpages = j->pages;
9344 key->storage = REDIS_VM_SWAPPED;
9345 key->vtype = j->val->type;
9346 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 9347 dictGetEntryVal(de) = NULL;
b9bc0eef 9348 redisLog(REDIS_DEBUG,
9349 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9350 (unsigned char*) key->ptr,
9351 (unsigned long long) j->page, (unsigned long long) j->pages);
9352 server.vm_stats_swapped_objects++;
9353 server.vm_stats_swapouts++;
9354 freeIOJob(j);
f11b8647 9355 /* Put a few more swap requests in queue if we are still
9356 * out of memory */
b0d8747d 9357 if (trytoswap && vmCanSwapOut() &&
9358 zmalloc_used_memory() > server.vm_max_memory)
9359 {
f11b8647 9360 int more = 1;
9361 while(more) {
9362 lockThreadedIO();
9363 more = listLength(server.io_newjobs) <
9364 (unsigned) server.vm_max_threads;
9365 unlockThreadedIO();
9366 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 9367 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9368 trytoswap = 0;
9369 break;
9370 }
f11b8647 9371 }
9372 }
b9bc0eef 9373 }
c953f24b 9374 processed++;
f6c0bba8 9375 if (processed == toprocess) return;
996cb5f7 9376 }
9377 if (retval < 0 && errno != EAGAIN) {
9378 redisLog(REDIS_WARNING,
9379 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9380 strerror(errno));
9381 }
9382}
9383
9384static void lockThreadedIO(void) {
9385 pthread_mutex_lock(&server.io_mutex);
9386}
9387
9388static void unlockThreadedIO(void) {
9389 pthread_mutex_unlock(&server.io_mutex);
9390}
9391
9392/* Remove the specified object from the threaded I/O queue if still not
9393 * processed, otherwise make sure to flag it as canceled. */
9394static void vmCancelThreadedIOJob(robj *o) {
9395 list *lists[3] = {
6c96ba7d 9396 server.io_newjobs, /* 0 */
9397 server.io_processing, /* 1 */
9398 server.io_processed /* 2 */
996cb5f7 9399 };
9400 int i;
9401
9402 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 9403again:
996cb5f7 9404 lockThreadedIO();
9405 /* Search for a matching key in one of the queues */
9406 for (i = 0; i < 3; i++) {
9407 listNode *ln;
c7df85a4 9408 listIter li;
996cb5f7 9409
c7df85a4 9410 listRewind(lists[i],&li);
9411 while ((ln = listNext(&li)) != NULL) {
996cb5f7 9412 iojob *job = ln->value;
9413
6c96ba7d 9414 if (job->canceled) continue; /* Skip this, already canceled. */
78ebe4c8 9415 if (job->key == o) {
970e10bb 9416 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9417 (void*)job, (char*)o->ptr, job->type, i);
427a2153 9418 /* Mark the pages as free since the swap didn't happened
9419 * or happened but is now discarded. */
970e10bb 9420 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 9421 vmMarkPagesFree(job->page,job->pages);
9422 /* Cancel the job. It depends on the list the job is
9423 * living in. */
996cb5f7 9424 switch(i) {
9425 case 0: /* io_newjobs */
6c96ba7d 9426 /* If the job was yet not processed the best thing to do
996cb5f7 9427 * is to remove it from the queue at all */
6c96ba7d 9428 freeIOJob(job);
996cb5f7 9429 listDelNode(lists[i],ln);
9430 break;
9431 case 1: /* io_processing */
d5d55fc3 9432 /* Oh Shi- the thread is messing with the Job:
9433 *
9434 * Probably it's accessing the object if this is a
9435 * PREPARE_SWAP or DO_SWAP job.
9436 * If it's a LOAD job it may be reading from disk and
9437 * if we don't wait for the job to terminate before to
9438 * cancel it, maybe in a few microseconds data can be
9439 * corrupted in this pages. So the short story is:
9440 *
9441 * Better to wait for the job to move into the
9442 * next queue (processed)... */
9443
9444 /* We try again and again until the job is completed. */
9445 unlockThreadedIO();
9446 /* But let's wait some time for the I/O thread
9447 * to finish with this job. After all this condition
9448 * should be very rare. */
9449 usleep(1);
9450 goto again;
996cb5f7 9451 case 2: /* io_processed */
2e111efe 9452 /* The job was already processed, that's easy...
9453 * just mark it as canceled so that we'll ignore it
9454 * when processing completed jobs. */
996cb5f7 9455 job->canceled = 1;
9456 break;
9457 }
c7df85a4 9458 /* Finally we have to adjust the storage type of the object
9459 * in order to "UNDO" the operaiton. */
996cb5f7 9460 if (o->storage == REDIS_VM_LOADING)
9461 o->storage = REDIS_VM_SWAPPED;
9462 else if (o->storage == REDIS_VM_SWAPPING)
9463 o->storage = REDIS_VM_MEMORY;
9464 unlockThreadedIO();
9465 return;
9466 }
9467 }
9468 }
9469 unlockThreadedIO();
9470 assert(1 != 1); /* We should never reach this */
9471}
9472
b9bc0eef 9473static void *IOThreadEntryPoint(void *arg) {
9474 iojob *j;
9475 listNode *ln;
9476 REDIS_NOTUSED(arg);
9477
9478 pthread_detach(pthread_self());
9479 while(1) {
9480 /* Get a new job to process */
9481 lockThreadedIO();
9482 if (listLength(server.io_newjobs) == 0) {
9483 /* No new jobs in queue, exit. */
9ebed7cf 9484 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9485 (long) pthread_self());
b9bc0eef 9486 server.io_active_threads--;
9487 unlockThreadedIO();
9488 return NULL;
9489 }
9490 ln = listFirst(server.io_newjobs);
9491 j = ln->value;
9492 listDelNode(server.io_newjobs,ln);
9493 /* Add the job in the processing queue */
9494 j->thread = pthread_self();
9495 listAddNodeTail(server.io_processing,j);
9496 ln = listLast(server.io_processing); /* We use ln later to remove it */
9497 unlockThreadedIO();
9ebed7cf 9498 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9499 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 9500
9501 /* Process the Job */
9502 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9503 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
b9bc0eef 9504 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9505 FILE *fp = fopen("/dev/null","w+");
9506 j->pages = rdbSavedObjectPages(j->val,fp);
9507 fclose(fp);
9508 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 9509 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9510 j->canceled = 1;
b9bc0eef 9511 }
9512
9513 /* Done: insert the job into the processed queue */
9ebed7cf 9514 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9515 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 9516 lockThreadedIO();
9517 listDelNode(server.io_processing,ln);
9518 listAddNodeTail(server.io_processed,j);
9519 unlockThreadedIO();
e0a62c7f 9520
b9bc0eef 9521 /* Signal the main thread there is new stuff to process */
9522 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9523 }
9524 return NULL; /* never reached */
9525}
9526
9527static void spawnIOThread(void) {
9528 pthread_t thread;
478c2c6f 9529 sigset_t mask, omask;
a97b9060 9530 int err;
b9bc0eef 9531
478c2c6f 9532 sigemptyset(&mask);
9533 sigaddset(&mask,SIGCHLD);
9534 sigaddset(&mask,SIGHUP);
9535 sigaddset(&mask,SIGPIPE);
9536 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 9537 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9538 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9539 strerror(err));
9540 usleep(1000000);
9541 }
478c2c6f 9542 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 9543 server.io_active_threads++;
9544}
9545
4ee9488d 9546/* We need to wait for the last thread to exit before we are able to
9547 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 9548static void waitEmptyIOJobsQueue(void) {
4ee9488d 9549 while(1) {
76b7233a 9550 int io_processed_len;
9551
4ee9488d 9552 lockThreadedIO();
054e426d 9553 if (listLength(server.io_newjobs) == 0 &&
9554 listLength(server.io_processing) == 0 &&
9555 server.io_active_threads == 0)
9556 {
4ee9488d 9557 unlockThreadedIO();
9558 return;
9559 }
76b7233a 9560 /* While waiting for empty jobs queue condition we post-process some
9561 * finshed job, as I/O threads may be hanging trying to write against
9562 * the io_ready_pipe_write FD but there are so much pending jobs that
9563 * it's blocking. */
9564 io_processed_len = listLength(server.io_processed);
4ee9488d 9565 unlockThreadedIO();
76b7233a 9566 if (io_processed_len) {
9567 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9568 usleep(1000); /* 1 millisecond */
9569 } else {
9570 usleep(10000); /* 10 milliseconds */
9571 }
4ee9488d 9572 }
9573}
9574
054e426d 9575static void vmReopenSwapFile(void) {
478c2c6f 9576 /* Note: we don't close the old one as we are in the child process
9577 * and don't want to mess at all with the original file object. */
054e426d 9578 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9579 if (server.vm_fp == NULL) {
9580 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9581 server.vm_swap_file);
478c2c6f 9582 _exit(1);
054e426d 9583 }
9584 server.vm_fd = fileno(server.vm_fp);
9585}
9586
b9bc0eef 9587/* This function must be called while with threaded IO locked */
9588static void queueIOJob(iojob *j) {
6c96ba7d 9589 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9590 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 9591 listAddNodeTail(server.io_newjobs,j);
9592 if (server.io_active_threads < server.vm_max_threads)
9593 spawnIOThread();
9594}
9595
9596static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9597 iojob *j;
e0a62c7f 9598
b9bc0eef 9599 assert(key->storage == REDIS_VM_MEMORY);
9600 assert(key->refcount == 1);
9601
9602 j = zmalloc(sizeof(*j));
9603 j->type = REDIS_IOJOB_PREPARE_SWAP;
9604 j->db = db;
78ebe4c8 9605 j->key = key;
b9bc0eef 9606 j->val = val;
9607 incrRefCount(val);
9608 j->canceled = 0;
9609 j->thread = (pthread_t) -1;
f11b8647 9610 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 9611
9612 lockThreadedIO();
9613 queueIOJob(j);
9614 unlockThreadedIO();
9615 return REDIS_OK;
9616}
9617
b0d8747d 9618/* ============ Virtual Memory - Blocking clients on missing keys =========== */
9619
d5d55fc3 9620/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9621 * If there is not already a job loading the key, it is craeted.
9622 * The key is added to the io_keys list in the client structure, and also
9623 * in the hash table mapping swapped keys to waiting clients, that is,
9624 * server.io_waited_keys. */
9625static int waitForSwappedKey(redisClient *c, robj *key) {
9626 struct dictEntry *de;
9627 robj *o;
9628 list *l;
9629
9630 /* If the key does not exist or is already in RAM we don't need to
9631 * block the client at all. */
9632 de = dictFind(c->db->dict,key);
9633 if (de == NULL) return 0;
9634 o = dictGetEntryKey(de);
9635 if (o->storage == REDIS_VM_MEMORY) {
9636 return 0;
9637 } else if (o->storage == REDIS_VM_SWAPPING) {
9638 /* We were swapping the key, undo it! */
9639 vmCancelThreadedIOJob(o);
9640 return 0;
9641 }
e0a62c7f 9642
d5d55fc3 9643 /* OK: the key is either swapped, or being loaded just now. */
9644
9645 /* Add the key to the list of keys this client is waiting for.
9646 * This maps clients to keys they are waiting for. */
9647 listAddNodeTail(c->io_keys,key);
9648 incrRefCount(key);
9649
9650 /* Add the client to the swapped keys => clients waiting map. */
9651 de = dictFind(c->db->io_keys,key);
9652 if (de == NULL) {
9653 int retval;
9654
9655 /* For every key we take a list of clients blocked for it */
9656 l = listCreate();
9657 retval = dictAdd(c->db->io_keys,key,l);
9658 incrRefCount(key);
9659 assert(retval == DICT_OK);
9660 } else {
9661 l = dictGetEntryVal(de);
9662 }
9663 listAddNodeTail(l,c);
9664
9665 /* Are we already loading the key from disk? If not create a job */
9666 if (o->storage == REDIS_VM_SWAPPED) {
9667 iojob *j;
9668
9669 o->storage = REDIS_VM_LOADING;
9670 j = zmalloc(sizeof(*j));
9671 j->type = REDIS_IOJOB_LOAD;
9672 j->db = c->db;
78ebe4c8 9673 j->key = o;
d5d55fc3 9674 j->key->vtype = o->vtype;
9675 j->page = o->vm.page;
9676 j->val = NULL;
9677 j->canceled = 0;
9678 j->thread = (pthread_t) -1;
9679 lockThreadedIO();
9680 queueIOJob(j);
9681 unlockThreadedIO();
9682 }
9683 return 1;
9684}
9685
6f078746
PN
9686/* Preload keys for any command with first, last and step values for
9687 * the command keys prototype, as defined in the command table. */
9688static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9689 int j, last;
9690 if (cmd->vm_firstkey == 0) return;
9691 last = cmd->vm_lastkey;
9692 if (last < 0) last = argc+last;
9693 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
9694 redisAssert(j < argc);
9695 waitForSwappedKey(c,argv[j]);
9696 }
9697}
9698
5d373da9 9699/* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
739ba0d2
PN
9700 * Note that the number of keys to preload is user-defined, so we need to
9701 * apply a sanity check against argc. */
ca1788b5 9702static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
76583ea4 9703 int i, num;
ca1788b5 9704 REDIS_NOTUSED(cmd);
ca1788b5
PN
9705
9706 num = atoi(argv[2]->ptr);
739ba0d2 9707 if (num > (argc-3)) return;
76583ea4 9708 for (i = 0; i < num; i++) {
ca1788b5 9709 waitForSwappedKey(c,argv[3+i]);
76583ea4
PN
9710 }
9711}
9712
3805e04f
PN
9713/* Preload keys needed to execute the entire MULTI/EXEC block.
9714 *
9715 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
9716 * and will block the client when any command requires a swapped out value. */
9717static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9718 int i, margc;
9719 struct redisCommand *mcmd;
9720 robj **margv;
9721 REDIS_NOTUSED(cmd);
9722 REDIS_NOTUSED(argc);
9723 REDIS_NOTUSED(argv);
9724
9725 if (!(c->flags & REDIS_MULTI)) return;
9726 for (i = 0; i < c->mstate.count; i++) {
9727 mcmd = c->mstate.commands[i].cmd;
9728 margc = c->mstate.commands[i].argc;
9729 margv = c->mstate.commands[i].argv;
9730
9731 if (mcmd->vm_preload_proc != NULL) {
9732 mcmd->vm_preload_proc(c,mcmd,margc,margv);
9733 } else {
9734 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
9735 }
76583ea4
PN
9736 }
9737}
9738
b0d8747d 9739/* Is this client attempting to run a command against swapped keys?
d5d55fc3 9740 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 9741 *
d5d55fc3 9742 * The important idea about this function is that it can fail! If keys will
9743 * still be swapped when the client is resumed, this key lookups will
9744 * just block loading keys from disk. In practical terms this should only
9745 * happen with SORT BY command or if there is a bug in this function.
9746 *
9747 * Return 1 if the client is marked as blocked, 0 if the client can
9748 * continue as the keys it is going to access appear to be in memory. */
0a6f3f0f 9749static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
76583ea4 9750 if (cmd->vm_preload_proc != NULL) {
ca1788b5 9751 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
76583ea4 9752 } else {
6f078746 9753 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
76583ea4
PN
9754 }
9755
d5d55fc3 9756 /* If the client was blocked for at least one key, mark it as blocked. */
9757 if (listLength(c->io_keys)) {
9758 c->flags |= REDIS_IO_WAIT;
9759 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9760 server.vm_blocked_clients++;
9761 return 1;
9762 } else {
9763 return 0;
9764 }
9765}
9766
9767/* Remove the 'key' from the list of blocked keys for a given client.
9768 *
9769 * The function returns 1 when there are no longer blocking keys after
9770 * the current one was removed (and the client can be unblocked). */
9771static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9772 list *l;
9773 listNode *ln;
9774 listIter li;
9775 struct dictEntry *de;
9776
9777 /* Remove the key from the list of keys this client is waiting for. */
9778 listRewind(c->io_keys,&li);
9779 while ((ln = listNext(&li)) != NULL) {
bf028098 9780 if (equalStringObjects(ln->value,key)) {
d5d55fc3 9781 listDelNode(c->io_keys,ln);
9782 break;
9783 }
9784 }
9785 assert(ln != NULL);
9786
9787 /* Remove the client form the key => waiting clients map. */
9788 de = dictFind(c->db->io_keys,key);
9789 assert(de != NULL);
9790 l = dictGetEntryVal(de);
9791 ln = listSearchKey(l,c);
9792 assert(ln != NULL);
9793 listDelNode(l,ln);
9794 if (listLength(l) == 0)
9795 dictDelete(c->db->io_keys,key);
9796
9797 return listLength(c->io_keys) == 0;
9798}
9799
9800static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9801 struct dictEntry *de;
9802 list *l;
9803 listNode *ln;
9804 int len;
9805
9806 de = dictFind(db->io_keys,key);
9807 if (!de) return;
9808
9809 l = dictGetEntryVal(de);
9810 len = listLength(l);
9811 /* Note: we can't use something like while(listLength(l)) as the list
9812 * can be freed by the calling function when we remove the last element. */
9813 while (len--) {
9814 ln = listFirst(l);
9815 redisClient *c = ln->value;
9816
9817 if (dontWaitForSwappedKey(c,key)) {
9818 /* Put the client in the list of clients ready to go as we
9819 * loaded all the keys about it. */
9820 listAddNodeTail(server.io_ready_clients,c);
9821 }
9822 }
b0d8747d 9823}
b0d8747d 9824
500ece7c 9825/* =========================== Remote Configuration ========================= */
9826
9827static void configSetCommand(redisClient *c) {
9828 robj *o = getDecodedObject(c->argv[3]);
9829 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9830 zfree(server.dbfilename);
9831 server.dbfilename = zstrdup(o->ptr);
9832 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9833 zfree(server.requirepass);
9834 server.requirepass = zstrdup(o->ptr);
9835 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9836 zfree(server.masterauth);
9837 server.masterauth = zstrdup(o->ptr);
9838 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9839 server.maxmemory = strtoll(o->ptr, NULL, 10);
1b677732 9840 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
9841 if (!strcasecmp(o->ptr,"no")) {
9842 server.appendfsync = APPENDFSYNC_NO;
9843 } else if (!strcasecmp(o->ptr,"everysec")) {
9844 server.appendfsync = APPENDFSYNC_EVERYSEC;
9845 } else if (!strcasecmp(o->ptr,"always")) {
9846 server.appendfsync = APPENDFSYNC_ALWAYS;
9847 } else {
9848 goto badfmt;
9849 }
a34e0a25 9850 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
9851 int vlen, j;
9852 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
9853
9854 /* Perform sanity check before setting the new config:
9855 * - Even number of args
9856 * - Seconds >= 1, changes >= 0 */
9857 if (vlen & 1) {
9858 sdsfreesplitres(v,vlen);
9859 goto badfmt;
9860 }
9861 for (j = 0; j < vlen; j++) {
9862 char *eptr;
9863 long val;
9864
9865 val = strtoll(v[j], &eptr, 10);
9866 if (eptr[0] != '\0' ||
9867 ((j & 1) == 0 && val < 1) ||
9868 ((j & 1) == 1 && val < 0)) {
9869 sdsfreesplitres(v,vlen);
9870 goto badfmt;
9871 }
9872 }
9873 /* Finally set the new config */
9874 resetServerSaveParams();
9875 for (j = 0; j < vlen; j += 2) {
9876 time_t seconds;
9877 int changes;
9878
9879 seconds = strtoll(v[j],NULL,10);
9880 changes = strtoll(v[j+1],NULL,10);
9881 appendServerSaveParams(seconds, changes);
9882 }
9883 sdsfreesplitres(v,vlen);
500ece7c 9884 } else {
9885 addReplySds(c,sdscatprintf(sdsempty(),
9886 "-ERR not supported CONFIG parameter %s\r\n",
9887 (char*)c->argv[2]->ptr));
9888 decrRefCount(o);
9889 return;
9890 }
9891 decrRefCount(o);
9892 addReply(c,shared.ok);
a34e0a25 9893 return;
9894
9895badfmt: /* Bad format errors */
9896 addReplySds(c,sdscatprintf(sdsempty(),
9897 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
9898 (char*)o->ptr,
9899 (char*)c->argv[2]->ptr));
9900 decrRefCount(o);
500ece7c 9901}
9902
9903static void configGetCommand(redisClient *c) {
9904 robj *o = getDecodedObject(c->argv[2]);
9905 robj *lenobj = createObject(REDIS_STRING,NULL);
9906 char *pattern = o->ptr;
9907 int matches = 0;
9908
9909 addReply(c,lenobj);
9910 decrRefCount(lenobj);
9911
9912 if (stringmatch(pattern,"dbfilename",0)) {
9913 addReplyBulkCString(c,"dbfilename");
9914 addReplyBulkCString(c,server.dbfilename);
9915 matches++;
9916 }
9917 if (stringmatch(pattern,"requirepass",0)) {
9918 addReplyBulkCString(c,"requirepass");
9919 addReplyBulkCString(c,server.requirepass);
9920 matches++;
9921 }
9922 if (stringmatch(pattern,"masterauth",0)) {
9923 addReplyBulkCString(c,"masterauth");
9924 addReplyBulkCString(c,server.masterauth);
9925 matches++;
9926 }
9927 if (stringmatch(pattern,"maxmemory",0)) {
9928 char buf[128];
9929
9930 snprintf(buf,128,"%llu\n",server.maxmemory);
9931 addReplyBulkCString(c,"maxmemory");
9932 addReplyBulkCString(c,buf);
9933 matches++;
9934 }
1b677732 9935 if (stringmatch(pattern,"appendfsync",0)) {
9936 char *policy;
9937
9938 switch(server.appendfsync) {
9939 case APPENDFSYNC_NO: policy = "no"; break;
9940 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
9941 case APPENDFSYNC_ALWAYS: policy = "always"; break;
9942 default: policy = "unknown"; break; /* too harmless to panic */
9943 }
9944 addReplyBulkCString(c,"appendfsync");
9945 addReplyBulkCString(c,policy);
9946 matches++;
9947 }
a34e0a25 9948 if (stringmatch(pattern,"save",0)) {
9949 sds buf = sdsempty();
9950 int j;
9951
9952 for (j = 0; j < server.saveparamslen; j++) {
9953 buf = sdscatprintf(buf,"%ld %d",
9954 server.saveparams[j].seconds,
9955 server.saveparams[j].changes);
9956 if (j != server.saveparamslen-1)
9957 buf = sdscatlen(buf," ",1);
9958 }
9959 addReplyBulkCString(c,"save");
9960 addReplyBulkCString(c,buf);
9961 sdsfree(buf);
9962 matches++;
9963 }
500ece7c 9964 decrRefCount(o);
9965 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9966}
9967
9968static void configCommand(redisClient *c) {
9969 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9970 if (c->argc != 4) goto badarity;
9971 configSetCommand(c);
9972 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9973 if (c->argc != 3) goto badarity;
9974 configGetCommand(c);
9975 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9976 if (c->argc != 2) goto badarity;
9977 server.stat_numcommands = 0;
9978 server.stat_numconnections = 0;
9979 server.stat_expiredkeys = 0;
9980 server.stat_starttime = time(NULL);
9981 addReply(c,shared.ok);
9982 } else {
9983 addReplySds(c,sdscatprintf(sdsempty(),
9984 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9985 }
9986 return;
9987
9988badarity:
9989 addReplySds(c,sdscatprintf(sdsempty(),
9990 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9991 (char*) c->argv[1]->ptr));
9992}
9993
befec3cd 9994/* =========================== Pubsub implementation ======================== */
9995
ffc6b7f8 9996static void freePubsubPattern(void *p) {
9997 pubsubPattern *pat = p;
9998
9999 decrRefCount(pat->pattern);
10000 zfree(pat);
10001}
10002
10003static int listMatchPubsubPattern(void *a, void *b) {
10004 pubsubPattern *pa = a, *pb = b;
10005
10006 return (pa->client == pb->client) &&
bf028098 10007 (equalStringObjects(pa->pattern,pb->pattern));
ffc6b7f8 10008}
10009
10010/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10011 * 0 if the client was already subscribed to that channel. */
10012static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
befec3cd 10013 struct dictEntry *de;
10014 list *clients = NULL;
10015 int retval = 0;
10016
ffc6b7f8 10017 /* Add the channel to the client -> channels hash table */
10018 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
befec3cd 10019 retval = 1;
ffc6b7f8 10020 incrRefCount(channel);
10021 /* Add the client to the channel -> list of clients hash table */
10022 de = dictFind(server.pubsub_channels,channel);
befec3cd 10023 if (de == NULL) {
10024 clients = listCreate();
ffc6b7f8 10025 dictAdd(server.pubsub_channels,channel,clients);
10026 incrRefCount(channel);
befec3cd 10027 } else {
10028 clients = dictGetEntryVal(de);
10029 }
10030 listAddNodeTail(clients,c);
10031 }
10032 /* Notify the client */
10033 addReply(c,shared.mbulk3);
10034 addReply(c,shared.subscribebulk);
ffc6b7f8 10035 addReplyBulk(c,channel);
482b672d 10036 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
befec3cd 10037 return retval;
10038}
10039
ffc6b7f8 10040/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10041 * 0 if the client was not subscribed to the specified channel. */
10042static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
befec3cd 10043 struct dictEntry *de;
10044 list *clients;
10045 listNode *ln;
10046 int retval = 0;
10047
ffc6b7f8 10048 /* Remove the channel from the client -> channels hash table */
10049 incrRefCount(channel); /* channel may be just a pointer to the same object
201037f5 10050 we have in the hash tables. Protect it... */
ffc6b7f8 10051 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
befec3cd 10052 retval = 1;
ffc6b7f8 10053 /* Remove the client from the channel -> clients list hash table */
10054 de = dictFind(server.pubsub_channels,channel);
befec3cd 10055 assert(de != NULL);
10056 clients = dictGetEntryVal(de);
10057 ln = listSearchKey(clients,c);
10058 assert(ln != NULL);
10059 listDelNode(clients,ln);
ff767a75 10060 if (listLength(clients) == 0) {
10061 /* Free the list and associated hash entry at all if this was
10062 * the latest client, so that it will be possible to abuse
ffc6b7f8 10063 * Redis PUBSUB creating millions of channels. */
10064 dictDelete(server.pubsub_channels,channel);
ff767a75 10065 }
befec3cd 10066 }
10067 /* Notify the client */
10068 if (notify) {
10069 addReply(c,shared.mbulk3);
10070 addReply(c,shared.unsubscribebulk);
ffc6b7f8 10071 addReplyBulk(c,channel);
482b672d 10072 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10073 listLength(c->pubsub_patterns));
10074
10075 }
10076 decrRefCount(channel); /* it is finally safe to release it */
10077 return retval;
10078}
10079
10080/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10081static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10082 int retval = 0;
10083
10084 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10085 retval = 1;
10086 pubsubPattern *pat;
10087 listAddNodeTail(c->pubsub_patterns,pattern);
10088 incrRefCount(pattern);
10089 pat = zmalloc(sizeof(*pat));
10090 pat->pattern = getDecodedObject(pattern);
10091 pat->client = c;
10092 listAddNodeTail(server.pubsub_patterns,pat);
10093 }
10094 /* Notify the client */
10095 addReply(c,shared.mbulk3);
10096 addReply(c,shared.psubscribebulk);
10097 addReplyBulk(c,pattern);
482b672d 10098 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
ffc6b7f8 10099 return retval;
10100}
10101
10102/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10103 * 0 if the client was not subscribed to the specified channel. */
10104static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10105 listNode *ln;
10106 pubsubPattern pat;
10107 int retval = 0;
10108
10109 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10110 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10111 retval = 1;
10112 listDelNode(c->pubsub_patterns,ln);
10113 pat.client = c;
10114 pat.pattern = pattern;
10115 ln = listSearchKey(server.pubsub_patterns,&pat);
10116 listDelNode(server.pubsub_patterns,ln);
10117 }
10118 /* Notify the client */
10119 if (notify) {
10120 addReply(c,shared.mbulk3);
10121 addReply(c,shared.punsubscribebulk);
10122 addReplyBulk(c,pattern);
482b672d 10123 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10124 listLength(c->pubsub_patterns));
befec3cd 10125 }
ffc6b7f8 10126 decrRefCount(pattern);
befec3cd 10127 return retval;
10128}
10129
ffc6b7f8 10130/* Unsubscribe from all the channels. Return the number of channels the
10131 * client was subscribed from. */
10132static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10133 dictIterator *di = dictGetIterator(c->pubsub_channels);
befec3cd 10134 dictEntry *de;
10135 int count = 0;
10136
10137 while((de = dictNext(di)) != NULL) {
ffc6b7f8 10138 robj *channel = dictGetEntryKey(de);
befec3cd 10139
ffc6b7f8 10140 count += pubsubUnsubscribeChannel(c,channel,notify);
befec3cd 10141 }
10142 dictReleaseIterator(di);
10143 return count;
10144}
10145
ffc6b7f8 10146/* Unsubscribe from all the patterns. Return the number of patterns the
10147 * client was subscribed from. */
10148static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10149 listNode *ln;
10150 listIter li;
10151 int count = 0;
10152
10153 listRewind(c->pubsub_patterns,&li);
10154 while ((ln = listNext(&li)) != NULL) {
10155 robj *pattern = ln->value;
10156
10157 count += pubsubUnsubscribePattern(c,pattern,notify);
10158 }
10159 return count;
10160}
10161
befec3cd 10162/* Publish a message */
ffc6b7f8 10163static int pubsubPublishMessage(robj *channel, robj *message) {
befec3cd 10164 int receivers = 0;
10165 struct dictEntry *de;
ffc6b7f8 10166 listNode *ln;
10167 listIter li;
befec3cd 10168
ffc6b7f8 10169 /* Send to clients listening for that channel */
10170 de = dictFind(server.pubsub_channels,channel);
befec3cd 10171 if (de) {
10172 list *list = dictGetEntryVal(de);
10173 listNode *ln;
10174 listIter li;
10175
10176 listRewind(list,&li);
10177 while ((ln = listNext(&li)) != NULL) {
10178 redisClient *c = ln->value;
10179
10180 addReply(c,shared.mbulk3);
10181 addReply(c,shared.messagebulk);
ffc6b7f8 10182 addReplyBulk(c,channel);
befec3cd 10183 addReplyBulk(c,message);
10184 receivers++;
10185 }
10186 }
ffc6b7f8 10187 /* Send to clients listening to matching channels */
10188 if (listLength(server.pubsub_patterns)) {
10189 listRewind(server.pubsub_patterns,&li);
10190 channel = getDecodedObject(channel);
10191 while ((ln = listNext(&li)) != NULL) {
10192 pubsubPattern *pat = ln->value;
10193
10194 if (stringmatchlen((char*)pat->pattern->ptr,
10195 sdslen(pat->pattern->ptr),
10196 (char*)channel->ptr,
10197 sdslen(channel->ptr),0)) {
c8d0ea0e 10198 addReply(pat->client,shared.mbulk4);
10199 addReply(pat->client,shared.pmessagebulk);
10200 addReplyBulk(pat->client,pat->pattern);
ffc6b7f8 10201 addReplyBulk(pat->client,channel);
10202 addReplyBulk(pat->client,message);
10203 receivers++;
10204 }
10205 }
10206 decrRefCount(channel);
10207 }
befec3cd 10208 return receivers;
10209}
10210
10211static void subscribeCommand(redisClient *c) {
10212 int j;
10213
10214 for (j = 1; j < c->argc; j++)
ffc6b7f8 10215 pubsubSubscribeChannel(c,c->argv[j]);
befec3cd 10216}
10217
10218static void unsubscribeCommand(redisClient *c) {
10219 if (c->argc == 1) {
ffc6b7f8 10220 pubsubUnsubscribeAllChannels(c,1);
10221 return;
10222 } else {
10223 int j;
10224
10225 for (j = 1; j < c->argc; j++)
10226 pubsubUnsubscribeChannel(c,c->argv[j],1);
10227 }
10228}
10229
10230static void psubscribeCommand(redisClient *c) {
10231 int j;
10232
10233 for (j = 1; j < c->argc; j++)
10234 pubsubSubscribePattern(c,c->argv[j]);
10235}
10236
10237static void punsubscribeCommand(redisClient *c) {
10238 if (c->argc == 1) {
10239 pubsubUnsubscribeAllPatterns(c,1);
befec3cd 10240 return;
10241 } else {
10242 int j;
10243
10244 for (j = 1; j < c->argc; j++)
ffc6b7f8 10245 pubsubUnsubscribePattern(c,c->argv[j],1);
befec3cd 10246 }
10247}
10248
10249static void publishCommand(redisClient *c) {
10250 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
482b672d 10251 addReplyLongLong(c,receivers);
befec3cd 10252}
10253
7f957c92 10254/* ================================= Debugging ============================== */
10255
ba798261 10256/* Compute the sha1 of string at 's' with 'len' bytes long.
10257 * The SHA1 is then xored againt the string pointed by digest.
10258 * Since xor is commutative, this operation is used in order to
10259 * "add" digests relative to unordered elements.
10260 *
10261 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
10262static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
10263 SHA1_CTX ctx;
10264 unsigned char hash[20], *s = ptr;
10265 int j;
10266
10267 SHA1Init(&ctx);
10268 SHA1Update(&ctx,s,len);
10269 SHA1Final(hash,&ctx);
10270
10271 for (j = 0; j < 20; j++)
10272 digest[j] ^= hash[j];
10273}
10274
10275static void xorObjectDigest(unsigned char *digest, robj *o) {
10276 o = getDecodedObject(o);
10277 xorDigest(digest,o->ptr,sdslen(o->ptr));
10278 decrRefCount(o);
10279}
10280
10281/* This function instead of just computing the SHA1 and xoring it
10282 * against diget, also perform the digest of "digest" itself and
10283 * replace the old value with the new one.
10284 *
10285 * So the final digest will be:
10286 *
10287 * digest = SHA1(digest xor SHA1(data))
10288 *
10289 * This function is used every time we want to preserve the order so
10290 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
10291 *
10292 * Also note that mixdigest("foo") followed by mixdigest("bar")
10293 * will lead to a different digest compared to "fo", "obar".
10294 */
10295static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
10296 SHA1_CTX ctx;
10297 char *s = ptr;
10298
10299 xorDigest(digest,s,len);
10300 SHA1Init(&ctx);
10301 SHA1Update(&ctx,digest,20);
10302 SHA1Final(digest,&ctx);
10303}
10304
10305static void mixObjectDigest(unsigned char *digest, robj *o) {
10306 o = getDecodedObject(o);
10307 mixDigest(digest,o->ptr,sdslen(o->ptr));
10308 decrRefCount(o);
10309}
10310
10311/* Compute the dataset digest. Since keys, sets elements, hashes elements
10312 * are not ordered, we use a trick: every aggregate digest is the xor
10313 * of the digests of their elements. This way the order will not change
10314 * the result. For list instead we use a feedback entering the output digest
10315 * as input in order to ensure that a different ordered list will result in
10316 * a different digest. */
10317static void computeDatasetDigest(unsigned char *final) {
10318 unsigned char digest[20];
10319 char buf[128];
10320 dictIterator *di = NULL;
10321 dictEntry *de;
10322 int j;
10323 uint32_t aux;
10324
10325 memset(final,0,20); /* Start with a clean result */
10326
10327 for (j = 0; j < server.dbnum; j++) {
10328 redisDb *db = server.db+j;
10329
10330 if (dictSize(db->dict) == 0) continue;
10331 di = dictGetIterator(db->dict);
10332
10333 /* hash the DB id, so the same dataset moved in a different
10334 * DB will lead to a different digest */
10335 aux = htonl(j);
10336 mixDigest(final,&aux,sizeof(aux));
10337
10338 /* Iterate this DB writing every entry */
10339 while((de = dictNext(di)) != NULL) {
10340 robj *key, *o;
10341 time_t expiretime;
10342
10343 memset(digest,0,20); /* This key-val digest */
10344 key = dictGetEntryKey(de);
10345 mixObjectDigest(digest,key);
10346 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
10347 key->storage == REDIS_VM_SWAPPING) {
10348 o = dictGetEntryVal(de);
10349 incrRefCount(o);
10350 } else {
10351 o = vmPreviewObject(key);
10352 }
10353 aux = htonl(o->type);
10354 mixDigest(digest,&aux,sizeof(aux));
10355 expiretime = getExpire(db,key);
10356
10357 /* Save the key and associated value */
10358 if (o->type == REDIS_STRING) {
10359 mixObjectDigest(digest,o);
10360 } else if (o->type == REDIS_LIST) {
10361 list *list = o->ptr;
10362 listNode *ln;
10363 listIter li;
10364
10365 listRewind(list,&li);
10366 while((ln = listNext(&li))) {
10367 robj *eleobj = listNodeValue(ln);
10368
10369 mixObjectDigest(digest,eleobj);
10370 }
10371 } else if (o->type == REDIS_SET) {
10372 dict *set = o->ptr;
10373 dictIterator *di = dictGetIterator(set);
10374 dictEntry *de;
10375
10376 while((de = dictNext(di)) != NULL) {
10377 robj *eleobj = dictGetEntryKey(de);
10378
10379 xorObjectDigest(digest,eleobj);
10380 }
10381 dictReleaseIterator(di);
10382 } else if (o->type == REDIS_ZSET) {
10383 zset *zs = o->ptr;
10384 dictIterator *di = dictGetIterator(zs->dict);
10385 dictEntry *de;
10386
10387 while((de = dictNext(di)) != NULL) {
10388 robj *eleobj = dictGetEntryKey(de);
10389 double *score = dictGetEntryVal(de);
10390 unsigned char eledigest[20];
10391
10392 snprintf(buf,sizeof(buf),"%.17g",*score);
10393 memset(eledigest,0,20);
10394 mixObjectDigest(eledigest,eleobj);
10395 mixDigest(eledigest,buf,strlen(buf));
10396 xorDigest(digest,eledigest,20);
10397 }
10398 dictReleaseIterator(di);
10399 } else if (o->type == REDIS_HASH) {
10400 hashIterator *hi;
10401 robj *obj;
10402
10403 hi = hashInitIterator(o);
10404 while (hashNext(hi) != REDIS_ERR) {
10405 unsigned char eledigest[20];
10406
10407 memset(eledigest,0,20);
10408 obj = hashCurrent(hi,REDIS_HASH_KEY);
10409 mixObjectDigest(eledigest,obj);
10410 decrRefCount(obj);
10411 obj = hashCurrent(hi,REDIS_HASH_VALUE);
10412 mixObjectDigest(eledigest,obj);
10413 decrRefCount(obj);
10414 xorDigest(digest,eledigest,20);
10415 }
10416 hashReleaseIterator(hi);
10417 } else {
10418 redisPanic("Unknown object type");
10419 }
10420 decrRefCount(o);
10421 /* If the key has an expire, add it to the mix */
10422 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
10423 /* We can finally xor the key-val digest to the final digest */
10424 xorDigest(final,digest,20);
10425 }
10426 dictReleaseIterator(di);
10427 }
10428}
10429
7f957c92 10430static void debugCommand(redisClient *c) {
10431 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10432 *((char*)-1) = 'x';
210e29f7 10433 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10434 if (rdbSave(server.dbfilename) != REDIS_OK) {
10435 addReply(c,shared.err);
10436 return;
10437 }
10438 emptyDb();
10439 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10440 addReply(c,shared.err);
10441 return;
10442 }
10443 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10444 addReply(c,shared.ok);
71c2b467 10445 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10446 emptyDb();
10447 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10448 addReply(c,shared.err);
10449 return;
10450 }
10451 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10452 addReply(c,shared.ok);
333298da 10453 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10454 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10455 robj *key, *val;
10456
10457 if (!de) {
10458 addReply(c,shared.nokeyerr);
10459 return;
10460 }
10461 key = dictGetEntryKey(de);
10462 val = dictGetEntryVal(de);
59146ef3 10463 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
10464 key->storage == REDIS_VM_SWAPPING)) {
07efaf74 10465 char *strenc;
10466 char buf[128];
10467
10468 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10469 strenc = strencoding[val->encoding];
10470 } else {
10471 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10472 strenc = buf;
10473 }
ace06542 10474 addReplySds(c,sdscatprintf(sdsempty(),
10475 "+Key at:%p refcount:%d, value at:%p refcount:%d "
07efaf74 10476 "encoding:%s serializedlength:%lld\r\n",
682ac724 10477 (void*)key, key->refcount, (void*)val, val->refcount,
07efaf74 10478 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 10479 } else {
10480 addReplySds(c,sdscatprintf(sdsempty(),
10481 "+Key at:%p refcount:%d, value swapped at: page %llu "
10482 "using %llu pages\r\n",
10483 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10484 (unsigned long long) key->vm.usedpages));
10485 }
78ebe4c8 10486 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10487 lookupKeyRead(c->db,c->argv[2]);
10488 addReply(c,shared.ok);
7d30035d 10489 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10490 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10491 robj *key, *val;
10492
10493 if (!server.vm_enabled) {
10494 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10495 return;
10496 }
10497 if (!de) {
10498 addReply(c,shared.nokeyerr);
10499 return;
10500 }
10501 key = dictGetEntryKey(de);
10502 val = dictGetEntryVal(de);
4ef8de8a 10503 /* If the key is shared we want to create a copy */
10504 if (key->refcount > 1) {
10505 robj *newkey = dupStringObject(key);
10506 decrRefCount(key);
10507 key = dictGetEntryKey(de) = newkey;
10508 }
10509 /* Swap it */
7d30035d 10510 if (key->storage != REDIS_VM_MEMORY) {
10511 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 10512 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 10513 dictGetEntryVal(de) = NULL;
10514 addReply(c,shared.ok);
10515 } else {
10516 addReply(c,shared.err);
10517 }
59305dc7 10518 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
10519 long keys, j;
10520 robj *key, *val;
10521 char buf[128];
10522
10523 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
10524 return;
10525 for (j = 0; j < keys; j++) {
10526 snprintf(buf,sizeof(buf),"key:%lu",j);
10527 key = createStringObject(buf,strlen(buf));
10528 if (lookupKeyRead(c->db,key) != NULL) {
10529 decrRefCount(key);
10530 continue;
10531 }
10532 snprintf(buf,sizeof(buf),"value:%lu",j);
10533 val = createStringObject(buf,strlen(buf));
10534 dictAdd(c->db->dict,key,val);
10535 }
10536 addReply(c,shared.ok);
ba798261 10537 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
10538 unsigned char digest[20];
10539 sds d = sdsnew("+");
10540 int j;
10541
10542 computeDatasetDigest(digest);
10543 for (j = 0; j < 20; j++)
10544 d = sdscatprintf(d, "%02x",digest[j]);
10545
10546 d = sdscatlen(d,"\r\n",2);
10547 addReplySds(c,d);
7f957c92 10548 } else {
333298da 10549 addReplySds(c,sdsnew(
bdcb92f2 10550 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 10551 }
10552}
56906eef 10553
6c96ba7d 10554static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 10555 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
6c96ba7d 10556 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
dfc5e96c 10557#ifdef HAVE_BACKTRACE
10558 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10559 *((char*)-1) = 'x';
10560#endif
10561}
10562
c651fd9e 10563static void _redisPanic(char *msg, char *file, int line) {
10564 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
17772754 10565 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
c651fd9e 10566#ifdef HAVE_BACKTRACE
10567 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10568 *((char*)-1) = 'x';
10569#endif
10570}
10571
bcfc686d 10572/* =================================== Main! ================================ */
56906eef 10573
bcfc686d 10574#ifdef __linux__
10575int linuxOvercommitMemoryValue(void) {
10576 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10577 char buf[64];
56906eef 10578
bcfc686d 10579 if (!fp) return -1;
10580 if (fgets(buf,64,fp) == NULL) {
10581 fclose(fp);
10582 return -1;
10583 }
10584 fclose(fp);
56906eef 10585
bcfc686d 10586 return atoi(buf);
10587}
10588
10589void linuxOvercommitMemoryWarning(void) {
10590 if (linuxOvercommitMemoryValue() == 0) {
7ccd2d0a 10591 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
bcfc686d 10592 }
10593}
10594#endif /* __linux__ */
10595
10596static void daemonize(void) {
10597 int fd;
10598 FILE *fp;
10599
10600 if (fork() != 0) exit(0); /* parent exits */
10601 setsid(); /* create a new session */
10602
10603 /* Every output goes to /dev/null. If Redis is daemonized but
10604 * the 'logfile' is set to 'stdout' in the configuration file
10605 * it will not log at all. */
10606 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10607 dup2(fd, STDIN_FILENO);
10608 dup2(fd, STDOUT_FILENO);
10609 dup2(fd, STDERR_FILENO);
10610 if (fd > STDERR_FILENO) close(fd);
10611 }
10612 /* Try to write the pid file */
10613 fp = fopen(server.pidfile,"w");
10614 if (fp) {
10615 fprintf(fp,"%d\n",getpid());
10616 fclose(fp);
56906eef 10617 }
56906eef 10618}
10619
42ab0172
AO
10620static void version() {
10621 printf("Redis server version %s\n", REDIS_VERSION);
10622 exit(0);
10623}
10624
723fb69b
AO
10625static void usage() {
10626 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 10627 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
10628 exit(1);
10629}
10630
bcfc686d 10631int main(int argc, char **argv) {
9651a787 10632 time_t start;
10633
bcfc686d 10634 initServerConfig();
10635 if (argc == 2) {
44efe66e 10636 if (strcmp(argv[1], "-v") == 0 ||
10637 strcmp(argv[1], "--version") == 0) version();
10638 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 10639 resetServerSaveParams();
10640 loadServerConfig(argv[1]);
723fb69b
AO
10641 } else if ((argc > 2)) {
10642 usage();
bcfc686d 10643 } else {
10644 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10645 }
bcfc686d 10646 if (server.daemonize) daemonize();
71c54b21 10647 initServer();
bcfc686d 10648 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10649#ifdef __linux__
10650 linuxOvercommitMemoryWarning();
10651#endif
9651a787 10652 start = time(NULL);
bcfc686d 10653 if (server.appendonly) {
10654 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 10655 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 10656 } else {
10657 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 10658 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 10659 }
bcfc686d 10660 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 10661 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 10662 aeMain(server.el);
10663 aeDeleteEventLoop(server.el);
10664 return 0;
10665}
10666
10667/* ============================= Backtrace support ========================= */
10668
10669#ifdef HAVE_BACKTRACE
10670static char *findFuncName(void *pointer, unsigned long *offset);
10671
56906eef 10672static void *getMcontextEip(ucontext_t *uc) {
10673#if defined(__FreeBSD__)
10674 return (void*) uc->uc_mcontext.mc_eip;
10675#elif defined(__dietlibc__)
10676 return (void*) uc->uc_mcontext.eip;
06db1f50 10677#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 10678 #if __x86_64__
10679 return (void*) uc->uc_mcontext->__ss.__rip;
10680 #else
56906eef 10681 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 10682 #endif
06db1f50 10683#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 10684 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 10685 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 10686 #else
10687 return (void*) uc->uc_mcontext->__ss.__eip;
e0a62c7f 10688 #endif
54bac49d 10689#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 10690 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 10691#elif defined(__ia64__) /* Linux IA64 */
10692 return (void*) uc->uc_mcontext.sc_ip;
10693#else
10694 return NULL;
56906eef 10695#endif
10696}
10697
10698static void segvHandler(int sig, siginfo_t *info, void *secret) {
10699 void *trace[100];
10700 char **messages = NULL;
10701 int i, trace_size = 0;
10702 unsigned long offset=0;
56906eef 10703 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 10704 sds infostring;
56906eef 10705 REDIS_NOTUSED(info);
10706
10707 redisLog(REDIS_WARNING,
10708 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 10709 infostring = genRedisInfoString();
10710 redisLog(REDIS_WARNING, "%s",infostring);
10711 /* It's not safe to sdsfree() the returned string under memory
10712 * corruption conditions. Let it leak as we are going to abort */
e0a62c7f 10713
56906eef 10714 trace_size = backtrace(trace, 100);
de96dbfe 10715 /* overwrite sigaction with caller's address */
b91cf5ef 10716 if (getMcontextEip(uc) != NULL) {
10717 trace[1] = getMcontextEip(uc);
10718 }
56906eef 10719 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 10720
d76412d1 10721 for (i=1; i<trace_size; ++i) {
56906eef 10722 char *fn = findFuncName(trace[i], &offset), *p;
10723
10724 p = strchr(messages[i],'+');
10725 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
10726 redisLog(REDIS_WARNING,"%s", messages[i]);
10727 } else {
10728 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
10729 }
10730 }
b177fd30 10731 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 10732 _exit(0);
fe3bbfbe 10733}
56906eef 10734
10735static void setupSigSegvAction(void) {
10736 struct sigaction act;
10737
10738 sigemptyset (&act.sa_mask);
10739 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10740 * is used. Otherwise, sa_handler is used */
10741 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10742 act.sa_sigaction = segvHandler;
10743 sigaction (SIGSEGV, &act, NULL);
10744 sigaction (SIGBUS, &act, NULL);
12fea928 10745 sigaction (SIGFPE, &act, NULL);
10746 sigaction (SIGILL, &act, NULL);
10747 sigaction (SIGBUS, &act, NULL);
e65fdc78 10748 return;
56906eef 10749}
e65fdc78 10750
bcfc686d 10751#include "staticsymbols.h"
10752/* This function try to convert a pointer into a function name. It's used in
10753 * oreder to provide a backtrace under segmentation fault that's able to
10754 * display functions declared as static (otherwise the backtrace is useless). */
10755static char *findFuncName(void *pointer, unsigned long *offset){
10756 int i, ret = -1;
10757 unsigned long off, minoff = 0;
ed9b544e 10758
bcfc686d 10759 /* Try to match against the Symbol with the smallest offset */
10760 for (i=0; symsTable[i].pointer; i++) {
10761 unsigned long lp = (unsigned long) pointer;
0bc03378 10762
bcfc686d 10763 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
10764 off=lp-symsTable[i].pointer;
10765 if (ret < 0 || off < minoff) {
10766 minoff=off;
10767 ret=i;
10768 }
10769 }
0bc03378 10770 }
bcfc686d 10771 if (ret == -1) return NULL;
10772 *offset = minoff;
10773 return symsTable[ret].name;
0bc03378 10774}
bcfc686d 10775#else /* HAVE_BACKTRACE */
10776static void setupSigSegvAction(void) {
0bc03378 10777}
bcfc686d 10778#endif /* HAVE_BACKTRACE */
0bc03378 10779
ed9b544e 10780
ed9b544e 10781
bcfc686d 10782/* The End */
10783
10784
ed9b544e 10785