]> git.saurik.com Git - redis.git/blame - redis.c
long long to string conversion speedup applied in other places as well. Still the...
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
24df7698 30#define REDIS_VERSION "1.3.10"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
40#include <signal.h>
fbf9bcdb 41
42#ifdef HAVE_BACKTRACE
c9468bcf 43#include <execinfo.h>
44#include <ucontext.h>
fbf9bcdb 45#endif /* HAVE_BACKTRACE */
46
ed9b544e 47#include <sys/wait.h>
48#include <errno.h>
49#include <assert.h>
50#include <ctype.h>
51#include <stdarg.h>
52#include <inttypes.h>
53#include <arpa/inet.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#include <sys/time.h>
57#include <sys/resource.h>
2895e862 58#include <sys/uio.h>
f78fd11b 59#include <limits.h>
a7866db6 60#include <math.h>
92f8e882 61#include <pthread.h>
0bc1b2f6 62
63#if defined(__sun)
5043dff3 64#include "solarisfixes.h"
65#endif
ed9b544e 66
c9468bcf 67#include "redis.h"
ed9b544e 68#include "ae.h" /* Event driven programming library */
69#include "sds.h" /* Dynamic safe strings */
70#include "anet.h" /* Networking the easy way */
71#include "dict.h" /* Hash tables */
72#include "adlist.h" /* Linked lists */
73#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 74#include "lzf.h" /* LZF compression library */
75#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
5234952b 76#include "zipmap.h"
ed9b544e 77
78/* Error codes */
79#define REDIS_OK 0
80#define REDIS_ERR -1
81
82/* Static server configuration */
83#define REDIS_SERVERPORT 6379 /* TCP port */
84#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 85#define REDIS_IOBUF_LEN 1024
ed9b544e 86#define REDIS_LOADBUF_LEN 1024
248ea310 87#define REDIS_STATIC_ARGS 8
ed9b544e 88#define REDIS_DEFAULT_DBNUM 16
89#define REDIS_CONFIGLINE_MAX 1024
90#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
91#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
8ca3e9d1 92#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
6f376729 93#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 94#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
95
96/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
97#define REDIS_WRITEV_THRESHOLD 3
98/* Max number of iovecs used for each writev call */
99#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 100
101/* Hash table parameters */
102#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 103
104/* Command flags */
3fd78bcd 105#define REDIS_CMD_BULK 1 /* Bulk write command */
106#define REDIS_CMD_INLINE 2 /* Inline command */
107/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
108 this flags will return an error when the 'maxmemory' option is set in the
109 config file and the server is using more than maxmemory bytes of memory.
110 In short this commands are denied on low memory conditions. */
111#define REDIS_CMD_DENYOOM 4
4005fef1 112#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
ed9b544e 113
114/* Object types */
115#define REDIS_STRING 0
116#define REDIS_LIST 1
117#define REDIS_SET 2
1812e024 118#define REDIS_ZSET 3
119#define REDIS_HASH 4
f78fd11b 120
5234952b 121/* Objects encoding. Some kind of objects like Strings and Hashes can be
122 * internally represented in multiple ways. The 'encoding' field of the object
123 * is set to one of this fields for this object. */
942a3961 124#define REDIS_ENCODING_RAW 0 /* Raw representation */
125#define REDIS_ENCODING_INT 1 /* Encoded as integer */
5234952b 126#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
127#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
942a3961 128
07efaf74 129static char* strencoding[] = {
130 "raw", "int", "zipmap", "hashtable"
131};
132
f78fd11b 133/* Object types only used for dumping to disk */
bb32ede5 134#define REDIS_EXPIRETIME 253
ed9b544e 135#define REDIS_SELECTDB 254
136#define REDIS_EOF 255
137
f78fd11b 138/* Defines related to the dump file format. To store 32 bits lengths for short
139 * keys requires a lot of space, so we check the most significant 2 bits of
140 * the first byte to interpreter the length:
141 *
142 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
143 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
144 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 145 * 11|000000 this means: specially encoded object will follow. The six bits
146 * number specify the kind of object that follows.
147 * See the REDIS_RDB_ENC_* defines.
f78fd11b 148 *
10c43610 149 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
150 * values, will fit inside. */
f78fd11b 151#define REDIS_RDB_6BITLEN 0
152#define REDIS_RDB_14BITLEN 1
153#define REDIS_RDB_32BITLEN 2
17be1a4a 154#define REDIS_RDB_ENCVAL 3
f78fd11b 155#define REDIS_RDB_LENERR UINT_MAX
156
a4d1ba9a 157/* When a length of a string object stored on disk has the first two bits
158 * set, the remaining two bits specify a special encoding for the object
159 * accordingly to the following defines: */
160#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
161#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
162#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 163#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 164
75680a3c 165/* Virtual memory object->where field. */
166#define REDIS_VM_MEMORY 0 /* The object is on memory */
167#define REDIS_VM_SWAPPED 1 /* The object is on disk */
168#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
169#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
170
06224fec 171/* Virtual memory static configuration stuff.
172 * Check vmFindContiguousPages() to know more about this magic numbers. */
173#define REDIS_VM_MAX_NEAR_PAGES 65536
174#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 175#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 176#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 177/* The following is the *percentage* of completed I/O jobs to process when the
178 * handelr is called. While Virtual Memory I/O operations are performed by
179 * threads, this operations must be processed by the main thread when completed
180 * in order to take effect. */
c953f24b 181#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 182
ed9b544e 183/* Client flags */
d5d55fc3 184#define REDIS_SLAVE 1 /* This client is a slave server */
185#define REDIS_MASTER 2 /* This client is a master server */
186#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
187#define REDIS_MULTI 8 /* This client is in a MULTI context */
188#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
189#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
ed9b544e 190
40d224a9 191/* Slave replication state - slave side */
ed9b544e 192#define REDIS_REPL_NONE 0 /* No active replication */
193#define REDIS_REPL_CONNECT 1 /* Must connect to master */
194#define REDIS_REPL_CONNECTED 2 /* Connected to master */
195
40d224a9 196/* Slave replication state - from the point of view of master
197 * Note that in SEND_BULK and ONLINE state the slave receives new updates
198 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
199 * to start the next background saving in order to send updates to it. */
200#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
201#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
202#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
203#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
204
ed9b544e 205/* List related stuff */
206#define REDIS_HEAD 0
207#define REDIS_TAIL 1
208
209/* Sort operations */
210#define REDIS_SORT_GET 0
443c6409 211#define REDIS_SORT_ASC 1
212#define REDIS_SORT_DESC 2
ed9b544e 213#define REDIS_SORTKEY_MAX 1024
214
215/* Log levels */
216#define REDIS_DEBUG 0
f870935d 217#define REDIS_VERBOSE 1
218#define REDIS_NOTICE 2
219#define REDIS_WARNING 3
ed9b544e 220
221/* Anti-warning macro... */
222#define REDIS_NOTUSED(V) ((void) V)
223
6b47e12e 224#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
225#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 226
48f0308a 227/* Append only defines */
228#define APPENDFSYNC_NO 0
229#define APPENDFSYNC_ALWAYS 1
230#define APPENDFSYNC_EVERYSEC 2
231
cbba7dd7 232/* Hashes related defaults */
233#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
234#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
235
dfc5e96c 236/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 237#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
c651fd9e 238#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
6c96ba7d 239static void _redisAssert(char *estr, char *file, int line);
c651fd9e 240static void _redisPanic(char *msg, char *file, int line);
dfc5e96c 241
ed9b544e 242/*================================= Data types ============================== */
243
244/* A redis object, that is a type able to hold a string / list / set */
75680a3c 245
246/* The VM object structure */
247struct redisObjectVM {
3a66edc7 248 off_t page; /* the page at witch the object is stored on disk */
249 off_t usedpages; /* number of pages used on disk */
250 time_t atime; /* Last access time */
75680a3c 251} vm;
252
253/* The actual Redis Object */
ed9b544e 254typedef struct redisObject {
ed9b544e 255 void *ptr;
942a3961 256 unsigned char type;
257 unsigned char encoding;
d894161b 258 unsigned char storage; /* If this object is a key, where is the value?
259 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
260 unsigned char vtype; /* If this object is a key, and value is swapped out,
261 * this is the type of the swapped out object. */
ed9b544e 262 int refcount;
75680a3c 263 /* VM fields, this are only allocated if VM is active, otherwise the
264 * object allocation function will just allocate
265 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
266 * Redis without VM active will not have any overhead. */
267 struct redisObjectVM vm;
ed9b544e 268} robj;
269
dfc5e96c 270/* Macro used to initalize a Redis object allocated on the stack.
271 * Note that this macro is taken near the structure definition to make sure
272 * we'll update it when the structure is changed, to avoid bugs like
273 * bug #85 introduced exactly in this way. */
274#define initStaticStringObject(_var,_ptr) do { \
275 _var.refcount = 1; \
276 _var.type = REDIS_STRING; \
277 _var.encoding = REDIS_ENCODING_RAW; \
278 _var.ptr = _ptr; \
3a66edc7 279 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 280} while(0);
281
3305306f 282typedef struct redisDb {
4409877e 283 dict *dict; /* The keyspace for this DB */
284 dict *expires; /* Timeout of keys with a timeout set */
285 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 286 dict *io_keys; /* Keys with clients waiting for VM I/O */
3305306f 287 int id;
288} redisDb;
289
6e469882 290/* Client MULTI/EXEC state */
291typedef struct multiCmd {
292 robj **argv;
293 int argc;
294 struct redisCommand *cmd;
295} multiCmd;
296
297typedef struct multiState {
298 multiCmd *commands; /* Array of MULTI commands */
299 int count; /* Total number of MULTI commands */
300} multiState;
301
ed9b544e 302/* With multiplexing we need to take per-clinet state.
303 * Clients are taken in a liked list. */
304typedef struct redisClient {
305 int fd;
3305306f 306 redisDb *db;
ed9b544e 307 int dictid;
308 sds querybuf;
e8a74421 309 robj **argv, **mbargv;
310 int argc, mbargc;
40d224a9 311 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 312 int multibulk; /* multi bulk command format active */
ed9b544e 313 list *reply;
314 int sentlen;
315 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 316 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 317 int slaveseldb; /* slave selected db, if this client is a slave */
318 int authenticated; /* when requirepass is non-NULL */
319 int replstate; /* replication state if this is a slave */
320 int repldbfd; /* replication DB file descriptor */
6e469882 321 long repldboff; /* replication DB file offset */
40d224a9 322 off_t repldbsize; /* replication DB file size */
6e469882 323 multiState mstate; /* MULTI/EXEC state */
d5d55fc3 324 robj **blockingkeys; /* The key we are waiting to terminate a blocking
4409877e 325 * operation such as BLPOP. Otherwise NULL. */
b177fd30 326 int blockingkeysnum; /* Number of blocking keys */
4409877e 327 time_t blockingto; /* Blocking operation timeout. If UNIX current time
328 * is >= blockingto then the operation timed out. */
92f8e882 329 list *io_keys; /* Keys this client is waiting to be loaded from the
330 * swap file in order to continue. */
ffc6b7f8 331 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
332 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
ed9b544e 333} redisClient;
334
335struct saveparam {
336 time_t seconds;
337 int changes;
338};
339
340/* Global server state structure */
341struct redisServer {
342 int port;
343 int fd;
3305306f 344 redisDb *db;
ed9b544e 345 long long dirty; /* changes to DB from the last save */
346 list *clients;
87eca727 347 list *slaves, *monitors;
ed9b544e 348 char neterr[ANET_ERR_LEN];
349 aeEventLoop *el;
350 int cronloops; /* number of times the cron function run */
351 list *objfreelist; /* A list of freed objects to avoid malloc() */
352 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 353 /* Fields used only for stats */
354 time_t stat_starttime; /* server start time */
355 long long stat_numcommands; /* number of processed commands */
356 long long stat_numconnections; /* number of connections received */
2a6a2ed1 357 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 358 /* Configuration */
359 int verbosity;
360 int glueoutputbuf;
361 int maxidletime;
362 int dbnum;
363 int daemonize;
44b38ef4 364 int appendonly;
48f0308a 365 int appendfsync;
366 time_t lastfsync;
44b38ef4 367 int appendfd;
368 int appendseldb;
ed329fcf 369 char *pidfile;
9f3c422c 370 pid_t bgsavechildpid;
9d65a1bb 371 pid_t bgrewritechildpid;
372 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
28ed1f33 373 sds aofbuf; /* AOF buffer, written before entering the event loop */
ed9b544e 374 struct saveparam *saveparams;
375 int saveparamslen;
376 char *logfile;
377 char *bindaddr;
378 char *dbfilename;
44b38ef4 379 char *appendfilename;
abcb223e 380 char *requirepass;
121f70cf 381 int rdbcompression;
8ca3e9d1 382 int activerehashing;
ed9b544e 383 /* Replication related */
384 int isslave;
d0ccebcf 385 char *masterauth;
ed9b544e 386 char *masterhost;
387 int masterport;
40d224a9 388 redisClient *master; /* client that is master for this slave */
ed9b544e 389 int replstate;
285add55 390 unsigned int maxclients;
4ef8de8a 391 unsigned long long maxmemory;
d5d55fc3 392 unsigned int blpop_blocked_clients;
393 unsigned int vm_blocked_clients;
ed9b544e 394 /* Sort parameters - qsort_r() is only available under BSD so we
395 * have to take this state global, in order to pass it to sortCompare() */
396 int sort_desc;
397 int sort_alpha;
398 int sort_bypattern;
75680a3c 399 /* Virtual memory configuration */
400 int vm_enabled;
054e426d 401 char *vm_swap_file;
75680a3c 402 off_t vm_page_size;
403 off_t vm_pages;
4ef8de8a 404 unsigned long long vm_max_memory;
cbba7dd7 405 /* Hashes config */
406 size_t hash_max_zipmap_entries;
407 size_t hash_max_zipmap_value;
75680a3c 408 /* Virtual memory state */
409 FILE *vm_fp;
410 int vm_fd;
411 off_t vm_next_page; /* Next probably empty page */
412 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 413 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 414 time_t unixtime; /* Unix time sampled every second. */
92f8e882 415 /* Virtual memory I/O threads stuff */
92f8e882 416 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 417 * put the result of the operation in the io_done list. While the
418 * job is being processed, it's put on io_processing queue. */
419 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
420 list *io_processing; /* List of VM I/O jobs being processed */
421 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 422 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 423 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 424 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
425 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 426 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 427 int io_active_threads; /* Number of running I/O threads */
428 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 429 /* Our main thread is blocked on the event loop, locking for sockets ready
430 * to be read or written, so when a threaded I/O operation is ready to be
431 * processed by the main thread, the I/O thread will use a unix pipe to
432 * awake the main thread. The followings are the two pipe FDs. */
433 int io_ready_pipe_read;
434 int io_ready_pipe_write;
7d98e08c 435 /* Virtual memory stats */
436 unsigned long long vm_stats_used_pages;
437 unsigned long long vm_stats_swapped_objects;
438 unsigned long long vm_stats_swapouts;
439 unsigned long long vm_stats_swapins;
befec3cd 440 /* Pubsub */
ffc6b7f8 441 dict *pubsub_channels; /* Map channels to list of subscribed clients */
442 list *pubsub_patterns; /* A list of pubsub_patterns */
befec3cd 443 /* Misc */
b9bc0eef 444 FILE *devnull;
ed9b544e 445};
446
ffc6b7f8 447typedef struct pubsubPattern {
448 redisClient *client;
449 robj *pattern;
450} pubsubPattern;
451
ed9b544e 452typedef void redisCommandProc(redisClient *c);
453struct redisCommand {
454 char *name;
455 redisCommandProc *proc;
456 int arity;
457 int flags;
76583ea4
PN
458 /* Use a function to determine which keys need to be loaded
459 * in the background prior to executing this command. Takes precedence
460 * over vm_firstkey and others, ignored when NULL */
461 redisCommandProc *vm_preload_proc;
7c775e09 462 /* What keys should be loaded in background when calling this command? */
463 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
464 int vm_lastkey; /* THe last argument that's a key */
465 int vm_keystep; /* The step between first and last key */
ed9b544e 466};
467
de96dbfe 468struct redisFunctionSym {
469 char *name;
56906eef 470 unsigned long pointer;
de96dbfe 471};
472
ed9b544e 473typedef struct _redisSortObject {
474 robj *obj;
475 union {
476 double score;
477 robj *cmpobj;
478 } u;
479} redisSortObject;
480
481typedef struct _redisSortOperation {
482 int type;
483 robj *pattern;
484} redisSortOperation;
485
6b47e12e 486/* ZSETs use a specialized version of Skiplists */
487
488typedef struct zskiplistNode {
489 struct zskiplistNode **forward;
e3870fab 490 struct zskiplistNode *backward;
912b9165 491 unsigned int *span;
6b47e12e 492 double score;
493 robj *obj;
494} zskiplistNode;
495
496typedef struct zskiplist {
e3870fab 497 struct zskiplistNode *header, *tail;
d13f767c 498 unsigned long length;
6b47e12e 499 int level;
500} zskiplist;
501
1812e024 502typedef struct zset {
503 dict *dict;
6b47e12e 504 zskiplist *zsl;
1812e024 505} zset;
506
6b47e12e 507/* Our shared "common" objects */
508
05df7621 509#define REDIS_SHARED_INTEGERS 10000
ed9b544e 510struct sharedObjectsStruct {
c937aa89 511 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 512 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 513 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
514 *outofrangeerr, *plus,
ed9b544e 515 *select0, *select1, *select2, *select3, *select4,
befec3cd 516 *select5, *select6, *select7, *select8, *select9,
c8d0ea0e 517 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
518 *mbulk4, *psubscribebulk, *punsubscribebulk,
519 *integers[REDIS_SHARED_INTEGERS];
ed9b544e 520} shared;
521
a7866db6 522/* Global vars that are actally used as constants. The following double
523 * values are used for double on-disk serialization, and are initialized
524 * at runtime to avoid strange compiler optimizations. */
525
526static double R_Zero, R_PosInf, R_NegInf, R_Nan;
527
92f8e882 528/* VM threaded I/O request message */
b9bc0eef 529#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
530#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
531#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 532typedef struct iojob {
996cb5f7 533 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 534 redisDb *db;/* Redis database */
92f8e882 535 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 536 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 537 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
538 off_t page; /* Swap page where to read/write the object */
248ea310 539 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 540 int canceled; /* True if this command was canceled by blocking side of VM */
541 pthread_t thread; /* ID of the thread processing this entry */
542} iojob;
92f8e882 543
ed9b544e 544/*================================ Prototypes =============================== */
545
546static void freeStringObject(robj *o);
547static void freeListObject(robj *o);
548static void freeSetObject(robj *o);
549static void decrRefCount(void *o);
550static robj *createObject(int type, void *ptr);
551static void freeClient(redisClient *c);
f78fd11b 552static int rdbLoad(char *filename);
ed9b544e 553static void addReply(redisClient *c, robj *obj);
554static void addReplySds(redisClient *c, sds s);
555static void incrRefCount(robj *o);
f78fd11b 556static int rdbSaveBackground(char *filename);
ed9b544e 557static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 558static robj *dupStringObject(robj *o);
248ea310 559static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
dd142b9c 560static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
28ed1f33 561static void flushAppendOnlyFile(void);
44b38ef4 562static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 563static int syncWithMaster(void);
05df7621 564static robj *tryObjectEncoding(robj *o);
9d65a1bb 565static robj *getDecodedObject(robj *o);
3305306f 566static int removeExpire(redisDb *db, robj *key);
567static int expireIfNeeded(redisDb *db, robj *key);
568static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 569static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 570static int deleteKey(redisDb *db, robj *key);
bb32ede5 571static time_t getExpire(redisDb *db, robj *key);
572static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 573static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 574static void freeMemoryIfNeeded(void);
de96dbfe 575static int processCommand(redisClient *c);
56906eef 576static void setupSigSegvAction(void);
a3b21203 577static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 578static void aofRemoveTempFile(pid_t childpid);
0ea663ea 579static size_t stringObjectLen(robj *o);
638e42ac 580static void processInputBuffer(redisClient *c);
6b47e12e 581static zskiplist *zslCreate(void);
fd8ccf44 582static void zslFree(zskiplist *zsl);
2b59cfdf 583static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 584static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 585static void initClientMultiState(redisClient *c);
586static void freeClientMultiState(redisClient *c);
587static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 588static void unblockClientWaitingData(redisClient *c);
4409877e 589static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 590static void vmInit(void);
a35ddf12 591static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 592static robj *vmLoadObject(robj *key);
7e69548d 593static robj *vmPreviewObject(robj *key);
a69a0c9c 594static int vmSwapOneObjectBlocking(void);
595static int vmSwapOneObjectThreaded(void);
7e69548d 596static int vmCanSwapOut(void);
a5819310 597static int tryFreeOneObjectFromFreelist(void);
996cb5f7 598static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
599static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
600static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 601static void lockThreadedIO(void);
602static void unlockThreadedIO(void);
603static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
604static void freeIOJob(iojob *j);
605static void queueIOJob(iojob *j);
a5819310 606static int vmWriteObjectOnSwap(robj *o, off_t page);
607static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 608static void waitEmptyIOJobsQueue(void);
609static void vmReopenSwapFile(void);
970e10bb 610static int vmFreePage(off_t page);
76583ea4 611static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
d5d55fc3 612static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
613static int dontWaitForSwappedKey(redisClient *c, robj *key);
614static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
615static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
616static struct redisCommand *lookupCommand(char *name);
617static void call(redisClient *c, struct redisCommand *cmd);
618static void resetClient(redisClient *c);
ada386b2 619static void convertToRealHash(robj *o);
ffc6b7f8 620static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
621static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
622static void freePubsubPattern(void *p);
623static int listMatchPubsubPattern(void *a, void *b);
624static int compareStringObjects(robj *a, robj *b);
bf028098 625static int equalStringObjects(robj *a, robj *b);
befec3cd 626static void usage();
8f63ddca 627static int rewriteAppendOnlyFileBackground(void);
242a64f3 628static int vmSwapObjectBlocking(robj *key, robj *val);
ed9b544e 629
abcb223e 630static void authCommand(redisClient *c);
ed9b544e 631static void pingCommand(redisClient *c);
632static void echoCommand(redisClient *c);
633static void setCommand(redisClient *c);
634static void setnxCommand(redisClient *c);
526d00a5 635static void setexCommand(redisClient *c);
ed9b544e 636static void getCommand(redisClient *c);
637static void delCommand(redisClient *c);
638static void existsCommand(redisClient *c);
639static void incrCommand(redisClient *c);
640static void decrCommand(redisClient *c);
641static void incrbyCommand(redisClient *c);
642static void decrbyCommand(redisClient *c);
643static void selectCommand(redisClient *c);
644static void randomkeyCommand(redisClient *c);
645static void keysCommand(redisClient *c);
646static void dbsizeCommand(redisClient *c);
647static void lastsaveCommand(redisClient *c);
648static void saveCommand(redisClient *c);
649static void bgsaveCommand(redisClient *c);
9d65a1bb 650static void bgrewriteaofCommand(redisClient *c);
ed9b544e 651static void shutdownCommand(redisClient *c);
652static void moveCommand(redisClient *c);
653static void renameCommand(redisClient *c);
654static void renamenxCommand(redisClient *c);
655static void lpushCommand(redisClient *c);
656static void rpushCommand(redisClient *c);
657static void lpopCommand(redisClient *c);
658static void rpopCommand(redisClient *c);
659static void llenCommand(redisClient *c);
660static void lindexCommand(redisClient *c);
661static void lrangeCommand(redisClient *c);
662static void ltrimCommand(redisClient *c);
663static void typeCommand(redisClient *c);
664static void lsetCommand(redisClient *c);
665static void saddCommand(redisClient *c);
666static void sremCommand(redisClient *c);
a4460ef4 667static void smoveCommand(redisClient *c);
ed9b544e 668static void sismemberCommand(redisClient *c);
669static void scardCommand(redisClient *c);
12fea928 670static void spopCommand(redisClient *c);
2abb95a9 671static void srandmemberCommand(redisClient *c);
ed9b544e 672static void sinterCommand(redisClient *c);
673static void sinterstoreCommand(redisClient *c);
40d224a9 674static void sunionCommand(redisClient *c);
675static void sunionstoreCommand(redisClient *c);
f4f56e1d 676static void sdiffCommand(redisClient *c);
677static void sdiffstoreCommand(redisClient *c);
ed9b544e 678static void syncCommand(redisClient *c);
679static void flushdbCommand(redisClient *c);
680static void flushallCommand(redisClient *c);
681static void sortCommand(redisClient *c);
682static void lremCommand(redisClient *c);
0f5f7e9a 683static void rpoplpushcommand(redisClient *c);
ed9b544e 684static void infoCommand(redisClient *c);
70003d28 685static void mgetCommand(redisClient *c);
87eca727 686static void monitorCommand(redisClient *c);
3305306f 687static void expireCommand(redisClient *c);
802e8373 688static void expireatCommand(redisClient *c);
f6b141c5 689static void getsetCommand(redisClient *c);
fd88489a 690static void ttlCommand(redisClient *c);
321b0e13 691static void slaveofCommand(redisClient *c);
7f957c92 692static void debugCommand(redisClient *c);
f6b141c5 693static void msetCommand(redisClient *c);
694static void msetnxCommand(redisClient *c);
fd8ccf44 695static void zaddCommand(redisClient *c);
7db723ad 696static void zincrbyCommand(redisClient *c);
cc812361 697static void zrangeCommand(redisClient *c);
50c55df5 698static void zrangebyscoreCommand(redisClient *c);
f44dd428 699static void zcountCommand(redisClient *c);
e3870fab 700static void zrevrangeCommand(redisClient *c);
3c41331e 701static void zcardCommand(redisClient *c);
1b7106e7 702static void zremCommand(redisClient *c);
6e333bbe 703static void zscoreCommand(redisClient *c);
1807985b 704static void zremrangebyscoreCommand(redisClient *c);
6e469882 705static void multiCommand(redisClient *c);
706static void execCommand(redisClient *c);
18b6cb76 707static void discardCommand(redisClient *c);
4409877e 708static void blpopCommand(redisClient *c);
709static void brpopCommand(redisClient *c);
4b00bebd 710static void appendCommand(redisClient *c);
39191553 711static void substrCommand(redisClient *c);
69d95c3e 712static void zrankCommand(redisClient *c);
798d9e55 713static void zrevrankCommand(redisClient *c);
978c2c94 714static void hsetCommand(redisClient *c);
1f1c7695 715static void hsetnxCommand(redisClient *c);
978c2c94 716static void hgetCommand(redisClient *c);
09aeb579
PN
717static void hmsetCommand(redisClient *c);
718static void hmgetCommand(redisClient *c);
07efaf74 719static void hdelCommand(redisClient *c);
92b27fe9 720static void hlenCommand(redisClient *c);
9212eafd 721static void zremrangebyrankCommand(redisClient *c);
2830ca53
PN
722static void zunionCommand(redisClient *c);
723static void zinterCommand(redisClient *c);
78409a0f 724static void hkeysCommand(redisClient *c);
725static void hvalsCommand(redisClient *c);
726static void hgetallCommand(redisClient *c);
a86f14b1 727static void hexistsCommand(redisClient *c);
500ece7c 728static void configCommand(redisClient *c);
01426b05 729static void hincrbyCommand(redisClient *c);
befec3cd 730static void subscribeCommand(redisClient *c);
731static void unsubscribeCommand(redisClient *c);
ffc6b7f8 732static void psubscribeCommand(redisClient *c);
733static void punsubscribeCommand(redisClient *c);
befec3cd 734static void publishCommand(redisClient *c);
f6b141c5 735
ed9b544e 736/*================================= Globals ================================= */
737
738/* Global vars */
739static struct redisServer server; /* server global state */
740static struct redisCommand cmdTable[] = {
76583ea4
PN
741 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
742 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
743 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
526d00a5 744 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
745 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
746 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
748 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
749 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
750 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
751 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
752 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
753 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
754 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
755 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
756 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
757 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
758 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
760 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
761 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
762 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
763 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
764 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
765 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
766 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
767 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
768 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
769 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
770 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
771 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
773 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
774 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
775 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
776 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
777 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
778 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
780 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
781 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
782 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
783 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
784 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
785 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
786 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
790 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
791 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
792 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
793 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
794 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
1f1c7695 795 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 796 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
d33278d1 797 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 798 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
01426b05 799 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
800 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
801 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
802 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 805 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
806 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
807 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
808 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
809 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
810 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
811 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
812 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
814 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
815 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
816 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
818 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
819 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
820 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
823 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
824 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
825 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
826 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
829 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
958cd5f3 830 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
831 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
832 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
836 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
839 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
840 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 841 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
befec3cd 842 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
843 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
ffc6b7f8 844 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
845 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
4005fef1 846 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
76583ea4 847 {NULL,NULL,0,0,NULL,0,0,0}
ed9b544e 848};
bcfc686d 849
ed9b544e 850/*============================ Utility functions ============================ */
851
852/* Glob-style pattern matching. */
500ece7c 853static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 854 const char *string, int stringLen, int nocase)
855{
856 while(patternLen) {
857 switch(pattern[0]) {
858 case '*':
859 while (pattern[1] == '*') {
860 pattern++;
861 patternLen--;
862 }
863 if (patternLen == 1)
864 return 1; /* match */
865 while(stringLen) {
866 if (stringmatchlen(pattern+1, patternLen-1,
867 string, stringLen, nocase))
868 return 1; /* match */
869 string++;
870 stringLen--;
871 }
872 return 0; /* no match */
873 break;
874 case '?':
875 if (stringLen == 0)
876 return 0; /* no match */
877 string++;
878 stringLen--;
879 break;
880 case '[':
881 {
882 int not, match;
883
884 pattern++;
885 patternLen--;
886 not = pattern[0] == '^';
887 if (not) {
888 pattern++;
889 patternLen--;
890 }
891 match = 0;
892 while(1) {
893 if (pattern[0] == '\\') {
894 pattern++;
895 patternLen--;
896 if (pattern[0] == string[0])
897 match = 1;
898 } else if (pattern[0] == ']') {
899 break;
900 } else if (patternLen == 0) {
901 pattern--;
902 patternLen++;
903 break;
904 } else if (pattern[1] == '-' && patternLen >= 3) {
905 int start = pattern[0];
906 int end = pattern[2];
907 int c = string[0];
908 if (start > end) {
909 int t = start;
910 start = end;
911 end = t;
912 }
913 if (nocase) {
914 start = tolower(start);
915 end = tolower(end);
916 c = tolower(c);
917 }
918 pattern += 2;
919 patternLen -= 2;
920 if (c >= start && c <= end)
921 match = 1;
922 } else {
923 if (!nocase) {
924 if (pattern[0] == string[0])
925 match = 1;
926 } else {
927 if (tolower((int)pattern[0]) == tolower((int)string[0]))
928 match = 1;
929 }
930 }
931 pattern++;
932 patternLen--;
933 }
934 if (not)
935 match = !match;
936 if (!match)
937 return 0; /* no match */
938 string++;
939 stringLen--;
940 break;
941 }
942 case '\\':
943 if (patternLen >= 2) {
944 pattern++;
945 patternLen--;
946 }
947 /* fall through */
948 default:
949 if (!nocase) {
950 if (pattern[0] != string[0])
951 return 0; /* no match */
952 } else {
953 if (tolower((int)pattern[0]) != tolower((int)string[0]))
954 return 0; /* no match */
955 }
956 string++;
957 stringLen--;
958 break;
959 }
960 pattern++;
961 patternLen--;
962 if (stringLen == 0) {
963 while(*pattern == '*') {
964 pattern++;
965 patternLen--;
966 }
967 break;
968 }
969 }
970 if (patternLen == 0 && stringLen == 0)
971 return 1;
972 return 0;
973}
974
500ece7c 975static int stringmatch(const char *pattern, const char *string, int nocase) {
976 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
977}
978
2b619329 979/* Convert a string representing an amount of memory into the number of
980 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
981 * (1024*1024*1024).
982 *
983 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
984 * set to 0 */
985static long long memtoll(const char *p, int *err) {
986 const char *u;
987 char buf[128];
988 long mul; /* unit multiplier */
989 long long val;
990 unsigned int digits;
991
992 if (err) *err = 0;
993 /* Search the first non digit character. */
994 u = p;
995 if (*u == '-') u++;
996 while(*u && isdigit(*u)) u++;
997 if (*u == '\0' || !strcasecmp(u,"b")) {
998 mul = 1;
72324005 999 } else if (!strcasecmp(u,"k")) {
2b619329 1000 mul = 1000;
72324005 1001 } else if (!strcasecmp(u,"kb")) {
2b619329 1002 mul = 1024;
72324005 1003 } else if (!strcasecmp(u,"m")) {
2b619329 1004 mul = 1000*1000;
72324005 1005 } else if (!strcasecmp(u,"mb")) {
2b619329 1006 mul = 1024*1024;
72324005 1007 } else if (!strcasecmp(u,"g")) {
2b619329 1008 mul = 1000L*1000*1000;
72324005 1009 } else if (!strcasecmp(u,"gb")) {
2b619329 1010 mul = 1024L*1024*1024;
1011 } else {
1012 if (err) *err = 1;
1013 mul = 1;
1014 }
1015 digits = u-p;
1016 if (digits >= sizeof(buf)) {
1017 if (err) *err = 1;
1018 return LLONG_MAX;
1019 }
1020 memcpy(buf,p,digits);
1021 buf[digits] = '\0';
1022 val = strtoll(buf,NULL,10);
1023 return val*mul;
1024}
1025
ee14da56 1026/* Convert a long long into a string. Returns the number of
1027 * characters needed to represent the number, that can be shorter if passed
1028 * buffer length is not enough to store the whole number. */
1029static int ll2string(char *s, size_t len, long long value) {
1030 char buf[32], *p;
1031 unsigned long long v;
1032 size_t l;
1033
1034 if (len == 0) return 0;
1035 v = (value < 0) ? -value : value;
1036 p = buf+31; /* point to the last character */
1037 do {
1038 *p-- = '0'+(v%10);
1039 v /= 10;
1040 } while(v);
1041 if (value < 0) *p-- = '-';
1042 p++;
1043 l = 32-(p-buf);
1044 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1045 memcpy(s,p,l);
1046 s[l] = '\0';
1047 return l;
1048}
1049
56906eef 1050static void redisLog(int level, const char *fmt, ...) {
ed9b544e 1051 va_list ap;
1052 FILE *fp;
1053
1054 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1055 if (!fp) return;
1056
1057 va_start(ap, fmt);
1058 if (level >= server.verbosity) {
6766f45e 1059 char *c = ".-*#";
1904ecc1 1060 char buf[64];
1061 time_t now;
1062
1063 now = time(NULL);
6c9385e0 1064 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 1065 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 1066 vfprintf(fp, fmt, ap);
1067 fprintf(fp,"\n");
1068 fflush(fp);
1069 }
1070 va_end(ap);
1071
1072 if (server.logfile) fclose(fp);
1073}
1074
1075/*====================== Hash table type implementation ==================== */
1076
1077/* This is an hash table type that uses the SDS dynamic strings libary as
1078 * keys and radis objects as values (objects can hold SDS strings,
1079 * lists, sets). */
1080
1812e024 1081static void dictVanillaFree(void *privdata, void *val)
1082{
1083 DICT_NOTUSED(privdata);
1084 zfree(val);
1085}
1086
4409877e 1087static void dictListDestructor(void *privdata, void *val)
1088{
1089 DICT_NOTUSED(privdata);
1090 listRelease((list*)val);
1091}
1092
ed9b544e 1093static int sdsDictKeyCompare(void *privdata, const void *key1,
1094 const void *key2)
1095{
1096 int l1,l2;
1097 DICT_NOTUSED(privdata);
1098
1099 l1 = sdslen((sds)key1);
1100 l2 = sdslen((sds)key2);
1101 if (l1 != l2) return 0;
1102 return memcmp(key1, key2, l1) == 0;
1103}
1104
1105static void dictRedisObjectDestructor(void *privdata, void *val)
1106{
1107 DICT_NOTUSED(privdata);
1108
a35ddf12 1109 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 1110 decrRefCount(val);
1111}
1112
942a3961 1113static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1114 const void *key2)
1115{
1116 const robj *o1 = key1, *o2 = key2;
1117 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1118}
1119
942a3961 1120static unsigned int dictObjHash(const void *key) {
ed9b544e 1121 const robj *o = key;
1122 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1123}
1124
942a3961 1125static int dictEncObjKeyCompare(void *privdata, const void *key1,
1126 const void *key2)
1127{
9d65a1bb 1128 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1129 int cmp;
942a3961 1130
2a1198b4 1131 if (o1->encoding == REDIS_ENCODING_INT &&
dc05abde 1132 o2->encoding == REDIS_ENCODING_INT)
1133 return o1->ptr == o2->ptr;
2a1198b4 1134
9d65a1bb 1135 o1 = getDecodedObject(o1);
1136 o2 = getDecodedObject(o2);
1137 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1138 decrRefCount(o1);
1139 decrRefCount(o2);
1140 return cmp;
942a3961 1141}
1142
1143static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1144 robj *o = (robj*) key;
942a3961 1145
ed9e4966 1146 if (o->encoding == REDIS_ENCODING_RAW) {
1147 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1148 } else {
1149 if (o->encoding == REDIS_ENCODING_INT) {
1150 char buf[32];
1151 int len;
1152
ee14da56 1153 len = ll2string(buf,32,(long)o->ptr);
ed9e4966 1154 return dictGenHashFunction((unsigned char*)buf, len);
1155 } else {
1156 unsigned int hash;
1157
1158 o = getDecodedObject(o);
1159 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1160 decrRefCount(o);
1161 return hash;
1162 }
1163 }
942a3961 1164}
1165
f2d9f50f 1166/* Sets type and expires */
ed9b544e 1167static dictType setDictType = {
942a3961 1168 dictEncObjHash, /* hash function */
ed9b544e 1169 NULL, /* key dup */
1170 NULL, /* val dup */
942a3961 1171 dictEncObjKeyCompare, /* key compare */
ed9b544e 1172 dictRedisObjectDestructor, /* key destructor */
1173 NULL /* val destructor */
1174};
1175
f2d9f50f 1176/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1177static dictType zsetDictType = {
1178 dictEncObjHash, /* hash function */
1179 NULL, /* key dup */
1180 NULL, /* val dup */
1181 dictEncObjKeyCompare, /* key compare */
1182 dictRedisObjectDestructor, /* key destructor */
da0a1620 1183 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1184};
1185
f2d9f50f 1186/* Db->dict */
5234952b 1187static dictType dbDictType = {
942a3961 1188 dictObjHash, /* hash function */
ed9b544e 1189 NULL, /* key dup */
1190 NULL, /* val dup */
942a3961 1191 dictObjKeyCompare, /* key compare */
ed9b544e 1192 dictRedisObjectDestructor, /* key destructor */
1193 dictRedisObjectDestructor /* val destructor */
1194};
1195
f2d9f50f 1196/* Db->expires */
1197static dictType keyptrDictType = {
1198 dictObjHash, /* hash function */
1199 NULL, /* key dup */
1200 NULL, /* val dup */
1201 dictObjKeyCompare, /* key compare */
1202 dictRedisObjectDestructor, /* key destructor */
1203 NULL /* val destructor */
1204};
1205
5234952b 1206/* Hash type hash table (note that small hashes are represented with zimpaps) */
1207static dictType hashDictType = {
1208 dictEncObjHash, /* hash function */
1209 NULL, /* key dup */
1210 NULL, /* val dup */
1211 dictEncObjKeyCompare, /* key compare */
1212 dictRedisObjectDestructor, /* key destructor */
1213 dictRedisObjectDestructor /* val destructor */
1214};
1215
4409877e 1216/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1217 * lists as values. It's used for blocking operations (BLPOP) and to
1218 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1219static dictType keylistDictType = {
1220 dictObjHash, /* hash function */
1221 NULL, /* key dup */
1222 NULL, /* val dup */
1223 dictObjKeyCompare, /* key compare */
1224 dictRedisObjectDestructor, /* key destructor */
1225 dictListDestructor /* val destructor */
1226};
1227
42ab0172
AO
1228static void version();
1229
ed9b544e 1230/* ========================= Random utility functions ======================= */
1231
1232/* Redis generally does not try to recover from out of memory conditions
1233 * when allocating objects or strings, it is not clear if it will be possible
1234 * to report this condition to the client since the networking layer itself
1235 * is based on heap allocation for send buffers, so we simply abort.
1236 * At least the code will be simpler to read... */
1237static void oom(const char *msg) {
71c54b21 1238 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1239 sleep(1);
1240 abort();
1241}
1242
1243/* ====================== Redis server networking stuff ===================== */
56906eef 1244static void closeTimedoutClients(void) {
ed9b544e 1245 redisClient *c;
ed9b544e 1246 listNode *ln;
1247 time_t now = time(NULL);
c7df85a4 1248 listIter li;
ed9b544e 1249
c7df85a4 1250 listRewind(server.clients,&li);
1251 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1252 c = listNodeValue(ln);
f86a74e9 1253 if (server.maxidletime &&
1254 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1255 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
ffc6b7f8 1256 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1257 listLength(c->pubsub_patterns) == 0 &&
d6cc8867 1258 (now - c->lastinteraction > server.maxidletime))
f86a74e9 1259 {
f870935d 1260 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1261 freeClient(c);
f86a74e9 1262 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1263 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1264 addReply(c,shared.nullmultibulk);
b0d8747d 1265 unblockClientWaitingData(c);
f86a74e9 1266 }
ed9b544e 1267 }
1268 }
ed9b544e 1269}
1270
12fea928 1271static int htNeedsResize(dict *dict) {
1272 long long size, used;
1273
1274 size = dictSlots(dict);
1275 used = dictSize(dict);
1276 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1277 (used*100/size < REDIS_HT_MINFILL));
1278}
1279
0bc03378 1280/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1281 * we resize the hash table to save memory */
56906eef 1282static void tryResizeHashTables(void) {
0bc03378 1283 int j;
1284
1285 for (j = 0; j < server.dbnum; j++) {
5413c40d 1286 if (htNeedsResize(server.db[j].dict))
0bc03378 1287 dictResize(server.db[j].dict);
12fea928 1288 if (htNeedsResize(server.db[j].expires))
1289 dictResize(server.db[j].expires);
0bc03378 1290 }
1291}
1292
8ca3e9d1 1293/* Our hash table implementation performs rehashing incrementally while
1294 * we write/read from the hash table. Still if the server is idle, the hash
1295 * table will use two tables for a long time. So we try to use 1 millisecond
1296 * of CPU time at every serverCron() loop in order to rehash some key. */
1297static void incrementallyRehash(void) {
1298 int j;
1299
1300 for (j = 0; j < server.dbnum; j++) {
1301 if (dictIsRehashing(server.db[j].dict)) {
1302 dictRehashMilliseconds(server.db[j].dict,1);
1303 break; /* already used our millisecond for this loop... */
1304 }
1305 }
1306}
1307
9d65a1bb 1308/* A background saving child (BGSAVE) terminated its work. Handle this. */
1309void backgroundSaveDoneHandler(int statloc) {
1310 int exitcode = WEXITSTATUS(statloc);
1311 int bysignal = WIFSIGNALED(statloc);
1312
1313 if (!bysignal && exitcode == 0) {
1314 redisLog(REDIS_NOTICE,
1315 "Background saving terminated with success");
1316 server.dirty = 0;
1317 server.lastsave = time(NULL);
1318 } else if (!bysignal && exitcode != 0) {
1319 redisLog(REDIS_WARNING, "Background saving error");
1320 } else {
1321 redisLog(REDIS_WARNING,
454eea7c 1322 "Background saving terminated by signal %d", WTERMSIG(statloc));
9d65a1bb 1323 rdbRemoveTempFile(server.bgsavechildpid);
1324 }
1325 server.bgsavechildpid = -1;
1326 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1327 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1328 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1329}
1330
1331/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1332 * Handle this. */
1333void backgroundRewriteDoneHandler(int statloc) {
1334 int exitcode = WEXITSTATUS(statloc);
1335 int bysignal = WIFSIGNALED(statloc);
1336
1337 if (!bysignal && exitcode == 0) {
1338 int fd;
1339 char tmpfile[256];
1340
1341 redisLog(REDIS_NOTICE,
1342 "Background append only file rewriting terminated with success");
1343 /* Now it's time to flush the differences accumulated by the parent */
1344 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1345 fd = open(tmpfile,O_WRONLY|O_APPEND);
1346 if (fd == -1) {
1347 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1348 goto cleanup;
1349 }
1350 /* Flush our data... */
1351 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1352 (signed) sdslen(server.bgrewritebuf)) {
1353 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1354 close(fd);
1355 goto cleanup;
1356 }
b32627cd 1357 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1358 /* Now our work is to rename the temp file into the stable file. And
1359 * switch the file descriptor used by the server for append only. */
1360 if (rename(tmpfile,server.appendfilename) == -1) {
1361 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1362 close(fd);
1363 goto cleanup;
1364 }
1365 /* Mission completed... almost */
1366 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1367 if (server.appendfd != -1) {
1368 /* If append only is actually enabled... */
1369 close(server.appendfd);
1370 server.appendfd = fd;
1371 fsync(fd);
85a83172 1372 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1373 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1374 } else {
1375 /* If append only is disabled we just generate a dump in this
1376 * format. Why not? */
1377 close(fd);
1378 }
1379 } else if (!bysignal && exitcode != 0) {
1380 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1381 } else {
1382 redisLog(REDIS_WARNING,
454eea7c 1383 "Background append only file rewriting terminated by signal %d",
1384 WTERMSIG(statloc));
9d65a1bb 1385 }
1386cleanup:
1387 sdsfree(server.bgrewritebuf);
1388 server.bgrewritebuf = sdsempty();
1389 aofRemoveTempFile(server.bgrewritechildpid);
1390 server.bgrewritechildpid = -1;
1391}
1392
884d4b39 1393/* This function is called once a background process of some kind terminates,
1394 * as we want to avoid resizing the hash tables when there is a child in order
1395 * to play well with copy-on-write (otherwise when a resize happens lots of
1396 * memory pages are copied). The goal of this function is to update the ability
1397 * for dict.c to resize the hash tables accordingly to the fact we have o not
1398 * running childs. */
1399static void updateDictResizePolicy(void) {
1400 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1401 dictEnableResize();
1402 else
1403 dictDisableResize();
1404}
1405
56906eef 1406static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1407 int j, loops = server.cronloops++;
ed9b544e 1408 REDIS_NOTUSED(eventLoop);
1409 REDIS_NOTUSED(id);
1410 REDIS_NOTUSED(clientData);
1411
3a66edc7 1412 /* We take a cached value of the unix time in the global state because
1413 * with virtual memory and aging there is to store the current time
1414 * in objects at every object access, and accuracy is not needed.
1415 * To access a global var is faster than calling time(NULL) */
1416 server.unixtime = time(NULL);
1417
0bc03378 1418 /* Show some info about non-empty databases */
ed9b544e 1419 for (j = 0; j < server.dbnum; j++) {
dec423d9 1420 long long size, used, vkeys;
94754ccc 1421
3305306f 1422 size = dictSlots(server.db[j].dict);
1423 used = dictSize(server.db[j].dict);
94754ccc 1424 vkeys = dictSize(server.db[j].expires);
1763929f 1425 if (!(loops % 50) && (used || vkeys)) {
f870935d 1426 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1427 /* dictPrintStats(server.dict); */
ed9b544e 1428 }
ed9b544e 1429 }
1430
0bc03378 1431 /* We don't want to resize the hash tables while a bacground saving
1432 * is in progress: the saving child is created using fork() that is
1433 * implemented with a copy-on-write semantic in most modern systems, so
1434 * if we resize the HT while there is the saving child at work actually
1435 * a lot of memory movements in the parent will cause a lot of pages
1436 * copied. */
8ca3e9d1 1437 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1438 if (!(loops % 10)) tryResizeHashTables();
1439 if (server.activerehashing) incrementallyRehash();
884d4b39 1440 }
0bc03378 1441
ed9b544e 1442 /* Show information about connected clients */
1763929f 1443 if (!(loops % 50)) {
bdcb92f2 1444 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
ed9b544e 1445 listLength(server.clients)-listLength(server.slaves),
1446 listLength(server.slaves),
bdcb92f2 1447 zmalloc_used_memory());
ed9b544e 1448 }
1449
1450 /* Close connections of timedout clients */
1763929f 1451 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1452 closeTimedoutClients();
1453
9d65a1bb 1454 /* Check if a background saving or AOF rewrite in progress terminated */
1455 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1456 int statloc;
9d65a1bb 1457 pid_t pid;
1458
1459 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1460 if (pid == server.bgsavechildpid) {
1461 backgroundSaveDoneHandler(statloc);
ed9b544e 1462 } else {
9d65a1bb 1463 backgroundRewriteDoneHandler(statloc);
ed9b544e 1464 }
884d4b39 1465 updateDictResizePolicy();
ed9b544e 1466 }
1467 } else {
1468 /* If there is not a background saving in progress check if
1469 * we have to save now */
1470 time_t now = time(NULL);
1471 for (j = 0; j < server.saveparamslen; j++) {
1472 struct saveparam *sp = server.saveparams+j;
1473
1474 if (server.dirty >= sp->changes &&
1475 now-server.lastsave > sp->seconds) {
1476 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1477 sp->changes, sp->seconds);
f78fd11b 1478 rdbSaveBackground(server.dbfilename);
ed9b544e 1479 break;
1480 }
1481 }
1482 }
94754ccc 1483
f2324293 1484 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1485 * will use few CPU cycles if there are few expiring keys, otherwise
1486 * it will get more aggressive to avoid that too much memory is used by
1487 * keys that can be removed from the keyspace. */
94754ccc 1488 for (j = 0; j < server.dbnum; j++) {
f2324293 1489 int expired;
94754ccc 1490 redisDb *db = server.db+j;
94754ccc 1491
f2324293 1492 /* Continue to expire if at the end of the cycle more than 25%
1493 * of the keys were expired. */
1494 do {
4ef8de8a 1495 long num = dictSize(db->expires);
94754ccc 1496 time_t now = time(NULL);
1497
f2324293 1498 expired = 0;
94754ccc 1499 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1500 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1501 while (num--) {
1502 dictEntry *de;
1503 time_t t;
1504
1505 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1506 t = (time_t) dictGetEntryVal(de);
1507 if (now > t) {
1508 deleteKey(db,dictGetEntryKey(de));
f2324293 1509 expired++;
2a6a2ed1 1510 server.stat_expiredkeys++;
94754ccc 1511 }
1512 }
f2324293 1513 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1514 }
1515
4ef8de8a 1516 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1517 * is enbled. Try to free objects from the free list first. */
7e69548d 1518 if (vmCanSwapOut()) {
1519 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1520 server.vm_max_memory)
1521 {
72e9fd40 1522 int retval;
1523
a5819310 1524 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1525 retval = (server.vm_max_threads == 0) ?
1526 vmSwapOneObjectBlocking() :
1527 vmSwapOneObjectThreaded();
1763929f 1528 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1529 zmalloc_used_memory() >
1530 (server.vm_max_memory+server.vm_max_memory/10))
1531 {
1532 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1533 }
72e9fd40 1534 /* Note that when using threade I/O we free just one object,
1535 * because anyway when the I/O thread in charge to swap this
1536 * object out will finish, the handler of completed jobs
1537 * will try to swap more objects if we are still out of memory. */
1538 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1539 }
1540 }
1541
ed9b544e 1542 /* Check if we should connect to a MASTER */
1763929f 1543 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1544 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1545 if (syncWithMaster() == REDIS_OK) {
1546 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
8f63ddca 1547 if (server.appendonly) rewriteAppendOnlyFileBackground();
ed9b544e 1548 }
1549 }
1763929f 1550 return 100;
ed9b544e 1551}
1552
d5d55fc3 1553/* This function gets called every time Redis is entering the
1554 * main loop of the event driven library, that is, before to sleep
1555 * for ready file descriptors. */
1556static void beforeSleep(struct aeEventLoop *eventLoop) {
1557 REDIS_NOTUSED(eventLoop);
1558
28ed1f33 1559 /* Awake clients that got all the swapped keys they requested */
d5d55fc3 1560 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1561 listIter li;
1562 listNode *ln;
1563
1564 listRewind(server.io_ready_clients,&li);
1565 while((ln = listNext(&li))) {
1566 redisClient *c = ln->value;
1567 struct redisCommand *cmd;
1568
1569 /* Resume the client. */
1570 listDelNode(server.io_ready_clients,ln);
1571 c->flags &= (~REDIS_IO_WAIT);
1572 server.vm_blocked_clients--;
1573 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1574 readQueryFromClient, c);
1575 cmd = lookupCommand(c->argv[0]->ptr);
1576 assert(cmd != NULL);
1577 call(c,cmd);
1578 resetClient(c);
1579 /* There may be more data to process in the input buffer. */
1580 if (c->querybuf && sdslen(c->querybuf) > 0)
1581 processInputBuffer(c);
1582 }
1583 }
28ed1f33 1584 /* Write the AOF buffer on disk */
1585 flushAppendOnlyFile();
d5d55fc3 1586}
1587
ed9b544e 1588static void createSharedObjects(void) {
05df7621 1589 int j;
1590
ed9b544e 1591 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1592 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1593 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1594 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1595 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1596 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1597 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1598 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1599 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1600 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1601 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1602 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1603 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1604 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1605 "-ERR no such key\r\n"));
ed9b544e 1606 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1607 "-ERR syntax error\r\n"));
c937aa89 1608 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1609 "-ERR source and destination objects are the same\r\n"));
1610 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1611 "-ERR index out of range\r\n"));
ed9b544e 1612 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1613 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1614 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1615 shared.select0 = createStringObject("select 0\r\n",10);
1616 shared.select1 = createStringObject("select 1\r\n",10);
1617 shared.select2 = createStringObject("select 2\r\n",10);
1618 shared.select3 = createStringObject("select 3\r\n",10);
1619 shared.select4 = createStringObject("select 4\r\n",10);
1620 shared.select5 = createStringObject("select 5\r\n",10);
1621 shared.select6 = createStringObject("select 6\r\n",10);
1622 shared.select7 = createStringObject("select 7\r\n",10);
1623 shared.select8 = createStringObject("select 8\r\n",10);
1624 shared.select9 = createStringObject("select 9\r\n",10);
befec3cd 1625 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
c8d0ea0e 1626 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
befec3cd 1627 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
fc46bb71 1628 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
ffc6b7f8 1629 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1630 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
befec3cd 1631 shared.mbulk3 = createStringObject("*3\r\n",4);
c8d0ea0e 1632 shared.mbulk4 = createStringObject("*4\r\n",4);
05df7621 1633 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1634 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1635 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1636 }
ed9b544e 1637}
1638
1639static void appendServerSaveParams(time_t seconds, int changes) {
1640 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1641 server.saveparams[server.saveparamslen].seconds = seconds;
1642 server.saveparams[server.saveparamslen].changes = changes;
1643 server.saveparamslen++;
1644}
1645
bcfc686d 1646static void resetServerSaveParams() {
ed9b544e 1647 zfree(server.saveparams);
1648 server.saveparams = NULL;
1649 server.saveparamslen = 0;
1650}
1651
1652static void initServerConfig() {
1653 server.dbnum = REDIS_DEFAULT_DBNUM;
1654 server.port = REDIS_SERVERPORT;
f870935d 1655 server.verbosity = REDIS_VERBOSE;
ed9b544e 1656 server.maxidletime = REDIS_MAXIDLETIME;
1657 server.saveparams = NULL;
1658 server.logfile = NULL; /* NULL = log on standard output */
1659 server.bindaddr = NULL;
1660 server.glueoutputbuf = 1;
1661 server.daemonize = 0;
44b38ef4 1662 server.appendonly = 0;
1b677732 1663 server.appendfsync = APPENDFSYNC_EVERYSEC;
48f0308a 1664 server.lastfsync = time(NULL);
44b38ef4 1665 server.appendfd = -1;
1666 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1667 server.pidfile = zstrdup("/var/run/redis.pid");
1668 server.dbfilename = zstrdup("dump.rdb");
1669 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1670 server.requirepass = NULL;
b0553789 1671 server.rdbcompression = 1;
8ca3e9d1 1672 server.activerehashing = 1;
285add55 1673 server.maxclients = 0;
d5d55fc3 1674 server.blpop_blocked_clients = 0;
3fd78bcd 1675 server.maxmemory = 0;
75680a3c 1676 server.vm_enabled = 0;
054e426d 1677 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1678 server.vm_page_size = 256; /* 256 bytes per page */
1679 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1680 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1681 server.vm_max_threads = 4;
d5d55fc3 1682 server.vm_blocked_clients = 0;
cbba7dd7 1683 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1684 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
75680a3c 1685
bcfc686d 1686 resetServerSaveParams();
ed9b544e 1687
1688 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1689 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1690 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1691 /* Replication related */
1692 server.isslave = 0;
d0ccebcf 1693 server.masterauth = NULL;
ed9b544e 1694 server.masterhost = NULL;
1695 server.masterport = 6379;
1696 server.master = NULL;
1697 server.replstate = REDIS_REPL_NONE;
a7866db6 1698
1699 /* Double constants initialization */
1700 R_Zero = 0.0;
1701 R_PosInf = 1.0/R_Zero;
1702 R_NegInf = -1.0/R_Zero;
1703 R_Nan = R_Zero/R_Zero;
ed9b544e 1704}
1705
1706static void initServer() {
1707 int j;
1708
1709 signal(SIGHUP, SIG_IGN);
1710 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1711 setupSigSegvAction();
ed9b544e 1712
b9bc0eef 1713 server.devnull = fopen("/dev/null","w");
1714 if (server.devnull == NULL) {
1715 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1716 exit(1);
1717 }
ed9b544e 1718 server.clients = listCreate();
1719 server.slaves = listCreate();
87eca727 1720 server.monitors = listCreate();
ed9b544e 1721 server.objfreelist = listCreate();
1722 createSharedObjects();
1723 server.el = aeCreateEventLoop();
3305306f 1724 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
ed9b544e 1725 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1726 if (server.fd == -1) {
1727 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1728 exit(1);
1729 }
3305306f 1730 for (j = 0; j < server.dbnum; j++) {
5234952b 1731 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1732 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
4409877e 1733 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1734 if (server.vm_enabled)
1735 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1736 server.db[j].id = j;
1737 }
ffc6b7f8 1738 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1739 server.pubsub_patterns = listCreate();
1740 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1741 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
ed9b544e 1742 server.cronloops = 0;
9f3c422c 1743 server.bgsavechildpid = -1;
9d65a1bb 1744 server.bgrewritechildpid = -1;
1745 server.bgrewritebuf = sdsempty();
28ed1f33 1746 server.aofbuf = sdsempty();
ed9b544e 1747 server.lastsave = time(NULL);
1748 server.dirty = 0;
ed9b544e 1749 server.stat_numcommands = 0;
1750 server.stat_numconnections = 0;
2a6a2ed1 1751 server.stat_expiredkeys = 0;
ed9b544e 1752 server.stat_starttime = time(NULL);
3a66edc7 1753 server.unixtime = time(NULL);
d8f8b666 1754 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1755 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1756 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1757
1758 if (server.appendonly) {
3bb225d6 1759 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1760 if (server.appendfd == -1) {
1761 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1762 strerror(errno));
1763 exit(1);
1764 }
1765 }
75680a3c 1766
1767 if (server.vm_enabled) vmInit();
ed9b544e 1768}
1769
1770/* Empty the whole database */
ca37e9cd 1771static long long emptyDb() {
ed9b544e 1772 int j;
ca37e9cd 1773 long long removed = 0;
ed9b544e 1774
3305306f 1775 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1776 removed += dictSize(server.db[j].dict);
3305306f 1777 dictEmpty(server.db[j].dict);
1778 dictEmpty(server.db[j].expires);
1779 }
ca37e9cd 1780 return removed;
ed9b544e 1781}
1782
85dd2f3a 1783static int yesnotoi(char *s) {
1784 if (!strcasecmp(s,"yes")) return 1;
1785 else if (!strcasecmp(s,"no")) return 0;
1786 else return -1;
1787}
1788
ed9b544e 1789/* I agree, this is a very rudimental way to load a configuration...
1790 will improve later if the config gets more complex */
1791static void loadServerConfig(char *filename) {
c9a111ac 1792 FILE *fp;
ed9b544e 1793 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1794 int linenum = 0;
1795 sds line = NULL;
c9a111ac 1796
1797 if (filename[0] == '-' && filename[1] == '\0')
1798 fp = stdin;
1799 else {
1800 if ((fp = fopen(filename,"r")) == NULL) {
9a22de82 1801 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
c9a111ac 1802 exit(1);
1803 }
ed9b544e 1804 }
c9a111ac 1805
ed9b544e 1806 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1807 sds *argv;
1808 int argc, j;
1809
1810 linenum++;
1811 line = sdsnew(buf);
1812 line = sdstrim(line," \t\r\n");
1813
1814 /* Skip comments and blank lines*/
1815 if (line[0] == '#' || line[0] == '\0') {
1816 sdsfree(line);
1817 continue;
1818 }
1819
1820 /* Split into arguments */
1821 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1822 sdstolower(argv[0]);
1823
1824 /* Execute config directives */
bb0b03a3 1825 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1826 server.maxidletime = atoi(argv[1]);
0150db36 1827 if (server.maxidletime < 0) {
ed9b544e 1828 err = "Invalid timeout value"; goto loaderr;
1829 }
bb0b03a3 1830 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1831 server.port = atoi(argv[1]);
1832 if (server.port < 1 || server.port > 65535) {
1833 err = "Invalid port"; goto loaderr;
1834 }
bb0b03a3 1835 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1836 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1837 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1838 int seconds = atoi(argv[1]);
1839 int changes = atoi(argv[2]);
1840 if (seconds < 1 || changes < 0) {
1841 err = "Invalid save parameters"; goto loaderr;
1842 }
1843 appendServerSaveParams(seconds,changes);
bb0b03a3 1844 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1845 if (chdir(argv[1]) == -1) {
1846 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1847 argv[1], strerror(errno));
1848 exit(1);
1849 }
bb0b03a3 1850 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1851 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1852 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1853 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1854 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1855 else {
1856 err = "Invalid log level. Must be one of debug, notice, warning";
1857 goto loaderr;
1858 }
bb0b03a3 1859 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1860 FILE *logfp;
ed9b544e 1861
1862 server.logfile = zstrdup(argv[1]);
bb0b03a3 1863 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1864 zfree(server.logfile);
1865 server.logfile = NULL;
1866 }
1867 if (server.logfile) {
1868 /* Test if we are able to open the file. The server will not
1869 * be able to abort just for this problem later... */
c9a111ac 1870 logfp = fopen(server.logfile,"a");
1871 if (logfp == NULL) {
ed9b544e 1872 err = sdscatprintf(sdsempty(),
1873 "Can't open the log file: %s", strerror(errno));
1874 goto loaderr;
1875 }
c9a111ac 1876 fclose(logfp);
ed9b544e 1877 }
bb0b03a3 1878 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1879 server.dbnum = atoi(argv[1]);
1880 if (server.dbnum < 1) {
1881 err = "Invalid number of databases"; goto loaderr;
1882 }
b3f83f12
JZ
1883 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1884 loadServerConfig(argv[1]);
285add55 1885 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1886 server.maxclients = atoi(argv[1]);
3fd78bcd 1887 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
2b619329 1888 server.maxmemory = memtoll(argv[1],NULL);
bb0b03a3 1889 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1890 server.masterhost = sdsnew(argv[1]);
1891 server.masterport = atoi(argv[2]);
1892 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1893 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1894 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1895 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1896 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1897 err = "argument must be 'yes' or 'no'"; goto loaderr;
1898 }
121f70cf 1899 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1900 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
8ca3e9d1 1901 err = "argument must be 'yes' or 'no'"; goto loaderr;
1902 }
1903 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1904 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
121f70cf 1905 err = "argument must be 'yes' or 'no'"; goto loaderr;
1906 }
bb0b03a3 1907 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1908 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1909 err = "argument must be 'yes' or 'no'"; goto loaderr;
1910 }
44b38ef4 1911 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1912 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1913 err = "argument must be 'yes' or 'no'"; goto loaderr;
1914 }
48f0308a 1915 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1916 if (!strcasecmp(argv[1],"no")) {
48f0308a 1917 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1918 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1919 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1920 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1921 server.appendfsync = APPENDFSYNC_EVERYSEC;
1922 } else {
1923 err = "argument must be 'no', 'always' or 'everysec'";
1924 goto loaderr;
1925 }
bb0b03a3 1926 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 1927 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1928 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 1929 zfree(server.pidfile);
054e426d 1930 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1931 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 1932 zfree(server.dbfilename);
054e426d 1933 server.dbfilename = zstrdup(argv[1]);
75680a3c 1934 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1935 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1936 err = "argument must be 'yes' or 'no'"; goto loaderr;
1937 }
054e426d 1938 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 1939 zfree(server.vm_swap_file);
054e426d 1940 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 1941 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2b619329 1942 server.vm_max_memory = memtoll(argv[1],NULL);
4ef8de8a 1943 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2b619329 1944 server.vm_page_size = memtoll(argv[1], NULL);
4ef8de8a 1945 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2b619329 1946 server.vm_pages = memtoll(argv[1], NULL);
92f8e882 1947 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1948 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 1949 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2b619329 1950 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
cbba7dd7 1951 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2b619329 1952 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
ed9b544e 1953 } else {
1954 err = "Bad directive or wrong number of arguments"; goto loaderr;
1955 }
1956 for (j = 0; j < argc; j++)
1957 sdsfree(argv[j]);
1958 zfree(argv);
1959 sdsfree(line);
1960 }
c9a111ac 1961 if (fp != stdin) fclose(fp);
ed9b544e 1962 return;
1963
1964loaderr:
1965 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1966 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1967 fprintf(stderr, ">>> '%s'\n", line);
1968 fprintf(stderr, "%s\n", err);
1969 exit(1);
1970}
1971
1972static void freeClientArgv(redisClient *c) {
1973 int j;
1974
1975 for (j = 0; j < c->argc; j++)
1976 decrRefCount(c->argv[j]);
e8a74421 1977 for (j = 0; j < c->mbargc; j++)
1978 decrRefCount(c->mbargv[j]);
ed9b544e 1979 c->argc = 0;
e8a74421 1980 c->mbargc = 0;
ed9b544e 1981}
1982
1983static void freeClient(redisClient *c) {
1984 listNode *ln;
1985
4409877e 1986 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 1987 * call, we have to set querybuf to NULL *before* to call
1988 * unblockClientWaitingData() to avoid processInputBuffer() will get
1989 * called. Also it is important to remove the file events after
1990 * this, because this call adds the READABLE event. */
4409877e 1991 sdsfree(c->querybuf);
1992 c->querybuf = NULL;
1993 if (c->flags & REDIS_BLOCKED)
b0d8747d 1994 unblockClientWaitingData(c);
4409877e 1995
ffc6b7f8 1996 /* Unsubscribe from all the pubsub channels */
1997 pubsubUnsubscribeAllChannels(c,0);
1998 pubsubUnsubscribeAllPatterns(c,0);
1999 dictRelease(c->pubsub_channels);
2000 listRelease(c->pubsub_patterns);
befec3cd 2001 /* Obvious cleanup */
ed9b544e 2002 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2003 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 2004 listRelease(c->reply);
2005 freeClientArgv(c);
2006 close(c->fd);
92f8e882 2007 /* Remove from the list of clients */
ed9b544e 2008 ln = listSearchKey(server.clients,c);
dfc5e96c 2009 redisAssert(ln != NULL);
ed9b544e 2010 listDelNode(server.clients,ln);
d5d55fc3 2011 /* Remove from the list of clients waiting for swapped keys */
2012 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2013 ln = listSearchKey(server.io_ready_clients,c);
2014 if (ln) {
2015 listDelNode(server.io_ready_clients,ln);
2016 server.vm_blocked_clients--;
2017 }
2018 }
2019 while (server.vm_enabled && listLength(c->io_keys)) {
2020 ln = listFirst(c->io_keys);
2021 dontWaitForSwappedKey(c,ln->value);
92f8e882 2022 }
b3e3d0d7 2023 listRelease(c->io_keys);
befec3cd 2024 /* Master/slave cleanup */
ed9b544e 2025 if (c->flags & REDIS_SLAVE) {
6208b3a7 2026 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2027 close(c->repldbfd);
87eca727 2028 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2029 ln = listSearchKey(l,c);
dfc5e96c 2030 redisAssert(ln != NULL);
87eca727 2031 listDelNode(l,ln);
ed9b544e 2032 }
2033 if (c->flags & REDIS_MASTER) {
2034 server.master = NULL;
2035 server.replstate = REDIS_REPL_CONNECT;
2036 }
befec3cd 2037 /* Release memory */
93ea3759 2038 zfree(c->argv);
e8a74421 2039 zfree(c->mbargv);
6e469882 2040 freeClientMultiState(c);
ed9b544e 2041 zfree(c);
2042}
2043
cc30e368 2044#define GLUEREPLY_UP_TO (1024)
ed9b544e 2045static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 2046 int copylen = 0;
2047 char buf[GLUEREPLY_UP_TO];
6208b3a7 2048 listNode *ln;
c7df85a4 2049 listIter li;
ed9b544e 2050 robj *o;
2051
c7df85a4 2052 listRewind(c->reply,&li);
2053 while((ln = listNext(&li))) {
c28b42ac 2054 int objlen;
2055
ed9b544e 2056 o = ln->value;
c28b42ac 2057 objlen = sdslen(o->ptr);
2058 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2059 memcpy(buf+copylen,o->ptr,objlen);
2060 copylen += objlen;
ed9b544e 2061 listDelNode(c->reply,ln);
c28b42ac 2062 } else {
2063 if (copylen == 0) return;
2064 break;
ed9b544e 2065 }
ed9b544e 2066 }
c28b42ac 2067 /* Now the output buffer is empty, add the new single element */
2068 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2069 listAddNodeHead(c->reply,o);
ed9b544e 2070}
2071
2072static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2073 redisClient *c = privdata;
2074 int nwritten = 0, totwritten = 0, objlen;
2075 robj *o;
2076 REDIS_NOTUSED(el);
2077 REDIS_NOTUSED(mask);
2078
2895e862 2079 /* Use writev() if we have enough buffers to send */
7ea870c0 2080 if (!server.glueoutputbuf &&
e0a62c7f 2081 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
7ea870c0 2082 !(c->flags & REDIS_MASTER))
2895e862 2083 {
2084 sendReplyToClientWritev(el, fd, privdata, mask);
2085 return;
2086 }
2895e862 2087
ed9b544e 2088 while(listLength(c->reply)) {
c28b42ac 2089 if (server.glueoutputbuf && listLength(c->reply) > 1)
2090 glueReplyBuffersIfNeeded(c);
2091
ed9b544e 2092 o = listNodeValue(listFirst(c->reply));
2093 objlen = sdslen(o->ptr);
2094
2095 if (objlen == 0) {
2096 listDelNode(c->reply,listFirst(c->reply));
2097 continue;
2098 }
2099
2100 if (c->flags & REDIS_MASTER) {
6f376729 2101 /* Don't reply to a master */
ed9b544e 2102 nwritten = objlen - c->sentlen;
2103 } else {
a4d1ba9a 2104 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 2105 if (nwritten <= 0) break;
2106 }
2107 c->sentlen += nwritten;
2108 totwritten += nwritten;
2109 /* If we fully sent the object on head go to the next one */
2110 if (c->sentlen == objlen) {
2111 listDelNode(c->reply,listFirst(c->reply));
2112 c->sentlen = 0;
2113 }
6f376729 2114 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 2115 * bytes, in a single threaded server it's a good idea to serve
6f376729 2116 * other clients as well, even if a very large request comes from
2117 * super fast link that is always able to accept data (in real world
12f9d551 2118 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 2119 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 2120 }
2121 if (nwritten == -1) {
2122 if (errno == EAGAIN) {
2123 nwritten = 0;
2124 } else {
f870935d 2125 redisLog(REDIS_VERBOSE,
ed9b544e 2126 "Error writing to client: %s", strerror(errno));
2127 freeClient(c);
2128 return;
2129 }
2130 }
2131 if (totwritten > 0) c->lastinteraction = time(NULL);
2132 if (listLength(c->reply) == 0) {
2133 c->sentlen = 0;
2134 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2135 }
2136}
2137
2895e862 2138static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2139{
2140 redisClient *c = privdata;
2141 int nwritten = 0, totwritten = 0, objlen, willwrite;
2142 robj *o;
2143 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2144 int offset, ion = 0;
2145 REDIS_NOTUSED(el);
2146 REDIS_NOTUSED(mask);
2147
2148 listNode *node;
2149 while (listLength(c->reply)) {
2150 offset = c->sentlen;
2151 ion = 0;
2152 willwrite = 0;
2153
2154 /* fill-in the iov[] array */
2155 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2156 o = listNodeValue(node);
2157 objlen = sdslen(o->ptr);
2158
e0a62c7f 2159 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2895e862 2160 break;
2161
2162 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2163 break; /* no more iovecs */
2164
2165 iov[ion].iov_base = ((char*)o->ptr) + offset;
2166 iov[ion].iov_len = objlen - offset;
2167 willwrite += objlen - offset;
2168 offset = 0; /* just for the first item */
2169 ion++;
2170 }
2171
2172 if(willwrite == 0)
2173 break;
2174
2175 /* write all collected blocks at once */
2176 if((nwritten = writev(fd, iov, ion)) < 0) {
2177 if (errno != EAGAIN) {
f870935d 2178 redisLog(REDIS_VERBOSE,
2895e862 2179 "Error writing to client: %s", strerror(errno));
2180 freeClient(c);
2181 return;
2182 }
2183 break;
2184 }
2185
2186 totwritten += nwritten;
2187 offset = c->sentlen;
2188
2189 /* remove written robjs from c->reply */
2190 while (nwritten && listLength(c->reply)) {
2191 o = listNodeValue(listFirst(c->reply));
2192 objlen = sdslen(o->ptr);
2193
2194 if(nwritten >= objlen - offset) {
2195 listDelNode(c->reply, listFirst(c->reply));
2196 nwritten -= objlen - offset;
2197 c->sentlen = 0;
2198 } else {
2199 /* partial write */
2200 c->sentlen += nwritten;
2201 break;
2202 }
2203 offset = 0;
2204 }
2205 }
2206
e0a62c7f 2207 if (totwritten > 0)
2895e862 2208 c->lastinteraction = time(NULL);
2209
2210 if (listLength(c->reply) == 0) {
2211 c->sentlen = 0;
2212 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2213 }
2214}
2215
ed9b544e 2216static struct redisCommand *lookupCommand(char *name) {
2217 int j = 0;
2218 while(cmdTable[j].name != NULL) {
bb0b03a3 2219 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
ed9b544e 2220 j++;
2221 }
2222 return NULL;
2223}
2224
2225/* resetClient prepare the client to process the next command */
2226static void resetClient(redisClient *c) {
2227 freeClientArgv(c);
2228 c->bulklen = -1;
e8a74421 2229 c->multibulk = 0;
ed9b544e 2230}
2231
6e469882 2232/* Call() is the core of Redis execution of a command */
2233static void call(redisClient *c, struct redisCommand *cmd) {
2234 long long dirty;
2235
2236 dirty = server.dirty;
2237 cmd->proc(c);
4005fef1 2238 dirty = server.dirty-dirty;
2239
2240 if (server.appendonly && dirty)
6e469882 2241 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
4005fef1 2242 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2243 listLength(server.slaves))
248ea310 2244 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2245 if (listLength(server.monitors))
dd142b9c 2246 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2247 server.stat_numcommands++;
2248}
2249
ed9b544e 2250/* If this function gets called we already read a whole
2251 * command, argments are in the client argv/argc fields.
2252 * processCommand() execute the command or prepare the
2253 * server for a bulk read from the client.
2254 *
2255 * If 1 is returned the client is still alive and valid and
2256 * and other operations can be performed by the caller. Otherwise
2257 * if 0 is returned the client was destroied (i.e. after QUIT). */
2258static int processCommand(redisClient *c) {
2259 struct redisCommand *cmd;
ed9b544e 2260
3fd78bcd 2261 /* Free some memory if needed (maxmemory setting) */
2262 if (server.maxmemory) freeMemoryIfNeeded();
2263
e8a74421 2264 /* Handle the multi bulk command type. This is an alternative protocol
2265 * supported by Redis in order to receive commands that are composed of
2266 * multiple binary-safe "bulk" arguments. The latency of processing is
2267 * a bit higher but this allows things like multi-sets, so if this
2268 * protocol is used only for MSET and similar commands this is a big win. */
2269 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2270 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2271 if (c->multibulk <= 0) {
2272 resetClient(c);
2273 return 1;
2274 } else {
2275 decrRefCount(c->argv[c->argc-1]);
2276 c->argc--;
2277 return 1;
2278 }
2279 } else if (c->multibulk) {
2280 if (c->bulklen == -1) {
2281 if (((char*)c->argv[0]->ptr)[0] != '$') {
2282 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2283 resetClient(c);
2284 return 1;
2285 } else {
2286 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2287 decrRefCount(c->argv[0]);
2288 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2289 c->argc--;
2290 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2291 resetClient(c);
2292 return 1;
2293 }
2294 c->argc--;
2295 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2296 return 1;
2297 }
2298 } else {
2299 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2300 c->mbargv[c->mbargc] = c->argv[0];
2301 c->mbargc++;
2302 c->argc--;
2303 c->multibulk--;
2304 if (c->multibulk == 0) {
2305 robj **auxargv;
2306 int auxargc;
2307
2308 /* Here we need to swap the multi-bulk argc/argv with the
2309 * normal argc/argv of the client structure. */
2310 auxargv = c->argv;
2311 c->argv = c->mbargv;
2312 c->mbargv = auxargv;
2313
2314 auxargc = c->argc;
2315 c->argc = c->mbargc;
2316 c->mbargc = auxargc;
2317
2318 /* We need to set bulklen to something different than -1
2319 * in order for the code below to process the command without
2320 * to try to read the last argument of a bulk command as
2321 * a special argument. */
2322 c->bulklen = 0;
2323 /* continue below and process the command */
2324 } else {
2325 c->bulklen = -1;
2326 return 1;
2327 }
2328 }
2329 }
2330 /* -- end of multi bulk commands processing -- */
2331
ed9b544e 2332 /* The QUIT command is handled as a special case. Normal command
2333 * procs are unable to close the client connection safely */
bb0b03a3 2334 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2335 freeClient(c);
2336 return 0;
2337 }
d5d55fc3 2338
2339 /* Now lookup the command and check ASAP about trivial error conditions
2340 * such wrong arity, bad command name and so forth. */
ed9b544e 2341 cmd = lookupCommand(c->argv[0]->ptr);
2342 if (!cmd) {
2c14807b 2343 addReplySds(c,
2344 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2345 (char*)c->argv[0]->ptr));
ed9b544e 2346 resetClient(c);
2347 return 1;
2348 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2349 (c->argc < -cmd->arity)) {
454d4e43 2350 addReplySds(c,
2351 sdscatprintf(sdsempty(),
2352 "-ERR wrong number of arguments for '%s' command\r\n",
2353 cmd->name));
ed9b544e 2354 resetClient(c);
2355 return 1;
2356 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2357 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2358 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2359
2360 decrRefCount(c->argv[c->argc-1]);
2361 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2362 c->argc--;
2363 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2364 resetClient(c);
2365 return 1;
2366 }
2367 c->argc--;
2368 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2369 /* It is possible that the bulk read is already in the
8d0490e7 2370 * buffer. Check this condition and handle it accordingly.
2371 * This is just a fast path, alternative to call processInputBuffer().
2372 * It's a good idea since the code is small and this condition
2373 * happens most of the times. */
ed9b544e 2374 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2375 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2376 c->argc++;
2377 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2378 } else {
d5d55fc3 2379 /* Otherwise return... there is to read the last argument
2380 * from the socket. */
ed9b544e 2381 return 1;
2382 }
2383 }
942a3961 2384 /* Let's try to encode the bulk object to save space. */
2385 if (cmd->flags & REDIS_CMD_BULK)
05df7621 2386 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
942a3961 2387
e63943a4 2388 /* Check if the user is authenticated */
2389 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2390 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2391 resetClient(c);
2392 return 1;
2393 }
2394
b61a28fe 2395 /* Handle the maxmemory directive */
2396 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2397 zmalloc_used_memory() > server.maxmemory)
2398 {
2399 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2400 resetClient(c);
2401 return 1;
2402 }
2403
d6cc8867 2404 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
e6cca5db 2405 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2406 &&
ffc6b7f8 2407 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2408 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2409 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
d6cc8867 2410 resetClient(c);
2411 return 1;
2412 }
2413
ed9b544e 2414 /* Exec the command */
18b6cb76 2415 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
6e469882 2416 queueMultiCommand(c,cmd);
2417 addReply(c,shared.queued);
2418 } else {
d5d55fc3 2419 if (server.vm_enabled && server.vm_max_threads > 0 &&
2420 blockClientOnSwappedKeys(cmd,c)) return 1;
6e469882 2421 call(c,cmd);
2422 }
ed9b544e 2423
2424 /* Prepare the client for the next command */
ed9b544e 2425 resetClient(c);
2426 return 1;
2427}
2428
248ea310 2429static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2430 listNode *ln;
c7df85a4 2431 listIter li;
ed9b544e 2432 int outc = 0, j;
93ea3759 2433 robj **outv;
248ea310 2434 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2435 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2436 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2437 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2438 robj *lenobj;
93ea3759 2439
2440 if (argc <= REDIS_STATIC_ARGS) {
2441 outv = static_outv;
2442 } else {
248ea310 2443 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2444 }
248ea310 2445
2446 lenobj = createObject(REDIS_STRING,
2447 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2448 lenobj->refcount = 0;
2449 outv[outc++] = lenobj;
ed9b544e 2450 for (j = 0; j < argc; j++) {
248ea310 2451 lenobj = createObject(REDIS_STRING,
2452 sdscatprintf(sdsempty(),"$%lu\r\n",
2453 (unsigned long) stringObjectLen(argv[j])));
2454 lenobj->refcount = 0;
2455 outv[outc++] = lenobj;
ed9b544e 2456 outv[outc++] = argv[j];
248ea310 2457 outv[outc++] = shared.crlf;
ed9b544e 2458 }
ed9b544e 2459
40d224a9 2460 /* Increment all the refcounts at start and decrement at end in order to
2461 * be sure to free objects if there is no slave in a replication state
2462 * able to be feed with commands */
2463 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2464 listRewind(slaves,&li);
2465 while((ln = listNext(&li))) {
ed9b544e 2466 redisClient *slave = ln->value;
40d224a9 2467
2468 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2469 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2470
2471 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2472 if (slave->slaveseldb != dictid) {
2473 robj *selectcmd;
2474
2475 switch(dictid) {
2476 case 0: selectcmd = shared.select0; break;
2477 case 1: selectcmd = shared.select1; break;
2478 case 2: selectcmd = shared.select2; break;
2479 case 3: selectcmd = shared.select3; break;
2480 case 4: selectcmd = shared.select4; break;
2481 case 5: selectcmd = shared.select5; break;
2482 case 6: selectcmd = shared.select6; break;
2483 case 7: selectcmd = shared.select7; break;
2484 case 8: selectcmd = shared.select8; break;
2485 case 9: selectcmd = shared.select9; break;
2486 default:
2487 selectcmd = createObject(REDIS_STRING,
2488 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2489 selectcmd->refcount = 0;
2490 break;
2491 }
2492 addReply(slave,selectcmd);
2493 slave->slaveseldb = dictid;
2494 }
2495 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2496 }
40d224a9 2497 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2498 if (outv != static_outv) zfree(outv);
ed9b544e 2499}
2500
dd142b9c 2501static sds sdscatrepr(sds s, char *p, size_t len) {
2502 s = sdscatlen(s,"\"",1);
2503 while(len--) {
2504 switch(*p) {
2505 case '\\':
2506 case '"':
2507 s = sdscatprintf(s,"\\%c",*p);
2508 break;
2509 case '\n': s = sdscatlen(s,"\\n",1); break;
2510 case '\r': s = sdscatlen(s,"\\r",1); break;
2511 case '\t': s = sdscatlen(s,"\\t",1); break;
2512 case '\a': s = sdscatlen(s,"\\a",1); break;
2513 case '\b': s = sdscatlen(s,"\\b",1); break;
2514 default:
2515 if (isprint(*p))
2516 s = sdscatprintf(s,"%c",*p);
2517 else
2518 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2519 break;
2520 }
2521 p++;
2522 }
2523 return sdscatlen(s,"\"",1);
2524}
2525
2526static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2527 listNode *ln;
2528 listIter li;
2529 int j;
2530 sds cmdrepr = sdsnew("+");
2531 robj *cmdobj;
2532 struct timeval tv;
2533
2534 gettimeofday(&tv,NULL);
2535 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2536 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2537
2538 for (j = 0; j < argc; j++) {
2539 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2540 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2541 } else {
2542 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2543 sdslen(argv[j]->ptr));
2544 }
2545 if (j != argc-1)
2546 cmdrepr = sdscatlen(cmdrepr," ",1);
2547 }
2548 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2549 cmdobj = createObject(REDIS_STRING,cmdrepr);
2550
2551 listRewind(monitors,&li);
2552 while((ln = listNext(&li))) {
2553 redisClient *monitor = ln->value;
2554 addReply(monitor,cmdobj);
2555 }
2556 decrRefCount(cmdobj);
2557}
2558
638e42ac 2559static void processInputBuffer(redisClient *c) {
ed9b544e 2560again:
4409877e 2561 /* Before to process the input buffer, make sure the client is not
2562 * waitig for a blocking operation such as BLPOP. Note that the first
2563 * iteration the client is never blocked, otherwise the processInputBuffer
2564 * would not be called at all, but after the execution of the first commands
2565 * in the input buffer the client may be blocked, and the "goto again"
2566 * will try to reiterate. The following line will make it return asap. */
92f8e882 2567 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2568 if (c->bulklen == -1) {
2569 /* Read the first line of the query */
2570 char *p = strchr(c->querybuf,'\n');
2571 size_t querylen;
644fafa3 2572
ed9b544e 2573 if (p) {
2574 sds query, *argv;
2575 int argc, j;
e0a62c7f 2576
ed9b544e 2577 query = c->querybuf;
2578 c->querybuf = sdsempty();
2579 querylen = 1+(p-(query));
2580 if (sdslen(query) > querylen) {
2581 /* leave data after the first line of the query in the buffer */
2582 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2583 }
2584 *p = '\0'; /* remove "\n" */
2585 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2586 sdsupdatelen(query);
2587
2588 /* Now we can split the query in arguments */
ed9b544e 2589 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2590 sdsfree(query);
2591
2592 if (c->argv) zfree(c->argv);
2593 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2594
2595 for (j = 0; j < argc; j++) {
ed9b544e 2596 if (sdslen(argv[j])) {
2597 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2598 c->argc++;
2599 } else {
2600 sdsfree(argv[j]);
2601 }
2602 }
2603 zfree(argv);
7c49733c 2604 if (c->argc) {
2605 /* Execute the command. If the client is still valid
2606 * after processCommand() return and there is something
2607 * on the query buffer try to process the next command. */
2608 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2609 } else {
2610 /* Nothing to process, argc == 0. Just process the query
2611 * buffer if it's not empty or return to the caller */
2612 if (sdslen(c->querybuf)) goto again;
2613 }
ed9b544e 2614 return;
644fafa3 2615 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2616 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2617 freeClient(c);
2618 return;
2619 }
2620 } else {
2621 /* Bulk read handling. Note that if we are at this point
2622 the client already sent a command terminated with a newline,
2623 we are reading the bulk data that is actually the last
2624 argument of the command. */
2625 int qbl = sdslen(c->querybuf);
2626
2627 if (c->bulklen <= qbl) {
2628 /* Copy everything but the final CRLF as final argument */
2629 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2630 c->argc++;
2631 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2632 /* Process the command. If the client is still valid after
2633 * the processing and there is more data in the buffer
2634 * try to parse it. */
2635 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2636 return;
2637 }
2638 }
2639}
2640
638e42ac 2641static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2642 redisClient *c = (redisClient*) privdata;
2643 char buf[REDIS_IOBUF_LEN];
2644 int nread;
2645 REDIS_NOTUSED(el);
2646 REDIS_NOTUSED(mask);
2647
2648 nread = read(fd, buf, REDIS_IOBUF_LEN);
2649 if (nread == -1) {
2650 if (errno == EAGAIN) {
2651 nread = 0;
2652 } else {
f870935d 2653 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2654 freeClient(c);
2655 return;
2656 }
2657 } else if (nread == 0) {
f870935d 2658 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2659 freeClient(c);
2660 return;
2661 }
2662 if (nread) {
2663 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2664 c->lastinteraction = time(NULL);
2665 } else {
2666 return;
2667 }
168ac5c6 2668 processInputBuffer(c);
638e42ac 2669}
2670
ed9b544e 2671static int selectDb(redisClient *c, int id) {
2672 if (id < 0 || id >= server.dbnum)
2673 return REDIS_ERR;
3305306f 2674 c->db = &server.db[id];
ed9b544e 2675 return REDIS_OK;
2676}
2677
40d224a9 2678static void *dupClientReplyValue(void *o) {
2679 incrRefCount((robj*)o);
12d090d2 2680 return o;
40d224a9 2681}
2682
ffc6b7f8 2683static int listMatchObjects(void *a, void *b) {
bf028098 2684 return equalStringObjects(a,b);
ffc6b7f8 2685}
2686
ed9b544e 2687static redisClient *createClient(int fd) {
2688 redisClient *c = zmalloc(sizeof(*c));
2689
2690 anetNonBlock(NULL,fd);
2691 anetTcpNoDelay(NULL,fd);
2692 if (!c) return NULL;
2693 selectDb(c,0);
2694 c->fd = fd;
2695 c->querybuf = sdsempty();
2696 c->argc = 0;
93ea3759 2697 c->argv = NULL;
ed9b544e 2698 c->bulklen = -1;
e8a74421 2699 c->multibulk = 0;
2700 c->mbargc = 0;
2701 c->mbargv = NULL;
ed9b544e 2702 c->sentlen = 0;
2703 c->flags = 0;
2704 c->lastinteraction = time(NULL);
abcb223e 2705 c->authenticated = 0;
40d224a9 2706 c->replstate = REDIS_REPL_NONE;
6b47e12e 2707 c->reply = listCreate();
ed9b544e 2708 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2709 listSetDupMethod(c->reply,dupClientReplyValue);
92f8e882 2710 c->blockingkeys = NULL;
2711 c->blockingkeysnum = 0;
2712 c->io_keys = listCreate();
2713 listSetFreeMethod(c->io_keys,decrRefCount);
ffc6b7f8 2714 c->pubsub_channels = dictCreate(&setDictType,NULL);
2715 c->pubsub_patterns = listCreate();
2716 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2717 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
ed9b544e 2718 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2719 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2720 freeClient(c);
2721 return NULL;
2722 }
6b47e12e 2723 listAddNodeTail(server.clients,c);
6e469882 2724 initClientMultiState(c);
ed9b544e 2725 return c;
2726}
2727
2728static void addReply(redisClient *c, robj *obj) {
2729 if (listLength(c->reply) == 0 &&
6208b3a7 2730 (c->replstate == REDIS_REPL_NONE ||
2731 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2732 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2733 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2734
2735 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2736 obj = dupStringObject(obj);
2737 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2738 }
9d65a1bb 2739 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2740}
2741
2742static void addReplySds(redisClient *c, sds s) {
2743 robj *o = createObject(REDIS_STRING,s);
2744 addReply(c,o);
2745 decrRefCount(o);
2746}
2747
e2665397 2748static void addReplyDouble(redisClient *c, double d) {
2749 char buf[128];
2750
2751 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2752 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2753 (unsigned long) strlen(buf),buf));
e2665397 2754}
2755
f44dd428 2756static void addReplyLong(redisClient *c, long l) {
2757 char buf[128];
2758 size_t len;
2759
dd88747b 2760 if (l == 0) {
2761 addReply(c,shared.czero);
2762 return;
2763 } else if (l == 1) {
2764 addReply(c,shared.cone);
2765 return;
2766 }
f44dd428 2767 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2768 addReplySds(c,sdsnewlen(buf,len));
2769}
2770
aa7c2934
PN
2771static void addReplyLongLong(redisClient *c, long long ll) {
2772 char buf[128];
2773 size_t len;
2774
2775 if (ll == 0) {
2776 addReply(c,shared.czero);
2777 return;
2778 } else if (ll == 1) {
2779 addReply(c,shared.cone);
2780 return;
2781 }
2782 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2783 addReplySds(c,sdsnewlen(buf,len));
2784}
2785
92b27fe9 2786static void addReplyUlong(redisClient *c, unsigned long ul) {
2787 char buf[128];
2788 size_t len;
2789
dd88747b 2790 if (ul == 0) {
2791 addReply(c,shared.czero);
2792 return;
2793 } else if (ul == 1) {
2794 addReply(c,shared.cone);
2795 return;
2796 }
92b27fe9 2797 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2798 addReplySds(c,sdsnewlen(buf,len));
2799}
2800
942a3961 2801static void addReplyBulkLen(redisClient *c, robj *obj) {
2802 size_t len;
2803
2804 if (obj->encoding == REDIS_ENCODING_RAW) {
2805 len = sdslen(obj->ptr);
2806 } else {
2807 long n = (long)obj->ptr;
2808
e054afda 2809 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2810 len = 1;
2811 if (n < 0) {
2812 len++;
2813 n = -n;
2814 }
2815 while((n = n/10) != 0) {
2816 len++;
2817 }
2818 }
83c6a618 2819 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
942a3961 2820}
2821
dd88747b 2822static void addReplyBulk(redisClient *c, robj *obj) {
2823 addReplyBulkLen(c,obj);
2824 addReply(c,obj);
2825 addReply(c,shared.crlf);
2826}
2827
500ece7c 2828/* In the CONFIG command we need to add vanilla C string as bulk replies */
2829static void addReplyBulkCString(redisClient *c, char *s) {
2830 if (s == NULL) {
2831 addReply(c,shared.nullbulk);
2832 } else {
2833 robj *o = createStringObject(s,strlen(s));
2834 addReplyBulk(c,o);
2835 decrRefCount(o);
2836 }
2837}
2838
ed9b544e 2839static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2840 int cport, cfd;
2841 char cip[128];
285add55 2842 redisClient *c;
ed9b544e 2843 REDIS_NOTUSED(el);
2844 REDIS_NOTUSED(mask);
2845 REDIS_NOTUSED(privdata);
2846
2847 cfd = anetAccept(server.neterr, fd, cip, &cport);
2848 if (cfd == AE_ERR) {
f870935d 2849 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2850 return;
2851 }
f870935d 2852 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2853 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2854 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2855 close(cfd); /* May be already closed, just ingore errors */
2856 return;
2857 }
285add55 2858 /* If maxclient directive is set and this is one client more... close the
2859 * connection. Note that we create the client instead to check before
2860 * for this condition, since now the socket is already set in nonblocking
2861 * mode and we can send an error for free using the Kernel I/O */
2862 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2863 char *err = "-ERR max number of clients reached\r\n";
2864
2865 /* That's a best effort error message, don't check write errors */
fee803ba 2866 if (write(c->fd,err,strlen(err)) == -1) {
2867 /* Nothing to do, Just to avoid the warning... */
2868 }
285add55 2869 freeClient(c);
2870 return;
2871 }
ed9b544e 2872 server.stat_numconnections++;
2873}
2874
2875/* ======================= Redis objects implementation ===================== */
2876
2877static robj *createObject(int type, void *ptr) {
2878 robj *o;
2879
a5819310 2880 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2881 if (listLength(server.objfreelist)) {
2882 listNode *head = listFirst(server.objfreelist);
2883 o = listNodeValue(head);
2884 listDelNode(server.objfreelist,head);
a5819310 2885 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2886 } else {
75680a3c 2887 if (server.vm_enabled) {
a5819310 2888 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2889 o = zmalloc(sizeof(*o));
2890 } else {
2891 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2892 }
ed9b544e 2893 }
ed9b544e 2894 o->type = type;
942a3961 2895 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2896 o->ptr = ptr;
2897 o->refcount = 1;
3a66edc7 2898 if (server.vm_enabled) {
1064ef87 2899 /* Note that this code may run in the context of an I/O thread
2900 * and accessing to server.unixtime in theory is an error
2901 * (no locks). But in practice this is safe, and even if we read
2902 * garbage Redis will not fail, as it's just a statistical info */
3a66edc7 2903 o->vm.atime = server.unixtime;
2904 o->storage = REDIS_VM_MEMORY;
2905 }
ed9b544e 2906 return o;
2907}
2908
2909static robj *createStringObject(char *ptr, size_t len) {
2910 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2911}
2912
3f973463
PN
2913static robj *createStringObjectFromLongLong(long long value) {
2914 robj *o;
2915 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2916 incrRefCount(shared.integers[value]);
2917 o = shared.integers[value];
2918 } else {
2919 o = createObject(REDIS_STRING, NULL);
2920 if (value >= LONG_MIN && value <= LONG_MAX) {
2921 o->encoding = REDIS_ENCODING_INT;
2922 o->ptr = (void*)((long)value);
2923 } else {
ee14da56 2924 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3f973463
PN
2925 }
2926 }
2927 return o;
2928}
2929
4ef8de8a 2930static robj *dupStringObject(robj *o) {
b9bc0eef 2931 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2932 return createStringObject(o->ptr,sdslen(o->ptr));
2933}
2934
ed9b544e 2935static robj *createListObject(void) {
2936 list *l = listCreate();
2937
ed9b544e 2938 listSetFreeMethod(l,decrRefCount);
2939 return createObject(REDIS_LIST,l);
2940}
2941
2942static robj *createSetObject(void) {
2943 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2944 return createObject(REDIS_SET,d);
2945}
2946
5234952b 2947static robj *createHashObject(void) {
2948 /* All the Hashes start as zipmaps. Will be automatically converted
2949 * into hash tables if there are enough elements or big elements
2950 * inside. */
2951 unsigned char *zm = zipmapNew();
2952 robj *o = createObject(REDIS_HASH,zm);
2953 o->encoding = REDIS_ENCODING_ZIPMAP;
2954 return o;
2955}
2956
1812e024 2957static robj *createZsetObject(void) {
6b47e12e 2958 zset *zs = zmalloc(sizeof(*zs));
2959
2960 zs->dict = dictCreate(&zsetDictType,NULL);
2961 zs->zsl = zslCreate();
2962 return createObject(REDIS_ZSET,zs);
1812e024 2963}
2964
ed9b544e 2965static void freeStringObject(robj *o) {
942a3961 2966 if (o->encoding == REDIS_ENCODING_RAW) {
2967 sdsfree(o->ptr);
2968 }
ed9b544e 2969}
2970
2971static void freeListObject(robj *o) {
2972 listRelease((list*) o->ptr);
2973}
2974
2975static void freeSetObject(robj *o) {
2976 dictRelease((dict*) o->ptr);
2977}
2978
fd8ccf44 2979static void freeZsetObject(robj *o) {
2980 zset *zs = o->ptr;
2981
2982 dictRelease(zs->dict);
2983 zslFree(zs->zsl);
2984 zfree(zs);
2985}
2986
ed9b544e 2987static void freeHashObject(robj *o) {
cbba7dd7 2988 switch (o->encoding) {
2989 case REDIS_ENCODING_HT:
2990 dictRelease((dict*) o->ptr);
2991 break;
2992 case REDIS_ENCODING_ZIPMAP:
2993 zfree(o->ptr);
2994 break;
2995 default:
f83c6cb5 2996 redisPanic("Unknown hash encoding type");
cbba7dd7 2997 break;
2998 }
ed9b544e 2999}
3000
3001static void incrRefCount(robj *o) {
3002 o->refcount++;
3003}
3004
3005static void decrRefCount(void *obj) {
3006 robj *o = obj;
94754ccc 3007
c651fd9e 3008 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
970e10bb 3009 /* Object is a key of a swapped out value, or in the process of being
3010 * loaded. */
996cb5f7 3011 if (server.vm_enabled &&
3012 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3013 {
996cb5f7 3014 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 3015 redisAssert(o->type == REDIS_STRING);
a35ddf12 3016 freeStringObject(o);
3017 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 3018 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 3019 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3020 !listAddNodeHead(server.objfreelist,o))
3021 zfree(o);
a5819310 3022 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 3023 server.vm_stats_swapped_objects--;
a35ddf12 3024 return;
3025 }
996cb5f7 3026 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 3027 if (--(o->refcount) == 0) {
996cb5f7 3028 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3029 vmCancelThreadedIOJob(obj);
ed9b544e 3030 switch(o->type) {
3031 case REDIS_STRING: freeStringObject(o); break;
3032 case REDIS_LIST: freeListObject(o); break;
3033 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 3034 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 3035 case REDIS_HASH: freeHashObject(o); break;
f83c6cb5 3036 default: redisPanic("Unknown object type"); break;
ed9b544e 3037 }
a5819310 3038 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 3039 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3040 !listAddNodeHead(server.objfreelist,o))
3041 zfree(o);
a5819310 3042 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 3043 }
3044}
3045
942a3961 3046static robj *lookupKey(redisDb *db, robj *key) {
3047 dictEntry *de = dictFind(db->dict,key);
3a66edc7 3048 if (de) {
55cf8433 3049 robj *key = dictGetEntryKey(de);
3050 robj *val = dictGetEntryVal(de);
3a66edc7 3051
55cf8433 3052 if (server.vm_enabled) {
996cb5f7 3053 if (key->storage == REDIS_VM_MEMORY ||
3054 key->storage == REDIS_VM_SWAPPING)
3055 {
3056 /* If we were swapping the object out, stop it, this key
3057 * was requested. */
3058 if (key->storage == REDIS_VM_SWAPPING)
3059 vmCancelThreadedIOJob(key);
55cf8433 3060 /* Update the access time of the key for the aging algorithm. */
3061 key->vm.atime = server.unixtime;
3062 } else {
d5d55fc3 3063 int notify = (key->storage == REDIS_VM_LOADING);
3064
55cf8433 3065 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 3066 redisAssert(val == NULL);
55cf8433 3067 val = vmLoadObject(key);
3068 dictGetEntryVal(de) = val;
d5d55fc3 3069
3070 /* Clients blocked by the VM subsystem may be waiting for
3071 * this key... */
3072 if (notify) handleClientsBlockedOnSwappedKey(db,key);
55cf8433 3073 }
3074 }
3075 return val;
3a66edc7 3076 } else {
3077 return NULL;
3078 }
942a3961 3079}
3080
3081static robj *lookupKeyRead(redisDb *db, robj *key) {
3082 expireIfNeeded(db,key);
3083 return lookupKey(db,key);
3084}
3085
3086static robj *lookupKeyWrite(redisDb *db, robj *key) {
3087 deleteIfVolatile(db,key);
3088 return lookupKey(db,key);
3089}
3090
92b27fe9 3091static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3092 robj *o = lookupKeyRead(c->db, key);
3093 if (!o) addReply(c,reply);
3094 return o;
3095}
3096
3097static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3098 robj *o = lookupKeyWrite(c->db, key);
3099 if (!o) addReply(c,reply);
3100 return o;
3101}
3102
3103static int checkType(redisClient *c, robj *o, int type) {
3104 if (o->type != type) {
3105 addReply(c,shared.wrongtypeerr);
3106 return 1;
3107 }
3108 return 0;
3109}
3110
942a3961 3111static int deleteKey(redisDb *db, robj *key) {
3112 int retval;
3113
3114 /* We need to protect key from destruction: after the first dictDelete()
3115 * it may happen that 'key' is no longer valid if we don't increment
3116 * it's count. This may happen when we get the object reference directly
3117 * from the hash table with dictRandomKey() or dict iterators */
3118 incrRefCount(key);
3119 if (dictSize(db->expires)) dictDelete(db->expires,key);
3120 retval = dictDelete(db->dict,key);
3121 decrRefCount(key);
3122
3123 return retval == DICT_OK;
3124}
3125
724a51b1 3126/* Check if the nul-terminated string 's' can be represented by a long
3127 * (that is, is a number that fits into long without any other space or
3128 * character before or after the digits).
3129 *
3130 * If so, the function returns REDIS_OK and *longval is set to the value
3131 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 3132static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 3133 char buf[32], *endptr;
3134 long value;
3135 int slen;
e0a62c7f 3136
724a51b1 3137 value = strtol(s, &endptr, 10);
3138 if (endptr[0] != '\0') return REDIS_ERR;
ee14da56 3139 slen = ll2string(buf,32,value);
724a51b1 3140
3141 /* If the number converted back into a string is not identical
3142 * then it's not possible to encode the string as integer */
f69f2cba 3143 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 3144 if (longval) *longval = value;
3145 return REDIS_OK;
3146}
3147
942a3961 3148/* Try to encode a string object in order to save space */
05df7621 3149static robj *tryObjectEncoding(robj *o) {
942a3961 3150 long value;
942a3961 3151 sds s = o->ptr;
3305306f 3152
942a3961 3153 if (o->encoding != REDIS_ENCODING_RAW)
05df7621 3154 return o; /* Already encoded */
3305306f 3155
05df7621 3156 /* It's not safe to encode shared objects: shared objects can be shared
942a3961 3157 * everywhere in the "object space" of Redis. Encoded objects can only
3158 * appear as "values" (and not, for instance, as keys) */
05df7621 3159 if (o->refcount > 1) return o;
3305306f 3160
942a3961 3161 /* Currently we try to encode only strings */
dfc5e96c 3162 redisAssert(o->type == REDIS_STRING);
94754ccc 3163
724a51b1 3164 /* Check if we can represent this string as a long integer */
05df7621 3165 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
942a3961 3166
3167 /* Ok, this object can be encoded */
05df7621 3168 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3169 decrRefCount(o);
3170 incrRefCount(shared.integers[value]);
3171 return shared.integers[value];
3172 } else {
3173 o->encoding = REDIS_ENCODING_INT;
3174 sdsfree(o->ptr);
3175 o->ptr = (void*) value;
3176 return o;
3177 }
942a3961 3178}
3179
9d65a1bb 3180/* Get a decoded version of an encoded object (returned as a new object).
3181 * If the object is already raw-encoded just increment the ref count. */
3182static robj *getDecodedObject(robj *o) {
942a3961 3183 robj *dec;
e0a62c7f 3184
9d65a1bb 3185 if (o->encoding == REDIS_ENCODING_RAW) {
3186 incrRefCount(o);
3187 return o;
3188 }
942a3961 3189 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3190 char buf[32];
3191
ee14da56 3192 ll2string(buf,32,(long)o->ptr);
942a3961 3193 dec = createStringObject(buf,strlen(buf));
3194 return dec;
3195 } else {
08ee9b57 3196 redisPanic("Unknown encoding type");
942a3961 3197 }
3305306f 3198}
3199
d7f43c08 3200/* Compare two string objects via strcmp() or alike.
3201 * Note that the objects may be integer-encoded. In such a case we
ee14da56 3202 * use ll2string() to get a string representation of the numbers on the stack
1fd9bc8a 3203 * and compare the strings, it's much faster than calling getDecodedObject().
3204 *
3205 * Important note: if objects are not integer encoded, but binary-safe strings,
3206 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3207 * binary safe. */
724a51b1 3208static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 3209 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 3210 char bufa[128], bufb[128], *astr, *bstr;
3211 int bothsds = 1;
724a51b1 3212
e197b441 3213 if (a == b) return 0;
d7f43c08 3214 if (a->encoding != REDIS_ENCODING_RAW) {
ee14da56 3215 ll2string(bufa,sizeof(bufa),(long) a->ptr);
d7f43c08 3216 astr = bufa;
3217 bothsds = 0;
724a51b1 3218 } else {
d7f43c08 3219 astr = a->ptr;
724a51b1 3220 }
d7f43c08 3221 if (b->encoding != REDIS_ENCODING_RAW) {
ee14da56 3222 ll2string(bufb,sizeof(bufb),(long) b->ptr);
d7f43c08 3223 bstr = bufb;
3224 bothsds = 0;
3225 } else {
3226 bstr = b->ptr;
3227 }
3228 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3229}
3230
bf028098 3231/* Equal string objects return 1 if the two objects are the same from the
3232 * point of view of a string comparison, otherwise 0 is returned. Note that
3233 * this function is faster then checking for (compareStringObject(a,b) == 0)
3234 * because it can perform some more optimization. */
3235static int equalStringObjects(robj *a, robj *b) {
3236 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3237 return a->ptr == b->ptr;
3238 } else {
3239 return compareStringObjects(a,b) == 0;
3240 }
3241}
3242
0ea663ea 3243static size_t stringObjectLen(robj *o) {
dfc5e96c 3244 redisAssert(o->type == REDIS_STRING);
0ea663ea 3245 if (o->encoding == REDIS_ENCODING_RAW) {
3246 return sdslen(o->ptr);
3247 } else {
3248 char buf[32];
3249
ee14da56 3250 return ll2string(buf,32,(long)o->ptr);
0ea663ea 3251 }
3252}
3253
bd79a6bd
PN
3254static int getDoubleFromObject(robj *o, double *target) {
3255 double value;
682c73e8 3256 char *eptr;
bbe025e0 3257
bd79a6bd
PN
3258 if (o == NULL) {
3259 value = 0;
3260 } else {
3261 redisAssert(o->type == REDIS_STRING);
3262 if (o->encoding == REDIS_ENCODING_RAW) {
3263 value = strtod(o->ptr, &eptr);
682c73e8 3264 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3265 } else if (o->encoding == REDIS_ENCODING_INT) {
3266 value = (long)o->ptr;
3267 } else {
946342c1 3268 redisPanic("Unknown string encoding");
bd79a6bd
PN
3269 }
3270 }
3271
bd79a6bd
PN
3272 *target = value;
3273 return REDIS_OK;
3274}
bbe025e0 3275
bd79a6bd
PN
3276static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3277 double value;
3278 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3279 if (msg != NULL) {
3280 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3281 } else {
3282 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3283 }
bbe025e0
AM
3284 return REDIS_ERR;
3285 }
3286
bd79a6bd 3287 *target = value;
bbe025e0
AM
3288 return REDIS_OK;
3289}
3290
bd79a6bd
PN
3291static int getLongLongFromObject(robj *o, long long *target) {
3292 long long value;
682c73e8 3293 char *eptr;
bbe025e0 3294
bd79a6bd
PN
3295 if (o == NULL) {
3296 value = 0;
3297 } else {
3298 redisAssert(o->type == REDIS_STRING);
3299 if (o->encoding == REDIS_ENCODING_RAW) {
3300 value = strtoll(o->ptr, &eptr, 10);
682c73e8 3301 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3302 } else if (o->encoding == REDIS_ENCODING_INT) {
3303 value = (long)o->ptr;
3304 } else {
946342c1 3305 redisPanic("Unknown string encoding");
bd79a6bd
PN
3306 }
3307 }
3308
bd79a6bd
PN
3309 *target = value;
3310 return REDIS_OK;
3311}
bbe025e0 3312
bd79a6bd
PN
3313static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3314 long long value;
3315 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3316 if (msg != NULL) {
3317 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3318 } else {
3319 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3320 }
bbe025e0
AM
3321 return REDIS_ERR;
3322 }
3323
bd79a6bd 3324 *target = value;
bbe025e0
AM
3325 return REDIS_OK;
3326}
3327
bd79a6bd
PN
3328static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3329 long long value;
bbe025e0 3330
bd79a6bd
PN
3331 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3332 if (value < LONG_MIN || value > LONG_MAX) {
3333 if (msg != NULL) {
3334 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3335 } else {
3336 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3337 }
bbe025e0
AM
3338 return REDIS_ERR;
3339 }
3340
bd79a6bd 3341 *target = value;
bbe025e0
AM
3342 return REDIS_OK;
3343}
3344
06233c45 3345/*============================ RDB saving/loading =========================== */
ed9b544e 3346
f78fd11b 3347static int rdbSaveType(FILE *fp, unsigned char type) {
3348 if (fwrite(&type,1,1,fp) == 0) return -1;
3349 return 0;
3350}
3351
bb32ede5 3352static int rdbSaveTime(FILE *fp, time_t t) {
3353 int32_t t32 = (int32_t) t;
3354 if (fwrite(&t32,4,1,fp) == 0) return -1;
3355 return 0;
3356}
3357
e3566d4b 3358/* check rdbLoadLen() comments for more info */
f78fd11b 3359static int rdbSaveLen(FILE *fp, uint32_t len) {
3360 unsigned char buf[2];
3361
3362 if (len < (1<<6)) {
3363 /* Save a 6 bit len */
10c43610 3364 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3365 if (fwrite(buf,1,1,fp) == 0) return -1;
3366 } else if (len < (1<<14)) {
3367 /* Save a 14 bit len */
10c43610 3368 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3369 buf[1] = len&0xFF;
17be1a4a 3370 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3371 } else {
3372 /* Save a 32 bit len */
10c43610 3373 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3374 if (fwrite(buf,1,1,fp) == 0) return -1;
3375 len = htonl(len);
3376 if (fwrite(&len,4,1,fp) == 0) return -1;
3377 }
3378 return 0;
3379}
3380
e3566d4b 3381/* String objects in the form "2391" "-100" without any space and with a
3382 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3383 * encoded as integers to save space */
b1befe6a 3384static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
e3566d4b 3385 long long value;
3386 char *endptr, buf[32];
3387
3388 /* Check if it's possible to encode this value as a number */
3389 value = strtoll(s, &endptr, 10);
3390 if (endptr[0] != '\0') return 0;
ee14da56 3391 ll2string(buf,32,value);
e3566d4b 3392
3393 /* If the number converted back into a string is not identical
3394 * then it's not possible to encode the string as integer */
b1befe6a 3395 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
e3566d4b 3396
3397 /* Finally check if it fits in our ranges */
3398 if (value >= -(1<<7) && value <= (1<<7)-1) {
3399 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3400 enc[1] = value&0xFF;
3401 return 2;
3402 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3403 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3404 enc[1] = value&0xFF;
3405 enc[2] = (value>>8)&0xFF;
3406 return 3;
3407 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3408 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3409 enc[1] = value&0xFF;
3410 enc[2] = (value>>8)&0xFF;
3411 enc[3] = (value>>16)&0xFF;
3412 enc[4] = (value>>24)&0xFF;
3413 return 5;
3414 } else {
3415 return 0;
3416 }
3417}
3418
b1befe6a 3419static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3420 size_t comprlen, outlen;
774e3047 3421 unsigned char byte;
3422 void *out;
3423
3424 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3425 if (len <= 4) return 0;
3426 outlen = len-4;
3a2694c4 3427 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3428 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3429 if (comprlen == 0) {
88e85998 3430 zfree(out);
774e3047 3431 return 0;
3432 }
3433 /* Data compressed! Let's save it on disk */
3434 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3435 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3436 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3437 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3438 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3439 zfree(out);
774e3047 3440 return comprlen;
3441
3442writeerr:
88e85998 3443 zfree(out);
774e3047 3444 return -1;
3445}
3446
e3566d4b 3447/* Save a string objet as [len][data] on disk. If the object is a string
3448 * representation of an integer value we try to safe it in a special form */
b1befe6a 3449static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3450 int enclen;
10c43610 3451
774e3047 3452 /* Try integer encoding */
e3566d4b 3453 if (len <= 11) {
3454 unsigned char buf[5];
b1befe6a 3455 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3456 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3457 return 0;
3458 }
3459 }
774e3047 3460
3461 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3462 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3463 if (server.rdbcompression && len > 20) {
774e3047 3464 int retval;
3465
b1befe6a 3466 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3467 if (retval == -1) return -1;
3468 if (retval > 0) return 0;
3469 /* retval == 0 means data can't be compressed, save the old way */
3470 }
3471
3472 /* Store verbatim */
10c43610 3473 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3474 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3475 return 0;
3476}
3477
942a3961 3478/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3479static int rdbSaveStringObject(FILE *fp, robj *obj) {
3480 int retval;
942a3961 3481
f2d9f50f 3482 /* Avoid incr/decr ref count business when possible.
3483 * This plays well with copy-on-write given that we are probably
3484 * in a child process (BGSAVE). Also this makes sure key objects
3485 * of swapped objects are not incRefCount-ed (an assert does not allow
3486 * this in order to avoid bugs) */
3487 if (obj->encoding != REDIS_ENCODING_RAW) {
996cb5f7 3488 obj = getDecodedObject(obj);
b1befe6a 3489 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3490 decrRefCount(obj);
3491 } else {
b1befe6a 3492 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3493 }
9d65a1bb 3494 return retval;
942a3961 3495}
3496
a7866db6 3497/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3498 * 8 bit integer specifing the length of the representation.
3499 * This 8 bit integer has special values in order to specify the following
3500 * conditions:
3501 * 253: not a number
3502 * 254: + inf
3503 * 255: - inf
3504 */
3505static int rdbSaveDoubleValue(FILE *fp, double val) {
3506 unsigned char buf[128];
3507 int len;
3508
3509 if (isnan(val)) {
3510 buf[0] = 253;
3511 len = 1;
3512 } else if (!isfinite(val)) {
3513 len = 1;
3514 buf[0] = (val < 0) ? 255 : 254;
3515 } else {
eaa256ad 3516 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3517 buf[0] = strlen((char*)buf+1);
a7866db6 3518 len = buf[0]+1;
3519 }
3520 if (fwrite(buf,len,1,fp) == 0) return -1;
3521 return 0;
3522}
3523
06233c45 3524/* Save a Redis object. */
3525static int rdbSaveObject(FILE *fp, robj *o) {
3526 if (o->type == REDIS_STRING) {
3527 /* Save a string value */
3528 if (rdbSaveStringObject(fp,o) == -1) return -1;
3529 } else if (o->type == REDIS_LIST) {
3530 /* Save a list value */
3531 list *list = o->ptr;
c7df85a4 3532 listIter li;
06233c45 3533 listNode *ln;
3534
06233c45 3535 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
c7df85a4 3536 listRewind(list,&li);
3537 while((ln = listNext(&li))) {
06233c45 3538 robj *eleobj = listNodeValue(ln);
3539
3540 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3541 }
3542 } else if (o->type == REDIS_SET) {
3543 /* Save a set value */
3544 dict *set = o->ptr;
3545 dictIterator *di = dictGetIterator(set);
3546 dictEntry *de;
3547
3548 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3549 while((de = dictNext(di)) != NULL) {
3550 robj *eleobj = dictGetEntryKey(de);
3551
3552 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3553 }
3554 dictReleaseIterator(di);
3555 } else if (o->type == REDIS_ZSET) {
3556 /* Save a set value */
3557 zset *zs = o->ptr;
3558 dictIterator *di = dictGetIterator(zs->dict);
3559 dictEntry *de;
3560
3561 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3562 while((de = dictNext(di)) != NULL) {
3563 robj *eleobj = dictGetEntryKey(de);
3564 double *score = dictGetEntryVal(de);
3565
3566 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3567 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3568 }
3569 dictReleaseIterator(di);
b1befe6a 3570 } else if (o->type == REDIS_HASH) {
3571 /* Save a hash value */
3572 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3573 unsigned char *p = zipmapRewind(o->ptr);
3574 unsigned int count = zipmapLen(o->ptr);
3575 unsigned char *key, *val;
3576 unsigned int klen, vlen;
3577
3578 if (rdbSaveLen(fp,count) == -1) return -1;
3579 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3580 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3581 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3582 }
3583 } else {
3584 dictIterator *di = dictGetIterator(o->ptr);
3585 dictEntry *de;
3586
3587 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3588 while((de = dictNext(di)) != NULL) {
3589 robj *key = dictGetEntryKey(de);
3590 robj *val = dictGetEntryVal(de);
3591
3592 if (rdbSaveStringObject(fp,key) == -1) return -1;
3593 if (rdbSaveStringObject(fp,val) == -1) return -1;
3594 }
3595 dictReleaseIterator(di);
3596 }
06233c45 3597 } else {
f83c6cb5 3598 redisPanic("Unknown object type");
06233c45 3599 }
3600 return 0;
3601}
3602
3603/* Return the length the object will have on disk if saved with
3604 * the rdbSaveObject() function. Currently we use a trick to get
3605 * this length with very little changes to the code. In the future
3606 * we could switch to a faster solution. */
b9bc0eef 3607static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3608 if (fp == NULL) fp = server.devnull;
06233c45 3609 rewind(fp);
3610 assert(rdbSaveObject(fp,o) != 1);
3611 return ftello(fp);
3612}
3613
06224fec 3614/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3615static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3616 off_t bytes = rdbSavedObjectLen(o,fp);
e0a62c7f 3617
06224fec 3618 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3619}
3620
ed9b544e 3621/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3622static int rdbSave(char *filename) {
ed9b544e 3623 dictIterator *di = NULL;
3624 dictEntry *de;
ed9b544e 3625 FILE *fp;
3626 char tmpfile[256];
3627 int j;
bb32ede5 3628 time_t now = time(NULL);
ed9b544e 3629
2316bb3b 3630 /* Wait for I/O therads to terminate, just in case this is a
3631 * foreground-saving, to avoid seeking the swap file descriptor at the
3632 * same time. */
3633 if (server.vm_enabled)
3634 waitEmptyIOJobsQueue();
3635
a3b21203 3636 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3637 fp = fopen(tmpfile,"w");
3638 if (!fp) {
3639 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3640 return REDIS_ERR;
3641 }
f78fd11b 3642 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3643 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3644 redisDb *db = server.db+j;
3645 dict *d = db->dict;
3305306f 3646 if (dictSize(d) == 0) continue;
ed9b544e 3647 di = dictGetIterator(d);
3648 if (!di) {
3649 fclose(fp);
3650 return REDIS_ERR;
3651 }
3652
3653 /* Write the SELECT DB opcode */
f78fd11b 3654 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3655 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3656
3657 /* Iterate this DB writing every entry */
3658 while((de = dictNext(di)) != NULL) {
3659 robj *key = dictGetEntryKey(de);
3660 robj *o = dictGetEntryVal(de);
bb32ede5 3661 time_t expiretime = getExpire(db,key);
3662
3663 /* Save the expire time */
3664 if (expiretime != -1) {
3665 /* If this key is already expired skip it */
3666 if (expiretime < now) continue;
3667 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3668 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3669 }
7e69548d 3670 /* Save the key and associated value. This requires special
3671 * handling if the value is swapped out. */
996cb5f7 3672 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3673 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3674 /* Save type, key, value */
3675 if (rdbSaveType(fp,o->type) == -1) goto werr;
3676 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3677 if (rdbSaveObject(fp,o) == -1) goto werr;
3678 } else {
996cb5f7 3679 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3680 robj *po;
7e69548d 3681 /* Get a preview of the object in memory */
3682 po = vmPreviewObject(key);
7e69548d 3683 /* Save type, key, value */
3684 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3685 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3686 if (rdbSaveObject(fp,po) == -1) goto werr;
3687 /* Remove the loaded object from memory */
3688 decrRefCount(po);
7e69548d 3689 }
ed9b544e 3690 }
3691 dictReleaseIterator(di);
3692 }
3693 /* EOF opcode */
f78fd11b 3694 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3695
3696 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3697 fflush(fp);
3698 fsync(fileno(fp));
3699 fclose(fp);
e0a62c7f 3700
ed9b544e 3701 /* Use RENAME to make sure the DB file is changed atomically only
3702 * if the generate DB file is ok. */
3703 if (rename(tmpfile,filename) == -1) {
325d1eb4 3704 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3705 unlink(tmpfile);
3706 return REDIS_ERR;
3707 }
3708 redisLog(REDIS_NOTICE,"DB saved on disk");
3709 server.dirty = 0;
3710 server.lastsave = time(NULL);
3711 return REDIS_OK;
3712
3713werr:
3714 fclose(fp);
3715 unlink(tmpfile);
3716 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3717 if (di) dictReleaseIterator(di);
3718 return REDIS_ERR;
3719}
3720
f78fd11b 3721static int rdbSaveBackground(char *filename) {
ed9b544e 3722 pid_t childpid;
3723
9d65a1bb 3724 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3725 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3726 if ((childpid = fork()) == 0) {
3727 /* Child */
054e426d 3728 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3729 close(server.fd);
f78fd11b 3730 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3731 _exit(0);
ed9b544e 3732 } else {
478c2c6f 3733 _exit(1);
ed9b544e 3734 }
3735 } else {
3736 /* Parent */
5a7c647e 3737 if (childpid == -1) {
3738 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3739 strerror(errno));
3740 return REDIS_ERR;
3741 }
ed9b544e 3742 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3743 server.bgsavechildpid = childpid;
884d4b39 3744 updateDictResizePolicy();
ed9b544e 3745 return REDIS_OK;
3746 }
3747 return REDIS_OK; /* unreached */
3748}
3749
a3b21203 3750static void rdbRemoveTempFile(pid_t childpid) {
3751 char tmpfile[256];
3752
3753 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3754 unlink(tmpfile);
3755}
3756
f78fd11b 3757static int rdbLoadType(FILE *fp) {
3758 unsigned char type;
7b45bfb2 3759 if (fread(&type,1,1,fp) == 0) return -1;
3760 return type;
3761}
3762
bb32ede5 3763static time_t rdbLoadTime(FILE *fp) {
3764 int32_t t32;
3765 if (fread(&t32,4,1,fp) == 0) return -1;
3766 return (time_t) t32;
3767}
3768
e3566d4b 3769/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3770 * of this file for a description of how this are stored on disk.
3771 *
3772 * isencoded is set to 1 if the readed length is not actually a length but
3773 * an "encoding type", check the above comments for more info */
c78a8ccc 3774static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3775 unsigned char buf[2];
3776 uint32_t len;
c78a8ccc 3777 int type;
f78fd11b 3778
e3566d4b 3779 if (isencoded) *isencoded = 0;
c78a8ccc 3780 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3781 type = (buf[0]&0xC0)>>6;
3782 if (type == REDIS_RDB_6BITLEN) {
3783 /* Read a 6 bit len */
3784 return buf[0]&0x3F;
3785 } else if (type == REDIS_RDB_ENCVAL) {
3786 /* Read a 6 bit len encoding type */
3787 if (isencoded) *isencoded = 1;
3788 return buf[0]&0x3F;
3789 } else if (type == REDIS_RDB_14BITLEN) {
3790 /* Read a 14 bit len */
3791 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3792 return ((buf[0]&0x3F)<<8)|buf[1];
3793 } else {
3794 /* Read a 32 bit len */
f78fd11b 3795 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3796 return ntohl(len);
f78fd11b 3797 }
f78fd11b 3798}
3799
e3566d4b 3800static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3801 unsigned char enc[4];
3802 long long val;
3803
3804 if (enctype == REDIS_RDB_ENC_INT8) {
3805 if (fread(enc,1,1,fp) == 0) return NULL;
3806 val = (signed char)enc[0];
3807 } else if (enctype == REDIS_RDB_ENC_INT16) {
3808 uint16_t v;
3809 if (fread(enc,2,1,fp) == 0) return NULL;
3810 v = enc[0]|(enc[1]<<8);
3811 val = (int16_t)v;
3812 } else if (enctype == REDIS_RDB_ENC_INT32) {
3813 uint32_t v;
3814 if (fread(enc,4,1,fp) == 0) return NULL;
3815 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3816 val = (int32_t)v;
3817 } else {
3818 val = 0; /* anti-warning */
f83c6cb5 3819 redisPanic("Unknown RDB integer encoding type");
e3566d4b 3820 }
e5b7a215 3821 return createStringObjectFromLongLong(val);
e3566d4b 3822}
3823
c78a8ccc 3824static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3825 unsigned int len, clen;
3826 unsigned char *c = NULL;
3827 sds val = NULL;
3828
c78a8ccc 3829 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3830 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3831 if ((c = zmalloc(clen)) == NULL) goto err;
3832 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3833 if (fread(c,clen,1,fp) == 0) goto err;
3834 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3835 zfree(c);
88e85998 3836 return createObject(REDIS_STRING,val);
3837err:
3838 zfree(c);
3839 sdsfree(val);
3840 return NULL;
3841}
3842
c78a8ccc 3843static robj *rdbLoadStringObject(FILE*fp) {
e3566d4b 3844 int isencoded;
3845 uint32_t len;
f78fd11b 3846 sds val;
3847
c78a8ccc 3848 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3849 if (isencoded) {
3850 switch(len) {
3851 case REDIS_RDB_ENC_INT8:
3852 case REDIS_RDB_ENC_INT16:
3853 case REDIS_RDB_ENC_INT32:
bdcb92f2 3854 return rdbLoadIntegerObject(fp,len);
88e85998 3855 case REDIS_RDB_ENC_LZF:
bdcb92f2 3856 return rdbLoadLzfStringObject(fp);
e3566d4b 3857 default:
f83c6cb5 3858 redisPanic("Unknown RDB encoding type");
e3566d4b 3859 }
3860 }
3861
f78fd11b 3862 if (len == REDIS_RDB_LENERR) return NULL;
3863 val = sdsnewlen(NULL,len);
3864 if (len && fread(val,len,1,fp) == 0) {
3865 sdsfree(val);
3866 return NULL;
3867 }
bdcb92f2 3868 return createObject(REDIS_STRING,val);
f78fd11b 3869}
3870
a7866db6 3871/* For information about double serialization check rdbSaveDoubleValue() */
3872static int rdbLoadDoubleValue(FILE *fp, double *val) {
3873 char buf[128];
3874 unsigned char len;
3875
3876 if (fread(&len,1,1,fp) == 0) return -1;
3877 switch(len) {
3878 case 255: *val = R_NegInf; return 0;
3879 case 254: *val = R_PosInf; return 0;
3880 case 253: *val = R_Nan; return 0;
3881 default:
3882 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3883 buf[len] = '\0';
a7866db6 3884 sscanf(buf, "%lg", val);
3885 return 0;
3886 }
3887}
3888
c78a8ccc 3889/* Load a Redis object of the specified type from the specified file.
3890 * On success a newly allocated object is returned, otherwise NULL. */
3891static robj *rdbLoadObject(int type, FILE *fp) {
3892 robj *o;
3893
bcd11906 3894 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 3895 if (type == REDIS_STRING) {
3896 /* Read string value */
3897 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
05df7621 3898 o = tryObjectEncoding(o);
c78a8ccc 3899 } else if (type == REDIS_LIST || type == REDIS_SET) {
3900 /* Read list/set value */
3901 uint32_t listlen;
3902
3903 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3904 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3c68de9b 3905 /* It's faster to expand the dict to the right size asap in order
3906 * to avoid rehashing */
3907 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3908 dictExpand(o->ptr,listlen);
c78a8ccc 3909 /* Load every single element of the list/set */
3910 while(listlen--) {
3911 robj *ele;
3912
3913 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
05df7621 3914 ele = tryObjectEncoding(ele);
c78a8ccc 3915 if (type == REDIS_LIST) {
3916 listAddNodeTail((list*)o->ptr,ele);
3917 } else {
3918 dictAdd((dict*)o->ptr,ele,NULL);
3919 }
3920 }
3921 } else if (type == REDIS_ZSET) {
3922 /* Read list/set value */
ada386b2 3923 size_t zsetlen;
c78a8ccc 3924 zset *zs;
3925
3926 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3927 o = createZsetObject();
3928 zs = o->ptr;
3929 /* Load every single element of the list/set */
3930 while(zsetlen--) {
3931 robj *ele;
3932 double *score = zmalloc(sizeof(double));
3933
3934 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
05df7621 3935 ele = tryObjectEncoding(ele);
c78a8ccc 3936 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3937 dictAdd(zs->dict,ele,score);
3938 zslInsert(zs->zsl,*score,ele);
3939 incrRefCount(ele); /* added to skiplist */
3940 }
ada386b2 3941 } else if (type == REDIS_HASH) {
3942 size_t hashlen;
3943
3944 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3945 o = createHashObject();
3946 /* Too many entries? Use an hash table. */
3947 if (hashlen > server.hash_max_zipmap_entries)
3948 convertToRealHash(o);
3949 /* Load every key/value, then set it into the zipmap or hash
3950 * table, as needed. */
3951 while(hashlen--) {
3952 robj *key, *val;
3953
3954 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3955 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3956 /* If we are using a zipmap and there are too big values
3957 * the object is converted to real hash table encoding. */
3958 if (o->encoding != REDIS_ENCODING_HT &&
3959 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3960 sdslen(val->ptr) > server.hash_max_zipmap_value))
3961 {
3962 convertToRealHash(o);
3963 }
3964
3965 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3966 unsigned char *zm = o->ptr;
3967
3968 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3969 val->ptr,sdslen(val->ptr),NULL);
3970 o->ptr = zm;
3971 decrRefCount(key);
3972 decrRefCount(val);
3973 } else {
05df7621 3974 key = tryObjectEncoding(key);
3975 val = tryObjectEncoding(val);
ada386b2 3976 dictAdd((dict*)o->ptr,key,val);
ada386b2 3977 }
3978 }
c78a8ccc 3979 } else {
f83c6cb5 3980 redisPanic("Unknown object type");
c78a8ccc 3981 }
3982 return o;
3983}
3984
f78fd11b 3985static int rdbLoad(char *filename) {
ed9b544e 3986 FILE *fp;
f78fd11b 3987 uint32_t dbid;
bb32ede5 3988 int type, retval, rdbver;
585af7e2 3989 int swap_all_values = 0;
3305306f 3990 dict *d = server.db[0].dict;
bb32ede5 3991 redisDb *db = server.db+0;
f78fd11b 3992 char buf[1024];
242a64f3 3993 time_t expiretime, now = time(NULL);
b492cf00 3994 long long loadedkeys = 0;
bb32ede5 3995
ed9b544e 3996 fp = fopen(filename,"r");
3997 if (!fp) return REDIS_ERR;
3998 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 3999 buf[9] = '\0';
4000 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 4001 fclose(fp);
4002 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4003 return REDIS_ERR;
4004 }
f78fd11b 4005 rdbver = atoi(buf+5);
c78a8ccc 4006 if (rdbver != 1) {
f78fd11b 4007 fclose(fp);
4008 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4009 return REDIS_ERR;
4010 }
ed9b544e 4011 while(1) {
585af7e2 4012 robj *key, *val;
ed9b544e 4013
585af7e2 4014 expiretime = -1;
ed9b544e 4015 /* Read type. */
f78fd11b 4016 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 4017 if (type == REDIS_EXPIRETIME) {
4018 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4019 /* We read the time so we need to read the object type again */
4020 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4021 }
ed9b544e 4022 if (type == REDIS_EOF) break;
4023 /* Handle SELECT DB opcode as a special case */
4024 if (type == REDIS_SELECTDB) {
c78a8ccc 4025 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 4026 goto eoferr;
ed9b544e 4027 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 4028 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 4029 exit(1);
4030 }
bb32ede5 4031 db = server.db+dbid;
4032 d = db->dict;
ed9b544e 4033 continue;
4034 }
4035 /* Read key */
585af7e2 4036 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
c78a8ccc 4037 /* Read value */
585af7e2 4038 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
89e689c5 4039 /* Check if the key already expired */
4040 if (expiretime != -1 && expiretime < now) {
4041 decrRefCount(key);
4042 decrRefCount(val);
4043 continue;
4044 }
ed9b544e 4045 /* Add the new object in the hash table */
585af7e2 4046 retval = dictAdd(d,key,val);
ed9b544e 4047 if (retval == DICT_ERR) {
585af7e2 4048 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
ed9b544e 4049 exit(1);
4050 }
242a64f3 4051 loadedkeys++;
bb32ede5 4052 /* Set the expire time if needed */
89e689c5 4053 if (expiretime != -1) setExpire(db,key,expiretime);
242a64f3 4054
b492cf00 4055 /* Handle swapping while loading big datasets when VM is on */
242a64f3 4056
4057 /* If we detecter we are hopeless about fitting something in memory
4058 * we just swap every new key on disk. Directly...
4059 * Note that's important to check for this condition before resorting
4060 * to random sampling, otherwise we may try to swap already
4061 * swapped keys. */
585af7e2 4062 if (swap_all_values) {
4063 dictEntry *de = dictFind(d,key);
242a64f3 4064
4065 /* de may be NULL since the key already expired */
4066 if (de) {
585af7e2 4067 key = dictGetEntryKey(de);
4068 val = dictGetEntryVal(de);
242a64f3 4069
585af7e2 4070 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
242a64f3 4071 dictGetEntryVal(de) = NULL;
4072 }
4073 }
4074 continue;
4075 }
4076
4077 /* If we have still some hope of having some value fitting memory
4078 * then we try random sampling. */
585af7e2 4079 if (!swap_all_values && server.vm_enabled && (loadedkeys % 5000) == 0) {
b492cf00 4080 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 4081 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 4082 }
242a64f3 4083 if (zmalloc_used_memory() > server.vm_max_memory)
585af7e2 4084 swap_all_values = 1; /* We are already using too much mem */
b492cf00 4085 }
ed9b544e 4086 }
4087 fclose(fp);
4088 return REDIS_OK;
4089
4090eoferr: /* unexpected end of file is handled here with a fatal exit */
f80dff62 4091 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 4092 exit(1);
4093 return REDIS_ERR; /* Just to avoid warning */
4094}
4095
4096/*================================== Commands =============================== */
4097
abcb223e 4098static void authCommand(redisClient *c) {
2e77c2ee 4099 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
4100 c->authenticated = 1;
4101 addReply(c,shared.ok);
4102 } else {
4103 c->authenticated = 0;
fa4c0aba 4104 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
4105 }
4106}
4107
ed9b544e 4108static void pingCommand(redisClient *c) {
4109 addReply(c,shared.pong);
4110}
4111
4112static void echoCommand(redisClient *c) {
dd88747b 4113 addReplyBulk(c,c->argv[1]);
ed9b544e 4114}
4115
4116/*=================================== Strings =============================== */
4117
526d00a5 4118static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
ed9b544e 4119 int retval;
10ce1276 4120 long seconds = 0; /* initialized to avoid an harmness warning */
ed9b544e 4121
526d00a5 4122 if (expire) {
4123 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4124 return;
4125 if (seconds <= 0) {
4126 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4127 return;
4128 }
4129 }
4130
4131 if (nx) deleteIfVolatile(c->db,key);
4132 retval = dictAdd(c->db->dict,key,val);
ed9b544e 4133 if (retval == DICT_ERR) {
4134 if (!nx) {
1b03836c 4135 /* If the key is about a swapped value, we want a new key object
4136 * to overwrite the old. So we delete the old key in the database.
4137 * This will also make sure that swap pages about the old object
4138 * will be marked as free. */
526d00a5 4139 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4140 incrRefCount(key);
4141 dictReplace(c->db->dict,key,val);
4142 incrRefCount(val);
ed9b544e 4143 } else {
c937aa89 4144 addReply(c,shared.czero);
ed9b544e 4145 return;
4146 }
4147 } else {
526d00a5 4148 incrRefCount(key);
4149 incrRefCount(val);
ed9b544e 4150 }
4151 server.dirty++;
526d00a5 4152 removeExpire(c->db,key);
4153 if (expire) setExpire(c->db,key,time(NULL)+seconds);
c937aa89 4154 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 4155}
4156
4157static void setCommand(redisClient *c) {
526d00a5 4158 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
ed9b544e 4159}
4160
4161static void setnxCommand(redisClient *c) {
526d00a5 4162 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4163}
4164
4165static void setexCommand(redisClient *c) {
4166 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
ed9b544e 4167}
4168
322fc7d8 4169static int getGenericCommand(redisClient *c) {
dd88747b 4170 robj *o;
e0a62c7f 4171
dd88747b 4172 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 4173 return REDIS_OK;
dd88747b 4174
4175 if (o->type != REDIS_STRING) {
4176 addReply(c,shared.wrongtypeerr);
4177 return REDIS_ERR;
ed9b544e 4178 } else {
dd88747b 4179 addReplyBulk(c,o);
4180 return REDIS_OK;
ed9b544e 4181 }
4182}
4183
322fc7d8 4184static void getCommand(redisClient *c) {
4185 getGenericCommand(c);
4186}
4187
f6b141c5 4188static void getsetCommand(redisClient *c) {
322fc7d8 4189 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 4190 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4191 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4192 } else {
4193 incrRefCount(c->argv[1]);
4194 }
4195 incrRefCount(c->argv[2]);
4196 server.dirty++;
4197 removeExpire(c->db,c->argv[1]);
4198}
4199
70003d28 4200static void mgetCommand(redisClient *c) {
70003d28 4201 int j;
e0a62c7f 4202
c937aa89 4203 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 4204 for (j = 1; j < c->argc; j++) {
3305306f 4205 robj *o = lookupKeyRead(c->db,c->argv[j]);
4206 if (o == NULL) {
c937aa89 4207 addReply(c,shared.nullbulk);
70003d28 4208 } else {
70003d28 4209 if (o->type != REDIS_STRING) {
c937aa89 4210 addReply(c,shared.nullbulk);
70003d28 4211 } else {
dd88747b 4212 addReplyBulk(c,o);
70003d28 4213 }
4214 }
4215 }
4216}
4217
6c446631 4218static void msetGenericCommand(redisClient *c, int nx) {
906573e7 4219 int j, busykeys = 0;
6c446631 4220
4221 if ((c->argc % 2) == 0) {
454d4e43 4222 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 4223 return;
4224 }
4225 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4226 * set nothing at all if at least one already key exists. */
4227 if (nx) {
4228 for (j = 1; j < c->argc; j += 2) {
906573e7 4229 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4230 busykeys++;
6c446631 4231 }
4232 }
4233 }
906573e7 4234 if (busykeys) {
4235 addReply(c, shared.czero);
4236 return;
4237 }
6c446631 4238
4239 for (j = 1; j < c->argc; j += 2) {
4240 int retval;
4241
05df7621 4242 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
6c446631 4243 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4244 if (retval == DICT_ERR) {
4245 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4246 incrRefCount(c->argv[j+1]);
4247 } else {
4248 incrRefCount(c->argv[j]);
4249 incrRefCount(c->argv[j+1]);
4250 }
4251 removeExpire(c->db,c->argv[j]);
4252 }
4253 server.dirty += (c->argc-1)/2;
4254 addReply(c, nx ? shared.cone : shared.ok);
4255}
4256
4257static void msetCommand(redisClient *c) {
4258 msetGenericCommand(c,0);
4259}
4260
4261static void msetnxCommand(redisClient *c) {
4262 msetGenericCommand(c,1);
4263}
4264
d68ed120 4265static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 4266 long long value;
4267 int retval;
4268 robj *o;
e0a62c7f 4269
3305306f 4270 o = lookupKeyWrite(c->db,c->argv[1]);
ed9b544e 4271
bd79a6bd 4272 if (getLongLongFromObjectOrReply(c, o, &value, NULL) != REDIS_OK) return;
ed9b544e 4273
4274 value += incr;
4275 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
05df7621 4276 o = tryObjectEncoding(o);
3305306f 4277 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 4278 if (retval == DICT_ERR) {
3305306f 4279 dictReplace(c->db->dict,c->argv[1],o);
4280 removeExpire(c->db,c->argv[1]);
ed9b544e 4281 } else {
4282 incrRefCount(c->argv[1]);
4283 }
4284 server.dirty++;
c937aa89 4285 addReply(c,shared.colon);
ed9b544e 4286 addReply(c,o);
4287 addReply(c,shared.crlf);
4288}
4289
4290static void incrCommand(redisClient *c) {
a4d1ba9a 4291 incrDecrCommand(c,1);
ed9b544e 4292}
4293
4294static void decrCommand(redisClient *c) {
a4d1ba9a 4295 incrDecrCommand(c,-1);
ed9b544e 4296}
4297
4298static void incrbyCommand(redisClient *c) {
bbe025e0
AM
4299 long long incr;
4300
bd79a6bd 4301 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4302 incrDecrCommand(c,incr);
ed9b544e 4303}
4304
4305static void decrbyCommand(redisClient *c) {
bbe025e0
AM
4306 long long incr;
4307
bd79a6bd 4308 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4309 incrDecrCommand(c,-incr);
ed9b544e 4310}
4311
4b00bebd 4312static void appendCommand(redisClient *c) {
4313 int retval;
4314 size_t totlen;
4315 robj *o;
4316
4317 o = lookupKeyWrite(c->db,c->argv[1]);
4318 if (o == NULL) {
4319 /* Create the key */
4320 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4321 incrRefCount(c->argv[1]);
4322 incrRefCount(c->argv[2]);
4323 totlen = stringObjectLen(c->argv[2]);
4324 } else {
4325 dictEntry *de;
e0a62c7f 4326
4b00bebd 4327 de = dictFind(c->db->dict,c->argv[1]);
4328 assert(de != NULL);
4329
4330 o = dictGetEntryVal(de);
4331 if (o->type != REDIS_STRING) {
4332 addReply(c,shared.wrongtypeerr);
4333 return;
4334 }
4335 /* If the object is specially encoded or shared we have to make
4336 * a copy */
4337 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4338 robj *decoded = getDecodedObject(o);
4339
4340 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4341 decrRefCount(decoded);
4342 dictReplace(c->db->dict,c->argv[1],o);
4343 }
4344 /* APPEND! */
4345 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4346 o->ptr = sdscatlen(o->ptr,
4347 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4348 } else {
4349 o->ptr = sdscatprintf(o->ptr, "%ld",
4350 (unsigned long) c->argv[2]->ptr);
4351 }
4352 totlen = sdslen(o->ptr);
4353 }
4354 server.dirty++;
4355 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4356}
4357
39191553 4358static void substrCommand(redisClient *c) {
4359 robj *o;
4360 long start = atoi(c->argv[2]->ptr);
4361 long end = atoi(c->argv[3]->ptr);
dd88747b 4362 size_t rangelen, strlen;
4363 sds range;
39191553 4364
dd88747b 4365 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4366 checkType(c,o,REDIS_STRING)) return;
39191553 4367
dd88747b 4368 o = getDecodedObject(o);
4369 strlen = sdslen(o->ptr);
8fe7fad7 4370
dd88747b 4371 /* convert negative indexes */
4372 if (start < 0) start = strlen+start;
4373 if (end < 0) end = strlen+end;
4374 if (start < 0) start = 0;
4375 if (end < 0) end = 0;
39191553 4376
dd88747b 4377 /* indexes sanity checks */
4378 if (start > end || (size_t)start >= strlen) {
4379 /* Out of range start or start > end result in null reply */
4380 addReply(c,shared.nullbulk);
4381 decrRefCount(o);
4382 return;
39191553 4383 }
dd88747b 4384 if ((size_t)end >= strlen) end = strlen-1;
4385 rangelen = (end-start)+1;
4386
4387 /* Return the result */
4388 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4389 range = sdsnewlen((char*)o->ptr+start,rangelen);
4390 addReplySds(c,range);
4391 addReply(c,shared.crlf);
4392 decrRefCount(o);
39191553 4393}
4394
ed9b544e 4395/* ========================= Type agnostic commands ========================= */
4396
4397static void delCommand(redisClient *c) {
5109cdff 4398 int deleted = 0, j;
4399
4400 for (j = 1; j < c->argc; j++) {
4401 if (deleteKey(c->db,c->argv[j])) {
4402 server.dirty++;
4403 deleted++;
4404 }
4405 }
dd88747b 4406 addReplyLong(c,deleted);
ed9b544e 4407}
4408
4409static void existsCommand(redisClient *c) {
3305306f 4410 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
ed9b544e 4411}
4412
4413static void selectCommand(redisClient *c) {
4414 int id = atoi(c->argv[1]->ptr);
e0a62c7f 4415
ed9b544e 4416 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4417 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4418 } else {
4419 addReply(c,shared.ok);
4420 }
4421}
4422
4423static void randomkeyCommand(redisClient *c) {
4424 dictEntry *de;
dc4be23e 4425 robj *key;
e0a62c7f 4426
3305306f 4427 while(1) {
4428 de = dictGetRandomKey(c->db->dict);
ce7bef07 4429 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 4430 }
2b619329 4431
ed9b544e 4432 if (de == NULL) {
dc4be23e 4433 addReply(c,shared.nullbulk);
4434 return;
4435 }
4436
4437 key = dictGetEntryKey(de);
4438 if (server.vm_enabled) {
4439 key = dupStringObject(key);
4440 addReplyBulk(c,key);
4441 decrRefCount(key);
ed9b544e 4442 } else {
dc4be23e 4443 addReplyBulk(c,key);
ed9b544e 4444 }
4445}
4446
4447static void keysCommand(redisClient *c) {
4448 dictIterator *di;
4449 dictEntry *de;
4450 sds pattern = c->argv[1]->ptr;
4451 int plen = sdslen(pattern);
a3f9eec2 4452 unsigned long numkeys = 0;
ed9b544e 4453 robj *lenobj = createObject(REDIS_STRING,NULL);
4454
3305306f 4455 di = dictGetIterator(c->db->dict);
ed9b544e 4456 addReply(c,lenobj);
4457 decrRefCount(lenobj);
4458 while((de = dictNext(di)) != NULL) {
4459 robj *keyobj = dictGetEntryKey(de);
3305306f 4460
ed9b544e 4461 sds key = keyobj->ptr;
4462 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4463 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 4464 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4465 addReplyBulk(c,keyobj);
3305306f 4466 numkeys++;
3305306f 4467 }
ed9b544e 4468 }
4469 }
4470 dictReleaseIterator(di);
a3f9eec2 4471 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4472}
4473
4474static void dbsizeCommand(redisClient *c) {
4475 addReplySds(c,
3305306f 4476 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4477}
4478
4479static void lastsaveCommand(redisClient *c) {
4480 addReplySds(c,
c937aa89 4481 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4482}
4483
4484static void typeCommand(redisClient *c) {
3305306f 4485 robj *o;
ed9b544e 4486 char *type;
3305306f 4487
4488 o = lookupKeyRead(c->db,c->argv[1]);
4489 if (o == NULL) {
c937aa89 4490 type = "+none";
ed9b544e 4491 } else {
ed9b544e 4492 switch(o->type) {
c937aa89 4493 case REDIS_STRING: type = "+string"; break;
4494 case REDIS_LIST: type = "+list"; break;
4495 case REDIS_SET: type = "+set"; break;
412a8bce 4496 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4497 case REDIS_HASH: type = "+hash"; break;
4498 default: type = "+unknown"; break;
ed9b544e 4499 }
4500 }
4501 addReplySds(c,sdsnew(type));
4502 addReply(c,shared.crlf);
4503}
4504
4505static void saveCommand(redisClient *c) {
9d65a1bb 4506 if (server.bgsavechildpid != -1) {
05557f6d 4507 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4508 return;
4509 }
f78fd11b 4510 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4511 addReply(c,shared.ok);
4512 } else {
4513 addReply(c,shared.err);
4514 }
4515}
4516
4517static void bgsaveCommand(redisClient *c) {
9d65a1bb 4518 if (server.bgsavechildpid != -1) {
ed9b544e 4519 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4520 return;
4521 }
f78fd11b 4522 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4523 char *status = "+Background saving started\r\n";
4524 addReplySds(c,sdsnew(status));
ed9b544e 4525 } else {
4526 addReply(c,shared.err);
4527 }
4528}
4529
4530static void shutdownCommand(redisClient *c) {
4531 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
a3b21203 4532 /* Kill the saving child if there is a background saving in progress.
4533 We want to avoid race conditions, for instance our saving child may
4534 overwrite the synchronous saving did by SHUTDOWN. */
9d65a1bb 4535 if (server.bgsavechildpid != -1) {
9f3c422c 4536 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4537 kill(server.bgsavechildpid,SIGKILL);
a3b21203 4538 rdbRemoveTempFile(server.bgsavechildpid);
9f3c422c 4539 }
ac945e2d 4540 if (server.appendonly) {
4541 /* Append only file: fsync() the AOF and exit */
4542 fsync(server.appendfd);
054e426d 4543 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 4544 exit(0);
ed9b544e 4545 } else {
ac945e2d 4546 /* Snapshotting. Perform a SYNC SAVE and exit */
4547 if (rdbSave(server.dbfilename) == REDIS_OK) {
4548 if (server.daemonize)
4549 unlink(server.pidfile);
4550 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4551 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4552 exit(0);
4553 } else {
dd88747b 4554 /* Ooops.. error saving! The best we can do is to continue
4555 * operating. Note that if there was a background saving process,
4556 * in the next cron() Redis will be notified that the background
4557 * saving aborted, handling special stuff like slaves pending for
4558 * synchronization... */
e0a62c7f 4559 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
dd88747b 4560 addReplySds(c,
4561 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
ac945e2d 4562 }
ed9b544e 4563 }
4564}
4565
4566static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4567 robj *o;
4568
4569 /* To use the same key as src and dst is probably an error */
4570 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4571 addReply(c,shared.sameobjecterr);
ed9b544e 4572 return;
4573 }
4574
dd88747b 4575 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4576 return;
dd88747b 4577
ed9b544e 4578 incrRefCount(o);
3305306f 4579 deleteIfVolatile(c->db,c->argv[2]);
4580 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 4581 if (nx) {
4582 decrRefCount(o);
c937aa89 4583 addReply(c,shared.czero);
ed9b544e 4584 return;
4585 }
3305306f 4586 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 4587 } else {
4588 incrRefCount(c->argv[2]);
4589 }
3305306f 4590 deleteKey(c->db,c->argv[1]);
ed9b544e 4591 server.dirty++;
c937aa89 4592 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4593}
4594
4595static void renameCommand(redisClient *c) {
4596 renameGenericCommand(c,0);
4597}
4598
4599static void renamenxCommand(redisClient *c) {
4600 renameGenericCommand(c,1);
4601}
4602
4603static void moveCommand(redisClient *c) {
3305306f 4604 robj *o;
4605 redisDb *src, *dst;
ed9b544e 4606 int srcid;
4607
4608 /* Obtain source and target DB pointers */
3305306f 4609 src = c->db;
4610 srcid = c->db->id;
ed9b544e 4611 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4612 addReply(c,shared.outofrangeerr);
ed9b544e 4613 return;
4614 }
3305306f 4615 dst = c->db;
4616 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4617
4618 /* If the user is moving using as target the same
4619 * DB as the source DB it is probably an error. */
4620 if (src == dst) {
c937aa89 4621 addReply(c,shared.sameobjecterr);
ed9b544e 4622 return;
4623 }
4624
4625 /* Check if the element exists and get a reference */
3305306f 4626 o = lookupKeyWrite(c->db,c->argv[1]);
4627 if (!o) {
c937aa89 4628 addReply(c,shared.czero);
ed9b544e 4629 return;
4630 }
4631
4632 /* Try to add the element to the target DB */
3305306f 4633 deleteIfVolatile(dst,c->argv[1]);
4634 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 4635 addReply(c,shared.czero);
ed9b544e 4636 return;
4637 }
3305306f 4638 incrRefCount(c->argv[1]);
ed9b544e 4639 incrRefCount(o);
4640
4641 /* OK! key moved, free the entry in the source DB */
3305306f 4642 deleteKey(src,c->argv[1]);
ed9b544e 4643 server.dirty++;
c937aa89 4644 addReply(c,shared.cone);
ed9b544e 4645}
4646
4647/* =================================== Lists ================================ */
4648static void pushGenericCommand(redisClient *c, int where) {
4649 robj *lobj;
ed9b544e 4650 list *list;
3305306f 4651
4652 lobj = lookupKeyWrite(c->db,c->argv[1]);
4653 if (lobj == NULL) {
95242ab5 4654 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4655 addReply(c,shared.cone);
95242ab5 4656 return;
4657 }
ed9b544e 4658 lobj = createListObject();
4659 list = lobj->ptr;
4660 if (where == REDIS_HEAD) {
6b47e12e 4661 listAddNodeHead(list,c->argv[2]);
ed9b544e 4662 } else {
6b47e12e 4663 listAddNodeTail(list,c->argv[2]);
ed9b544e 4664 }
3305306f 4665 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 4666 incrRefCount(c->argv[1]);
4667 incrRefCount(c->argv[2]);
4668 } else {
ed9b544e 4669 if (lobj->type != REDIS_LIST) {
4670 addReply(c,shared.wrongtypeerr);
4671 return;
4672 }
95242ab5 4673 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4674 addReply(c,shared.cone);
95242ab5 4675 return;
4676 }
ed9b544e 4677 list = lobj->ptr;
4678 if (where == REDIS_HEAD) {
6b47e12e 4679 listAddNodeHead(list,c->argv[2]);
ed9b544e 4680 } else {
6b47e12e 4681 listAddNodeTail(list,c->argv[2]);
ed9b544e 4682 }
4683 incrRefCount(c->argv[2]);
4684 }
4685 server.dirty++;
520b5a33 4686 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
ed9b544e 4687}
4688
4689static void lpushCommand(redisClient *c) {
4690 pushGenericCommand(c,REDIS_HEAD);
4691}
4692
4693static void rpushCommand(redisClient *c) {
4694 pushGenericCommand(c,REDIS_TAIL);
4695}
4696
4697static void llenCommand(redisClient *c) {
3305306f 4698 robj *o;
ed9b544e 4699 list *l;
dd88747b 4700
4701 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4702 checkType(c,o,REDIS_LIST)) return;
e0a62c7f 4703
dd88747b 4704 l = o->ptr;
4705 addReplyUlong(c,listLength(l));
ed9b544e 4706}
4707
4708static void lindexCommand(redisClient *c) {
3305306f 4709 robj *o;
ed9b544e 4710 int index = atoi(c->argv[2]->ptr);
dd88747b 4711 list *list;
4712 listNode *ln;
4713
4714 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4715 checkType(c,o,REDIS_LIST)) return;
4716 list = o->ptr;
4717
4718 ln = listIndex(list, index);
4719 if (ln == NULL) {
c937aa89 4720 addReply(c,shared.nullbulk);
ed9b544e 4721 } else {
dd88747b 4722 robj *ele = listNodeValue(ln);
4723 addReplyBulk(c,ele);
ed9b544e 4724 }
4725}
4726
4727static void lsetCommand(redisClient *c) {
3305306f 4728 robj *o;
ed9b544e 4729 int index = atoi(c->argv[2]->ptr);
dd88747b 4730 list *list;
4731 listNode *ln;
4732
4733 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4734 checkType(c,o,REDIS_LIST)) return;
4735 list = o->ptr;
4736
4737 ln = listIndex(list, index);
4738 if (ln == NULL) {
4739 addReply(c,shared.outofrangeerr);
ed9b544e 4740 } else {
dd88747b 4741 robj *ele = listNodeValue(ln);
ed9b544e 4742
dd88747b 4743 decrRefCount(ele);
4744 listNodeValue(ln) = c->argv[3];
4745 incrRefCount(c->argv[3]);
4746 addReply(c,shared.ok);
4747 server.dirty++;
ed9b544e 4748 }
4749}
4750
4751static void popGenericCommand(redisClient *c, int where) {
3305306f 4752 robj *o;
dd88747b 4753 list *list;
4754 listNode *ln;
3305306f 4755
dd88747b 4756 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4757 checkType(c,o,REDIS_LIST)) return;
4758 list = o->ptr;
ed9b544e 4759
dd88747b 4760 if (where == REDIS_HEAD)
4761 ln = listFirst(list);
4762 else
4763 ln = listLast(list);
ed9b544e 4764
dd88747b 4765 if (ln == NULL) {
4766 addReply(c,shared.nullbulk);
4767 } else {
4768 robj *ele = listNodeValue(ln);
4769 addReplyBulk(c,ele);
4770 listDelNode(list,ln);
3ea27d37 4771 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4772 server.dirty++;
ed9b544e 4773 }
4774}
4775
4776static void lpopCommand(redisClient *c) {
4777 popGenericCommand(c,REDIS_HEAD);
4778}
4779
4780static void rpopCommand(redisClient *c) {
4781 popGenericCommand(c,REDIS_TAIL);
4782}
4783
4784static void lrangeCommand(redisClient *c) {
3305306f 4785 robj *o;
ed9b544e 4786 int start = atoi(c->argv[2]->ptr);
4787 int end = atoi(c->argv[3]->ptr);
dd88747b 4788 int llen;
4789 int rangelen, j;
4790 list *list;
4791 listNode *ln;
4792 robj *ele;
4793
4e27f268 4794 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4795 || checkType(c,o,REDIS_LIST)) return;
dd88747b 4796 list = o->ptr;
4797 llen = listLength(list);
4798
4799 /* convert negative indexes */
4800 if (start < 0) start = llen+start;
4801 if (end < 0) end = llen+end;
4802 if (start < 0) start = 0;
4803 if (end < 0) end = 0;
4804
4805 /* indexes sanity checks */
4806 if (start > end || start >= llen) {
4807 /* Out of range start or start > end result in empty list */
4808 addReply(c,shared.emptymultibulk);
4809 return;
4810 }
4811 if (end >= llen) end = llen-1;
4812 rangelen = (end-start)+1;
3305306f 4813
dd88747b 4814 /* Return the result in form of a multi-bulk reply */
4815 ln = listIndex(list, start);
4816 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4817 for (j = 0; j < rangelen; j++) {
4818 ele = listNodeValue(ln);
4819 addReplyBulk(c,ele);
4820 ln = ln->next;
ed9b544e 4821 }
4822}
4823
4824static void ltrimCommand(redisClient *c) {
3305306f 4825 robj *o;
ed9b544e 4826 int start = atoi(c->argv[2]->ptr);
4827 int end = atoi(c->argv[3]->ptr);
dd88747b 4828 int llen;
4829 int j, ltrim, rtrim;
4830 list *list;
4831 listNode *ln;
4832
4833 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4834 checkType(c,o,REDIS_LIST)) return;
4835 list = o->ptr;
4836 llen = listLength(list);
4837
4838 /* convert negative indexes */
4839 if (start < 0) start = llen+start;
4840 if (end < 0) end = llen+end;
4841 if (start < 0) start = 0;
4842 if (end < 0) end = 0;
4843
4844 /* indexes sanity checks */
4845 if (start > end || start >= llen) {
4846 /* Out of range start or start > end result in empty list */
4847 ltrim = llen;
4848 rtrim = 0;
ed9b544e 4849 } else {
dd88747b 4850 if (end >= llen) end = llen-1;
4851 ltrim = start;
4852 rtrim = llen-end-1;
4853 }
ed9b544e 4854
dd88747b 4855 /* Remove list elements to perform the trim */
4856 for (j = 0; j < ltrim; j++) {
4857 ln = listFirst(list);
4858 listDelNode(list,ln);
4859 }
4860 for (j = 0; j < rtrim; j++) {
4861 ln = listLast(list);
4862 listDelNode(list,ln);
ed9b544e 4863 }
3ea27d37 4864 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4865 server.dirty++;
4866 addReply(c,shared.ok);
ed9b544e 4867}
4868
4869static void lremCommand(redisClient *c) {
3305306f 4870 robj *o;
dd88747b 4871 list *list;
4872 listNode *ln, *next;
4873 int toremove = atoi(c->argv[2]->ptr);
4874 int removed = 0;
4875 int fromtail = 0;
a4d1ba9a 4876
dd88747b 4877 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4878 checkType(c,o,REDIS_LIST)) return;
4879 list = o->ptr;
4880
4881 if (toremove < 0) {
4882 toremove = -toremove;
4883 fromtail = 1;
4884 }
4885 ln = fromtail ? list->tail : list->head;
4886 while (ln) {
4887 robj *ele = listNodeValue(ln);
4888
4889 next = fromtail ? ln->prev : ln->next;
bf028098 4890 if (equalStringObjects(ele,c->argv[3])) {
dd88747b 4891 listDelNode(list,ln);
4892 server.dirty++;
4893 removed++;
4894 if (toremove && removed == toremove) break;
ed9b544e 4895 }
dd88747b 4896 ln = next;
ed9b544e 4897 }
3ea27d37 4898 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4899 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 4900}
4901
12f9d551 4902/* This is the semantic of this command:
0f5f7e9a 4903 * RPOPLPUSH srclist dstlist:
12f9d551 4904 * IF LLEN(srclist) > 0
4905 * element = RPOP srclist
4906 * LPUSH dstlist element
4907 * RETURN element
4908 * ELSE
4909 * RETURN nil
4910 * END
4911 * END
4912 *
4913 * The idea is to be able to get an element from a list in a reliable way
4914 * since the element is not just returned but pushed against another list
4915 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4916 */
0f5f7e9a 4917static void rpoplpushcommand(redisClient *c) {
12f9d551 4918 robj *sobj;
dd88747b 4919 list *srclist;
4920 listNode *ln;
4921
4922 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4923 checkType(c,sobj,REDIS_LIST)) return;
4924 srclist = sobj->ptr;
4925 ln = listLast(srclist);
12f9d551 4926
dd88747b 4927 if (ln == NULL) {
12f9d551 4928 addReply(c,shared.nullbulk);
4929 } else {
dd88747b 4930 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4931 robj *ele = listNodeValue(ln);
4932 list *dstlist;
e20fb74f 4933
dd88747b 4934 if (dobj && dobj->type != REDIS_LIST) {
4935 addReply(c,shared.wrongtypeerr);
4936 return;
4937 }
12f9d551 4938
dd88747b 4939 /* Add the element to the target list (unless it's directly
4940 * passed to some BLPOP-ing client */
4941 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4942 if (dobj == NULL) {
4943 /* Create the list if the key does not exist */
4944 dobj = createListObject();
4945 dictAdd(c->db->dict,c->argv[2],dobj);
4946 incrRefCount(c->argv[2]);
12f9d551 4947 }
dd88747b 4948 dstlist = dobj->ptr;
4949 listAddNodeHead(dstlist,ele);
4950 incrRefCount(ele);
12f9d551 4951 }
dd88747b 4952
4953 /* Send the element to the client as reply as well */
4954 addReplyBulk(c,ele);
4955
4956 /* Finally remove the element from the source list */
4957 listDelNode(srclist,ln);
3ea27d37 4958 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4959 server.dirty++;
12f9d551 4960 }
4961}
4962
ed9b544e 4963/* ==================================== Sets ================================ */
4964
4965static void saddCommand(redisClient *c) {
ed9b544e 4966 robj *set;
4967
3305306f 4968 set = lookupKeyWrite(c->db,c->argv[1]);
4969 if (set == NULL) {
ed9b544e 4970 set = createSetObject();
3305306f 4971 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 4972 incrRefCount(c->argv[1]);
4973 } else {
ed9b544e 4974 if (set->type != REDIS_SET) {
c937aa89 4975 addReply(c,shared.wrongtypeerr);
ed9b544e 4976 return;
4977 }
4978 }
4979 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4980 incrRefCount(c->argv[2]);
4981 server.dirty++;
c937aa89 4982 addReply(c,shared.cone);
ed9b544e 4983 } else {
c937aa89 4984 addReply(c,shared.czero);
ed9b544e 4985 }
4986}
4987
4988static void sremCommand(redisClient *c) {
3305306f 4989 robj *set;
ed9b544e 4990
dd88747b 4991 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4992 checkType(c,set,REDIS_SET)) return;
4993
4994 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4995 server.dirty++;
4996 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 4997 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4998 addReply(c,shared.cone);
ed9b544e 4999 } else {
dd88747b 5000 addReply(c,shared.czero);
ed9b544e 5001 }
5002}
5003
a4460ef4 5004static void smoveCommand(redisClient *c) {
5005 robj *srcset, *dstset;
5006
5007 srcset = lookupKeyWrite(c->db,c->argv[1]);
5008 dstset = lookupKeyWrite(c->db,c->argv[2]);
5009
5010 /* If the source key does not exist return 0, if it's of the wrong type
5011 * raise an error */
5012 if (srcset == NULL || srcset->type != REDIS_SET) {
5013 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5014 return;
5015 }
5016 /* Error if the destination key is not a set as well */
5017 if (dstset && dstset->type != REDIS_SET) {
5018 addReply(c,shared.wrongtypeerr);
5019 return;
5020 }
5021 /* Remove the element from the source set */
5022 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5023 /* Key not found in the src set! return zero */
5024 addReply(c,shared.czero);
5025 return;
5026 }
3ea27d37 5027 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5028 deleteKey(c->db,c->argv[1]);
a4460ef4 5029 server.dirty++;
5030 /* Add the element to the destination set */
5031 if (!dstset) {
5032 dstset = createSetObject();
5033 dictAdd(c->db->dict,c->argv[2],dstset);
5034 incrRefCount(c->argv[2]);
5035 }
5036 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5037 incrRefCount(c->argv[3]);
5038 addReply(c,shared.cone);
5039}
5040
ed9b544e 5041static void sismemberCommand(redisClient *c) {
3305306f 5042 robj *set;
ed9b544e 5043
dd88747b 5044 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5045 checkType(c,set,REDIS_SET)) return;
5046
5047 if (dictFind(set->ptr,c->argv[2]))
5048 addReply(c,shared.cone);
5049 else
c937aa89 5050 addReply(c,shared.czero);
ed9b544e 5051}
5052
5053static void scardCommand(redisClient *c) {
3305306f 5054 robj *o;
ed9b544e 5055 dict *s;
dd88747b 5056
5057 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5058 checkType(c,o,REDIS_SET)) return;
e0a62c7f 5059
dd88747b 5060 s = o->ptr;
5061 addReplyUlong(c,dictSize(s));
ed9b544e 5062}
5063
12fea928 5064static void spopCommand(redisClient *c) {
5065 robj *set;
5066 dictEntry *de;
5067
dd88747b 5068 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5069 checkType(c,set,REDIS_SET)) return;
5070
5071 de = dictGetRandomKey(set->ptr);
5072 if (de == NULL) {
12fea928 5073 addReply(c,shared.nullbulk);
5074 } else {
dd88747b 5075 robj *ele = dictGetEntryKey(de);
12fea928 5076
dd88747b 5077 addReplyBulk(c,ele);
5078 dictDelete(set->ptr,ele);
5079 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 5080 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5081 server.dirty++;
12fea928 5082 }
5083}
5084
2abb95a9 5085static void srandmemberCommand(redisClient *c) {
5086 robj *set;
5087 dictEntry *de;
5088
dd88747b 5089 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5090 checkType(c,set,REDIS_SET)) return;
5091
5092 de = dictGetRandomKey(set->ptr);
5093 if (de == NULL) {
2abb95a9 5094 addReply(c,shared.nullbulk);
5095 } else {
dd88747b 5096 robj *ele = dictGetEntryKey(de);
2abb95a9 5097
dd88747b 5098 addReplyBulk(c,ele);
2abb95a9 5099 }
5100}
5101
ed9b544e 5102static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5103 dict **d1 = (void*) s1, **d2 = (void*) s2;
5104
3305306f 5105 return dictSize(*d1)-dictSize(*d2);
ed9b544e 5106}
5107
682ac724 5108static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 5109 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5110 dictIterator *di;
5111 dictEntry *de;
5112 robj *lenobj = NULL, *dstset = NULL;
682ac724 5113 unsigned long j, cardinality = 0;
ed9b544e 5114
ed9b544e 5115 for (j = 0; j < setsnum; j++) {
5116 robj *setobj;
3305306f 5117
5118 setobj = dstkey ?
5119 lookupKeyWrite(c->db,setskeys[j]) :
5120 lookupKeyRead(c->db,setskeys[j]);
5121 if (!setobj) {
ed9b544e 5122 zfree(dv);
5faa6025 5123 if (dstkey) {
fdcaae84 5124 if (deleteKey(c->db,dstkey))
5125 server.dirty++;
0d36ded0 5126 addReply(c,shared.czero);
5faa6025 5127 } else {
4e27f268 5128 addReply(c,shared.emptymultibulk);
5faa6025 5129 }
ed9b544e 5130 return;
5131 }
ed9b544e 5132 if (setobj->type != REDIS_SET) {
5133 zfree(dv);
c937aa89 5134 addReply(c,shared.wrongtypeerr);
ed9b544e 5135 return;
5136 }
5137 dv[j] = setobj->ptr;
5138 }
5139 /* Sort sets from the smallest to largest, this will improve our
5140 * algorithm's performace */
5141 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5142
5143 /* The first thing we should output is the total number of elements...
5144 * since this is a multi-bulk write, but at this stage we don't know
5145 * the intersection set size, so we use a trick, append an empty object
5146 * to the output list and save the pointer to later modify it with the
5147 * right length */
5148 if (!dstkey) {
5149 lenobj = createObject(REDIS_STRING,NULL);
5150 addReply(c,lenobj);
5151 decrRefCount(lenobj);
5152 } else {
5153 /* If we have a target key where to store the resulting set
5154 * create this key with an empty set inside */
5155 dstset = createSetObject();
ed9b544e 5156 }
5157
5158 /* Iterate all the elements of the first (smallest) set, and test
5159 * the element against all the other sets, if at least one set does
5160 * not include the element it is discarded */
5161 di = dictGetIterator(dv[0]);
ed9b544e 5162
5163 while((de = dictNext(di)) != NULL) {
5164 robj *ele;
5165
5166 for (j = 1; j < setsnum; j++)
5167 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5168 if (j != setsnum)
5169 continue; /* at least one set does not contain the member */
5170 ele = dictGetEntryKey(de);
5171 if (!dstkey) {
dd88747b 5172 addReplyBulk(c,ele);
ed9b544e 5173 cardinality++;
5174 } else {
5175 dictAdd(dstset->ptr,ele,NULL);
5176 incrRefCount(ele);
5177 }
5178 }
5179 dictReleaseIterator(di);
5180
83cdfe18 5181 if (dstkey) {
3ea27d37 5182 /* Store the resulting set into the target, if the intersection
5183 * is not an empty set. */
83cdfe18 5184 deleteKey(c->db,dstkey);
3ea27d37 5185 if (dictSize((dict*)dstset->ptr) > 0) {
5186 dictAdd(c->db->dict,dstkey,dstset);
5187 incrRefCount(dstkey);
d36c4e97 5188 addReplyLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5189 } else {
5190 decrRefCount(dstset);
d36c4e97 5191 addReply(c,shared.czero);
3ea27d37 5192 }
40d224a9 5193 server.dirty++;
d36c4e97 5194 } else {
5195 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 5196 }
ed9b544e 5197 zfree(dv);
5198}
5199
5200static void sinterCommand(redisClient *c) {
5201 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5202}
5203
5204static void sinterstoreCommand(redisClient *c) {
5205 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5206}
5207
f4f56e1d 5208#define REDIS_OP_UNION 0
5209#define REDIS_OP_DIFF 1
2830ca53 5210#define REDIS_OP_INTER 2
f4f56e1d 5211
5212static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 5213 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5214 dictIterator *di;
5215 dictEntry *de;
f4f56e1d 5216 robj *dstset = NULL;
40d224a9 5217 int j, cardinality = 0;
5218
40d224a9 5219 for (j = 0; j < setsnum; j++) {
5220 robj *setobj;
5221
5222 setobj = dstkey ?
5223 lookupKeyWrite(c->db,setskeys[j]) :
5224 lookupKeyRead(c->db,setskeys[j]);
5225 if (!setobj) {
5226 dv[j] = NULL;
5227 continue;
5228 }
5229 if (setobj->type != REDIS_SET) {
5230 zfree(dv);
5231 addReply(c,shared.wrongtypeerr);
5232 return;
5233 }
5234 dv[j] = setobj->ptr;
5235 }
5236
5237 /* We need a temp set object to store our union. If the dstkey
5238 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5239 * this set object will be the resulting object to set into the target key*/
5240 dstset = createSetObject();
5241
40d224a9 5242 /* Iterate all the elements of all the sets, add every element a single
5243 * time to the result set */
5244 for (j = 0; j < setsnum; j++) {
51829ed3 5245 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 5246 if (!dv[j]) continue; /* non existing keys are like empty sets */
5247
5248 di = dictGetIterator(dv[j]);
40d224a9 5249
5250 while((de = dictNext(di)) != NULL) {
5251 robj *ele;
5252
5253 /* dictAdd will not add the same element multiple times */
5254 ele = dictGetEntryKey(de);
f4f56e1d 5255 if (op == REDIS_OP_UNION || j == 0) {
5256 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5257 incrRefCount(ele);
40d224a9 5258 cardinality++;
5259 }
f4f56e1d 5260 } else if (op == REDIS_OP_DIFF) {
5261 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5262 cardinality--;
5263 }
40d224a9 5264 }
5265 }
5266 dictReleaseIterator(di);
51829ed3 5267
d36c4e97 5268 /* result set is empty? Exit asap. */
5269 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 5270 }
5271
f4f56e1d 5272 /* Output the content of the resulting set, if not in STORE mode */
5273 if (!dstkey) {
5274 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5275 di = dictGetIterator(dstset->ptr);
f4f56e1d 5276 while((de = dictNext(di)) != NULL) {
5277 robj *ele;
5278
5279 ele = dictGetEntryKey(de);
dd88747b 5280 addReplyBulk(c,ele);
f4f56e1d 5281 }
5282 dictReleaseIterator(di);
d36c4e97 5283 decrRefCount(dstset);
83cdfe18
AG
5284 } else {
5285 /* If we have a target key where to store the resulting set
5286 * create this key with the result set inside */
5287 deleteKey(c->db,dstkey);
3ea27d37 5288 if (dictSize((dict*)dstset->ptr) > 0) {
5289 dictAdd(c->db->dict,dstkey,dstset);
5290 incrRefCount(dstkey);
d36c4e97 5291 addReplyLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5292 } else {
5293 decrRefCount(dstset);
d36c4e97 5294 addReply(c,shared.czero);
3ea27d37 5295 }
40d224a9 5296 server.dirty++;
5297 }
5298 zfree(dv);
5299}
5300
5301static void sunionCommand(redisClient *c) {
f4f56e1d 5302 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 5303}
5304
5305static void sunionstoreCommand(redisClient *c) {
f4f56e1d 5306 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5307}
5308
5309static void sdiffCommand(redisClient *c) {
5310 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5311}
5312
5313static void sdiffstoreCommand(redisClient *c) {
5314 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 5315}
5316
6b47e12e 5317/* ==================================== ZSets =============================== */
5318
5319/* ZSETs are ordered sets using two data structures to hold the same elements
5320 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5321 * data structure.
5322 *
5323 * The elements are added to an hash table mapping Redis objects to scores.
5324 * At the same time the elements are added to a skip list mapping scores
5325 * to Redis objects (so objects are sorted by scores in this "view"). */
5326
5327/* This skiplist implementation is almost a C translation of the original
5328 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5329 * Alternative to Balanced Trees", modified in three ways:
5330 * a) this implementation allows for repeated values.
5331 * b) the comparison is not just by key (our 'score') but by satellite data.
5332 * c) there is a back pointer, so it's a doubly linked list with the back
5333 * pointers being only at "level 1". This allows to traverse the list
5334 * from tail to head, useful for ZREVRANGE. */
5335
5336static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5337 zskiplistNode *zn = zmalloc(sizeof(*zn));
5338
5339 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2b37892e
PN
5340 if (level > 0)
5341 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
6b47e12e 5342 zn->score = score;
5343 zn->obj = obj;
5344 return zn;
5345}
5346
5347static zskiplist *zslCreate(void) {
5348 int j;
5349 zskiplist *zsl;
e0a62c7f 5350
6b47e12e 5351 zsl = zmalloc(sizeof(*zsl));
5352 zsl->level = 1;
cc812361 5353 zsl->length = 0;
6b47e12e 5354 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 5355 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 5356 zsl->header->forward[j] = NULL;
94e543b5 5357
5358 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5359 if (j < ZSKIPLIST_MAXLEVEL-1)
5360 zsl->header->span[j] = 0;
69d95c3e 5361 }
e3870fab 5362 zsl->header->backward = NULL;
5363 zsl->tail = NULL;
6b47e12e 5364 return zsl;
5365}
5366
fd8ccf44 5367static void zslFreeNode(zskiplistNode *node) {
5368 decrRefCount(node->obj);
ad807e6f 5369 zfree(node->forward);
69d95c3e 5370 zfree(node->span);
fd8ccf44 5371 zfree(node);
5372}
5373
5374static void zslFree(zskiplist *zsl) {
ad807e6f 5375 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 5376
ad807e6f 5377 zfree(zsl->header->forward);
69d95c3e 5378 zfree(zsl->header->span);
ad807e6f 5379 zfree(zsl->header);
fd8ccf44 5380 while(node) {
599379dd 5381 next = node->forward[0];
fd8ccf44 5382 zslFreeNode(node);
5383 node = next;
5384 }
ad807e6f 5385 zfree(zsl);
fd8ccf44 5386}
5387
6b47e12e 5388static int zslRandomLevel(void) {
5389 int level = 1;
5390 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5391 level += 1;
10c2baa5 5392 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6b47e12e 5393}
5394
5395static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5396 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 5397 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 5398 int i, level;
5399
5400 x = zsl->header;
5401 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
5402 /* store rank that is crossed to reach the insert position */
5403 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 5404
9d60e6e4 5405 while (x->forward[i] &&
5406 (x->forward[i]->score < score ||
5407 (x->forward[i]->score == score &&
69d95c3e 5408 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 5409 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 5410 x = x->forward[i];
69d95c3e 5411 }
6b47e12e 5412 update[i] = x;
5413 }
6b47e12e 5414 /* we assume the key is not already inside, since we allow duplicated
5415 * scores, and the re-insertion of score and redis object should never
5416 * happpen since the caller of zslInsert() should test in the hash table
5417 * if the element is already inside or not. */
5418 level = zslRandomLevel();
5419 if (level > zsl->level) {
69d95c3e 5420 for (i = zsl->level; i < level; i++) {
2b37892e 5421 rank[i] = 0;
6b47e12e 5422 update[i] = zsl->header;
2b37892e 5423 update[i]->span[i-1] = zsl->length;
69d95c3e 5424 }
6b47e12e 5425 zsl->level = level;
5426 }
5427 x = zslCreateNode(level,score,obj);
5428 for (i = 0; i < level; i++) {
5429 x->forward[i] = update[i]->forward[i];
5430 update[i]->forward[i] = x;
69d95c3e
PN
5431
5432 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
5433 if (i > 0) {
5434 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5435 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5436 }
6b47e12e 5437 }
69d95c3e
PN
5438
5439 /* increment span for untouched levels */
5440 for (i = level; i < zsl->level; i++) {
2b37892e 5441 update[i]->span[i-1]++;
69d95c3e
PN
5442 }
5443
bb975144 5444 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 5445 if (x->forward[0])
5446 x->forward[0]->backward = x;
5447 else
5448 zsl->tail = x;
cc812361 5449 zsl->length++;
6b47e12e 5450}
5451
84105336
PN
5452/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5453void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5454 int i;
5455 for (i = 0; i < zsl->level; i++) {
5456 if (update[i]->forward[i] == x) {
5457 if (i > 0) {
5458 update[i]->span[i-1] += x->span[i-1] - 1;
5459 }
5460 update[i]->forward[i] = x->forward[i];
5461 } else {
5462 /* invariant: i > 0, because update[0]->forward[0]
5463 * is always equal to x */
5464 update[i]->span[i-1] -= 1;
5465 }
5466 }
5467 if (x->forward[0]) {
5468 x->forward[0]->backward = x->backward;
5469 } else {
5470 zsl->tail = x->backward;
5471 }
5472 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5473 zsl->level--;
5474 zsl->length--;
5475}
5476
50c55df5 5477/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 5478static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 5479 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5480 int i;
5481
5482 x = zsl->header;
5483 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 5484 while (x->forward[i] &&
5485 (x->forward[i]->score < score ||
5486 (x->forward[i]->score == score &&
5487 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 5488 x = x->forward[i];
5489 update[i] = x;
5490 }
5491 /* We may have multiple elements with the same score, what we need
5492 * is to find the element with both the right score and object. */
5493 x = x->forward[0];
bf028098 5494 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
84105336 5495 zslDeleteNode(zsl, x, update);
9d60e6e4 5496 zslFreeNode(x);
9d60e6e4 5497 return 1;
5498 } else {
5499 return 0; /* not found */
e197b441 5500 }
5501 return 0; /* not found */
fd8ccf44 5502}
5503
1807985b 5504/* Delete all the elements with score between min and max from the skiplist.
5505 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5506 * Note that this function takes the reference to the hash table view of the
5507 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 5508static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 5509 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5510 unsigned long removed = 0;
5511 int i;
5512
5513 x = zsl->header;
5514 for (i = zsl->level-1; i >= 0; i--) {
5515 while (x->forward[i] && x->forward[i]->score < min)
5516 x = x->forward[i];
5517 update[i] = x;
5518 }
5519 /* We may have multiple elements with the same score, what we need
5520 * is to find the element with both the right score and object. */
5521 x = x->forward[0];
5522 while (x && x->score <= max) {
84105336
PN
5523 zskiplistNode *next = x->forward[0];
5524 zslDeleteNode(zsl, x, update);
1807985b 5525 dictDelete(dict,x->obj);
5526 zslFreeNode(x);
1807985b 5527 removed++;
5528 x = next;
5529 }
5530 return removed; /* not found */
5531}
1807985b 5532
9212eafd 5533/* Delete all the elements with rank between start and end from the skiplist.
2424490f 5534 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
5535static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5536 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5537 unsigned long traversed = 0, removed = 0;
5538 int i;
5539
9212eafd
PN
5540 x = zsl->header;
5541 for (i = zsl->level-1; i >= 0; i--) {
5542 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5543 traversed += i > 0 ? x->span[i-1] : 1;
5544 x = x->forward[i];
1807985b 5545 }
9212eafd
PN
5546 update[i] = x;
5547 }
5548
5549 traversed++;
5550 x = x->forward[0];
5551 while (x && traversed <= end) {
84105336
PN
5552 zskiplistNode *next = x->forward[0];
5553 zslDeleteNode(zsl, x, update);
1807985b 5554 dictDelete(dict,x->obj);
5555 zslFreeNode(x);
1807985b 5556 removed++;
9212eafd 5557 traversed++;
1807985b 5558 x = next;
5559 }
9212eafd 5560 return removed;
1807985b 5561}
5562
50c55df5 5563/* Find the first node having a score equal or greater than the specified one.
5564 * Returns NULL if there is no match. */
5565static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5566 zskiplistNode *x;
5567 int i;
5568
5569 x = zsl->header;
5570 for (i = zsl->level-1; i >= 0; i--) {
5571 while (x->forward[i] && x->forward[i]->score < score)
5572 x = x->forward[i];
5573 }
5574 /* We may have multiple elements with the same score, what we need
5575 * is to find the element with both the right score and object. */
5576 return x->forward[0];
5577}
5578
27b0ccca
PN
5579/* Find the rank for an element by both score and key.
5580 * Returns 0 when the element cannot be found, rank otherwise.
5581 * Note that the rank is 1-based due to the span of zsl->header to the
5582 * first element. */
5583static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5584 zskiplistNode *x;
5585 unsigned long rank = 0;
5586 int i;
5587
5588 x = zsl->header;
5589 for (i = zsl->level-1; i >= 0; i--) {
5590 while (x->forward[i] &&
5591 (x->forward[i]->score < score ||
5592 (x->forward[i]->score == score &&
5593 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 5594 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
5595 x = x->forward[i];
5596 }
5597
5598 /* x might be equal to zsl->header, so test if obj is non-NULL */
bf028098 5599 if (x->obj && equalStringObjects(x->obj,o)) {
27b0ccca
PN
5600 return rank;
5601 }
5602 }
5603 return 0;
5604}
5605
e74825c2
PN
5606/* Finds an element by its rank. The rank argument needs to be 1-based. */
5607zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5608 zskiplistNode *x;
5609 unsigned long traversed = 0;
5610 int i;
5611
5612 x = zsl->header;
5613 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 5614 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5615 {
a50ea45c 5616 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
5617 x = x->forward[i];
5618 }
e74825c2
PN
5619 if (traversed == rank) {
5620 return x;
5621 }
5622 }
5623 return NULL;
5624}
5625
fd8ccf44 5626/* The actual Z-commands implementations */
5627
7db723ad 5628/* This generic command implements both ZADD and ZINCRBY.
e2665397 5629 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 5630 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 5631static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 5632 robj *zsetobj;
5633 zset *zs;
5634 double *score;
5635
e2665397 5636 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 5637 if (zsetobj == NULL) {
5638 zsetobj = createZsetObject();
e2665397 5639 dictAdd(c->db->dict,key,zsetobj);
5640 incrRefCount(key);
fd8ccf44 5641 } else {
5642 if (zsetobj->type != REDIS_ZSET) {
5643 addReply(c,shared.wrongtypeerr);
5644 return;
5645 }
5646 }
fd8ccf44 5647 zs = zsetobj->ptr;
e2665397 5648
7db723ad 5649 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 5650 * needs to handle the two different conditions. It's all about setting
5651 * '*score', that is, the new score to set, to the right value. */
5652 score = zmalloc(sizeof(double));
5653 if (doincrement) {
5654 dictEntry *de;
5655
5656 /* Read the old score. If the element was not present starts from 0 */
5657 de = dictFind(zs->dict,ele);
5658 if (de) {
5659 double *oldscore = dictGetEntryVal(de);
5660 *score = *oldscore + scoreval;
5661 } else {
5662 *score = scoreval;
5663 }
5664 } else {
5665 *score = scoreval;
5666 }
5667
5668 /* What follows is a simple remove and re-insert operation that is common
7db723ad 5669 * to both ZADD and ZINCRBY... */
e2665397 5670 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 5671 /* case 1: New element */
e2665397 5672 incrRefCount(ele); /* added to hash */
5673 zslInsert(zs->zsl,*score,ele);
5674 incrRefCount(ele); /* added to skiplist */
fd8ccf44 5675 server.dirty++;
e2665397 5676 if (doincrement)
e2665397 5677 addReplyDouble(c,*score);
91d71bfc 5678 else
5679 addReply(c,shared.cone);
fd8ccf44 5680 } else {
5681 dictEntry *de;
5682 double *oldscore;
e0a62c7f 5683
fd8ccf44 5684 /* case 2: Score update operation */
e2665397 5685 de = dictFind(zs->dict,ele);
dfc5e96c 5686 redisAssert(de != NULL);
fd8ccf44 5687 oldscore = dictGetEntryVal(de);
5688 if (*score != *oldscore) {
5689 int deleted;
5690
e2665397 5691 /* Remove and insert the element in the skip list with new score */
5692 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 5693 redisAssert(deleted != 0);
e2665397 5694 zslInsert(zs->zsl,*score,ele);
5695 incrRefCount(ele);
5696 /* Update the score in the hash table */
5697 dictReplace(zs->dict,ele,score);
fd8ccf44 5698 server.dirty++;
2161a965 5699 } else {
5700 zfree(score);
fd8ccf44 5701 }
e2665397 5702 if (doincrement)
5703 addReplyDouble(c,*score);
5704 else
5705 addReply(c,shared.czero);
fd8ccf44 5706 }
5707}
5708
e2665397 5709static void zaddCommand(redisClient *c) {
5710 double scoreval;
5711
bd79a6bd 5712 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5713 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5714}
5715
7db723ad 5716static void zincrbyCommand(redisClient *c) {
e2665397 5717 double scoreval;
5718
bd79a6bd 5719 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5720 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5721}
5722
1b7106e7 5723static void zremCommand(redisClient *c) {
5724 robj *zsetobj;
5725 zset *zs;
dd88747b 5726 dictEntry *de;
5727 double *oldscore;
5728 int deleted;
1b7106e7 5729
dd88747b 5730 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5731 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 5732
dd88747b 5733 zs = zsetobj->ptr;
5734 de = dictFind(zs->dict,c->argv[2]);
5735 if (de == NULL) {
5736 addReply(c,shared.czero);
5737 return;
1b7106e7 5738 }
dd88747b 5739 /* Delete from the skiplist */
5740 oldscore = dictGetEntryVal(de);
5741 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5742 redisAssert(deleted != 0);
5743
5744 /* Delete from the hash table */
5745 dictDelete(zs->dict,c->argv[2]);
5746 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5747 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5748 server.dirty++;
5749 addReply(c,shared.cone);
1b7106e7 5750}
5751
1807985b 5752static void zremrangebyscoreCommand(redisClient *c) {
bbe025e0
AM
5753 double min;
5754 double max;
dd88747b 5755 long deleted;
1807985b 5756 robj *zsetobj;
5757 zset *zs;
5758
bd79a6bd
PN
5759 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5760 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
bbe025e0 5761
dd88747b 5762 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5763 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 5764
dd88747b 5765 zs = zsetobj->ptr;
5766 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5767 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5768 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5769 server.dirty += deleted;
5770 addReplyLong(c,deleted);
1807985b 5771}
5772
9212eafd 5773static void zremrangebyrankCommand(redisClient *c) {
bbe025e0
AM
5774 long start;
5775 long end;
dd88747b 5776 int llen;
5777 long deleted;
9212eafd
PN
5778 robj *zsetobj;
5779 zset *zs;
5780
bd79a6bd
PN
5781 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5782 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 5783
dd88747b 5784 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5785 checkType(c,zsetobj,REDIS_ZSET)) return;
5786 zs = zsetobj->ptr;
5787 llen = zs->zsl->length;
9212eafd 5788
dd88747b 5789 /* convert negative indexes */
5790 if (start < 0) start = llen+start;
5791 if (end < 0) end = llen+end;
5792 if (start < 0) start = 0;
5793 if (end < 0) end = 0;
9212eafd 5794
dd88747b 5795 /* indexes sanity checks */
5796 if (start > end || start >= llen) {
5797 addReply(c,shared.czero);
5798 return;
9212eafd 5799 }
dd88747b 5800 if (end >= llen) end = llen-1;
5801
5802 /* increment start and end because zsl*Rank functions
5803 * use 1-based rank */
5804 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5805 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5806 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5807 server.dirty += deleted;
5808 addReplyLong(c, deleted);
9212eafd
PN
5809}
5810
8f92e768
PN
5811typedef struct {
5812 dict *dict;
5813 double weight;
5814} zsetopsrc;
5815
5816static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5817 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5818 unsigned long size1, size2;
5819 size1 = d1->dict ? dictSize(d1->dict) : 0;
5820 size2 = d2->dict ? dictSize(d2->dict) : 0;
5821 return size1 - size2;
5822}
5823
d2764cd6
PN
5824#define REDIS_AGGR_SUM 1
5825#define REDIS_AGGR_MIN 2
5826#define REDIS_AGGR_MAX 3
5827
5828inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5829 if (aggregate == REDIS_AGGR_SUM) {
5830 *target = *target + val;
5831 } else if (aggregate == REDIS_AGGR_MIN) {
5832 *target = val < *target ? val : *target;
5833 } else if (aggregate == REDIS_AGGR_MAX) {
5834 *target = val > *target ? val : *target;
5835 } else {
5836 /* safety net */
f83c6cb5 5837 redisPanic("Unknown ZUNION/INTER aggregate type");
d2764cd6
PN
5838 }
5839}
5840
2830ca53 5841static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
8f92e768 5842 int i, j, zsetnum;
d2764cd6 5843 int aggregate = REDIS_AGGR_SUM;
8f92e768 5844 zsetopsrc *src;
2830ca53
PN
5845 robj *dstobj;
5846 zset *dstzset;
b287c9bb
PN
5847 dictIterator *di;
5848 dictEntry *de;
5849
2830ca53
PN
5850 /* expect zsetnum input keys to be given */
5851 zsetnum = atoi(c->argv[2]->ptr);
5852 if (zsetnum < 1) {
5853 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5854 return;
b287c9bb 5855 }
2830ca53
PN
5856
5857 /* test if the expected number of keys would overflow */
5858 if (3+zsetnum > c->argc) {
b287c9bb
PN
5859 addReply(c,shared.syntaxerr);
5860 return;
5861 }
5862
2830ca53 5863 /* read keys to be used for input */
b9eed483 5864 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
2830ca53 5865 for (i = 0, j = 3; i < zsetnum; i++, j++) {
b287c9bb
PN
5866 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5867 if (!zsetobj) {
8f92e768 5868 src[i].dict = NULL;
b287c9bb
PN
5869 } else {
5870 if (zsetobj->type != REDIS_ZSET) {
8f92e768 5871 zfree(src);
b287c9bb
PN
5872 addReply(c,shared.wrongtypeerr);
5873 return;
5874 }
8f92e768 5875 src[i].dict = ((zset*)zsetobj->ptr)->dict;
b287c9bb 5876 }
2830ca53
PN
5877
5878 /* default all weights to 1 */
8f92e768 5879 src[i].weight = 1.0;
b287c9bb
PN
5880 }
5881
2830ca53
PN
5882 /* parse optional extra arguments */
5883 if (j < c->argc) {
d2764cd6 5884 int remaining = c->argc - j;
b287c9bb 5885
2830ca53 5886 while (remaining) {
d2764cd6 5887 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 5888 j++; remaining--;
2830ca53 5889 for (i = 0; i < zsetnum; i++, j++, remaining--) {
bd79a6bd 5890 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
bbe025e0 5891 return;
2830ca53 5892 }
d2764cd6
PN
5893 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5894 j++; remaining--;
5895 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5896 aggregate = REDIS_AGGR_SUM;
5897 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5898 aggregate = REDIS_AGGR_MIN;
5899 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5900 aggregate = REDIS_AGGR_MAX;
5901 } else {
5902 zfree(src);
5903 addReply(c,shared.syntaxerr);
5904 return;
5905 }
5906 j++; remaining--;
2830ca53 5907 } else {
8f92e768 5908 zfree(src);
2830ca53
PN
5909 addReply(c,shared.syntaxerr);
5910 return;
5911 }
5912 }
5913 }
b287c9bb 5914
d2764cd6
PN
5915 /* sort sets from the smallest to largest, this will improve our
5916 * algorithm's performance */
5917 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5918
2830ca53
PN
5919 dstobj = createZsetObject();
5920 dstzset = dstobj->ptr;
5921
5922 if (op == REDIS_OP_INTER) {
8f92e768
PN
5923 /* skip going over all entries if the smallest zset is NULL or empty */
5924 if (src[0].dict && dictSize(src[0].dict) > 0) {
5925 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5926 * from small to large, all src[i > 0].dict are non-empty too */
5927 di = dictGetIterator(src[0].dict);
2830ca53 5928 while((de = dictNext(di)) != NULL) {
d2764cd6
PN
5929 double *score = zmalloc(sizeof(double)), value;
5930 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
2830ca53 5931
d2764cd6
PN
5932 for (j = 1; j < zsetnum; j++) {
5933 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 5934 if (other) {
d2764cd6
PN
5935 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5936 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
5937 } else {
5938 break;
5939 }
5940 }
b287c9bb 5941
2830ca53 5942 /* skip entry when not present in every source dict */
8f92e768 5943 if (j != zsetnum) {
2830ca53
PN
5944 zfree(score);
5945 } else {
5946 robj *o = dictGetEntryKey(de);
5947 dictAdd(dstzset->dict,o,score);
5948 incrRefCount(o); /* added to dictionary */
5949 zslInsert(dstzset->zsl,*score,o);
5950 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
5951 }
5952 }
2830ca53
PN
5953 dictReleaseIterator(di);
5954 }
5955 } else if (op == REDIS_OP_UNION) {
5956 for (i = 0; i < zsetnum; i++) {
8f92e768 5957 if (!src[i].dict) continue;
2830ca53 5958
8f92e768 5959 di = dictGetIterator(src[i].dict);
2830ca53
PN
5960 while((de = dictNext(di)) != NULL) {
5961 /* skip key when already processed */
5962 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5963
d2764cd6
PN
5964 double *score = zmalloc(sizeof(double)), value;
5965 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
2830ca53 5966
d2764cd6
PN
5967 /* because the zsets are sorted by size, its only possible
5968 * for sets at larger indices to hold this entry */
5969 for (j = (i+1); j < zsetnum; j++) {
5970 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 5971 if (other) {
d2764cd6
PN
5972 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5973 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
5974 }
5975 }
b287c9bb 5976
2830ca53
PN
5977 robj *o = dictGetEntryKey(de);
5978 dictAdd(dstzset->dict,o,score);
5979 incrRefCount(o); /* added to dictionary */
5980 zslInsert(dstzset->zsl,*score,o);
5981 incrRefCount(o); /* added to skiplist */
5982 }
5983 dictReleaseIterator(di);
b287c9bb 5984 }
2830ca53
PN
5985 } else {
5986 /* unknown operator */
5987 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
5988 }
5989
5990 deleteKey(c->db,dstkey);
3ea27d37 5991 if (dstzset->zsl->length) {
5992 dictAdd(c->db->dict,dstkey,dstobj);
5993 incrRefCount(dstkey);
5994 addReplyLong(c, dstzset->zsl->length);
5995 server.dirty++;
5996 } else {
8bca8773 5997 decrRefCount(dstobj);
3ea27d37 5998 addReply(c, shared.czero);
5999 }
8f92e768 6000 zfree(src);
b287c9bb
PN
6001}
6002
2830ca53
PN
6003static void zunionCommand(redisClient *c) {
6004 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
6005}
6006
2830ca53
PN
6007static void zinterCommand(redisClient *c) {
6008 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
6009}
6010
e3870fab 6011static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 6012 robj *o;
bbe025e0
AM
6013 long start;
6014 long end;
752da584 6015 int withscores = 0;
dd88747b 6016 int llen;
6017 int rangelen, j;
6018 zset *zsetobj;
6019 zskiplist *zsl;
6020 zskiplistNode *ln;
6021 robj *ele;
752da584 6022
bd79a6bd
PN
6023 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6024 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 6025
752da584 6026 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6027 withscores = 1;
6028 } else if (c->argc >= 5) {
6029 addReply(c,shared.syntaxerr);
6030 return;
6031 }
cc812361 6032
4e27f268 6033 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6034 || checkType(c,o,REDIS_ZSET)) return;
dd88747b 6035 zsetobj = o->ptr;
6036 zsl = zsetobj->zsl;
6037 llen = zsl->length;
cc812361 6038
dd88747b 6039 /* convert negative indexes */
6040 if (start < 0) start = llen+start;
6041 if (end < 0) end = llen+end;
6042 if (start < 0) start = 0;
6043 if (end < 0) end = 0;
cc812361 6044
dd88747b 6045 /* indexes sanity checks */
6046 if (start > end || start >= llen) {
6047 /* Out of range start or start > end result in empty list */
6048 addReply(c,shared.emptymultibulk);
6049 return;
6050 }
6051 if (end >= llen) end = llen-1;
6052 rangelen = (end-start)+1;
cc812361 6053
dd88747b 6054 /* check if starting point is trivial, before searching
6055 * the element in log(N) time */
6056 if (reverse) {
6057 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6058 } else {
6059 ln = start == 0 ?
6060 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6061 }
cc812361 6062
dd88747b 6063 /* Return the result in form of a multi-bulk reply */
6064 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6065 withscores ? (rangelen*2) : rangelen));
6066 for (j = 0; j < rangelen; j++) {
6067 ele = ln->obj;
6068 addReplyBulk(c,ele);
6069 if (withscores)
6070 addReplyDouble(c,ln->score);
6071 ln = reverse ? ln->backward : ln->forward[0];
cc812361 6072 }
6073}
6074
e3870fab 6075static void zrangeCommand(redisClient *c) {
6076 zrangeGenericCommand(c,0);
6077}
6078
6079static void zrevrangeCommand(redisClient *c) {
6080 zrangeGenericCommand(c,1);
6081}
6082
f44dd428 6083/* This command implements both ZRANGEBYSCORE and ZCOUNT.
6084 * If justcount is non-zero, just the count is returned. */
6085static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 6086 robj *o;
f44dd428 6087 double min, max;
6088 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 6089 int offset = 0, limit = -1;
0500ef27
SH
6090 int withscores = 0;
6091 int badsyntax = 0;
6092
f44dd428 6093 /* Parse the min-max interval. If one of the values is prefixed
6094 * by the "(" character, it's considered "open". For instance
6095 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6096 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6097 if (((char*)c->argv[2]->ptr)[0] == '(') {
6098 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6099 minex = 1;
6100 } else {
6101 min = strtod(c->argv[2]->ptr,NULL);
6102 }
6103 if (((char*)c->argv[3]->ptr)[0] == '(') {
6104 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6105 maxex = 1;
6106 } else {
6107 max = strtod(c->argv[3]->ptr,NULL);
6108 }
6109
6110 /* Parse "WITHSCORES": note that if the command was called with
6111 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6112 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 6113 if (c->argc == 5 || c->argc == 8) {
3a3978b1 6114 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6115 withscores = 1;
6116 else
6117 badsyntax = 1;
0500ef27 6118 }
3a3978b1 6119 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 6120 badsyntax = 1;
0500ef27 6121 if (badsyntax) {
454d4e43 6122 addReplySds(c,
6123 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 6124 return;
0500ef27
SH
6125 }
6126
f44dd428 6127 /* Parse "LIMIT" */
0500ef27 6128 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 6129 addReply(c,shared.syntaxerr);
6130 return;
0500ef27 6131 } else if (c->argc == (7 + withscores)) {
80181f78 6132 offset = atoi(c->argv[5]->ptr);
6133 limit = atoi(c->argv[6]->ptr);
0b13687c 6134 if (offset < 0) offset = 0;
80181f78 6135 }
50c55df5 6136
f44dd428 6137 /* Ok, lookup the key and get the range */
50c55df5 6138 o = lookupKeyRead(c->db,c->argv[1]);
6139 if (o == NULL) {
4e27f268 6140 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6141 } else {
6142 if (o->type != REDIS_ZSET) {
6143 addReply(c,shared.wrongtypeerr);
6144 } else {
6145 zset *zsetobj = o->ptr;
6146 zskiplist *zsl = zsetobj->zsl;
6147 zskiplistNode *ln;
f44dd428 6148 robj *ele, *lenobj = NULL;
6149 unsigned long rangelen = 0;
50c55df5 6150
f44dd428 6151 /* Get the first node with the score >= min, or with
6152 * score > min if 'minex' is true. */
50c55df5 6153 ln = zslFirstWithScore(zsl,min);
f44dd428 6154 while (minex && ln && ln->score == min) ln = ln->forward[0];
6155
50c55df5 6156 if (ln == NULL) {
6157 /* No element matching the speciifed interval */
f44dd428 6158 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6159 return;
6160 }
6161
6162 /* We don't know in advance how many matching elements there
6163 * are in the list, so we push this object that will represent
6164 * the multi-bulk length in the output buffer, and will "fix"
6165 * it later */
f44dd428 6166 if (!justcount) {
6167 lenobj = createObject(REDIS_STRING,NULL);
6168 addReply(c,lenobj);
6169 decrRefCount(lenobj);
6170 }
50c55df5 6171
f44dd428 6172 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 6173 if (offset) {
6174 offset--;
6175 ln = ln->forward[0];
6176 continue;
6177 }
6178 if (limit == 0) break;
f44dd428 6179 if (!justcount) {
6180 ele = ln->obj;
dd88747b 6181 addReplyBulk(c,ele);
f44dd428 6182 if (withscores)
6183 addReplyDouble(c,ln->score);
6184 }
50c55df5 6185 ln = ln->forward[0];
6186 rangelen++;
80181f78 6187 if (limit > 0) limit--;
50c55df5 6188 }
f44dd428 6189 if (justcount) {
6190 addReplyLong(c,(long)rangelen);
6191 } else {
6192 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6193 withscores ? (rangelen*2) : rangelen);
6194 }
50c55df5 6195 }
6196 }
6197}
6198
f44dd428 6199static void zrangebyscoreCommand(redisClient *c) {
6200 genericZrangebyscoreCommand(c,0);
6201}
6202
6203static void zcountCommand(redisClient *c) {
6204 genericZrangebyscoreCommand(c,1);
6205}
6206
3c41331e 6207static void zcardCommand(redisClient *c) {
e197b441 6208 robj *o;
6209 zset *zs;
dd88747b 6210
6211 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6212 checkType(c,o,REDIS_ZSET)) return;
6213
6214 zs = o->ptr;
6215 addReplyUlong(c,zs->zsl->length);
e197b441 6216}
6217
6e333bbe 6218static void zscoreCommand(redisClient *c) {
6219 robj *o;
6220 zset *zs;
dd88747b 6221 dictEntry *de;
6222
6223 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6224 checkType(c,o,REDIS_ZSET)) return;
6225
6226 zs = o->ptr;
6227 de = dictFind(zs->dict,c->argv[2]);
6228 if (!de) {
96d8b4ee 6229 addReply(c,shared.nullbulk);
6e333bbe 6230 } else {
dd88747b 6231 double *score = dictGetEntryVal(de);
6e333bbe 6232
dd88747b 6233 addReplyDouble(c,*score);
6e333bbe 6234 }
6235}
6236
798d9e55 6237static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 6238 robj *o;
dd88747b 6239 zset *zs;
6240 zskiplist *zsl;
6241 dictEntry *de;
6242 unsigned long rank;
6243 double *score;
6244
6245 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6246 checkType(c,o,REDIS_ZSET)) return;
6247
6248 zs = o->ptr;
6249 zsl = zs->zsl;
6250 de = dictFind(zs->dict,c->argv[2]);
6251 if (!de) {
69d95c3e
PN
6252 addReply(c,shared.nullbulk);
6253 return;
6254 }
69d95c3e 6255
dd88747b 6256 score = dictGetEntryVal(de);
6257 rank = zslGetRank(zsl, *score, c->argv[2]);
6258 if (rank) {
6259 if (reverse) {
6260 addReplyLong(c, zsl->length - rank);
27b0ccca 6261 } else {
dd88747b 6262 addReplyLong(c, rank-1);
69d95c3e 6263 }
dd88747b 6264 } else {
6265 addReply(c,shared.nullbulk);
978c2c94 6266 }
6267}
6268
798d9e55
PN
6269static void zrankCommand(redisClient *c) {
6270 zrankGenericCommand(c, 0);
6271}
6272
6273static void zrevrankCommand(redisClient *c) {
6274 zrankGenericCommand(c, 1);
6275}
6276
7fb16bac
PN
6277/* ========================= Hashes utility functions ======================= */
6278#define REDIS_HASH_KEY 1
6279#define REDIS_HASH_VALUE 2
978c2c94 6280
7fb16bac
PN
6281/* Check the length of a number of objects to see if we need to convert a
6282 * zipmap to a real hash. Note that we only check string encoded objects
6283 * as their string length can be queried in constant time. */
6284static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6285 int i;
6286 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
978c2c94 6287
7fb16bac
PN
6288 for (i = start; i <= end; i++) {
6289 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6290 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6291 {
6292 convertToRealHash(subject);
978c2c94 6293 return;
6294 }
6295 }
7fb16bac 6296}
bae2c7ec 6297
97224de7
PN
6298/* Encode given objects in-place when the hash uses a dict. */
6299static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6300 if (subject->encoding == REDIS_ENCODING_HT) {
3f973463
PN
6301 if (o1) *o1 = tryObjectEncoding(*o1);
6302 if (o2) *o2 = tryObjectEncoding(*o2);
97224de7
PN
6303 }
6304}
6305
7fb16bac 6306/* Get the value from a hash identified by key. Returns either a string
a3f3af86
PN
6307 * object or NULL if the value cannot be found. The refcount of the object
6308 * is always increased by 1 when the value was found. */
7fb16bac
PN
6309static robj *hashGet(robj *o, robj *key) {
6310 robj *value = NULL;
978c2c94 6311 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7fb16bac
PN
6312 unsigned char *v;
6313 unsigned int vlen;
6314 key = getDecodedObject(key);
6315 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6316 value = createStringObject((char*)v,vlen);
6317 }
6318 decrRefCount(key);
6319 } else {
6320 dictEntry *de = dictFind(o->ptr,key);
6321 if (de != NULL) {
6322 value = dictGetEntryVal(de);
a3f3af86 6323 incrRefCount(value);
7fb16bac
PN
6324 }
6325 }
6326 return value;
6327}
978c2c94 6328
7fb16bac
PN
6329/* Test if the key exists in the given hash. Returns 1 if the key
6330 * exists and 0 when it doesn't. */
6331static int hashExists(robj *o, robj *key) {
6332 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6333 key = getDecodedObject(key);
6334 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6335 decrRefCount(key);
6336 return 1;
6337 }
6338 decrRefCount(key);
6339 } else {
6340 if (dictFind(o->ptr,key) != NULL) {
6341 return 1;
6342 }
6343 }
6344 return 0;
6345}
bae2c7ec 6346
7fb16bac
PN
6347/* Add an element, discard the old if the key already exists.
6348 * Return 0 on insert and 1 on update. */
feb8d7e6 6349static int hashSet(robj *o, robj *key, robj *value) {
7fb16bac
PN
6350 int update = 0;
6351 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6352 key = getDecodedObject(key);
6353 value = getDecodedObject(value);
6354 o->ptr = zipmapSet(o->ptr,
6355 key->ptr,sdslen(key->ptr),
6356 value->ptr,sdslen(value->ptr), &update);
6357 decrRefCount(key);
6358 decrRefCount(value);
6359
6360 /* Check if the zipmap needs to be upgraded to a real hash table */
6361 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
bae2c7ec 6362 convertToRealHash(o);
978c2c94 6363 } else {
7fb16bac
PN
6364 if (dictReplace(o->ptr,key,value)) {
6365 /* Insert */
6366 incrRefCount(key);
978c2c94 6367 } else {
7fb16bac 6368 /* Update */
978c2c94 6369 update = 1;
6370 }
7fb16bac 6371 incrRefCount(value);
978c2c94 6372 }
7fb16bac 6373 return update;
978c2c94 6374}
6375
7fb16bac
PN
6376/* Delete an element from a hash.
6377 * Return 1 on deleted and 0 on not found. */
6378static int hashDelete(robj *o, robj *key) {
6379 int deleted = 0;
6380 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6381 key = getDecodedObject(key);
6382 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6383 decrRefCount(key);
6384 } else {
6385 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6386 /* Always check if the dictionary needs a resize after a delete. */
6387 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
d33278d1 6388 }
7fb16bac
PN
6389 return deleted;
6390}
d33278d1 6391
7fb16bac 6392/* Return the number of elements in a hash. */
c811bb38 6393static unsigned long hashLength(robj *o) {
7fb16bac
PN
6394 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6395 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6396}
6397
6398/* Structure to hold hash iteration abstration. Note that iteration over
6399 * hashes involves both fields and values. Because it is possible that
6400 * not both are required, store pointers in the iterator to avoid
6401 * unnecessary memory allocation for fields/values. */
6402typedef struct {
6403 int encoding;
6404 unsigned char *zi;
6405 unsigned char *zk, *zv;
6406 unsigned int zklen, zvlen;
6407
6408 dictIterator *di;
6409 dictEntry *de;
6410} hashIterator;
6411
c44d3b56
PN
6412static hashIterator *hashInitIterator(robj *subject) {
6413 hashIterator *hi = zmalloc(sizeof(hashIterator));
7fb16bac
PN
6414 hi->encoding = subject->encoding;
6415 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6416 hi->zi = zipmapRewind(subject->ptr);
6417 } else if (hi->encoding == REDIS_ENCODING_HT) {
6418 hi->di = dictGetIterator(subject->ptr);
d33278d1 6419 } else {
7fb16bac 6420 redisAssert(NULL);
d33278d1 6421 }
c44d3b56 6422 return hi;
7fb16bac 6423}
d33278d1 6424
7fb16bac
PN
6425static void hashReleaseIterator(hashIterator *hi) {
6426 if (hi->encoding == REDIS_ENCODING_HT) {
6427 dictReleaseIterator(hi->di);
d33278d1 6428 }
c44d3b56 6429 zfree(hi);
7fb16bac 6430}
d33278d1 6431
7fb16bac
PN
6432/* Move to the next entry in the hash. Return REDIS_OK when the next entry
6433 * could be found and REDIS_ERR when the iterator reaches the end. */
c811bb38 6434static int hashNext(hashIterator *hi) {
7fb16bac
PN
6435 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6436 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6437 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6438 } else {
6439 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6440 }
6441 return REDIS_OK;
6442}
d33278d1 6443
0c390abc 6444/* Get key or value object at current iteration position.
a3f3af86 6445 * This increases the refcount of the field object by 1. */
c811bb38 6446static robj *hashCurrent(hashIterator *hi, int what) {
7fb16bac
PN
6447 robj *o;
6448 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6449 if (what & REDIS_HASH_KEY) {
6450 o = createStringObject((char*)hi->zk,hi->zklen);
6451 } else {
6452 o = createStringObject((char*)hi->zv,hi->zvlen);
d33278d1 6453 }
d33278d1 6454 } else {
7fb16bac
PN
6455 if (what & REDIS_HASH_KEY) {
6456 o = dictGetEntryKey(hi->de);
6457 } else {
6458 o = dictGetEntryVal(hi->de);
d33278d1 6459 }
a3f3af86 6460 incrRefCount(o);
d33278d1 6461 }
7fb16bac 6462 return o;
d33278d1
PN
6463}
6464
7fb16bac
PN
6465static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6466 robj *o = lookupKeyWrite(c->db,key);
01426b05
PN
6467 if (o == NULL) {
6468 o = createHashObject();
7fb16bac
PN
6469 dictAdd(c->db->dict,key,o);
6470 incrRefCount(key);
01426b05
PN
6471 } else {
6472 if (o->type != REDIS_HASH) {
6473 addReply(c,shared.wrongtypeerr);
7fb16bac 6474 return NULL;
01426b05
PN
6475 }
6476 }
7fb16bac
PN
6477 return o;
6478}
01426b05 6479
7fb16bac
PN
6480/* ============================= Hash commands ============================== */
6481static void hsetCommand(redisClient *c) {
6e9e463f 6482 int update;
7fb16bac 6483 robj *o;
bbe025e0 6484
7fb16bac
PN
6485 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6486 hashTryConversion(o,c->argv,2,3);
97224de7 6487 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6488 update = hashSet(o,c->argv[2],c->argv[3]);
6e9e463f 6489 addReply(c, update ? shared.czero : shared.cone);
7fb16bac
PN
6490 server.dirty++;
6491}
01426b05 6492
1f1c7695
PN
6493static void hsetnxCommand(redisClient *c) {
6494 robj *o;
6495 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6496 hashTryConversion(o,c->argv,2,3);
6497
6498 if (hashExists(o, c->argv[2])) {
6499 addReply(c, shared.czero);
01426b05 6500 } else {
97224de7 6501 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6502 hashSet(o,c->argv[2],c->argv[3]);
1f1c7695
PN
6503 addReply(c, shared.cone);
6504 server.dirty++;
6505 }
6506}
01426b05 6507
7fb16bac
PN
6508static void hmsetCommand(redisClient *c) {
6509 int i;
6510 robj *o;
01426b05 6511
7fb16bac
PN
6512 if ((c->argc % 2) == 1) {
6513 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6514 return;
6515 }
01426b05 6516
7fb16bac
PN
6517 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6518 hashTryConversion(o,c->argv,2,c->argc-1);
6519 for (i = 2; i < c->argc; i += 2) {
97224de7 6520 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
feb8d7e6 6521 hashSet(o,c->argv[i],c->argv[i+1]);
7fb16bac
PN
6522 }
6523 addReply(c, shared.ok);
edc2f63a 6524 server.dirty++;
7fb16bac
PN
6525}
6526
6527static void hincrbyCommand(redisClient *c) {
6528 long long value, incr;
6529 robj *o, *current, *new;
6530
bd79a6bd 6531 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
7fb16bac
PN
6532 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6533 if ((current = hashGet(o,c->argv[2])) != NULL) {
946342c1
PN
6534 if (getLongLongFromObjectOrReply(c,current,&value,
6535 "hash value is not an integer") != REDIS_OK) {
6536 decrRefCount(current);
6537 return;
6538 }
a3f3af86 6539 decrRefCount(current);
7fb16bac
PN
6540 } else {
6541 value = 0;
01426b05
PN
6542 }
6543
7fb16bac 6544 value += incr;
3f973463
PN
6545 new = createStringObjectFromLongLong(value);
6546 hashTryObjectEncoding(o,&c->argv[2],NULL);
feb8d7e6 6547 hashSet(o,c->argv[2],new);
7fb16bac
PN
6548 decrRefCount(new);
6549 addReplyLongLong(c,value);
01426b05 6550 server.dirty++;
01426b05
PN
6551}
6552
978c2c94 6553static void hgetCommand(redisClient *c) {
7fb16bac 6554 robj *o, *value;
dd88747b 6555 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6556 checkType(c,o,REDIS_HASH)) return;
6557
7fb16bac
PN
6558 if ((value = hashGet(o,c->argv[2])) != NULL) {
6559 addReplyBulk(c,value);
a3f3af86 6560 decrRefCount(value);
dd88747b 6561 } else {
7fb16bac 6562 addReply(c,shared.nullbulk);
69d95c3e 6563 }
69d95c3e
PN
6564}
6565
09aeb579
PN
6566static void hmgetCommand(redisClient *c) {
6567 int i;
7fb16bac
PN
6568 robj *o, *value;
6569 o = lookupKeyRead(c->db,c->argv[1]);
6570 if (o != NULL && o->type != REDIS_HASH) {
6571 addReply(c,shared.wrongtypeerr);
09aeb579
PN
6572 }
6573
7fb16bac
PN
6574 /* Note the check for o != NULL happens inside the loop. This is
6575 * done because objects that cannot be found are considered to be
6576 * an empty hash. The reply should then be a series of NULLs. */
09aeb579 6577 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7fb16bac
PN
6578 for (i = 2; i < c->argc; i++) {
6579 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6580 addReplyBulk(c,value);
a3f3af86 6581 decrRefCount(value);
7fb16bac
PN
6582 } else {
6583 addReply(c,shared.nullbulk);
09aeb579
PN
6584 }
6585 }
6586}
6587
07efaf74 6588static void hdelCommand(redisClient *c) {
dd88747b 6589 robj *o;
dd88747b 6590 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6591 checkType(c,o,REDIS_HASH)) return;
07efaf74 6592
7fb16bac
PN
6593 if (hashDelete(o,c->argv[2])) {
6594 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6595 addReply(c,shared.cone);
6596 server.dirty++;
dd88747b 6597 } else {
7fb16bac 6598 addReply(c,shared.czero);
07efaf74 6599 }
6600}
6601
92b27fe9 6602static void hlenCommand(redisClient *c) {
6603 robj *o;
dd88747b 6604 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 6605 checkType(c,o,REDIS_HASH)) return;
6606
7fb16bac 6607 addReplyUlong(c,hashLength(o));
92b27fe9 6608}
6609
78409a0f 6610static void genericHgetallCommand(redisClient *c, int flags) {
7fb16bac 6611 robj *o, *lenobj, *obj;
78409a0f 6612 unsigned long count = 0;
c44d3b56 6613 hashIterator *hi;
78409a0f 6614
4e27f268 6615 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
78409a0f 6616 || checkType(c,o,REDIS_HASH)) return;
6617
6618 lenobj = createObject(REDIS_STRING,NULL);
6619 addReply(c,lenobj);
6620 decrRefCount(lenobj);
6621
c44d3b56
PN
6622 hi = hashInitIterator(o);
6623 while (hashNext(hi) != REDIS_ERR) {
7fb16bac 6624 if (flags & REDIS_HASH_KEY) {
c44d3b56 6625 obj = hashCurrent(hi,REDIS_HASH_KEY);
7fb16bac 6626 addReplyBulk(c,obj);
a3f3af86 6627 decrRefCount(obj);
7fb16bac 6628 count++;
78409a0f 6629 }
7fb16bac 6630 if (flags & REDIS_HASH_VALUE) {
c44d3b56 6631 obj = hashCurrent(hi,REDIS_HASH_VALUE);
7fb16bac 6632 addReplyBulk(c,obj);
a3f3af86 6633 decrRefCount(obj);
7fb16bac 6634 count++;
78409a0f 6635 }
78409a0f 6636 }
c44d3b56 6637 hashReleaseIterator(hi);
7fb16bac 6638
78409a0f 6639 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6640}
6641
6642static void hkeysCommand(redisClient *c) {
7fb16bac 6643 genericHgetallCommand(c,REDIS_HASH_KEY);
78409a0f 6644}
6645
6646static void hvalsCommand(redisClient *c) {
7fb16bac 6647 genericHgetallCommand(c,REDIS_HASH_VALUE);
78409a0f 6648}
6649
6650static void hgetallCommand(redisClient *c) {
7fb16bac 6651 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
78409a0f 6652}
6653
a86f14b1 6654static void hexistsCommand(redisClient *c) {
6655 robj *o;
a86f14b1 6656 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6657 checkType(c,o,REDIS_HASH)) return;
6658
7fb16bac 6659 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
a86f14b1 6660}
6661
ada386b2 6662static void convertToRealHash(robj *o) {
6663 unsigned char *key, *val, *p, *zm = o->ptr;
6664 unsigned int klen, vlen;
6665 dict *dict = dictCreate(&hashDictType,NULL);
6666
6667 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6668 p = zipmapRewind(zm);
6669 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6670 robj *keyobj, *valobj;
6671
6672 keyobj = createStringObject((char*)key,klen);
6673 valobj = createStringObject((char*)val,vlen);
05df7621 6674 keyobj = tryObjectEncoding(keyobj);
6675 valobj = tryObjectEncoding(valobj);
ada386b2 6676 dictAdd(dict,keyobj,valobj);
6677 }
6678 o->encoding = REDIS_ENCODING_HT;
6679 o->ptr = dict;
6680 zfree(zm);
6681}
6682
6b47e12e 6683/* ========================= Non type-specific commands ==================== */
6684
ed9b544e 6685static void flushdbCommand(redisClient *c) {
ca37e9cd 6686 server.dirty += dictSize(c->db->dict);
3305306f 6687 dictEmpty(c->db->dict);
6688 dictEmpty(c->db->expires);
ed9b544e 6689 addReply(c,shared.ok);
ed9b544e 6690}
6691
6692static void flushallCommand(redisClient *c) {
ca37e9cd 6693 server.dirty += emptyDb();
ed9b544e 6694 addReply(c,shared.ok);
500ece7c 6695 if (server.bgsavechildpid != -1) {
6696 kill(server.bgsavechildpid,SIGKILL);
6697 rdbRemoveTempFile(server.bgsavechildpid);
6698 }
f78fd11b 6699 rdbSave(server.dbfilename);
ca37e9cd 6700 server.dirty++;
ed9b544e 6701}
6702
56906eef 6703static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 6704 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 6705 so->type = type;
6706 so->pattern = pattern;
6707 return so;
6708}
6709
6710/* Return the value associated to the key with a name obtained
55017f9d
PN
6711 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6712 * The returned object will always have its refcount increased by 1
6713 * when it is non-NULL. */
56906eef 6714static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6d7d1370 6715 char *p, *f;
ed9b544e 6716 sds spat, ssub;
6d7d1370
PN
6717 robj keyobj, fieldobj, *o;
6718 int prefixlen, sublen, postfixlen, fieldlen;
ed9b544e 6719 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6720 struct {
f1017b3f 6721 long len;
6722 long free;
ed9b544e 6723 char buf[REDIS_SORTKEY_MAX+1];
6d7d1370 6724 } keyname, fieldname;
ed9b544e 6725
28173a49 6726 /* If the pattern is "#" return the substitution object itself in order
6727 * to implement the "SORT ... GET #" feature. */
6728 spat = pattern->ptr;
6729 if (spat[0] == '#' && spat[1] == '\0') {
55017f9d 6730 incrRefCount(subst);
28173a49 6731 return subst;
6732 }
6733
6734 /* The substitution object may be specially encoded. If so we create
9d65a1bb 6735 * a decoded object on the fly. Otherwise getDecodedObject will just
6736 * increment the ref count, that we'll decrement later. */
6737 subst = getDecodedObject(subst);
942a3961 6738
ed9b544e 6739 ssub = subst->ptr;
6740 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6741 p = strchr(spat,'*');
ed5a857a 6742 if (!p) {
6743 decrRefCount(subst);
6744 return NULL;
6745 }
ed9b544e 6746
6d7d1370
PN
6747 /* Find out if we're dealing with a hash dereference. */
6748 if ((f = strstr(p+1, "->")) != NULL) {
6749 fieldlen = sdslen(spat)-(f-spat);
6750 /* this also copies \0 character */
6751 memcpy(fieldname.buf,f+2,fieldlen-1);
6752 fieldname.len = fieldlen-2;
6753 } else {
6754 fieldlen = 0;
6755 }
6756
ed9b544e 6757 prefixlen = p-spat;
6758 sublen = sdslen(ssub);
6d7d1370 6759 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
ed9b544e 6760 memcpy(keyname.buf,spat,prefixlen);
6761 memcpy(keyname.buf+prefixlen,ssub,sublen);
6762 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6763 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6764 keyname.len = prefixlen+sublen+postfixlen;
942a3961 6765 decrRefCount(subst);
6766
6d7d1370
PN
6767 /* Lookup substituted key */
6768 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6769 o = lookupKeyRead(db,&keyobj);
55017f9d
PN
6770 if (o == NULL) return NULL;
6771
6772 if (fieldlen > 0) {
6773 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6d7d1370 6774
705dad38
PN
6775 /* Retrieve value from hash by the field name. This operation
6776 * already increases the refcount of the returned object. */
6d7d1370
PN
6777 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6778 o = hashGet(o, &fieldobj);
705dad38 6779 } else {
55017f9d 6780 if (o->type != REDIS_STRING) return NULL;
b6f07345 6781
705dad38
PN
6782 /* Every object that this function returns needs to have its refcount
6783 * increased. sortCommand decreases it again. */
6784 incrRefCount(o);
6d7d1370
PN
6785 }
6786
6787 return o;
ed9b544e 6788}
6789
6790/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6791 * the additional parameter is not standard but a BSD-specific we have to
6792 * pass sorting parameters via the global 'server' structure */
6793static int sortCompare(const void *s1, const void *s2) {
6794 const redisSortObject *so1 = s1, *so2 = s2;
6795 int cmp;
6796
6797 if (!server.sort_alpha) {
6798 /* Numeric sorting. Here it's trivial as we precomputed scores */
6799 if (so1->u.score > so2->u.score) {
6800 cmp = 1;
6801 } else if (so1->u.score < so2->u.score) {
6802 cmp = -1;
6803 } else {
6804 cmp = 0;
6805 }
6806 } else {
6807 /* Alphanumeric sorting */
6808 if (server.sort_bypattern) {
6809 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6810 /* At least one compare object is NULL */
6811 if (so1->u.cmpobj == so2->u.cmpobj)
6812 cmp = 0;
6813 else if (so1->u.cmpobj == NULL)
6814 cmp = -1;
6815 else
6816 cmp = 1;
6817 } else {
6818 /* We have both the objects, use strcoll */
6819 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6820 }
6821 } else {
08ee9b57 6822 /* Compare elements directly. */
6823 cmp = compareStringObjects(so1->obj,so2->obj);
ed9b544e 6824 }
6825 }
6826 return server.sort_desc ? -cmp : cmp;
6827}
6828
6829/* The SORT command is the most complex command in Redis. Warning: this code
6830 * is optimized for speed and a bit less for readability */
6831static void sortCommand(redisClient *c) {
ed9b544e 6832 list *operations;
6833 int outputlen = 0;
6834 int desc = 0, alpha = 0;
6835 int limit_start = 0, limit_count = -1, start, end;
6836 int j, dontsort = 0, vectorlen;
6837 int getop = 0; /* GET operation counter */
443c6409 6838 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 6839 redisSortObject *vector; /* Resulting vector to sort */
6840
6841 /* Lookup the key to sort. It must be of the right types */
3305306f 6842 sortval = lookupKeyRead(c->db,c->argv[1]);
6843 if (sortval == NULL) {
4e27f268 6844 addReply(c,shared.emptymultibulk);
ed9b544e 6845 return;
6846 }
a5eb649b 6847 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6848 sortval->type != REDIS_ZSET)
6849 {
c937aa89 6850 addReply(c,shared.wrongtypeerr);
ed9b544e 6851 return;
6852 }
6853
6854 /* Create a list of operations to perform for every sorted element.
6855 * Operations can be GET/DEL/INCR/DECR */
6856 operations = listCreate();
092dac2a 6857 listSetFreeMethod(operations,zfree);
ed9b544e 6858 j = 2;
6859
6860 /* Now we need to protect sortval incrementing its count, in the future
6861 * SORT may have options able to overwrite/delete keys during the sorting
6862 * and the sorted key itself may get destroied */
6863 incrRefCount(sortval);
6864
6865 /* The SORT command has an SQL-alike syntax, parse it */
6866 while(j < c->argc) {
6867 int leftargs = c->argc-j-1;
6868 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6869 desc = 0;
6870 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6871 desc = 1;
6872 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6873 alpha = 1;
6874 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6875 limit_start = atoi(c->argv[j+1]->ptr);
6876 limit_count = atoi(c->argv[j+2]->ptr);
6877 j+=2;
443c6409 6878 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6879 storekey = c->argv[j+1];
6880 j++;
ed9b544e 6881 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6882 sortby = c->argv[j+1];
6883 /* If the BY pattern does not contain '*', i.e. it is constant,
6884 * we don't need to sort nor to lookup the weight keys. */
6885 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6886 j++;
6887 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6888 listAddNodeTail(operations,createSortOperation(
6889 REDIS_SORT_GET,c->argv[j+1]));
6890 getop++;
6891 j++;
ed9b544e 6892 } else {
6893 decrRefCount(sortval);
6894 listRelease(operations);
c937aa89 6895 addReply(c,shared.syntaxerr);
ed9b544e 6896 return;
6897 }
6898 j++;
6899 }
6900
6901 /* Load the sorting vector with all the objects to sort */
a5eb649b 6902 switch(sortval->type) {
6903 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6904 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6905 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
f83c6cb5 6906 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
a5eb649b 6907 }
ed9b544e 6908 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 6909 j = 0;
a5eb649b 6910
ed9b544e 6911 if (sortval->type == REDIS_LIST) {
6912 list *list = sortval->ptr;
6208b3a7 6913 listNode *ln;
c7df85a4 6914 listIter li;
6208b3a7 6915
c7df85a4 6916 listRewind(list,&li);
6917 while((ln = listNext(&li))) {
ed9b544e 6918 robj *ele = ln->value;
6919 vector[j].obj = ele;
6920 vector[j].u.score = 0;
6921 vector[j].u.cmpobj = NULL;
ed9b544e 6922 j++;
6923 }
6924 } else {
a5eb649b 6925 dict *set;
ed9b544e 6926 dictIterator *di;
6927 dictEntry *setele;
6928
a5eb649b 6929 if (sortval->type == REDIS_SET) {
6930 set = sortval->ptr;
6931 } else {
6932 zset *zs = sortval->ptr;
6933 set = zs->dict;
6934 }
6935
ed9b544e 6936 di = dictGetIterator(set);
ed9b544e 6937 while((setele = dictNext(di)) != NULL) {
6938 vector[j].obj = dictGetEntryKey(setele);
6939 vector[j].u.score = 0;
6940 vector[j].u.cmpobj = NULL;
6941 j++;
6942 }
6943 dictReleaseIterator(di);
6944 }
dfc5e96c 6945 redisAssert(j == vectorlen);
ed9b544e 6946
6947 /* Now it's time to load the right scores in the sorting vector */
6948 if (dontsort == 0) {
6949 for (j = 0; j < vectorlen; j++) {
6d7d1370 6950 robj *byval;
ed9b544e 6951 if (sortby) {
6d7d1370 6952 /* lookup value to sort by */
3305306f 6953 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
705dad38 6954 if (!byval) continue;
ed9b544e 6955 } else {
6d7d1370
PN
6956 /* use object itself to sort by */
6957 byval = vector[j].obj;
6958 }
6959
6960 if (alpha) {
08ee9b57 6961 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6d7d1370
PN
6962 } else {
6963 if (byval->encoding == REDIS_ENCODING_RAW) {
6964 vector[j].u.score = strtod(byval->ptr,NULL);
16fa22f1 6965 } else if (byval->encoding == REDIS_ENCODING_INT) {
6d7d1370
PN
6966 /* Don't need to decode the object if it's
6967 * integer-encoded (the only encoding supported) so
6968 * far. We can just cast it */
16fa22f1
PN
6969 vector[j].u.score = (long)byval->ptr;
6970 } else {
6971 redisAssert(1 != 1);
942a3961 6972 }
ed9b544e 6973 }
6d7d1370 6974
705dad38
PN
6975 /* when the object was retrieved using lookupKeyByPattern,
6976 * its refcount needs to be decreased. */
6977 if (sortby) {
6978 decrRefCount(byval);
ed9b544e 6979 }
6980 }
6981 }
6982
6983 /* We are ready to sort the vector... perform a bit of sanity check
6984 * on the LIMIT option too. We'll use a partial version of quicksort. */
6985 start = (limit_start < 0) ? 0 : limit_start;
6986 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6987 if (start >= vectorlen) {
6988 start = vectorlen-1;
6989 end = vectorlen-2;
6990 }
6991 if (end >= vectorlen) end = vectorlen-1;
6992
6993 if (dontsort == 0) {
6994 server.sort_desc = desc;
6995 server.sort_alpha = alpha;
6996 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 6997 if (sortby && (start != 0 || end != vectorlen-1))
6998 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6999 else
7000 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 7001 }
7002
7003 /* Send command output to the output buffer, performing the specified
7004 * GET/DEL/INCR/DECR operations if any. */
7005 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 7006 if (storekey == NULL) {
7007 /* STORE option not specified, sent the sorting result to client */
7008 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7009 for (j = start; j <= end; j++) {
7010 listNode *ln;
c7df85a4 7011 listIter li;
7012
dd88747b 7013 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 7014 listRewind(operations,&li);
7015 while((ln = listNext(&li))) {
443c6409 7016 redisSortOperation *sop = ln->value;
7017 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7018 vector[j].obj);
7019
7020 if (sop->type == REDIS_SORT_GET) {
55017f9d 7021 if (!val) {
443c6409 7022 addReply(c,shared.nullbulk);
7023 } else {
dd88747b 7024 addReplyBulk(c,val);
55017f9d 7025 decrRefCount(val);
443c6409 7026 }
7027 } else {
dfc5e96c 7028 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 7029 }
7030 }
ed9b544e 7031 }
443c6409 7032 } else {
7033 robj *listObject = createListObject();
7034 list *listPtr = (list*) listObject->ptr;
7035
7036 /* STORE option specified, set the sorting result as a List object */
7037 for (j = start; j <= end; j++) {
7038 listNode *ln;
c7df85a4 7039 listIter li;
7040
443c6409 7041 if (!getop) {
7042 listAddNodeTail(listPtr,vector[j].obj);
7043 incrRefCount(vector[j].obj);
7044 }
c7df85a4 7045 listRewind(operations,&li);
7046 while((ln = listNext(&li))) {
443c6409 7047 redisSortOperation *sop = ln->value;
7048 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7049 vector[j].obj);
7050
7051 if (sop->type == REDIS_SORT_GET) {
55017f9d 7052 if (!val) {
443c6409 7053 listAddNodeTail(listPtr,createStringObject("",0));
7054 } else {
55017f9d
PN
7055 /* We should do a incrRefCount on val because it is
7056 * added to the list, but also a decrRefCount because
7057 * it is returned by lookupKeyByPattern. This results
7058 * in doing nothing at all. */
443c6409 7059 listAddNodeTail(listPtr,val);
443c6409 7060 }
ed9b544e 7061 } else {
dfc5e96c 7062 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 7063 }
ed9b544e 7064 }
ed9b544e 7065 }
121796f7 7066 if (dictReplace(c->db->dict,storekey,listObject)) {
7067 incrRefCount(storekey);
7068 }
443c6409 7069 /* Note: we add 1 because the DB is dirty anyway since even if the
7070 * SORT result is empty a new key is set and maybe the old content
7071 * replaced. */
7072 server.dirty += 1+outputlen;
7073 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 7074 }
7075
7076 /* Cleanup */
7077 decrRefCount(sortval);
7078 listRelease(operations);
7079 for (j = 0; j < vectorlen; j++) {
16fa22f1 7080 if (alpha && vector[j].u.cmpobj)
ed9b544e 7081 decrRefCount(vector[j].u.cmpobj);
7082 }
7083 zfree(vector);
7084}
7085
ec6c7a1d 7086/* Convert an amount of bytes into a human readable string in the form
7087 * of 100B, 2G, 100M, 4K, and so forth. */
7088static void bytesToHuman(char *s, unsigned long long n) {
7089 double d;
7090
7091 if (n < 1024) {
7092 /* Bytes */
7093 sprintf(s,"%lluB",n);
7094 return;
7095 } else if (n < (1024*1024)) {
7096 d = (double)n/(1024);
7097 sprintf(s,"%.2fK",d);
7098 } else if (n < (1024LL*1024*1024)) {
7099 d = (double)n/(1024*1024);
7100 sprintf(s,"%.2fM",d);
7101 } else if (n < (1024LL*1024*1024*1024)) {
7102 d = (double)n/(1024LL*1024*1024);
b72f6a4b 7103 sprintf(s,"%.2fG",d);
ec6c7a1d 7104 }
7105}
7106
1c85b79f 7107/* Create the string returned by the INFO command. This is decoupled
7108 * by the INFO command itself as we need to report the same information
7109 * on memory corruption problems. */
7110static sds genRedisInfoString(void) {
ed9b544e 7111 sds info;
7112 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 7113 int j;
ec6c7a1d 7114 char hmem[64];
55a8298f 7115
b72f6a4b 7116 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 7117 info = sdscatprintf(sdsempty(),
7118 "redis_version:%s\r\n"
f1017b3f 7119 "arch_bits:%s\r\n"
7a932b74 7120 "multiplexing_api:%s\r\n"
0d7170a4 7121 "process_id:%ld\r\n"
682ac724 7122 "uptime_in_seconds:%ld\r\n"
7123 "uptime_in_days:%ld\r\n"
ed9b544e 7124 "connected_clients:%d\r\n"
7125 "connected_slaves:%d\r\n"
f86a74e9 7126 "blocked_clients:%d\r\n"
5fba9f71 7127 "used_memory:%zu\r\n"
ec6c7a1d 7128 "used_memory_human:%s\r\n"
ed9b544e 7129 "changes_since_last_save:%lld\r\n"
be2bb6b0 7130 "bgsave_in_progress:%d\r\n"
682ac724 7131 "last_save_time:%ld\r\n"
b3fad521 7132 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 7133 "total_connections_received:%lld\r\n"
7134 "total_commands_processed:%lld\r\n"
2a6a2ed1 7135 "expired_keys:%lld\r\n"
55a8298f 7136 "hash_max_zipmap_entries:%ld\r\n"
7137 "hash_max_zipmap_value:%ld\r\n"
ffc6b7f8 7138 "pubsub_channels:%ld\r\n"
7139 "pubsub_patterns:%u\r\n"
7d98e08c 7140 "vm_enabled:%d\r\n"
a0f643ea 7141 "role:%s\r\n"
ed9b544e 7142 ,REDIS_VERSION,
f1017b3f 7143 (sizeof(long) == 8) ? "64" : "32",
7a932b74 7144 aeGetApiName(),
0d7170a4 7145 (long) getpid(),
a0f643ea 7146 uptime,
7147 uptime/(3600*24),
ed9b544e 7148 listLength(server.clients)-listLength(server.slaves),
7149 listLength(server.slaves),
d5d55fc3 7150 server.blpop_blocked_clients,
b72f6a4b 7151 zmalloc_used_memory(),
ec6c7a1d 7152 hmem,
ed9b544e 7153 server.dirty,
9d65a1bb 7154 server.bgsavechildpid != -1,
ed9b544e 7155 server.lastsave,
b3fad521 7156 server.bgrewritechildpid != -1,
ed9b544e 7157 server.stat_numconnections,
7158 server.stat_numcommands,
2a6a2ed1 7159 server.stat_expiredkeys,
55a8298f 7160 server.hash_max_zipmap_entries,
7161 server.hash_max_zipmap_value,
ffc6b7f8 7162 dictSize(server.pubsub_channels),
7163 listLength(server.pubsub_patterns),
7d98e08c 7164 server.vm_enabled != 0,
a0f643ea 7165 server.masterhost == NULL ? "master" : "slave"
ed9b544e 7166 );
a0f643ea 7167 if (server.masterhost) {
7168 info = sdscatprintf(info,
7169 "master_host:%s\r\n"
7170 "master_port:%d\r\n"
7171 "master_link_status:%s\r\n"
7172 "master_last_io_seconds_ago:%d\r\n"
7173 ,server.masterhost,
7174 server.masterport,
7175 (server.replstate == REDIS_REPL_CONNECTED) ?
7176 "up" : "down",
f72b934d 7177 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 7178 );
7179 }
7d98e08c 7180 if (server.vm_enabled) {
1064ef87 7181 lockThreadedIO();
7d98e08c 7182 info = sdscatprintf(info,
7183 "vm_conf_max_memory:%llu\r\n"
7184 "vm_conf_page_size:%llu\r\n"
7185 "vm_conf_pages:%llu\r\n"
7186 "vm_stats_used_pages:%llu\r\n"
7187 "vm_stats_swapped_objects:%llu\r\n"
7188 "vm_stats_swappin_count:%llu\r\n"
7189 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 7190 "vm_stats_io_newjobs_len:%lu\r\n"
7191 "vm_stats_io_processing_len:%lu\r\n"
7192 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 7193 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 7194 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 7195 ,(unsigned long long) server.vm_max_memory,
7196 (unsigned long long) server.vm_page_size,
7197 (unsigned long long) server.vm_pages,
7198 (unsigned long long) server.vm_stats_used_pages,
7199 (unsigned long long) server.vm_stats_swapped_objects,
7200 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 7201 (unsigned long long) server.vm_stats_swapouts,
7202 (unsigned long) listLength(server.io_newjobs),
7203 (unsigned long) listLength(server.io_processing),
7204 (unsigned long) listLength(server.io_processed),
d5d55fc3 7205 (unsigned long) server.io_active_threads,
7206 (unsigned long) server.vm_blocked_clients
7d98e08c 7207 );
1064ef87 7208 unlockThreadedIO();
7d98e08c 7209 }
c3cb078d 7210 for (j = 0; j < server.dbnum; j++) {
7211 long long keys, vkeys;
7212
7213 keys = dictSize(server.db[j].dict);
7214 vkeys = dictSize(server.db[j].expires);
7215 if (keys || vkeys) {
9d65a1bb 7216 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 7217 j, keys, vkeys);
7218 }
7219 }
1c85b79f 7220 return info;
7221}
7222
7223static void infoCommand(redisClient *c) {
7224 sds info = genRedisInfoString();
83c6a618 7225 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7226 (unsigned long)sdslen(info)));
ed9b544e 7227 addReplySds(c,info);
70003d28 7228 addReply(c,shared.crlf);
ed9b544e 7229}
7230
3305306f 7231static void monitorCommand(redisClient *c) {
7232 /* ignore MONITOR if aleady slave or in monitor mode */
7233 if (c->flags & REDIS_SLAVE) return;
7234
7235 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7236 c->slaveseldb = 0;
6b47e12e 7237 listAddNodeTail(server.monitors,c);
3305306f 7238 addReply(c,shared.ok);
7239}
7240
7241/* ================================= Expire ================================= */
7242static int removeExpire(redisDb *db, robj *key) {
7243 if (dictDelete(db->expires,key) == DICT_OK) {
7244 return 1;
7245 } else {
7246 return 0;
7247 }
7248}
7249
7250static int setExpire(redisDb *db, robj *key, time_t when) {
7251 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7252 return 0;
7253 } else {
7254 incrRefCount(key);
7255 return 1;
7256 }
7257}
7258
bb32ede5 7259/* Return the expire time of the specified key, or -1 if no expire
7260 * is associated with this key (i.e. the key is non volatile) */
7261static time_t getExpire(redisDb *db, robj *key) {
7262 dictEntry *de;
7263
7264 /* No expire? return ASAP */
7265 if (dictSize(db->expires) == 0 ||
7266 (de = dictFind(db->expires,key)) == NULL) return -1;
7267
7268 return (time_t) dictGetEntryVal(de);
7269}
7270
3305306f 7271static int expireIfNeeded(redisDb *db, robj *key) {
7272 time_t when;
7273 dictEntry *de;
7274
7275 /* No expire? return ASAP */
7276 if (dictSize(db->expires) == 0 ||
7277 (de = dictFind(db->expires,key)) == NULL) return 0;
7278
7279 /* Lookup the expire */
7280 when = (time_t) dictGetEntryVal(de);
7281 if (time(NULL) <= when) return 0;
7282
7283 /* Delete the key */
7284 dictDelete(db->expires,key);
2a6a2ed1 7285 server.stat_expiredkeys++;
3305306f 7286 return dictDelete(db->dict,key) == DICT_OK;
7287}
7288
7289static int deleteIfVolatile(redisDb *db, robj *key) {
7290 dictEntry *de;
7291
7292 /* No expire? return ASAP */
7293 if (dictSize(db->expires) == 0 ||
7294 (de = dictFind(db->expires,key)) == NULL) return 0;
7295
7296 /* Delete the key */
0c66a471 7297 server.dirty++;
2a6a2ed1 7298 server.stat_expiredkeys++;
3305306f 7299 dictDelete(db->expires,key);
7300 return dictDelete(db->dict,key) == DICT_OK;
7301}
7302
bbe025e0 7303static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
3305306f 7304 dictEntry *de;
bbe025e0
AM
7305 time_t seconds;
7306
bd79a6bd 7307 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
bbe025e0
AM
7308
7309 seconds -= offset;
3305306f 7310
802e8373 7311 de = dictFind(c->db->dict,key);
3305306f 7312 if (de == NULL) {
7313 addReply(c,shared.czero);
7314 return;
7315 }
d4dd6556 7316 if (seconds <= 0) {
43e5ccdf 7317 if (deleteKey(c->db,key)) server.dirty++;
7318 addReply(c, shared.cone);
3305306f 7319 return;
7320 } else {
7321 time_t when = time(NULL)+seconds;
802e8373 7322 if (setExpire(c->db,key,when)) {
3305306f 7323 addReply(c,shared.cone);
77423026 7324 server.dirty++;
7325 } else {
3305306f 7326 addReply(c,shared.czero);
77423026 7327 }
3305306f 7328 return;
7329 }
7330}
7331
802e8373 7332static void expireCommand(redisClient *c) {
bbe025e0 7333 expireGenericCommand(c,c->argv[1],c->argv[2],0);
802e8373 7334}
7335
7336static void expireatCommand(redisClient *c) {
bbe025e0 7337 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
802e8373 7338}
7339
fd88489a 7340static void ttlCommand(redisClient *c) {
7341 time_t expire;
7342 int ttl = -1;
7343
7344 expire = getExpire(c->db,c->argv[1]);
7345 if (expire != -1) {
7346 ttl = (int) (expire-time(NULL));
7347 if (ttl < 0) ttl = -1;
7348 }
7349 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7350}
7351
6e469882 7352/* ================================ MULTI/EXEC ============================== */
7353
7354/* Client state initialization for MULTI/EXEC */
7355static void initClientMultiState(redisClient *c) {
7356 c->mstate.commands = NULL;
7357 c->mstate.count = 0;
7358}
7359
7360/* Release all the resources associated with MULTI/EXEC state */
7361static void freeClientMultiState(redisClient *c) {
7362 int j;
7363
7364 for (j = 0; j < c->mstate.count; j++) {
7365 int i;
7366 multiCmd *mc = c->mstate.commands+j;
7367
7368 for (i = 0; i < mc->argc; i++)
7369 decrRefCount(mc->argv[i]);
7370 zfree(mc->argv);
7371 }
7372 zfree(c->mstate.commands);
7373}
7374
7375/* Add a new command into the MULTI commands queue */
7376static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7377 multiCmd *mc;
7378 int j;
7379
7380 c->mstate.commands = zrealloc(c->mstate.commands,
7381 sizeof(multiCmd)*(c->mstate.count+1));
7382 mc = c->mstate.commands+c->mstate.count;
7383 mc->cmd = cmd;
7384 mc->argc = c->argc;
7385 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7386 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7387 for (j = 0; j < c->argc; j++)
7388 incrRefCount(mc->argv[j]);
7389 c->mstate.count++;
7390}
7391
7392static void multiCommand(redisClient *c) {
7393 c->flags |= REDIS_MULTI;
36c548f0 7394 addReply(c,shared.ok);
6e469882 7395}
7396
18b6cb76
DJ
7397static void discardCommand(redisClient *c) {
7398 if (!(c->flags & REDIS_MULTI)) {
7399 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7400 return;
7401 }
7402
7403 freeClientMultiState(c);
7404 initClientMultiState(c);
7405 c->flags &= (~REDIS_MULTI);
7406 addReply(c,shared.ok);
7407}
7408
66c8853f 7409/* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7410 * implememntation for more information. */
7411static void execCommandReplicateMulti(redisClient *c) {
7412 struct redisCommand *cmd;
7413 robj *multistring = createStringObject("MULTI",5);
7414
7415 cmd = lookupCommand("multi");
7416 if (server.appendonly)
7417 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7418 if (listLength(server.slaves))
7419 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7420 decrRefCount(multistring);
7421}
7422
6e469882 7423static void execCommand(redisClient *c) {
7424 int j;
7425 robj **orig_argv;
7426 int orig_argc;
7427
7428 if (!(c->flags & REDIS_MULTI)) {
7429 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7430 return;
7431 }
7432
66c8853f 7433 /* Replicate a MULTI request now that we are sure the block is executed.
7434 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7435 * both the AOF and the replication link will have the same consistency
7436 * and atomicity guarantees. */
7437 execCommandReplicateMulti(c);
7438
7439 /* Exec all the queued commands */
6e469882 7440 orig_argv = c->argv;
7441 orig_argc = c->argc;
7442 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7443 for (j = 0; j < c->mstate.count; j++) {
7444 c->argc = c->mstate.commands[j].argc;
7445 c->argv = c->mstate.commands[j].argv;
7446 call(c,c->mstate.commands[j].cmd);
7447 }
7448 c->argv = orig_argv;
7449 c->argc = orig_argc;
7450 freeClientMultiState(c);
7451 initClientMultiState(c);
7452 c->flags &= (~REDIS_MULTI);
66c8853f 7453 /* Make sure the EXEC command is always replicated / AOF, since we
7454 * always send the MULTI command (we can't know beforehand if the
7455 * next operations will contain at least a modification to the DB). */
7456 server.dirty++;
6e469882 7457}
7458
4409877e 7459/* =========================== Blocking Operations ========================= */
7460
7461/* Currently Redis blocking operations support is limited to list POP ops,
7462 * so the current implementation is not fully generic, but it is also not
7463 * completely specific so it will not require a rewrite to support new
7464 * kind of blocking operations in the future.
7465 *
7466 * Still it's important to note that list blocking operations can be already
7467 * used as a notification mechanism in order to implement other blocking
7468 * operations at application level, so there must be a very strong evidence
7469 * of usefulness and generality before new blocking operations are implemented.
7470 *
7471 * This is how the current blocking POP works, we use BLPOP as example:
7472 * - If the user calls BLPOP and the key exists and contains a non empty list
7473 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7474 * if there is not to block.
7475 * - If instead BLPOP is called and the key does not exists or the list is
7476 * empty we need to block. In order to do so we remove the notification for
7477 * new data to read in the client socket (so that we'll not serve new
7478 * requests if the blocking request is not served). Also we put the client
95242ab5 7479 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
4409877e 7480 * blocking for this keys.
7481 * - If a PUSH operation against a key with blocked clients waiting is
7482 * performed, we serve the first in the list: basically instead to push
7483 * the new element inside the list we return it to the (first / oldest)
7484 * blocking client, unblock the client, and remove it form the list.
7485 *
7486 * The above comment and the source code should be enough in order to understand
7487 * the implementation and modify / fix it later.
7488 */
7489
7490/* Set a client in blocking mode for the specified key, with the specified
7491 * timeout */
b177fd30 7492static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 7493 dictEntry *de;
7494 list *l;
b177fd30 7495 int j;
4409877e 7496
b177fd30 7497 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7498 c->blockingkeysnum = numkeys;
4409877e 7499 c->blockingto = timeout;
b177fd30 7500 for (j = 0; j < numkeys; j++) {
7501 /* Add the key in the client structure, to map clients -> keys */
7502 c->blockingkeys[j] = keys[j];
7503 incrRefCount(keys[j]);
4409877e 7504
b177fd30 7505 /* And in the other "side", to map keys -> clients */
7506 de = dictFind(c->db->blockingkeys,keys[j]);
7507 if (de == NULL) {
7508 int retval;
7509
7510 /* For every key we take a list of clients blocked for it */
7511 l = listCreate();
7512 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7513 incrRefCount(keys[j]);
7514 assert(retval == DICT_OK);
7515 } else {
7516 l = dictGetEntryVal(de);
7517 }
7518 listAddNodeTail(l,c);
4409877e 7519 }
b177fd30 7520 /* Mark the client as a blocked client */
4409877e 7521 c->flags |= REDIS_BLOCKED;
d5d55fc3 7522 server.blpop_blocked_clients++;
4409877e 7523}
7524
7525/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 7526static void unblockClientWaitingData(redisClient *c) {
4409877e 7527 dictEntry *de;
7528 list *l;
b177fd30 7529 int j;
4409877e 7530
b177fd30 7531 assert(c->blockingkeys != NULL);
7532 /* The client may wait for multiple keys, so unblock it for every key. */
7533 for (j = 0; j < c->blockingkeysnum; j++) {
7534 /* Remove this client from the list of clients waiting for this key. */
7535 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7536 assert(de != NULL);
7537 l = dictGetEntryVal(de);
7538 listDelNode(l,listSearchKey(l,c));
7539 /* If the list is empty we need to remove it to avoid wasting memory */
7540 if (listLength(l) == 0)
7541 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7542 decrRefCount(c->blockingkeys[j]);
7543 }
7544 /* Cleanup the client structure */
7545 zfree(c->blockingkeys);
7546 c->blockingkeys = NULL;
4409877e 7547 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 7548 server.blpop_blocked_clients--;
5921aa36 7549 /* We want to process data if there is some command waiting
b0d8747d 7550 * in the input buffer. Note that this is safe even if
7551 * unblockClientWaitingData() gets called from freeClient() because
7552 * freeClient() will be smart enough to call this function
7553 * *after* c->querybuf was set to NULL. */
4409877e 7554 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7555}
7556
7557/* This should be called from any function PUSHing into lists.
7558 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7559 * 'ele' is the element pushed.
7560 *
7561 * If the function returns 0 there was no client waiting for a list push
7562 * against this key.
7563 *
7564 * If the function returns 1 there was a client waiting for a list push
7565 * against this key, the element was passed to this client thus it's not
7566 * needed to actually add it to the list and the caller should return asap. */
7567static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7568 struct dictEntry *de;
7569 redisClient *receiver;
7570 list *l;
7571 listNode *ln;
7572
7573 de = dictFind(c->db->blockingkeys,key);
7574 if (de == NULL) return 0;
7575 l = dictGetEntryVal(de);
7576 ln = listFirst(l);
7577 assert(ln != NULL);
7578 receiver = ln->value;
4409877e 7579
b177fd30 7580 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 7581 addReplyBulk(receiver,key);
7582 addReplyBulk(receiver,ele);
b0d8747d 7583 unblockClientWaitingData(receiver);
4409877e 7584 return 1;
7585}
7586
7587/* Blocking RPOP/LPOP */
7588static void blockingPopGenericCommand(redisClient *c, int where) {
7589 robj *o;
7590 time_t timeout;
b177fd30 7591 int j;
4409877e 7592
b177fd30 7593 for (j = 1; j < c->argc-1; j++) {
7594 o = lookupKeyWrite(c->db,c->argv[j]);
7595 if (o != NULL) {
7596 if (o->type != REDIS_LIST) {
7597 addReply(c,shared.wrongtypeerr);
4409877e 7598 return;
b177fd30 7599 } else {
7600 list *list = o->ptr;
7601 if (listLength(list) != 0) {
7602 /* If the list contains elements fall back to the usual
7603 * non-blocking POP operation */
7604 robj *argv[2], **orig_argv;
7605 int orig_argc;
e0a62c7f 7606
b177fd30 7607 /* We need to alter the command arguments before to call
7608 * popGenericCommand() as the command takes a single key. */
7609 orig_argv = c->argv;
7610 orig_argc = c->argc;
7611 argv[1] = c->argv[j];
7612 c->argv = argv;
7613 c->argc = 2;
7614
7615 /* Also the return value is different, we need to output
7616 * the multi bulk reply header and the key name. The
7617 * "real" command will add the last element (the value)
7618 * for us. If this souds like an hack to you it's just
7619 * because it is... */
7620 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 7621 addReplyBulk(c,argv[1]);
b177fd30 7622 popGenericCommand(c,where);
7623
7624 /* Fix the client structure with the original stuff */
7625 c->argv = orig_argv;
7626 c->argc = orig_argc;
7627 return;
7628 }
4409877e 7629 }
7630 }
7631 }
7632 /* If the list is empty or the key does not exists we must block */
b177fd30 7633 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 7634 if (timeout > 0) timeout += time(NULL);
b177fd30 7635 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 7636}
7637
7638static void blpopCommand(redisClient *c) {
7639 blockingPopGenericCommand(c,REDIS_HEAD);
7640}
7641
7642static void brpopCommand(redisClient *c) {
7643 blockingPopGenericCommand(c,REDIS_TAIL);
7644}
7645
ed9b544e 7646/* =============================== Replication ============================= */
7647
a4d1ba9a 7648static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7649 ssize_t nwritten, ret = size;
7650 time_t start = time(NULL);
7651
7652 timeout++;
7653 while(size) {
7654 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7655 nwritten = write(fd,ptr,size);
7656 if (nwritten == -1) return -1;
7657 ptr += nwritten;
7658 size -= nwritten;
7659 }
7660 if ((time(NULL)-start) > timeout) {
7661 errno = ETIMEDOUT;
7662 return -1;
7663 }
7664 }
7665 return ret;
7666}
7667
a4d1ba9a 7668static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7669 ssize_t nread, totread = 0;
7670 time_t start = time(NULL);
7671
7672 timeout++;
7673 while(size) {
7674 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7675 nread = read(fd,ptr,size);
7676 if (nread == -1) return -1;
7677 ptr += nread;
7678 size -= nread;
7679 totread += nread;
7680 }
7681 if ((time(NULL)-start) > timeout) {
7682 errno = ETIMEDOUT;
7683 return -1;
7684 }
7685 }
7686 return totread;
7687}
7688
7689static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7690 ssize_t nread = 0;
7691
7692 size--;
7693 while(size) {
7694 char c;
7695
7696 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7697 if (c == '\n') {
7698 *ptr = '\0';
7699 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7700 return nread;
7701 } else {
7702 *ptr++ = c;
7703 *ptr = '\0';
7704 nread++;
7705 }
7706 }
7707 return nread;
7708}
7709
7710static void syncCommand(redisClient *c) {
40d224a9 7711 /* ignore SYNC if aleady slave or in monitor mode */
7712 if (c->flags & REDIS_SLAVE) return;
7713
7714 /* SYNC can't be issued when the server has pending data to send to
7715 * the client about already issued commands. We need a fresh reply
7716 * buffer registering the differences between the BGSAVE and the current
7717 * dataset, so that we can copy to other slaves if needed. */
7718 if (listLength(c->reply) != 0) {
7719 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7720 return;
7721 }
7722
7723 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7724 /* Here we need to check if there is a background saving operation
7725 * in progress, or if it is required to start one */
9d65a1bb 7726 if (server.bgsavechildpid != -1) {
40d224a9 7727 /* Ok a background save is in progress. Let's check if it is a good
7728 * one for replication, i.e. if there is another slave that is
7729 * registering differences since the server forked to save */
7730 redisClient *slave;
7731 listNode *ln;
c7df85a4 7732 listIter li;
40d224a9 7733
c7df85a4 7734 listRewind(server.slaves,&li);
7735 while((ln = listNext(&li))) {
40d224a9 7736 slave = ln->value;
7737 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 7738 }
7739 if (ln) {
7740 /* Perfect, the server is already registering differences for
7741 * another slave. Set the right state, and copy the buffer. */
7742 listRelease(c->reply);
7743 c->reply = listDup(slave->reply);
40d224a9 7744 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7745 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7746 } else {
7747 /* No way, we need to wait for the next BGSAVE in order to
7748 * register differences */
7749 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7750 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7751 }
7752 } else {
7753 /* Ok we don't have a BGSAVE in progress, let's start one */
7754 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7755 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7756 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7757 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7758 return;
7759 }
7760 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7761 }
6208b3a7 7762 c->repldbfd = -1;
40d224a9 7763 c->flags |= REDIS_SLAVE;
7764 c->slaveseldb = 0;
6b47e12e 7765 listAddNodeTail(server.slaves,c);
40d224a9 7766 return;
7767}
7768
6208b3a7 7769static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7770 redisClient *slave = privdata;
7771 REDIS_NOTUSED(el);
7772 REDIS_NOTUSED(mask);
7773 char buf[REDIS_IOBUF_LEN];
7774 ssize_t nwritten, buflen;
7775
7776 if (slave->repldboff == 0) {
7777 /* Write the bulk write count before to transfer the DB. In theory here
7778 * we don't know how much room there is in the output buffer of the
7779 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7780 * operations) will never be smaller than the few bytes we need. */
7781 sds bulkcount;
7782
7783 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7784 slave->repldbsize);
7785 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7786 {
7787 sdsfree(bulkcount);
7788 freeClient(slave);
7789 return;
7790 }
7791 sdsfree(bulkcount);
7792 }
7793 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7794 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7795 if (buflen <= 0) {
7796 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7797 (buflen == 0) ? "premature EOF" : strerror(errno));
7798 freeClient(slave);
7799 return;
7800 }
7801 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 7802 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 7803 strerror(errno));
7804 freeClient(slave);
7805 return;
7806 }
7807 slave->repldboff += nwritten;
7808 if (slave->repldboff == slave->repldbsize) {
7809 close(slave->repldbfd);
7810 slave->repldbfd = -1;
7811 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7812 slave->replstate = REDIS_REPL_ONLINE;
7813 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 7814 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 7815 freeClient(slave);
7816 return;
7817 }
7818 addReplySds(slave,sdsempty());
7819 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7820 }
7821}
ed9b544e 7822
a3b21203 7823/* This function is called at the end of every backgrond saving.
7824 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7825 * otherwise REDIS_ERR is passed to the function.
7826 *
7827 * The goal of this function is to handle slaves waiting for a successful
7828 * background saving in order to perform non-blocking synchronization. */
7829static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 7830 listNode *ln;
7831 int startbgsave = 0;
c7df85a4 7832 listIter li;
ed9b544e 7833
c7df85a4 7834 listRewind(server.slaves,&li);
7835 while((ln = listNext(&li))) {
6208b3a7 7836 redisClient *slave = ln->value;
ed9b544e 7837
6208b3a7 7838 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7839 startbgsave = 1;
7840 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7841 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 7842 struct redis_stat buf;
e0a62c7f 7843
6208b3a7 7844 if (bgsaveerr != REDIS_OK) {
7845 freeClient(slave);
7846 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7847 continue;
7848 }
7849 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 7850 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 7851 freeClient(slave);
7852 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7853 continue;
7854 }
7855 slave->repldboff = 0;
7856 slave->repldbsize = buf.st_size;
7857 slave->replstate = REDIS_REPL_SEND_BULK;
7858 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 7859 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 7860 freeClient(slave);
7861 continue;
7862 }
7863 }
ed9b544e 7864 }
6208b3a7 7865 if (startbgsave) {
7866 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 7867 listIter li;
7868
7869 listRewind(server.slaves,&li);
6208b3a7 7870 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 7871 while((ln = listNext(&li))) {
6208b3a7 7872 redisClient *slave = ln->value;
ed9b544e 7873
6208b3a7 7874 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7875 freeClient(slave);
7876 }
7877 }
7878 }
ed9b544e 7879}
7880
7881static int syncWithMaster(void) {
d0ccebcf 7882 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 7883 long dumpsize;
ed9b544e 7884 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 7885 int dfd, maxtries = 5;
ed9b544e 7886
7887 if (fd == -1) {
7888 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7889 strerror(errno));
7890 return REDIS_ERR;
7891 }
d0ccebcf 7892
7893 /* AUTH with the master if required. */
7894 if(server.masterauth) {
7895 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7896 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7897 close(fd);
7898 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7899 strerror(errno));
7900 return REDIS_ERR;
7901 }
7902 /* Read the AUTH result. */
7903 if (syncReadLine(fd,buf,1024,3600) == -1) {
7904 close(fd);
7905 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7906 strerror(errno));
7907 return REDIS_ERR;
7908 }
7909 if (buf[0] != '+') {
7910 close(fd);
7911 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7912 return REDIS_ERR;
7913 }
7914 }
7915
ed9b544e 7916 /* Issue the SYNC command */
7917 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7918 close(fd);
7919 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7920 strerror(errno));
7921 return REDIS_ERR;
7922 }
7923 /* Read the bulk write count */
8c4d91fc 7924 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 7925 close(fd);
7926 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7927 strerror(errno));
7928 return REDIS_ERR;
7929 }
4aa701c1 7930 if (buf[0] != '$') {
7931 close(fd);
7932 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7933 return REDIS_ERR;
7934 }
18e61fa2 7935 dumpsize = strtol(buf+1,NULL,10);
7936 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 7937 /* Read the bulk write data on a temp file */
8c5abee8 7938 while(maxtries--) {
7939 snprintf(tmpfile,256,
7940 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7941 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7942 if (dfd != -1) break;
5de9ad7c 7943 sleep(1);
8c5abee8 7944 }
ed9b544e 7945 if (dfd == -1) {
7946 close(fd);
7947 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7948 return REDIS_ERR;
7949 }
7950 while(dumpsize) {
7951 int nread, nwritten;
7952
7953 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7954 if (nread == -1) {
7955 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7956 strerror(errno));
7957 close(fd);
7958 close(dfd);
7959 return REDIS_ERR;
7960 }
7961 nwritten = write(dfd,buf,nread);
7962 if (nwritten == -1) {
7963 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7964 close(fd);
7965 close(dfd);
7966 return REDIS_ERR;
7967 }
7968 dumpsize -= nread;
7969 }
7970 close(dfd);
7971 if (rename(tmpfile,server.dbfilename) == -1) {
7972 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7973 unlink(tmpfile);
7974 close(fd);
7975 return REDIS_ERR;
7976 }
7977 emptyDb();
f78fd11b 7978 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 7979 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7980 close(fd);
7981 return REDIS_ERR;
7982 }
7983 server.master = createClient(fd);
7984 server.master->flags |= REDIS_MASTER;
179b3952 7985 server.master->authenticated = 1;
ed9b544e 7986 server.replstate = REDIS_REPL_CONNECTED;
7987 return REDIS_OK;
7988}
7989
321b0e13 7990static void slaveofCommand(redisClient *c) {
7991 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7992 !strcasecmp(c->argv[2]->ptr,"one")) {
7993 if (server.masterhost) {
7994 sdsfree(server.masterhost);
7995 server.masterhost = NULL;
7996 if (server.master) freeClient(server.master);
7997 server.replstate = REDIS_REPL_NONE;
7998 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7999 }
8000 } else {
8001 sdsfree(server.masterhost);
8002 server.masterhost = sdsdup(c->argv[1]->ptr);
8003 server.masterport = atoi(c->argv[2]->ptr);
8004 if (server.master) freeClient(server.master);
8005 server.replstate = REDIS_REPL_CONNECT;
8006 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8007 server.masterhost, server.masterport);
8008 }
8009 addReply(c,shared.ok);
8010}
8011
3fd78bcd 8012/* ============================ Maxmemory directive ======================== */
8013
a5819310 8014/* Try to free one object form the pre-allocated objects free list.
8015 * This is useful under low mem conditions as by default we take 1 million
8016 * free objects allocated. On success REDIS_OK is returned, otherwise
8017 * REDIS_ERR. */
8018static int tryFreeOneObjectFromFreelist(void) {
f870935d 8019 robj *o;
8020
a5819310 8021 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8022 if (listLength(server.objfreelist)) {
8023 listNode *head = listFirst(server.objfreelist);
8024 o = listNodeValue(head);
8025 listDelNode(server.objfreelist,head);
8026 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8027 zfree(o);
8028 return REDIS_OK;
8029 } else {
8030 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8031 return REDIS_ERR;
8032 }
f870935d 8033}
8034
3fd78bcd 8035/* This function gets called when 'maxmemory' is set on the config file to limit
8036 * the max memory used by the server, and we are out of memory.
8037 * This function will try to, in order:
8038 *
8039 * - Free objects from the free list
8040 * - Try to remove keys with an EXPIRE set
8041 *
8042 * It is not possible to free enough memory to reach used-memory < maxmemory
8043 * the server will start refusing commands that will enlarge even more the
8044 * memory usage.
8045 */
8046static void freeMemoryIfNeeded(void) {
8047 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 8048 int j, k, freed = 0;
8049
8050 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8051 for (j = 0; j < server.dbnum; j++) {
8052 int minttl = -1;
8053 robj *minkey = NULL;
8054 struct dictEntry *de;
8055
8056 if (dictSize(server.db[j].expires)) {
8057 freed = 1;
8058 /* From a sample of three keys drop the one nearest to
8059 * the natural expire */
8060 for (k = 0; k < 3; k++) {
8061 time_t t;
8062
8063 de = dictGetRandomKey(server.db[j].expires);
8064 t = (time_t) dictGetEntryVal(de);
8065 if (minttl == -1 || t < minttl) {
8066 minkey = dictGetEntryKey(de);
8067 minttl = t;
3fd78bcd 8068 }
3fd78bcd 8069 }
a5819310 8070 deleteKey(server.db+j,minkey);
3fd78bcd 8071 }
3fd78bcd 8072 }
a5819310 8073 if (!freed) return; /* nothing to free... */
3fd78bcd 8074 }
8075}
8076
f80dff62 8077/* ============================== Append Only file ========================== */
8078
28ed1f33 8079/* Write the append only file buffer on disk.
8080 *
8081 * Since we are required to write the AOF before replying to the client,
8082 * and the only way the client socket can get a write is entering when the
8083 * the event loop, we accumulate all the AOF writes in a memory
8084 * buffer and write it on disk using this function just before entering
8085 * the event loop again. */
8086static void flushAppendOnlyFile(void) {
8087 time_t now;
8088 ssize_t nwritten;
8089
8090 if (sdslen(server.aofbuf) == 0) return;
8091
8092 /* We want to perform a single write. This should be guaranteed atomic
8093 * at least if the filesystem we are writing is a real physical one.
8094 * While this will save us against the server being killed I don't think
8095 * there is much to do about the whole server stopping for power problems
8096 * or alike */
8097 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8098 if (nwritten != (signed)sdslen(server.aofbuf)) {
8099 /* Ooops, we are in troubles. The best thing to do for now is
8100 * aborting instead of giving the illusion that everything is
8101 * working as expected. */
8102 if (nwritten == -1) {
8103 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8104 } else {
8105 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8106 }
8107 exit(1);
8108 }
8109 sdsfree(server.aofbuf);
8110 server.aofbuf = sdsempty();
8111
8112 /* Fsync if needed */
8113 now = time(NULL);
8114 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8115 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8116 now-server.lastfsync > 1))
8117 {
8118 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8119 * flushing metadata. */
8120 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8121 server.lastfsync = now;
8122 }
8123}
8124
f80dff62 8125static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8126 sds buf = sdsempty();
8127 int j;
f80dff62 8128 robj *tmpargv[3];
8129
8130 /* The DB this command was targetting is not the same as the last command
8131 * we appendend. To issue a SELECT command is needed. */
8132 if (dictid != server.appendseldb) {
8133 char seldb[64];
8134
8135 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 8136 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 8137 (unsigned long)strlen(seldb),seldb);
f80dff62 8138 server.appendseldb = dictid;
8139 }
8140
8141 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
8142 * EXPIREs into EXPIREATs calls */
8143 if (cmd->proc == expireCommand) {
8144 long when;
8145
8146 tmpargv[0] = createStringObject("EXPIREAT",8);
8147 tmpargv[1] = argv[1];
8148 incrRefCount(argv[1]);
8149 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
8150 tmpargv[2] = createObject(REDIS_STRING,
8151 sdscatprintf(sdsempty(),"%ld",when));
8152 argv = tmpargv;
8153 }
8154
8155 /* Append the actual command */
8156 buf = sdscatprintf(buf,"*%d\r\n",argc);
8157 for (j = 0; j < argc; j++) {
8158 robj *o = argv[j];
8159
9d65a1bb 8160 o = getDecodedObject(o);
83c6a618 8161 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
f80dff62 8162 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8163 buf = sdscatlen(buf,"\r\n",2);
9d65a1bb 8164 decrRefCount(o);
f80dff62 8165 }
8166
8167 /* Free the objects from the modified argv for EXPIREAT */
8168 if (cmd->proc == expireCommand) {
8169 for (j = 0; j < 3; j++)
8170 decrRefCount(argv[j]);
8171 }
8172
28ed1f33 8173 /* Append to the AOF buffer. This will be flushed on disk just before
8174 * of re-entering the event loop, so before the client will get a
8175 * positive reply about the operation performed. */
8176 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8177
85a83172 8178 /* If a background append only file rewriting is in progress we want to
8179 * accumulate the differences between the child DB and the current one
8180 * in a buffer, so that when the child process will do its work we
8181 * can append the differences to the new append only file. */
8182 if (server.bgrewritechildpid != -1)
8183 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8184
8185 sdsfree(buf);
f80dff62 8186}
8187
8188/* In Redis commands are always executed in the context of a client, so in
8189 * order to load the append only file we need to create a fake client. */
8190static struct redisClient *createFakeClient(void) {
8191 struct redisClient *c = zmalloc(sizeof(*c));
8192
8193 selectDb(c,0);
8194 c->fd = -1;
8195 c->querybuf = sdsempty();
8196 c->argc = 0;
8197 c->argv = NULL;
8198 c->flags = 0;
9387d17d 8199 /* We set the fake client as a slave waiting for the synchronization
8200 * so that Redis will not try to send replies to this client. */
8201 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 8202 c->reply = listCreate();
8203 listSetFreeMethod(c->reply,decrRefCount);
8204 listSetDupMethod(c->reply,dupClientReplyValue);
4132ad8d 8205 initClientMultiState(c);
f80dff62 8206 return c;
8207}
8208
8209static void freeFakeClient(struct redisClient *c) {
8210 sdsfree(c->querybuf);
8211 listRelease(c->reply);
4132ad8d 8212 freeClientMultiState(c);
f80dff62 8213 zfree(c);
8214}
8215
8216/* Replay the append log file. On error REDIS_OK is returned. On non fatal
8217 * error (the append only file is zero-length) REDIS_ERR is returned. On
8218 * fatal error an error message is logged and the program exists. */
8219int loadAppendOnlyFile(char *filename) {
8220 struct redisClient *fakeClient;
8221 FILE *fp = fopen(filename,"r");
8222 struct redis_stat sb;
b492cf00 8223 unsigned long long loadedkeys = 0;
4132ad8d 8224 int appendonly = server.appendonly;
f80dff62 8225
8226 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8227 return REDIS_ERR;
8228
8229 if (fp == NULL) {
8230 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8231 exit(1);
8232 }
8233
4132ad8d
PN
8234 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8235 * to the same file we're about to read. */
8236 server.appendonly = 0;
8237
f80dff62 8238 fakeClient = createFakeClient();
8239 while(1) {
8240 int argc, j;
8241 unsigned long len;
8242 robj **argv;
8243 char buf[128];
8244 sds argsds;
8245 struct redisCommand *cmd;
8246
8247 if (fgets(buf,sizeof(buf),fp) == NULL) {
8248 if (feof(fp))
8249 break;
8250 else
8251 goto readerr;
8252 }
8253 if (buf[0] != '*') goto fmterr;
8254 argc = atoi(buf+1);
8255 argv = zmalloc(sizeof(robj*)*argc);
8256 for (j = 0; j < argc; j++) {
8257 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8258 if (buf[0] != '$') goto fmterr;
8259 len = strtol(buf+1,NULL,10);
8260 argsds = sdsnewlen(NULL,len);
0f151ef1 8261 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 8262 argv[j] = createObject(REDIS_STRING,argsds);
8263 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8264 }
8265
8266 /* Command lookup */
8267 cmd = lookupCommand(argv[0]->ptr);
8268 if (!cmd) {
8269 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8270 exit(1);
8271 }
bdcb92f2 8272 /* Try object encoding */
f80dff62 8273 if (cmd->flags & REDIS_CMD_BULK)
05df7621 8274 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
f80dff62 8275 /* Run the command in the context of a fake client */
8276 fakeClient->argc = argc;
8277 fakeClient->argv = argv;
8278 cmd->proc(fakeClient);
8279 /* Discard the reply objects list from the fake client */
8280 while(listLength(fakeClient->reply))
8281 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8282 /* Clean up, ready for the next command */
8283 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8284 zfree(argv);
b492cf00 8285 /* Handle swapping while loading big datasets when VM is on */
8286 loadedkeys++;
8287 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8288 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 8289 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 8290 }
8291 }
f80dff62 8292 }
4132ad8d
PN
8293
8294 /* This point can only be reached when EOF is reached without errors.
8295 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8296 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8297
f80dff62 8298 fclose(fp);
8299 freeFakeClient(fakeClient);
4132ad8d 8300 server.appendonly = appendonly;
f80dff62 8301 return REDIS_OK;
8302
8303readerr:
8304 if (feof(fp)) {
8305 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8306 } else {
8307 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8308 }
8309 exit(1);
8310fmterr:
8311 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8312 exit(1);
8313}
8314
9d65a1bb 8315/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
9c8e3cee 8316static int fwriteBulkObject(FILE *fp, robj *obj) {
9d65a1bb 8317 char buf[128];
b9bc0eef 8318 int decrrc = 0;
8319
f2d9f50f 8320 /* Avoid the incr/decr ref count business if possible to help
8321 * copy-on-write (we are often in a child process when this function
8322 * is called).
8323 * Also makes sure that key objects don't get incrRefCount-ed when VM
8324 * is enabled */
8325 if (obj->encoding != REDIS_ENCODING_RAW) {
b9bc0eef 8326 obj = getDecodedObject(obj);
8327 decrrc = 1;
8328 }
9d65a1bb 8329 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8330 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 8331 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8332 goto err;
9d65a1bb 8333 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 8334 if (decrrc) decrRefCount(obj);
9d65a1bb 8335 return 1;
8336err:
b9bc0eef 8337 if (decrrc) decrRefCount(obj);
9d65a1bb 8338 return 0;
8339}
8340
9c8e3cee 8341/* Write binary-safe string into a file in the bulkformat
8342 * $<count>\r\n<payload>\r\n */
8343static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8344 char buf[128];
8345
8346 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8347 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8348 if (len && fwrite(s,len,1,fp) == 0) return 0;
8349 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8350 return 1;
8351}
8352
9d65a1bb 8353/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8354static int fwriteBulkDouble(FILE *fp, double d) {
8355 char buf[128], dbuf[128];
8356
8357 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8358 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8359 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8360 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8361 return 1;
8362}
8363
8364/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8365static int fwriteBulkLong(FILE *fp, long l) {
8366 char buf[128], lbuf[128];
8367
8368 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8369 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8370 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8371 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8372 return 1;
8373}
8374
8375/* Write a sequence of commands able to fully rebuild the dataset into
8376 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8377static int rewriteAppendOnlyFile(char *filename) {
8378 dictIterator *di = NULL;
8379 dictEntry *de;
8380 FILE *fp;
8381 char tmpfile[256];
8382 int j;
8383 time_t now = time(NULL);
8384
8385 /* Note that we have to use a different temp name here compared to the
8386 * one used by rewriteAppendOnlyFileBackground() function. */
8387 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8388 fp = fopen(tmpfile,"w");
8389 if (!fp) {
8390 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8391 return REDIS_ERR;
8392 }
8393 for (j = 0; j < server.dbnum; j++) {
8394 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8395 redisDb *db = server.db+j;
8396 dict *d = db->dict;
8397 if (dictSize(d) == 0) continue;
8398 di = dictGetIterator(d);
8399 if (!di) {
8400 fclose(fp);
8401 return REDIS_ERR;
8402 }
8403
8404 /* SELECT the new DB */
8405 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 8406 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 8407
8408 /* Iterate this DB writing every entry */
8409 while((de = dictNext(di)) != NULL) {
e7546c63 8410 robj *key, *o;
8411 time_t expiretime;
8412 int swapped;
8413
8414 key = dictGetEntryKey(de);
b9bc0eef 8415 /* If the value for this key is swapped, load a preview in memory.
8416 * We use a "swapped" flag to remember if we need to free the
8417 * value object instead to just increment the ref count anyway
8418 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 8419 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8420 key->storage == REDIS_VM_SWAPPING) {
e7546c63 8421 o = dictGetEntryVal(de);
8422 swapped = 0;
8423 } else {
8424 o = vmPreviewObject(key);
e7546c63 8425 swapped = 1;
8426 }
8427 expiretime = getExpire(db,key);
9d65a1bb 8428
8429 /* Save the key and associated value */
9d65a1bb 8430 if (o->type == REDIS_STRING) {
8431 /* Emit a SET command */
8432 char cmd[]="*3\r\n$3\r\nSET\r\n";
8433 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8434 /* Key and value */
9c8e3cee 8435 if (fwriteBulkObject(fp,key) == 0) goto werr;
8436 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 8437 } else if (o->type == REDIS_LIST) {
8438 /* Emit the RPUSHes needed to rebuild the list */
8439 list *list = o->ptr;
8440 listNode *ln;
c7df85a4 8441 listIter li;
9d65a1bb 8442
c7df85a4 8443 listRewind(list,&li);
8444 while((ln = listNext(&li))) {
9d65a1bb 8445 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8446 robj *eleobj = listNodeValue(ln);
8447
8448 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8449 if (fwriteBulkObject(fp,key) == 0) goto werr;
8450 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8451 }
8452 } else if (o->type == REDIS_SET) {
8453 /* Emit the SADDs needed to rebuild the set */
8454 dict *set = o->ptr;
8455 dictIterator *di = dictGetIterator(set);
8456 dictEntry *de;
8457
8458 while((de = dictNext(di)) != NULL) {
8459 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8460 robj *eleobj = dictGetEntryKey(de);
8461
8462 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8463 if (fwriteBulkObject(fp,key) == 0) goto werr;
8464 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8465 }
8466 dictReleaseIterator(di);
8467 } else if (o->type == REDIS_ZSET) {
8468 /* Emit the ZADDs needed to rebuild the sorted set */
8469 zset *zs = o->ptr;
8470 dictIterator *di = dictGetIterator(zs->dict);
8471 dictEntry *de;
8472
8473 while((de = dictNext(di)) != NULL) {
8474 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8475 robj *eleobj = dictGetEntryKey(de);
8476 double *score = dictGetEntryVal(de);
8477
8478 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8479 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8480 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 8481 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8482 }
8483 dictReleaseIterator(di);
9c8e3cee 8484 } else if (o->type == REDIS_HASH) {
8485 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8486
8487 /* Emit the HSETs needed to rebuild the hash */
8488 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8489 unsigned char *p = zipmapRewind(o->ptr);
8490 unsigned char *field, *val;
8491 unsigned int flen, vlen;
8492
8493 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8494 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8495 if (fwriteBulkObject(fp,key) == 0) goto werr;
8496 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8497 return -1;
8498 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8499 return -1;
8500 }
8501 } else {
8502 dictIterator *di = dictGetIterator(o->ptr);
8503 dictEntry *de;
8504
8505 while((de = dictNext(di)) != NULL) {
8506 robj *field = dictGetEntryKey(de);
8507 robj *val = dictGetEntryVal(de);
8508
8509 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8510 if (fwriteBulkObject(fp,key) == 0) goto werr;
8511 if (fwriteBulkObject(fp,field) == -1) return -1;
8512 if (fwriteBulkObject(fp,val) == -1) return -1;
8513 }
8514 dictReleaseIterator(di);
8515 }
9d65a1bb 8516 } else {
f83c6cb5 8517 redisPanic("Unknown object type");
9d65a1bb 8518 }
8519 /* Save the expire time */
8520 if (expiretime != -1) {
e96e4fbf 8521 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 8522 /* If this key is already expired skip it */
8523 if (expiretime < now) continue;
8524 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8525 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8526 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8527 }
b9bc0eef 8528 if (swapped) decrRefCount(o);
9d65a1bb 8529 }
8530 dictReleaseIterator(di);
8531 }
8532
8533 /* Make sure data will not remain on the OS's output buffers */
8534 fflush(fp);
8535 fsync(fileno(fp));
8536 fclose(fp);
e0a62c7f 8537
9d65a1bb 8538 /* Use RENAME to make sure the DB file is changed atomically only
8539 * if the generate DB file is ok. */
8540 if (rename(tmpfile,filename) == -1) {
8541 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8542 unlink(tmpfile);
8543 return REDIS_ERR;
8544 }
8545 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8546 return REDIS_OK;
8547
8548werr:
8549 fclose(fp);
8550 unlink(tmpfile);
e96e4fbf 8551 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 8552 if (di) dictReleaseIterator(di);
8553 return REDIS_ERR;
8554}
8555
8556/* This is how rewriting of the append only file in background works:
8557 *
8558 * 1) The user calls BGREWRITEAOF
8559 * 2) Redis calls this function, that forks():
8560 * 2a) the child rewrite the append only file in a temp file.
8561 * 2b) the parent accumulates differences in server.bgrewritebuf.
8562 * 3) When the child finished '2a' exists.
8563 * 4) The parent will trap the exit code, if it's OK, will append the
8564 * data accumulated into server.bgrewritebuf into the temp file, and
8565 * finally will rename(2) the temp file in the actual file name.
8566 * The the new file is reopened as the new append only file. Profit!
8567 */
8568static int rewriteAppendOnlyFileBackground(void) {
8569 pid_t childpid;
8570
8571 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 8572 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 8573 if ((childpid = fork()) == 0) {
8574 /* Child */
8575 char tmpfile[256];
9d65a1bb 8576
054e426d 8577 if (server.vm_enabled) vmReopenSwapFile();
8578 close(server.fd);
9d65a1bb 8579 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8580 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 8581 _exit(0);
9d65a1bb 8582 } else {
478c2c6f 8583 _exit(1);
9d65a1bb 8584 }
8585 } else {
8586 /* Parent */
8587 if (childpid == -1) {
8588 redisLog(REDIS_WARNING,
8589 "Can't rewrite append only file in background: fork: %s",
8590 strerror(errno));
8591 return REDIS_ERR;
8592 }
8593 redisLog(REDIS_NOTICE,
8594 "Background append only file rewriting started by pid %d",childpid);
8595 server.bgrewritechildpid = childpid;
884d4b39 8596 updateDictResizePolicy();
85a83172 8597 /* We set appendseldb to -1 in order to force the next call to the
8598 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8599 * accumulated by the parent into server.bgrewritebuf will start
8600 * with a SELECT statement and it will be safe to merge. */
8601 server.appendseldb = -1;
9d65a1bb 8602 return REDIS_OK;
8603 }
8604 return REDIS_OK; /* unreached */
8605}
8606
8607static void bgrewriteaofCommand(redisClient *c) {
8608 if (server.bgrewritechildpid != -1) {
8609 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8610 return;
8611 }
8612 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 8613 char *status = "+Background append only file rewriting started\r\n";
8614 addReplySds(c,sdsnew(status));
9d65a1bb 8615 } else {
8616 addReply(c,shared.err);
8617 }
8618}
8619
8620static void aofRemoveTempFile(pid_t childpid) {
8621 char tmpfile[256];
8622
8623 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8624 unlink(tmpfile);
8625}
8626
996cb5f7 8627/* Virtual Memory is composed mainly of two subsystems:
8628 * - Blocking Virutal Memory
8629 * - Threaded Virtual Memory I/O
8630 * The two parts are not fully decoupled, but functions are split among two
8631 * different sections of the source code (delimited by comments) in order to
8632 * make more clear what functionality is about the blocking VM and what about
8633 * the threaded (not blocking) VM.
8634 *
8635 * Redis VM design:
8636 *
8637 * Redis VM is a blocking VM (one that blocks reading swapped values from
8638 * disk into memory when a value swapped out is needed in memory) that is made
8639 * unblocking by trying to examine the command argument vector in order to
8640 * load in background values that will likely be needed in order to exec
8641 * the command. The command is executed only once all the relevant keys
8642 * are loaded into memory.
8643 *
8644 * This basically is almost as simple of a blocking VM, but almost as parallel
8645 * as a fully non-blocking VM.
8646 */
8647
8648/* =================== Virtual Memory - Blocking Side ====================== */
054e426d 8649
75680a3c 8650static void vmInit(void) {
8651 off_t totsize;
996cb5f7 8652 int pipefds[2];
bcaa7a4f 8653 size_t stacksize;
8b5bb414 8654 struct flock fl;
75680a3c 8655
4ad37480 8656 if (server.vm_max_threads != 0)
8657 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8658
054e426d 8659 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8b5bb414 8660 /* Try to open the old swap file, otherwise create it */
6fa987e3 8661 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8662 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8663 }
75680a3c 8664 if (server.vm_fp == NULL) {
6fa987e3 8665 redisLog(REDIS_WARNING,
8b5bb414 8666 "Can't open the swap file: %s. Exiting.",
6fa987e3 8667 strerror(errno));
75680a3c 8668 exit(1);
8669 }
8670 server.vm_fd = fileno(server.vm_fp);
8b5bb414 8671 /* Lock the swap file for writing, this is useful in order to avoid
8672 * another instance to use the same swap file for a config error. */
8673 fl.l_type = F_WRLCK;
8674 fl.l_whence = SEEK_SET;
8675 fl.l_start = fl.l_len = 0;
8676 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8677 redisLog(REDIS_WARNING,
8678 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8679 exit(1);
8680 }
8681 /* Initialize */
75680a3c 8682 server.vm_next_page = 0;
8683 server.vm_near_pages = 0;
7d98e08c 8684 server.vm_stats_used_pages = 0;
8685 server.vm_stats_swapped_objects = 0;
8686 server.vm_stats_swapouts = 0;
8687 server.vm_stats_swapins = 0;
75680a3c 8688 totsize = server.vm_pages*server.vm_page_size;
8689 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8690 if (ftruncate(server.vm_fd,totsize) == -1) {
8691 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8692 strerror(errno));
8693 exit(1);
8694 } else {
8695 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8696 }
7d30035d 8697 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 8698 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 8699 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 8700 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 8701
996cb5f7 8702 /* Initialize threaded I/O (used by Virtual Memory) */
8703 server.io_newjobs = listCreate();
8704 server.io_processing = listCreate();
8705 server.io_processed = listCreate();
d5d55fc3 8706 server.io_ready_clients = listCreate();
92f8e882 8707 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 8708 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8709 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 8710 server.io_active_threads = 0;
996cb5f7 8711 if (pipe(pipefds) == -1) {
8712 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8713 ,strerror(errno));
8714 exit(1);
8715 }
8716 server.io_ready_pipe_read = pipefds[0];
8717 server.io_ready_pipe_write = pipefds[1];
8718 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 8719 /* LZF requires a lot of stack */
8720 pthread_attr_init(&server.io_threads_attr);
8721 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8722 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8723 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 8724 /* Listen for events in the threaded I/O pipe */
8725 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8726 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8727 oom("creating file event");
75680a3c 8728}
8729
06224fec 8730/* Mark the page as used */
8731static void vmMarkPageUsed(off_t page) {
8732 off_t byte = page/8;
8733 int bit = page&7;
970e10bb 8734 redisAssert(vmFreePage(page) == 1);
06224fec 8735 server.vm_bitmap[byte] |= 1<<bit;
8736}
8737
8738/* Mark N contiguous pages as used, with 'page' being the first. */
8739static void vmMarkPagesUsed(off_t page, off_t count) {
8740 off_t j;
8741
8742 for (j = 0; j < count; j++)
7d30035d 8743 vmMarkPageUsed(page+j);
7d98e08c 8744 server.vm_stats_used_pages += count;
7c775e09 8745 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8746 (long long)count, (long long)page);
06224fec 8747}
8748
8749/* Mark the page as free */
8750static void vmMarkPageFree(off_t page) {
8751 off_t byte = page/8;
8752 int bit = page&7;
970e10bb 8753 redisAssert(vmFreePage(page) == 0);
06224fec 8754 server.vm_bitmap[byte] &= ~(1<<bit);
8755}
8756
8757/* Mark N contiguous pages as free, with 'page' being the first. */
8758static void vmMarkPagesFree(off_t page, off_t count) {
8759 off_t j;
8760
8761 for (j = 0; j < count; j++)
7d30035d 8762 vmMarkPageFree(page+j);
7d98e08c 8763 server.vm_stats_used_pages -= count;
7c775e09 8764 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8765 (long long)count, (long long)page);
06224fec 8766}
8767
8768/* Test if the page is free */
8769static int vmFreePage(off_t page) {
8770 off_t byte = page/8;
8771 int bit = page&7;
7d30035d 8772 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 8773}
8774
8775/* Find N contiguous free pages storing the first page of the cluster in *first.
e0a62c7f 8776 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
3a66edc7 8777 * REDIS_ERR is returned.
06224fec 8778 *
8779 * This function uses a simple algorithm: we try to allocate
8780 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8781 * again from the start of the swap file searching for free spaces.
8782 *
8783 * If it looks pretty clear that there are no free pages near our offset
8784 * we try to find less populated places doing a forward jump of
8785 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8786 * without hurry, and then we jump again and so forth...
e0a62c7f 8787 *
06224fec 8788 * This function can be improved using a free list to avoid to guess
8789 * too much, since we could collect data about freed pages.
8790 *
8791 * note: I implemented this function just after watching an episode of
8792 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8793 */
c7df85a4 8794static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 8795 off_t base, offset = 0, since_jump = 0, numfree = 0;
8796
8797 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8798 server.vm_near_pages = 0;
8799 server.vm_next_page = 0;
8800 }
8801 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8802 base = server.vm_next_page;
8803
8804 while(offset < server.vm_pages) {
8805 off_t this = base+offset;
8806
8807 /* If we overflow, restart from page zero */
8808 if (this >= server.vm_pages) {
8809 this -= server.vm_pages;
8810 if (this == 0) {
8811 /* Just overflowed, what we found on tail is no longer
8812 * interesting, as it's no longer contiguous. */
8813 numfree = 0;
8814 }
8815 }
8816 if (vmFreePage(this)) {
8817 /* This is a free page */
8818 numfree++;
8819 /* Already got N free pages? Return to the caller, with success */
8820 if (numfree == n) {
7d30035d 8821 *first = this-(n-1);
8822 server.vm_next_page = this+1;
7c775e09 8823 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 8824 return REDIS_OK;
06224fec 8825 }
8826 } else {
8827 /* The current one is not a free page */
8828 numfree = 0;
8829 }
8830
8831 /* Fast-forward if the current page is not free and we already
8832 * searched enough near this place. */
8833 since_jump++;
8834 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8835 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8836 since_jump = 0;
8837 /* Note that even if we rewind after the jump, we are don't need
8838 * to make sure numfree is set to zero as we only jump *if* it
8839 * is set to zero. */
8840 } else {
8841 /* Otherwise just check the next page */
8842 offset++;
8843 }
8844 }
3a66edc7 8845 return REDIS_ERR;
8846}
8847
a5819310 8848/* Write the specified object at the specified page of the swap file */
8849static int vmWriteObjectOnSwap(robj *o, off_t page) {
8850 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8851 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8852 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8853 redisLog(REDIS_WARNING,
9ebed7cf 8854 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 8855 strerror(errno));
8856 return REDIS_ERR;
8857 }
8858 rdbSaveObject(server.vm_fp,o);
ba76a8f9 8859 fflush(server.vm_fp);
a5819310 8860 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8861 return REDIS_OK;
8862}
8863
3a66edc7 8864/* Swap the 'val' object relative to 'key' into disk. Store all the information
8865 * needed to later retrieve the object into the key object.
8866 * If we can't find enough contiguous empty pages to swap the object on disk
8867 * REDIS_ERR is returned. */
a69a0c9c 8868static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 8869 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 8870 off_t page;
8871
8872 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 8873 assert(key->refcount == 1);
3a66edc7 8874 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 8875 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 8876 key->vm.page = page;
8877 key->vm.usedpages = pages;
8878 key->storage = REDIS_VM_SWAPPED;
d894161b 8879 key->vtype = val->type;
3a66edc7 8880 decrRefCount(val); /* Deallocate the object from memory. */
8881 vmMarkPagesUsed(page,pages);
7d30035d 8882 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8883 (unsigned char*) key->ptr,
8884 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 8885 server.vm_stats_swapped_objects++;
8886 server.vm_stats_swapouts++;
3a66edc7 8887 return REDIS_OK;
8888}
8889
a5819310 8890static robj *vmReadObjectFromSwap(off_t page, int type) {
8891 robj *o;
3a66edc7 8892
a5819310 8893 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8894 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 8895 redisLog(REDIS_WARNING,
d5d55fc3 8896 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 8897 strerror(errno));
478c2c6f 8898 _exit(1);
3a66edc7 8899 }
a5819310 8900 o = rdbLoadObject(type,server.vm_fp);
8901 if (o == NULL) {
d5d55fc3 8902 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 8903 _exit(1);
3a66edc7 8904 }
a5819310 8905 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8906 return o;
8907}
8908
8909/* Load the value object relative to the 'key' object from swap to memory.
8910 * The newly allocated object is returned.
8911 *
8912 * If preview is true the unserialized object is returned to the caller but
8913 * no changes are made to the key object, nor the pages are marked as freed */
8914static robj *vmGenericLoadObject(robj *key, int preview) {
8915 robj *val;
8916
d5d55fc3 8917 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
a5819310 8918 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 8919 if (!preview) {
8920 key->storage = REDIS_VM_MEMORY;
8921 key->vm.atime = server.unixtime;
8922 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8923 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8924 (unsigned char*) key->ptr);
7d98e08c 8925 server.vm_stats_swapped_objects--;
38aba9a1 8926 } else {
8927 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8928 (unsigned char*) key->ptr);
7e69548d 8929 }
7d98e08c 8930 server.vm_stats_swapins++;
3a66edc7 8931 return val;
06224fec 8932}
8933
7e69548d 8934/* Plain object loading, from swap to memory */
8935static robj *vmLoadObject(robj *key) {
996cb5f7 8936 /* If we are loading the object in background, stop it, we
8937 * need to load this object synchronously ASAP. */
8938 if (key->storage == REDIS_VM_LOADING)
8939 vmCancelThreadedIOJob(key);
7e69548d 8940 return vmGenericLoadObject(key,0);
8941}
8942
8943/* Just load the value on disk, without to modify the key.
8944 * This is useful when we want to perform some operation on the value
8945 * without to really bring it from swap to memory, like while saving the
8946 * dataset or rewriting the append only log. */
8947static robj *vmPreviewObject(robj *key) {
8948 return vmGenericLoadObject(key,1);
8949}
8950
4ef8de8a 8951/* How a good candidate is this object for swapping?
8952 * The better candidate it is, the greater the returned value.
8953 *
8954 * Currently we try to perform a fast estimation of the object size in
8955 * memory, and combine it with aging informations.
8956 *
8957 * Basically swappability = idle-time * log(estimated size)
8958 *
8959 * Bigger objects are preferred over smaller objects, but not
8960 * proportionally, this is why we use the logarithm. This algorithm is
8961 * just a first try and will probably be tuned later. */
8962static double computeObjectSwappability(robj *o) {
8963 time_t age = server.unixtime - o->vm.atime;
8964 long asize = 0;
8965 list *l;
8966 dict *d;
8967 struct dictEntry *de;
8968 int z;
8969
8970 if (age <= 0) return 0;
8971 switch(o->type) {
8972 case REDIS_STRING:
8973 if (o->encoding != REDIS_ENCODING_RAW) {
8974 asize = sizeof(*o);
8975 } else {
8976 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8977 }
8978 break;
8979 case REDIS_LIST:
8980 l = o->ptr;
8981 listNode *ln = listFirst(l);
8982
8983 asize = sizeof(list);
8984 if (ln) {
8985 robj *ele = ln->value;
8986 long elesize;
8987
8988 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8989 (sizeof(*o)+sdslen(ele->ptr)) :
8990 sizeof(*o);
8991 asize += (sizeof(listNode)+elesize)*listLength(l);
8992 }
8993 break;
8994 case REDIS_SET:
8995 case REDIS_ZSET:
8996 z = (o->type == REDIS_ZSET);
8997 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8998
8999 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9000 if (z) asize += sizeof(zset)-sizeof(dict);
9001 if (dictSize(d)) {
9002 long elesize;
9003 robj *ele;
9004
9005 de = dictGetRandomKey(d);
9006 ele = dictGetEntryKey(de);
9007 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9008 (sizeof(*o)+sdslen(ele->ptr)) :
9009 sizeof(*o);
9010 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9011 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9012 }
9013 break;
a97b9060 9014 case REDIS_HASH:
9015 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9016 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9017 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9018 unsigned int klen, vlen;
9019 unsigned char *key, *val;
9020
9021 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9022 klen = 0;
9023 vlen = 0;
9024 }
9025 asize = len*(klen+vlen+3);
9026 } else if (o->encoding == REDIS_ENCODING_HT) {
9027 d = o->ptr;
9028 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9029 if (dictSize(d)) {
9030 long elesize;
9031 robj *ele;
9032
9033 de = dictGetRandomKey(d);
9034 ele = dictGetEntryKey(de);
9035 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9036 (sizeof(*o)+sdslen(ele->ptr)) :
9037 sizeof(*o);
9038 ele = dictGetEntryVal(de);
9039 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9040 (sizeof(*o)+sdslen(ele->ptr)) :
9041 sizeof(*o);
9042 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9043 }
9044 }
9045 break;
4ef8de8a 9046 }
c8c72447 9047 return (double)age*log(1+asize);
4ef8de8a 9048}
9049
9050/* Try to swap an object that's a good candidate for swapping.
9051 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 9052 * to swap any object at all.
9053 *
9054 * If 'usethreaded' is true, Redis will try to swap the object in background
9055 * using I/O threads. */
9056static int vmSwapOneObject(int usethreads) {
4ef8de8a 9057 int j, i;
9058 struct dictEntry *best = NULL;
9059 double best_swappability = 0;
b9bc0eef 9060 redisDb *best_db = NULL;
4ef8de8a 9061 robj *key, *val;
9062
9063 for (j = 0; j < server.dbnum; j++) {
9064 redisDb *db = server.db+j;
b72f6a4b 9065 /* Why maxtries is set to 100?
9066 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9067 * are swappable objects */
b0d8747d 9068 int maxtries = 100;
4ef8de8a 9069
9070 if (dictSize(db->dict) == 0) continue;
9071 for (i = 0; i < 5; i++) {
9072 dictEntry *de;
9073 double swappability;
9074
e3cadb8a 9075 if (maxtries) maxtries--;
4ef8de8a 9076 de = dictGetRandomKey(db->dict);
9077 key = dictGetEntryKey(de);
9078 val = dictGetEntryVal(de);
1064ef87 9079 /* Only swap objects that are currently in memory.
9080 *
9081 * Also don't swap shared objects if threaded VM is on, as we
9082 * try to ensure that the main thread does not touch the
9083 * object while the I/O thread is using it, but we can't
9084 * control other keys without adding additional mutex. */
9085 if (key->storage != REDIS_VM_MEMORY ||
9086 (server.vm_max_threads != 0 && val->refcount != 1)) {
e3cadb8a 9087 if (maxtries) i--; /* don't count this try */
9088 continue;
9089 }
4ef8de8a 9090 swappability = computeObjectSwappability(val);
9091 if (!best || swappability > best_swappability) {
9092 best = de;
9093 best_swappability = swappability;
b9bc0eef 9094 best_db = db;
4ef8de8a 9095 }
9096 }
9097 }
7c775e09 9098 if (best == NULL) return REDIS_ERR;
4ef8de8a 9099 key = dictGetEntryKey(best);
9100 val = dictGetEntryVal(best);
9101
e3cadb8a 9102 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 9103 key->ptr, best_swappability);
9104
9105 /* Unshare the key if needed */
9106 if (key->refcount > 1) {
9107 robj *newkey = dupStringObject(key);
9108 decrRefCount(key);
9109 key = dictGetEntryKey(best) = newkey;
9110 }
9111 /* Swap it */
a69a0c9c 9112 if (usethreads) {
b9bc0eef 9113 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 9114 return REDIS_OK;
9115 } else {
a69a0c9c 9116 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9117 dictGetEntryVal(best) = NULL;
9118 return REDIS_OK;
9119 } else {
9120 return REDIS_ERR;
9121 }
4ef8de8a 9122 }
9123}
9124
a69a0c9c 9125static int vmSwapOneObjectBlocking() {
9126 return vmSwapOneObject(0);
9127}
9128
9129static int vmSwapOneObjectThreaded() {
9130 return vmSwapOneObject(1);
9131}
9132
7e69548d 9133/* Return true if it's safe to swap out objects in a given moment.
9134 * Basically we don't want to swap objects out while there is a BGSAVE
9135 * or a BGAEOREWRITE running in backgroud. */
9136static int vmCanSwapOut(void) {
9137 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9138}
9139
1b03836c 9140/* Delete a key if swapped. Returns 1 if the key was found, was swapped
9141 * and was deleted. Otherwise 0 is returned. */
9142static int deleteIfSwapped(redisDb *db, robj *key) {
9143 dictEntry *de;
9144 robj *foundkey;
9145
9146 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9147 foundkey = dictGetEntryKey(de);
9148 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9149 deleteKey(db,key);
9150 return 1;
9151}
9152
996cb5f7 9153/* =================== Virtual Memory - Threaded I/O ======================= */
9154
b9bc0eef 9155static void freeIOJob(iojob *j) {
d5d55fc3 9156 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9157 j->type == REDIS_IOJOB_DO_SWAP ||
9158 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
b9bc0eef 9159 decrRefCount(j->val);
78ebe4c8 9160 /* We don't decrRefCount the j->key field as we did't incremented
9161 * the count creating IO Jobs. This is because the key field here is
9162 * just used as an indentifier and if a key is removed the Job should
9163 * never be touched again. */
b9bc0eef 9164 zfree(j);
9165}
9166
996cb5f7 9167/* Every time a thread finished a Job, it writes a byte into the write side
9168 * of an unix pipe in order to "awake" the main thread, and this function
9169 * is called. */
9170static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9171 int mask)
9172{
9173 char buf[1];
b0d8747d 9174 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 9175 REDIS_NOTUSED(el);
9176 REDIS_NOTUSED(mask);
9177 REDIS_NOTUSED(privdata);
9178
9179 /* For every byte we read in the read side of the pipe, there is one
9180 * I/O job completed to process. */
9181 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 9182 iojob *j;
9183 listNode *ln;
9184 robj *key;
9185 struct dictEntry *de;
9186
996cb5f7 9187 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 9188
9189 /* Get the processed element (the oldest one) */
9190 lockThreadedIO();
1064ef87 9191 assert(listLength(server.io_processed) != 0);
f6c0bba8 9192 if (toprocess == -1) {
9193 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9194 if (toprocess <= 0) toprocess = 1;
9195 }
b9bc0eef 9196 ln = listFirst(server.io_processed);
9197 j = ln->value;
9198 listDelNode(server.io_processed,ln);
9199 unlockThreadedIO();
9200 /* If this job is marked as canceled, just ignore it */
9201 if (j->canceled) {
9202 freeIOJob(j);
9203 continue;
9204 }
9205 /* Post process it in the main thread, as there are things we
9206 * can do just here to avoid race conditions and/or invasive locks */
6c96ba7d 9207 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 9208 de = dictFind(j->db->dict,j->key);
9209 assert(de != NULL);
9210 key = dictGetEntryKey(de);
9211 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9212 redisDb *db;
9213
b9bc0eef 9214 /* Key loaded, bring it at home */
9215 key->storage = REDIS_VM_MEMORY;
9216 key->vm.atime = server.unixtime;
9217 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9218 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9219 (unsigned char*) key->ptr);
9220 server.vm_stats_swapped_objects--;
9221 server.vm_stats_swapins++;
d5d55fc3 9222 dictGetEntryVal(de) = j->val;
9223 incrRefCount(j->val);
9224 db = j->db;
b9bc0eef 9225 freeIOJob(j);
d5d55fc3 9226 /* Handle clients waiting for this key to be loaded. */
9227 handleClientsBlockedOnSwappedKey(db,key);
b9bc0eef 9228 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9229 /* Now we know the amount of pages required to swap this object.
9230 * Let's find some space for it, and queue this task again
9231 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 9232 if (!vmCanSwapOut() ||
9233 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9234 {
9235 /* Ooops... no space or we can't swap as there is
9236 * a fork()ed Redis trying to save stuff on disk. */
b9bc0eef 9237 freeIOJob(j);
054e426d 9238 key->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 9239 } else {
c7df85a4 9240 /* Note that we need to mark this pages as used now,
9241 * if the job will be canceled, we'll mark them as freed
9242 * again. */
9243 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 9244 j->type = REDIS_IOJOB_DO_SWAP;
9245 lockThreadedIO();
9246 queueIOJob(j);
9247 unlockThreadedIO();
9248 }
9249 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9250 robj *val;
9251
9252 /* Key swapped. We can finally free some memory. */
6c96ba7d 9253 if (key->storage != REDIS_VM_SWAPPING) {
9254 printf("key->storage: %d\n",key->storage);
9255 printf("key->name: %s\n",(char*)key->ptr);
9256 printf("key->refcount: %d\n",key->refcount);
9257 printf("val: %p\n",(void*)j->val);
9258 printf("val->type: %d\n",j->val->type);
9259 printf("val->ptr: %s\n",(char*)j->val->ptr);
9260 }
9261 redisAssert(key->storage == REDIS_VM_SWAPPING);
b9bc0eef 9262 val = dictGetEntryVal(de);
9263 key->vm.page = j->page;
9264 key->vm.usedpages = j->pages;
9265 key->storage = REDIS_VM_SWAPPED;
9266 key->vtype = j->val->type;
9267 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 9268 dictGetEntryVal(de) = NULL;
b9bc0eef 9269 redisLog(REDIS_DEBUG,
9270 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9271 (unsigned char*) key->ptr,
9272 (unsigned long long) j->page, (unsigned long long) j->pages);
9273 server.vm_stats_swapped_objects++;
9274 server.vm_stats_swapouts++;
9275 freeIOJob(j);
f11b8647 9276 /* Put a few more swap requests in queue if we are still
9277 * out of memory */
b0d8747d 9278 if (trytoswap && vmCanSwapOut() &&
9279 zmalloc_used_memory() > server.vm_max_memory)
9280 {
f11b8647 9281 int more = 1;
9282 while(more) {
9283 lockThreadedIO();
9284 more = listLength(server.io_newjobs) <
9285 (unsigned) server.vm_max_threads;
9286 unlockThreadedIO();
9287 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 9288 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9289 trytoswap = 0;
9290 break;
9291 }
f11b8647 9292 }
9293 }
b9bc0eef 9294 }
c953f24b 9295 processed++;
f6c0bba8 9296 if (processed == toprocess) return;
996cb5f7 9297 }
9298 if (retval < 0 && errno != EAGAIN) {
9299 redisLog(REDIS_WARNING,
9300 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9301 strerror(errno));
9302 }
9303}
9304
9305static void lockThreadedIO(void) {
9306 pthread_mutex_lock(&server.io_mutex);
9307}
9308
9309static void unlockThreadedIO(void) {
9310 pthread_mutex_unlock(&server.io_mutex);
9311}
9312
9313/* Remove the specified object from the threaded I/O queue if still not
9314 * processed, otherwise make sure to flag it as canceled. */
9315static void vmCancelThreadedIOJob(robj *o) {
9316 list *lists[3] = {
6c96ba7d 9317 server.io_newjobs, /* 0 */
9318 server.io_processing, /* 1 */
9319 server.io_processed /* 2 */
996cb5f7 9320 };
9321 int i;
9322
9323 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 9324again:
996cb5f7 9325 lockThreadedIO();
9326 /* Search for a matching key in one of the queues */
9327 for (i = 0; i < 3; i++) {
9328 listNode *ln;
c7df85a4 9329 listIter li;
996cb5f7 9330
c7df85a4 9331 listRewind(lists[i],&li);
9332 while ((ln = listNext(&li)) != NULL) {
996cb5f7 9333 iojob *job = ln->value;
9334
6c96ba7d 9335 if (job->canceled) continue; /* Skip this, already canceled. */
78ebe4c8 9336 if (job->key == o) {
970e10bb 9337 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9338 (void*)job, (char*)o->ptr, job->type, i);
427a2153 9339 /* Mark the pages as free since the swap didn't happened
9340 * or happened but is now discarded. */
970e10bb 9341 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 9342 vmMarkPagesFree(job->page,job->pages);
9343 /* Cancel the job. It depends on the list the job is
9344 * living in. */
996cb5f7 9345 switch(i) {
9346 case 0: /* io_newjobs */
6c96ba7d 9347 /* If the job was yet not processed the best thing to do
996cb5f7 9348 * is to remove it from the queue at all */
6c96ba7d 9349 freeIOJob(job);
996cb5f7 9350 listDelNode(lists[i],ln);
9351 break;
9352 case 1: /* io_processing */
d5d55fc3 9353 /* Oh Shi- the thread is messing with the Job:
9354 *
9355 * Probably it's accessing the object if this is a
9356 * PREPARE_SWAP or DO_SWAP job.
9357 * If it's a LOAD job it may be reading from disk and
9358 * if we don't wait for the job to terminate before to
9359 * cancel it, maybe in a few microseconds data can be
9360 * corrupted in this pages. So the short story is:
9361 *
9362 * Better to wait for the job to move into the
9363 * next queue (processed)... */
9364
9365 /* We try again and again until the job is completed. */
9366 unlockThreadedIO();
9367 /* But let's wait some time for the I/O thread
9368 * to finish with this job. After all this condition
9369 * should be very rare. */
9370 usleep(1);
9371 goto again;
996cb5f7 9372 case 2: /* io_processed */
2e111efe 9373 /* The job was already processed, that's easy...
9374 * just mark it as canceled so that we'll ignore it
9375 * when processing completed jobs. */
996cb5f7 9376 job->canceled = 1;
9377 break;
9378 }
c7df85a4 9379 /* Finally we have to adjust the storage type of the object
9380 * in order to "UNDO" the operaiton. */
996cb5f7 9381 if (o->storage == REDIS_VM_LOADING)
9382 o->storage = REDIS_VM_SWAPPED;
9383 else if (o->storage == REDIS_VM_SWAPPING)
9384 o->storage = REDIS_VM_MEMORY;
9385 unlockThreadedIO();
9386 return;
9387 }
9388 }
9389 }
9390 unlockThreadedIO();
9391 assert(1 != 1); /* We should never reach this */
9392}
9393
b9bc0eef 9394static void *IOThreadEntryPoint(void *arg) {
9395 iojob *j;
9396 listNode *ln;
9397 REDIS_NOTUSED(arg);
9398
9399 pthread_detach(pthread_self());
9400 while(1) {
9401 /* Get a new job to process */
9402 lockThreadedIO();
9403 if (listLength(server.io_newjobs) == 0) {
9404 /* No new jobs in queue, exit. */
9ebed7cf 9405 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9406 (long) pthread_self());
b9bc0eef 9407 server.io_active_threads--;
9408 unlockThreadedIO();
9409 return NULL;
9410 }
9411 ln = listFirst(server.io_newjobs);
9412 j = ln->value;
9413 listDelNode(server.io_newjobs,ln);
9414 /* Add the job in the processing queue */
9415 j->thread = pthread_self();
9416 listAddNodeTail(server.io_processing,j);
9417 ln = listLast(server.io_processing); /* We use ln later to remove it */
9418 unlockThreadedIO();
9ebed7cf 9419 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9420 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 9421
9422 /* Process the Job */
9423 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9424 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
b9bc0eef 9425 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9426 FILE *fp = fopen("/dev/null","w+");
9427 j->pages = rdbSavedObjectPages(j->val,fp);
9428 fclose(fp);
9429 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 9430 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9431 j->canceled = 1;
b9bc0eef 9432 }
9433
9434 /* Done: insert the job into the processed queue */
9ebed7cf 9435 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9436 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 9437 lockThreadedIO();
9438 listDelNode(server.io_processing,ln);
9439 listAddNodeTail(server.io_processed,j);
9440 unlockThreadedIO();
e0a62c7f 9441
b9bc0eef 9442 /* Signal the main thread there is new stuff to process */
9443 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9444 }
9445 return NULL; /* never reached */
9446}
9447
9448static void spawnIOThread(void) {
9449 pthread_t thread;
478c2c6f 9450 sigset_t mask, omask;
a97b9060 9451 int err;
b9bc0eef 9452
478c2c6f 9453 sigemptyset(&mask);
9454 sigaddset(&mask,SIGCHLD);
9455 sigaddset(&mask,SIGHUP);
9456 sigaddset(&mask,SIGPIPE);
9457 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 9458 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9459 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9460 strerror(err));
9461 usleep(1000000);
9462 }
478c2c6f 9463 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 9464 server.io_active_threads++;
9465}
9466
4ee9488d 9467/* We need to wait for the last thread to exit before we are able to
9468 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 9469static void waitEmptyIOJobsQueue(void) {
4ee9488d 9470 while(1) {
76b7233a 9471 int io_processed_len;
9472
4ee9488d 9473 lockThreadedIO();
054e426d 9474 if (listLength(server.io_newjobs) == 0 &&
9475 listLength(server.io_processing) == 0 &&
9476 server.io_active_threads == 0)
9477 {
4ee9488d 9478 unlockThreadedIO();
9479 return;
9480 }
76b7233a 9481 /* While waiting for empty jobs queue condition we post-process some
9482 * finshed job, as I/O threads may be hanging trying to write against
9483 * the io_ready_pipe_write FD but there are so much pending jobs that
9484 * it's blocking. */
9485 io_processed_len = listLength(server.io_processed);
4ee9488d 9486 unlockThreadedIO();
76b7233a 9487 if (io_processed_len) {
9488 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9489 usleep(1000); /* 1 millisecond */
9490 } else {
9491 usleep(10000); /* 10 milliseconds */
9492 }
4ee9488d 9493 }
9494}
9495
054e426d 9496static void vmReopenSwapFile(void) {
478c2c6f 9497 /* Note: we don't close the old one as we are in the child process
9498 * and don't want to mess at all with the original file object. */
054e426d 9499 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9500 if (server.vm_fp == NULL) {
9501 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9502 server.vm_swap_file);
478c2c6f 9503 _exit(1);
054e426d 9504 }
9505 server.vm_fd = fileno(server.vm_fp);
9506}
9507
b9bc0eef 9508/* This function must be called while with threaded IO locked */
9509static void queueIOJob(iojob *j) {
6c96ba7d 9510 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9511 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 9512 listAddNodeTail(server.io_newjobs,j);
9513 if (server.io_active_threads < server.vm_max_threads)
9514 spawnIOThread();
9515}
9516
9517static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9518 iojob *j;
e0a62c7f 9519
b9bc0eef 9520 assert(key->storage == REDIS_VM_MEMORY);
9521 assert(key->refcount == 1);
9522
9523 j = zmalloc(sizeof(*j));
9524 j->type = REDIS_IOJOB_PREPARE_SWAP;
9525 j->db = db;
78ebe4c8 9526 j->key = key;
b9bc0eef 9527 j->val = val;
9528 incrRefCount(val);
9529 j->canceled = 0;
9530 j->thread = (pthread_t) -1;
f11b8647 9531 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 9532
9533 lockThreadedIO();
9534 queueIOJob(j);
9535 unlockThreadedIO();
9536 return REDIS_OK;
9537}
9538
b0d8747d 9539/* ============ Virtual Memory - Blocking clients on missing keys =========== */
9540
d5d55fc3 9541/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9542 * If there is not already a job loading the key, it is craeted.
9543 * The key is added to the io_keys list in the client structure, and also
9544 * in the hash table mapping swapped keys to waiting clients, that is,
9545 * server.io_waited_keys. */
9546static int waitForSwappedKey(redisClient *c, robj *key) {
9547 struct dictEntry *de;
9548 robj *o;
9549 list *l;
9550
9551 /* If the key does not exist or is already in RAM we don't need to
9552 * block the client at all. */
9553 de = dictFind(c->db->dict,key);
9554 if (de == NULL) return 0;
9555 o = dictGetEntryKey(de);
9556 if (o->storage == REDIS_VM_MEMORY) {
9557 return 0;
9558 } else if (o->storage == REDIS_VM_SWAPPING) {
9559 /* We were swapping the key, undo it! */
9560 vmCancelThreadedIOJob(o);
9561 return 0;
9562 }
e0a62c7f 9563
d5d55fc3 9564 /* OK: the key is either swapped, or being loaded just now. */
9565
9566 /* Add the key to the list of keys this client is waiting for.
9567 * This maps clients to keys they are waiting for. */
9568 listAddNodeTail(c->io_keys,key);
9569 incrRefCount(key);
9570
9571 /* Add the client to the swapped keys => clients waiting map. */
9572 de = dictFind(c->db->io_keys,key);
9573 if (de == NULL) {
9574 int retval;
9575
9576 /* For every key we take a list of clients blocked for it */
9577 l = listCreate();
9578 retval = dictAdd(c->db->io_keys,key,l);
9579 incrRefCount(key);
9580 assert(retval == DICT_OK);
9581 } else {
9582 l = dictGetEntryVal(de);
9583 }
9584 listAddNodeTail(l,c);
9585
9586 /* Are we already loading the key from disk? If not create a job */
9587 if (o->storage == REDIS_VM_SWAPPED) {
9588 iojob *j;
9589
9590 o->storage = REDIS_VM_LOADING;
9591 j = zmalloc(sizeof(*j));
9592 j->type = REDIS_IOJOB_LOAD;
9593 j->db = c->db;
78ebe4c8 9594 j->key = o;
d5d55fc3 9595 j->key->vtype = o->vtype;
9596 j->page = o->vm.page;
9597 j->val = NULL;
9598 j->canceled = 0;
9599 j->thread = (pthread_t) -1;
9600 lockThreadedIO();
9601 queueIOJob(j);
9602 unlockThreadedIO();
9603 }
9604 return 1;
9605}
9606
76583ea4
PN
9607/* Preload keys needed for the ZUNION and ZINTER commands. */
9608static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9609 int i, num;
9610 num = atoi(c->argv[2]->ptr);
9611 for (i = 0; i < num; i++) {
9612 waitForSwappedKey(c,c->argv[3+i]);
9613 }
9614}
9615
b0d8747d 9616/* Is this client attempting to run a command against swapped keys?
d5d55fc3 9617 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 9618 *
d5d55fc3 9619 * The important idea about this function is that it can fail! If keys will
9620 * still be swapped when the client is resumed, this key lookups will
9621 * just block loading keys from disk. In practical terms this should only
9622 * happen with SORT BY command or if there is a bug in this function.
9623 *
9624 * Return 1 if the client is marked as blocked, 0 if the client can
9625 * continue as the keys it is going to access appear to be in memory. */
9626static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
7c775e09 9627 int j, last;
9628
76583ea4
PN
9629 if (cmd->vm_preload_proc != NULL) {
9630 cmd->vm_preload_proc(c);
9631 } else {
9632 if (cmd->vm_firstkey == 0) return 0;
9633 last = cmd->vm_lastkey;
9634 if (last < 0) last = c->argc+last;
9635 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9636 waitForSwappedKey(c,c->argv[j]);
9637 }
9638
d5d55fc3 9639 /* If the client was blocked for at least one key, mark it as blocked. */
9640 if (listLength(c->io_keys)) {
9641 c->flags |= REDIS_IO_WAIT;
9642 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9643 server.vm_blocked_clients++;
9644 return 1;
9645 } else {
9646 return 0;
9647 }
9648}
9649
9650/* Remove the 'key' from the list of blocked keys for a given client.
9651 *
9652 * The function returns 1 when there are no longer blocking keys after
9653 * the current one was removed (and the client can be unblocked). */
9654static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9655 list *l;
9656 listNode *ln;
9657 listIter li;
9658 struct dictEntry *de;
9659
9660 /* Remove the key from the list of keys this client is waiting for. */
9661 listRewind(c->io_keys,&li);
9662 while ((ln = listNext(&li)) != NULL) {
bf028098 9663 if (equalStringObjects(ln->value,key)) {
d5d55fc3 9664 listDelNode(c->io_keys,ln);
9665 break;
9666 }
9667 }
9668 assert(ln != NULL);
9669
9670 /* Remove the client form the key => waiting clients map. */
9671 de = dictFind(c->db->io_keys,key);
9672 assert(de != NULL);
9673 l = dictGetEntryVal(de);
9674 ln = listSearchKey(l,c);
9675 assert(ln != NULL);
9676 listDelNode(l,ln);
9677 if (listLength(l) == 0)
9678 dictDelete(c->db->io_keys,key);
9679
9680 return listLength(c->io_keys) == 0;
9681}
9682
9683static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9684 struct dictEntry *de;
9685 list *l;
9686 listNode *ln;
9687 int len;
9688
9689 de = dictFind(db->io_keys,key);
9690 if (!de) return;
9691
9692 l = dictGetEntryVal(de);
9693 len = listLength(l);
9694 /* Note: we can't use something like while(listLength(l)) as the list
9695 * can be freed by the calling function when we remove the last element. */
9696 while (len--) {
9697 ln = listFirst(l);
9698 redisClient *c = ln->value;
9699
9700 if (dontWaitForSwappedKey(c,key)) {
9701 /* Put the client in the list of clients ready to go as we
9702 * loaded all the keys about it. */
9703 listAddNodeTail(server.io_ready_clients,c);
9704 }
9705 }
b0d8747d 9706}
b0d8747d 9707
500ece7c 9708/* =========================== Remote Configuration ========================= */
9709
9710static void configSetCommand(redisClient *c) {
9711 robj *o = getDecodedObject(c->argv[3]);
9712 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9713 zfree(server.dbfilename);
9714 server.dbfilename = zstrdup(o->ptr);
9715 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9716 zfree(server.requirepass);
9717 server.requirepass = zstrdup(o->ptr);
9718 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9719 zfree(server.masterauth);
9720 server.masterauth = zstrdup(o->ptr);
9721 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9722 server.maxmemory = strtoll(o->ptr, NULL, 10);
1b677732 9723 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
9724 if (!strcasecmp(o->ptr,"no")) {
9725 server.appendfsync = APPENDFSYNC_NO;
9726 } else if (!strcasecmp(o->ptr,"everysec")) {
9727 server.appendfsync = APPENDFSYNC_EVERYSEC;
9728 } else if (!strcasecmp(o->ptr,"always")) {
9729 server.appendfsync = APPENDFSYNC_ALWAYS;
9730 } else {
9731 goto badfmt;
9732 }
a34e0a25 9733 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
9734 int vlen, j;
9735 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
9736
9737 /* Perform sanity check before setting the new config:
9738 * - Even number of args
9739 * - Seconds >= 1, changes >= 0 */
9740 if (vlen & 1) {
9741 sdsfreesplitres(v,vlen);
9742 goto badfmt;
9743 }
9744 for (j = 0; j < vlen; j++) {
9745 char *eptr;
9746 long val;
9747
9748 val = strtoll(v[j], &eptr, 10);
9749 if (eptr[0] != '\0' ||
9750 ((j & 1) == 0 && val < 1) ||
9751 ((j & 1) == 1 && val < 0)) {
9752 sdsfreesplitres(v,vlen);
9753 goto badfmt;
9754 }
9755 }
9756 /* Finally set the new config */
9757 resetServerSaveParams();
9758 for (j = 0; j < vlen; j += 2) {
9759 time_t seconds;
9760 int changes;
9761
9762 seconds = strtoll(v[j],NULL,10);
9763 changes = strtoll(v[j+1],NULL,10);
9764 appendServerSaveParams(seconds, changes);
9765 }
9766 sdsfreesplitres(v,vlen);
500ece7c 9767 } else {
9768 addReplySds(c,sdscatprintf(sdsempty(),
9769 "-ERR not supported CONFIG parameter %s\r\n",
9770 (char*)c->argv[2]->ptr));
9771 decrRefCount(o);
9772 return;
9773 }
9774 decrRefCount(o);
9775 addReply(c,shared.ok);
a34e0a25 9776 return;
9777
9778badfmt: /* Bad format errors */
9779 addReplySds(c,sdscatprintf(sdsempty(),
9780 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
9781 (char*)o->ptr,
9782 (char*)c->argv[2]->ptr));
9783 decrRefCount(o);
500ece7c 9784}
9785
9786static void configGetCommand(redisClient *c) {
9787 robj *o = getDecodedObject(c->argv[2]);
9788 robj *lenobj = createObject(REDIS_STRING,NULL);
9789 char *pattern = o->ptr;
9790 int matches = 0;
9791
9792 addReply(c,lenobj);
9793 decrRefCount(lenobj);
9794
9795 if (stringmatch(pattern,"dbfilename",0)) {
9796 addReplyBulkCString(c,"dbfilename");
9797 addReplyBulkCString(c,server.dbfilename);
9798 matches++;
9799 }
9800 if (stringmatch(pattern,"requirepass",0)) {
9801 addReplyBulkCString(c,"requirepass");
9802 addReplyBulkCString(c,server.requirepass);
9803 matches++;
9804 }
9805 if (stringmatch(pattern,"masterauth",0)) {
9806 addReplyBulkCString(c,"masterauth");
9807 addReplyBulkCString(c,server.masterauth);
9808 matches++;
9809 }
9810 if (stringmatch(pattern,"maxmemory",0)) {
9811 char buf[128];
9812
9813 snprintf(buf,128,"%llu\n",server.maxmemory);
9814 addReplyBulkCString(c,"maxmemory");
9815 addReplyBulkCString(c,buf);
9816 matches++;
9817 }
1b677732 9818 if (stringmatch(pattern,"appendfsync",0)) {
9819 char *policy;
9820
9821 switch(server.appendfsync) {
9822 case APPENDFSYNC_NO: policy = "no"; break;
9823 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
9824 case APPENDFSYNC_ALWAYS: policy = "always"; break;
9825 default: policy = "unknown"; break; /* too harmless to panic */
9826 }
9827 addReplyBulkCString(c,"appendfsync");
9828 addReplyBulkCString(c,policy);
9829 matches++;
9830 }
a34e0a25 9831 if (stringmatch(pattern,"save",0)) {
9832 sds buf = sdsempty();
9833 int j;
9834
9835 for (j = 0; j < server.saveparamslen; j++) {
9836 buf = sdscatprintf(buf,"%ld %d",
9837 server.saveparams[j].seconds,
9838 server.saveparams[j].changes);
9839 if (j != server.saveparamslen-1)
9840 buf = sdscatlen(buf," ",1);
9841 }
9842 addReplyBulkCString(c,"save");
9843 addReplyBulkCString(c,buf);
9844 sdsfree(buf);
9845 matches++;
9846 }
500ece7c 9847 decrRefCount(o);
9848 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9849}
9850
9851static void configCommand(redisClient *c) {
9852 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9853 if (c->argc != 4) goto badarity;
9854 configSetCommand(c);
9855 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9856 if (c->argc != 3) goto badarity;
9857 configGetCommand(c);
9858 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9859 if (c->argc != 2) goto badarity;
9860 server.stat_numcommands = 0;
9861 server.stat_numconnections = 0;
9862 server.stat_expiredkeys = 0;
9863 server.stat_starttime = time(NULL);
9864 addReply(c,shared.ok);
9865 } else {
9866 addReplySds(c,sdscatprintf(sdsempty(),
9867 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9868 }
9869 return;
9870
9871badarity:
9872 addReplySds(c,sdscatprintf(sdsempty(),
9873 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9874 (char*) c->argv[1]->ptr));
9875}
9876
befec3cd 9877/* =========================== Pubsub implementation ======================== */
9878
ffc6b7f8 9879static void freePubsubPattern(void *p) {
9880 pubsubPattern *pat = p;
9881
9882 decrRefCount(pat->pattern);
9883 zfree(pat);
9884}
9885
9886static int listMatchPubsubPattern(void *a, void *b) {
9887 pubsubPattern *pa = a, *pb = b;
9888
9889 return (pa->client == pb->client) &&
bf028098 9890 (equalStringObjects(pa->pattern,pb->pattern));
ffc6b7f8 9891}
9892
9893/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9894 * 0 if the client was already subscribed to that channel. */
9895static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
befec3cd 9896 struct dictEntry *de;
9897 list *clients = NULL;
9898 int retval = 0;
9899
ffc6b7f8 9900 /* Add the channel to the client -> channels hash table */
9901 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
befec3cd 9902 retval = 1;
ffc6b7f8 9903 incrRefCount(channel);
9904 /* Add the client to the channel -> list of clients hash table */
9905 de = dictFind(server.pubsub_channels,channel);
befec3cd 9906 if (de == NULL) {
9907 clients = listCreate();
ffc6b7f8 9908 dictAdd(server.pubsub_channels,channel,clients);
9909 incrRefCount(channel);
befec3cd 9910 } else {
9911 clients = dictGetEntryVal(de);
9912 }
9913 listAddNodeTail(clients,c);
9914 }
9915 /* Notify the client */
9916 addReply(c,shared.mbulk3);
9917 addReply(c,shared.subscribebulk);
ffc6b7f8 9918 addReplyBulk(c,channel);
9919 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
befec3cd 9920 return retval;
9921}
9922
ffc6b7f8 9923/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9924 * 0 if the client was not subscribed to the specified channel. */
9925static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
befec3cd 9926 struct dictEntry *de;
9927 list *clients;
9928 listNode *ln;
9929 int retval = 0;
9930
ffc6b7f8 9931 /* Remove the channel from the client -> channels hash table */
9932 incrRefCount(channel); /* channel may be just a pointer to the same object
201037f5 9933 we have in the hash tables. Protect it... */
ffc6b7f8 9934 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
befec3cd 9935 retval = 1;
ffc6b7f8 9936 /* Remove the client from the channel -> clients list hash table */
9937 de = dictFind(server.pubsub_channels,channel);
befec3cd 9938 assert(de != NULL);
9939 clients = dictGetEntryVal(de);
9940 ln = listSearchKey(clients,c);
9941 assert(ln != NULL);
9942 listDelNode(clients,ln);
ff767a75 9943 if (listLength(clients) == 0) {
9944 /* Free the list and associated hash entry at all if this was
9945 * the latest client, so that it will be possible to abuse
ffc6b7f8 9946 * Redis PUBSUB creating millions of channels. */
9947 dictDelete(server.pubsub_channels,channel);
ff767a75 9948 }
befec3cd 9949 }
9950 /* Notify the client */
9951 if (notify) {
9952 addReply(c,shared.mbulk3);
9953 addReply(c,shared.unsubscribebulk);
ffc6b7f8 9954 addReplyBulk(c,channel);
9955 addReplyLong(c,dictSize(c->pubsub_channels)+
9956 listLength(c->pubsub_patterns));
9957
9958 }
9959 decrRefCount(channel); /* it is finally safe to release it */
9960 return retval;
9961}
9962
9963/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9964static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9965 int retval = 0;
9966
9967 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9968 retval = 1;
9969 pubsubPattern *pat;
9970 listAddNodeTail(c->pubsub_patterns,pattern);
9971 incrRefCount(pattern);
9972 pat = zmalloc(sizeof(*pat));
9973 pat->pattern = getDecodedObject(pattern);
9974 pat->client = c;
9975 listAddNodeTail(server.pubsub_patterns,pat);
9976 }
9977 /* Notify the client */
9978 addReply(c,shared.mbulk3);
9979 addReply(c,shared.psubscribebulk);
9980 addReplyBulk(c,pattern);
9981 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9982 return retval;
9983}
9984
9985/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9986 * 0 if the client was not subscribed to the specified channel. */
9987static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9988 listNode *ln;
9989 pubsubPattern pat;
9990 int retval = 0;
9991
9992 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9993 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9994 retval = 1;
9995 listDelNode(c->pubsub_patterns,ln);
9996 pat.client = c;
9997 pat.pattern = pattern;
9998 ln = listSearchKey(server.pubsub_patterns,&pat);
9999 listDelNode(server.pubsub_patterns,ln);
10000 }
10001 /* Notify the client */
10002 if (notify) {
10003 addReply(c,shared.mbulk3);
10004 addReply(c,shared.punsubscribebulk);
10005 addReplyBulk(c,pattern);
10006 addReplyLong(c,dictSize(c->pubsub_channels)+
10007 listLength(c->pubsub_patterns));
befec3cd 10008 }
ffc6b7f8 10009 decrRefCount(pattern);
befec3cd 10010 return retval;
10011}
10012
ffc6b7f8 10013/* Unsubscribe from all the channels. Return the number of channels the
10014 * client was subscribed from. */
10015static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10016 dictIterator *di = dictGetIterator(c->pubsub_channels);
befec3cd 10017 dictEntry *de;
10018 int count = 0;
10019
10020 while((de = dictNext(di)) != NULL) {
ffc6b7f8 10021 robj *channel = dictGetEntryKey(de);
befec3cd 10022
ffc6b7f8 10023 count += pubsubUnsubscribeChannel(c,channel,notify);
befec3cd 10024 }
10025 dictReleaseIterator(di);
10026 return count;
10027}
10028
ffc6b7f8 10029/* Unsubscribe from all the patterns. Return the number of patterns the
10030 * client was subscribed from. */
10031static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10032 listNode *ln;
10033 listIter li;
10034 int count = 0;
10035
10036 listRewind(c->pubsub_patterns,&li);
10037 while ((ln = listNext(&li)) != NULL) {
10038 robj *pattern = ln->value;
10039
10040 count += pubsubUnsubscribePattern(c,pattern,notify);
10041 }
10042 return count;
10043}
10044
befec3cd 10045/* Publish a message */
ffc6b7f8 10046static int pubsubPublishMessage(robj *channel, robj *message) {
befec3cd 10047 int receivers = 0;
10048 struct dictEntry *de;
ffc6b7f8 10049 listNode *ln;
10050 listIter li;
befec3cd 10051
ffc6b7f8 10052 /* Send to clients listening for that channel */
10053 de = dictFind(server.pubsub_channels,channel);
befec3cd 10054 if (de) {
10055 list *list = dictGetEntryVal(de);
10056 listNode *ln;
10057 listIter li;
10058
10059 listRewind(list,&li);
10060 while ((ln = listNext(&li)) != NULL) {
10061 redisClient *c = ln->value;
10062
10063 addReply(c,shared.mbulk3);
10064 addReply(c,shared.messagebulk);
ffc6b7f8 10065 addReplyBulk(c,channel);
befec3cd 10066 addReplyBulk(c,message);
10067 receivers++;
10068 }
10069 }
ffc6b7f8 10070 /* Send to clients listening to matching channels */
10071 if (listLength(server.pubsub_patterns)) {
10072 listRewind(server.pubsub_patterns,&li);
10073 channel = getDecodedObject(channel);
10074 while ((ln = listNext(&li)) != NULL) {
10075 pubsubPattern *pat = ln->value;
10076
10077 if (stringmatchlen((char*)pat->pattern->ptr,
10078 sdslen(pat->pattern->ptr),
10079 (char*)channel->ptr,
10080 sdslen(channel->ptr),0)) {
c8d0ea0e 10081 addReply(pat->client,shared.mbulk4);
10082 addReply(pat->client,shared.pmessagebulk);
10083 addReplyBulk(pat->client,pat->pattern);
ffc6b7f8 10084 addReplyBulk(pat->client,channel);
10085 addReplyBulk(pat->client,message);
10086 receivers++;
10087 }
10088 }
10089 decrRefCount(channel);
10090 }
befec3cd 10091 return receivers;
10092}
10093
10094static void subscribeCommand(redisClient *c) {
10095 int j;
10096
10097 for (j = 1; j < c->argc; j++)
ffc6b7f8 10098 pubsubSubscribeChannel(c,c->argv[j]);
befec3cd 10099}
10100
10101static void unsubscribeCommand(redisClient *c) {
10102 if (c->argc == 1) {
ffc6b7f8 10103 pubsubUnsubscribeAllChannels(c,1);
10104 return;
10105 } else {
10106 int j;
10107
10108 for (j = 1; j < c->argc; j++)
10109 pubsubUnsubscribeChannel(c,c->argv[j],1);
10110 }
10111}
10112
10113static void psubscribeCommand(redisClient *c) {
10114 int j;
10115
10116 for (j = 1; j < c->argc; j++)
10117 pubsubSubscribePattern(c,c->argv[j]);
10118}
10119
10120static void punsubscribeCommand(redisClient *c) {
10121 if (c->argc == 1) {
10122 pubsubUnsubscribeAllPatterns(c,1);
befec3cd 10123 return;
10124 } else {
10125 int j;
10126
10127 for (j = 1; j < c->argc; j++)
ffc6b7f8 10128 pubsubUnsubscribePattern(c,c->argv[j],1);
befec3cd 10129 }
10130}
10131
10132static void publishCommand(redisClient *c) {
10133 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
10134 addReplyLong(c,receivers);
10135}
10136
7f957c92 10137/* ================================= Debugging ============================== */
10138
10139static void debugCommand(redisClient *c) {
10140 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10141 *((char*)-1) = 'x';
210e29f7 10142 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10143 if (rdbSave(server.dbfilename) != REDIS_OK) {
10144 addReply(c,shared.err);
10145 return;
10146 }
10147 emptyDb();
10148 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10149 addReply(c,shared.err);
10150 return;
10151 }
10152 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10153 addReply(c,shared.ok);
71c2b467 10154 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10155 emptyDb();
10156 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10157 addReply(c,shared.err);
10158 return;
10159 }
10160 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10161 addReply(c,shared.ok);
333298da 10162 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10163 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10164 robj *key, *val;
10165
10166 if (!de) {
10167 addReply(c,shared.nokeyerr);
10168 return;
10169 }
10170 key = dictGetEntryKey(de);
10171 val = dictGetEntryVal(de);
59146ef3 10172 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
10173 key->storage == REDIS_VM_SWAPPING)) {
07efaf74 10174 char *strenc;
10175 char buf[128];
10176
10177 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10178 strenc = strencoding[val->encoding];
10179 } else {
10180 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10181 strenc = buf;
10182 }
ace06542 10183 addReplySds(c,sdscatprintf(sdsempty(),
10184 "+Key at:%p refcount:%d, value at:%p refcount:%d "
07efaf74 10185 "encoding:%s serializedlength:%lld\r\n",
682ac724 10186 (void*)key, key->refcount, (void*)val, val->refcount,
07efaf74 10187 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 10188 } else {
10189 addReplySds(c,sdscatprintf(sdsempty(),
10190 "+Key at:%p refcount:%d, value swapped at: page %llu "
10191 "using %llu pages\r\n",
10192 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10193 (unsigned long long) key->vm.usedpages));
10194 }
78ebe4c8 10195 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10196 lookupKeyRead(c->db,c->argv[2]);
10197 addReply(c,shared.ok);
7d30035d 10198 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10199 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10200 robj *key, *val;
10201
10202 if (!server.vm_enabled) {
10203 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10204 return;
10205 }
10206 if (!de) {
10207 addReply(c,shared.nokeyerr);
10208 return;
10209 }
10210 key = dictGetEntryKey(de);
10211 val = dictGetEntryVal(de);
4ef8de8a 10212 /* If the key is shared we want to create a copy */
10213 if (key->refcount > 1) {
10214 robj *newkey = dupStringObject(key);
10215 decrRefCount(key);
10216 key = dictGetEntryKey(de) = newkey;
10217 }
10218 /* Swap it */
7d30035d 10219 if (key->storage != REDIS_VM_MEMORY) {
10220 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 10221 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 10222 dictGetEntryVal(de) = NULL;
10223 addReply(c,shared.ok);
10224 } else {
10225 addReply(c,shared.err);
10226 }
59305dc7 10227 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
10228 long keys, j;
10229 robj *key, *val;
10230 char buf[128];
10231
10232 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
10233 return;
10234 for (j = 0; j < keys; j++) {
10235 snprintf(buf,sizeof(buf),"key:%lu",j);
10236 key = createStringObject(buf,strlen(buf));
10237 if (lookupKeyRead(c->db,key) != NULL) {
10238 decrRefCount(key);
10239 continue;
10240 }
10241 snprintf(buf,sizeof(buf),"value:%lu",j);
10242 val = createStringObject(buf,strlen(buf));
10243 dictAdd(c->db->dict,key,val);
10244 }
10245 addReply(c,shared.ok);
7f957c92 10246 } else {
333298da 10247 addReplySds(c,sdsnew(
bdcb92f2 10248 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 10249 }
10250}
56906eef 10251
6c96ba7d 10252static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 10253 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
6c96ba7d 10254 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
dfc5e96c 10255#ifdef HAVE_BACKTRACE
10256 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10257 *((char*)-1) = 'x';
10258#endif
10259}
10260
c651fd9e 10261static void _redisPanic(char *msg, char *file, int line) {
10262 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
17772754 10263 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
c651fd9e 10264#ifdef HAVE_BACKTRACE
10265 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10266 *((char*)-1) = 'x';
10267#endif
10268}
10269
bcfc686d 10270/* =================================== Main! ================================ */
56906eef 10271
bcfc686d 10272#ifdef __linux__
10273int linuxOvercommitMemoryValue(void) {
10274 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10275 char buf[64];
56906eef 10276
bcfc686d 10277 if (!fp) return -1;
10278 if (fgets(buf,64,fp) == NULL) {
10279 fclose(fp);
10280 return -1;
10281 }
10282 fclose(fp);
56906eef 10283
bcfc686d 10284 return atoi(buf);
10285}
10286
10287void linuxOvercommitMemoryWarning(void) {
10288 if (linuxOvercommitMemoryValue() == 0) {
7ccd2d0a 10289 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
bcfc686d 10290 }
10291}
10292#endif /* __linux__ */
10293
10294static void daemonize(void) {
10295 int fd;
10296 FILE *fp;
10297
10298 if (fork() != 0) exit(0); /* parent exits */
10299 setsid(); /* create a new session */
10300
10301 /* Every output goes to /dev/null. If Redis is daemonized but
10302 * the 'logfile' is set to 'stdout' in the configuration file
10303 * it will not log at all. */
10304 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10305 dup2(fd, STDIN_FILENO);
10306 dup2(fd, STDOUT_FILENO);
10307 dup2(fd, STDERR_FILENO);
10308 if (fd > STDERR_FILENO) close(fd);
10309 }
10310 /* Try to write the pid file */
10311 fp = fopen(server.pidfile,"w");
10312 if (fp) {
10313 fprintf(fp,"%d\n",getpid());
10314 fclose(fp);
56906eef 10315 }
56906eef 10316}
10317
42ab0172
AO
10318static void version() {
10319 printf("Redis server version %s\n", REDIS_VERSION);
10320 exit(0);
10321}
10322
723fb69b
AO
10323static void usage() {
10324 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 10325 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
10326 exit(1);
10327}
10328
bcfc686d 10329int main(int argc, char **argv) {
9651a787 10330 time_t start;
10331
bcfc686d 10332 initServerConfig();
10333 if (argc == 2) {
44efe66e 10334 if (strcmp(argv[1], "-v") == 0 ||
10335 strcmp(argv[1], "--version") == 0) version();
10336 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 10337 resetServerSaveParams();
10338 loadServerConfig(argv[1]);
723fb69b
AO
10339 } else if ((argc > 2)) {
10340 usage();
bcfc686d 10341 } else {
10342 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10343 }
bcfc686d 10344 if (server.daemonize) daemonize();
71c54b21 10345 initServer();
bcfc686d 10346 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10347#ifdef __linux__
10348 linuxOvercommitMemoryWarning();
10349#endif
9651a787 10350 start = time(NULL);
bcfc686d 10351 if (server.appendonly) {
10352 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 10353 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 10354 } else {
10355 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 10356 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 10357 }
bcfc686d 10358 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 10359 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 10360 aeMain(server.el);
10361 aeDeleteEventLoop(server.el);
10362 return 0;
10363}
10364
10365/* ============================= Backtrace support ========================= */
10366
10367#ifdef HAVE_BACKTRACE
10368static char *findFuncName(void *pointer, unsigned long *offset);
10369
56906eef 10370static void *getMcontextEip(ucontext_t *uc) {
10371#if defined(__FreeBSD__)
10372 return (void*) uc->uc_mcontext.mc_eip;
10373#elif defined(__dietlibc__)
10374 return (void*) uc->uc_mcontext.eip;
06db1f50 10375#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 10376 #if __x86_64__
10377 return (void*) uc->uc_mcontext->__ss.__rip;
10378 #else
56906eef 10379 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 10380 #endif
06db1f50 10381#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 10382 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 10383 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 10384 #else
10385 return (void*) uc->uc_mcontext->__ss.__eip;
e0a62c7f 10386 #endif
54bac49d 10387#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 10388 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 10389#elif defined(__ia64__) /* Linux IA64 */
10390 return (void*) uc->uc_mcontext.sc_ip;
10391#else
10392 return NULL;
56906eef 10393#endif
10394}
10395
10396static void segvHandler(int sig, siginfo_t *info, void *secret) {
10397 void *trace[100];
10398 char **messages = NULL;
10399 int i, trace_size = 0;
10400 unsigned long offset=0;
56906eef 10401 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 10402 sds infostring;
56906eef 10403 REDIS_NOTUSED(info);
10404
10405 redisLog(REDIS_WARNING,
10406 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 10407 infostring = genRedisInfoString();
10408 redisLog(REDIS_WARNING, "%s",infostring);
10409 /* It's not safe to sdsfree() the returned string under memory
10410 * corruption conditions. Let it leak as we are going to abort */
e0a62c7f 10411
56906eef 10412 trace_size = backtrace(trace, 100);
de96dbfe 10413 /* overwrite sigaction with caller's address */
b91cf5ef 10414 if (getMcontextEip(uc) != NULL) {
10415 trace[1] = getMcontextEip(uc);
10416 }
56906eef 10417 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 10418
d76412d1 10419 for (i=1; i<trace_size; ++i) {
56906eef 10420 char *fn = findFuncName(trace[i], &offset), *p;
10421
10422 p = strchr(messages[i],'+');
10423 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
10424 redisLog(REDIS_WARNING,"%s", messages[i]);
10425 } else {
10426 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
10427 }
10428 }
b177fd30 10429 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 10430 _exit(0);
fe3bbfbe 10431}
56906eef 10432
10433static void setupSigSegvAction(void) {
10434 struct sigaction act;
10435
10436 sigemptyset (&act.sa_mask);
10437 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10438 * is used. Otherwise, sa_handler is used */
10439 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10440 act.sa_sigaction = segvHandler;
10441 sigaction (SIGSEGV, &act, NULL);
10442 sigaction (SIGBUS, &act, NULL);
12fea928 10443 sigaction (SIGFPE, &act, NULL);
10444 sigaction (SIGILL, &act, NULL);
10445 sigaction (SIGBUS, &act, NULL);
e65fdc78 10446 return;
56906eef 10447}
e65fdc78 10448
bcfc686d 10449#include "staticsymbols.h"
10450/* This function try to convert a pointer into a function name. It's used in
10451 * oreder to provide a backtrace under segmentation fault that's able to
10452 * display functions declared as static (otherwise the backtrace is useless). */
10453static char *findFuncName(void *pointer, unsigned long *offset){
10454 int i, ret = -1;
10455 unsigned long off, minoff = 0;
ed9b544e 10456
bcfc686d 10457 /* Try to match against the Symbol with the smallest offset */
10458 for (i=0; symsTable[i].pointer; i++) {
10459 unsigned long lp = (unsigned long) pointer;
0bc03378 10460
bcfc686d 10461 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
10462 off=lp-symsTable[i].pointer;
10463 if (ret < 0 || off < minoff) {
10464 minoff=off;
10465 ret=i;
10466 }
10467 }
0bc03378 10468 }
bcfc686d 10469 if (ret == -1) return NULL;
10470 *offset = minoff;
10471 return symsTable[ret].name;
0bc03378 10472}
bcfc686d 10473#else /* HAVE_BACKTRACE */
10474static void setupSigSegvAction(void) {
0bc03378 10475}
bcfc686d 10476#endif /* HAVE_BACKTRACE */
0bc03378 10477
ed9b544e 10478
ed9b544e 10479
bcfc686d 10480/* The End */
10481
10482
ed9b544e 10483