]> git.saurik.com Git - redis.git/blame - redis.c
use ziplists in SORT STORE until the thresholds are determined
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
9005896c 30#define REDIS_VERSION "2.1.1"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
40#include <signal.h>
fbf9bcdb 41
42#ifdef HAVE_BACKTRACE
c9468bcf 43#include <execinfo.h>
44#include <ucontext.h>
fbf9bcdb 45#endif /* HAVE_BACKTRACE */
46
ed9b544e 47#include <sys/wait.h>
48#include <errno.h>
49#include <assert.h>
50#include <ctype.h>
51#include <stdarg.h>
52#include <inttypes.h>
53#include <arpa/inet.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#include <sys/time.h>
57#include <sys/resource.h>
2895e862 58#include <sys/uio.h>
f78fd11b 59#include <limits.h>
fb82e75c 60#include <float.h>
a7866db6 61#include <math.h>
92f8e882 62#include <pthread.h>
0bc1b2f6 63
64#if defined(__sun)
5043dff3 65#include "solarisfixes.h"
66#endif
ed9b544e 67
c9468bcf 68#include "redis.h"
ed9b544e 69#include "ae.h" /* Event driven programming library */
70#include "sds.h" /* Dynamic safe strings */
71#include "anet.h" /* Networking the easy way */
72#include "dict.h" /* Hash tables */
73#include "adlist.h" /* Linked lists */
74#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 75#include "lzf.h" /* LZF compression library */
76#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
ba798261 77#include "zipmap.h" /* Compact dictionary-alike data structure */
c7d9d662 78#include "ziplist.h" /* Compact list data structure */
ba798261 79#include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
5436146c 80#include "release.h" /* Release and/or git repository information */
ed9b544e 81
82/* Error codes */
83#define REDIS_OK 0
84#define REDIS_ERR -1
85
86/* Static server configuration */
87#define REDIS_SERVERPORT 6379 /* TCP port */
88#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 89#define REDIS_IOBUF_LEN 1024
ed9b544e 90#define REDIS_LOADBUF_LEN 1024
248ea310 91#define REDIS_STATIC_ARGS 8
ed9b544e 92#define REDIS_DEFAULT_DBNUM 16
93#define REDIS_CONFIGLINE_MAX 1024
94#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
95#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
8ca3e9d1 96#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
6f376729 97#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 98#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
99
100/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
101#define REDIS_WRITEV_THRESHOLD 3
102/* Max number of iovecs used for each writev call */
103#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 104
105/* Hash table parameters */
106#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 107
108/* Command flags */
3fd78bcd 109#define REDIS_CMD_BULK 1 /* Bulk write command */
110#define REDIS_CMD_INLINE 2 /* Inline command */
111/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
112 this flags will return an error when the 'maxmemory' option is set in the
113 config file and the server is using more than maxmemory bytes of memory.
114 In short this commands are denied on low memory conditions. */
115#define REDIS_CMD_DENYOOM 4
4005fef1 116#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
ed9b544e 117
118/* Object types */
119#define REDIS_STRING 0
120#define REDIS_LIST 1
121#define REDIS_SET 2
1812e024 122#define REDIS_ZSET 3
123#define REDIS_HASH 4
f78fd11b 124
5234952b 125/* Objects encoding. Some kind of objects like Strings and Hashes can be
126 * internally represented in multiple ways. The 'encoding' field of the object
127 * is set to one of this fields for this object. */
c7d9d662
PN
128#define REDIS_ENCODING_RAW 0 /* Raw representation */
129#define REDIS_ENCODING_INT 1 /* Encoded as integer */
130#define REDIS_ENCODING_HT 2 /* Encoded as hash table */
131#define REDIS_ENCODING_ZIPMAP 3 /* Encoded as zipmap */
132#define REDIS_ENCODING_LIST 4 /* Encoded as zipmap */
133#define REDIS_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
942a3961 134
07efaf74 135static char* strencoding[] = {
136 "raw", "int", "zipmap", "hashtable"
137};
138
f78fd11b 139/* Object types only used for dumping to disk */
bb32ede5 140#define REDIS_EXPIRETIME 253
ed9b544e 141#define REDIS_SELECTDB 254
142#define REDIS_EOF 255
143
f78fd11b 144/* Defines related to the dump file format. To store 32 bits lengths for short
145 * keys requires a lot of space, so we check the most significant 2 bits of
146 * the first byte to interpreter the length:
147 *
148 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
149 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
150 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 151 * 11|000000 this means: specially encoded object will follow. The six bits
152 * number specify the kind of object that follows.
153 * See the REDIS_RDB_ENC_* defines.
f78fd11b 154 *
10c43610 155 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
156 * values, will fit inside. */
f78fd11b 157#define REDIS_RDB_6BITLEN 0
158#define REDIS_RDB_14BITLEN 1
159#define REDIS_RDB_32BITLEN 2
17be1a4a 160#define REDIS_RDB_ENCVAL 3
f78fd11b 161#define REDIS_RDB_LENERR UINT_MAX
162
a4d1ba9a 163/* When a length of a string object stored on disk has the first two bits
164 * set, the remaining two bits specify a special encoding for the object
165 * accordingly to the following defines: */
166#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
167#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
168#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 169#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 170
75680a3c 171/* Virtual memory object->where field. */
172#define REDIS_VM_MEMORY 0 /* The object is on memory */
173#define REDIS_VM_SWAPPED 1 /* The object is on disk */
174#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
175#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
176
06224fec 177/* Virtual memory static configuration stuff.
178 * Check vmFindContiguousPages() to know more about this magic numbers. */
179#define REDIS_VM_MAX_NEAR_PAGES 65536
180#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 181#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 182#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 183/* The following is the *percentage* of completed I/O jobs to process when the
184 * handelr is called. While Virtual Memory I/O operations are performed by
185 * threads, this operations must be processed by the main thread when completed
186 * in order to take effect. */
c953f24b 187#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 188
ed9b544e 189/* Client flags */
d5d55fc3 190#define REDIS_SLAVE 1 /* This client is a slave server */
191#define REDIS_MASTER 2 /* This client is a master server */
192#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
193#define REDIS_MULTI 8 /* This client is in a MULTI context */
194#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
195#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
37ab76c9 196#define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
ed9b544e 197
40d224a9 198/* Slave replication state - slave side */
ed9b544e 199#define REDIS_REPL_NONE 0 /* No active replication */
200#define REDIS_REPL_CONNECT 1 /* Must connect to master */
201#define REDIS_REPL_CONNECTED 2 /* Connected to master */
202
40d224a9 203/* Slave replication state - from the point of view of master
204 * Note that in SEND_BULK and ONLINE state the slave receives new updates
205 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
206 * to start the next background saving in order to send updates to it. */
207#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
208#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
209#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
210#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
211
ed9b544e 212/* List related stuff */
213#define REDIS_HEAD 0
214#define REDIS_TAIL 1
215
216/* Sort operations */
217#define REDIS_SORT_GET 0
443c6409 218#define REDIS_SORT_ASC 1
219#define REDIS_SORT_DESC 2
ed9b544e 220#define REDIS_SORTKEY_MAX 1024
221
222/* Log levels */
223#define REDIS_DEBUG 0
f870935d 224#define REDIS_VERBOSE 1
225#define REDIS_NOTICE 2
226#define REDIS_WARNING 3
ed9b544e 227
228/* Anti-warning macro... */
229#define REDIS_NOTUSED(V) ((void) V)
230
6b47e12e 231#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
232#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 233
48f0308a 234/* Append only defines */
235#define APPENDFSYNC_NO 0
236#define APPENDFSYNC_ALWAYS 1
237#define APPENDFSYNC_EVERYSEC 2
238
cbba7dd7 239/* Hashes related defaults */
240#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
241#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
242
dfc5e96c 243/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 244#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
c651fd9e 245#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
6c96ba7d 246static void _redisAssert(char *estr, char *file, int line);
c651fd9e 247static void _redisPanic(char *msg, char *file, int line);
dfc5e96c 248
ed9b544e 249/*================================= Data types ============================== */
250
251/* A redis object, that is a type able to hold a string / list / set */
75680a3c 252
253/* The VM object structure */
254struct redisObjectVM {
3a66edc7 255 off_t page; /* the page at witch the object is stored on disk */
256 off_t usedpages; /* number of pages used on disk */
257 time_t atime; /* Last access time */
75680a3c 258} vm;
259
260/* The actual Redis Object */
ed9b544e 261typedef struct redisObject {
ed9b544e 262 void *ptr;
942a3961 263 unsigned char type;
264 unsigned char encoding;
d894161b 265 unsigned char storage; /* If this object is a key, where is the value?
266 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
267 unsigned char vtype; /* If this object is a key, and value is swapped out,
268 * this is the type of the swapped out object. */
ed9b544e 269 int refcount;
75680a3c 270 /* VM fields, this are only allocated if VM is active, otherwise the
271 * object allocation function will just allocate
272 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
273 * Redis without VM active will not have any overhead. */
274 struct redisObjectVM vm;
ed9b544e 275} robj;
276
dfc5e96c 277/* Macro used to initalize a Redis object allocated on the stack.
278 * Note that this macro is taken near the structure definition to make sure
279 * we'll update it when the structure is changed, to avoid bugs like
280 * bug #85 introduced exactly in this way. */
281#define initStaticStringObject(_var,_ptr) do { \
282 _var.refcount = 1; \
283 _var.type = REDIS_STRING; \
284 _var.encoding = REDIS_ENCODING_RAW; \
285 _var.ptr = _ptr; \
3a66edc7 286 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 287} while(0);
288
3305306f 289typedef struct redisDb {
4409877e 290 dict *dict; /* The keyspace for this DB */
291 dict *expires; /* Timeout of keys with a timeout set */
37ab76c9 292 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 293 dict *io_keys; /* Keys with clients waiting for VM I/O */
37ab76c9 294 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
3305306f 295 int id;
296} redisDb;
297
6e469882 298/* Client MULTI/EXEC state */
299typedef struct multiCmd {
300 robj **argv;
301 int argc;
302 struct redisCommand *cmd;
303} multiCmd;
304
305typedef struct multiState {
306 multiCmd *commands; /* Array of MULTI commands */
307 int count; /* Total number of MULTI commands */
308} multiState;
309
ed9b544e 310/* With multiplexing we need to take per-clinet state.
311 * Clients are taken in a liked list. */
312typedef struct redisClient {
313 int fd;
3305306f 314 redisDb *db;
ed9b544e 315 int dictid;
316 sds querybuf;
e8a74421 317 robj **argv, **mbargv;
318 int argc, mbargc;
40d224a9 319 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 320 int multibulk; /* multi bulk command format active */
ed9b544e 321 list *reply;
322 int sentlen;
323 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 324 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 325 int slaveseldb; /* slave selected db, if this client is a slave */
326 int authenticated; /* when requirepass is non-NULL */
327 int replstate; /* replication state if this is a slave */
328 int repldbfd; /* replication DB file descriptor */
6e469882 329 long repldboff; /* replication DB file offset */
40d224a9 330 off_t repldbsize; /* replication DB file size */
6e469882 331 multiState mstate; /* MULTI/EXEC state */
37ab76c9 332 robj **blocking_keys; /* The key we are waiting to terminate a blocking
4409877e 333 * operation such as BLPOP. Otherwise NULL. */
37ab76c9 334 int blocking_keys_num; /* Number of blocking keys */
4409877e 335 time_t blockingto; /* Blocking operation timeout. If UNIX current time
336 * is >= blockingto then the operation timed out. */
92f8e882 337 list *io_keys; /* Keys this client is waiting to be loaded from the
338 * swap file in order to continue. */
37ab76c9 339 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
ffc6b7f8 340 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
341 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
ed9b544e 342} redisClient;
343
344struct saveparam {
345 time_t seconds;
346 int changes;
347};
348
349/* Global server state structure */
350struct redisServer {
351 int port;
352 int fd;
3305306f 353 redisDb *db;
ed9b544e 354 long long dirty; /* changes to DB from the last save */
355 list *clients;
87eca727 356 list *slaves, *monitors;
ed9b544e 357 char neterr[ANET_ERR_LEN];
358 aeEventLoop *el;
359 int cronloops; /* number of times the cron function run */
360 list *objfreelist; /* A list of freed objects to avoid malloc() */
361 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 362 /* Fields used only for stats */
363 time_t stat_starttime; /* server start time */
364 long long stat_numcommands; /* number of processed commands */
365 long long stat_numconnections; /* number of connections received */
2a6a2ed1 366 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 367 /* Configuration */
368 int verbosity;
369 int glueoutputbuf;
370 int maxidletime;
371 int dbnum;
372 int daemonize;
44b38ef4 373 int appendonly;
48f0308a 374 int appendfsync;
fab43727 375 int shutdown_asap;
48f0308a 376 time_t lastfsync;
44b38ef4 377 int appendfd;
378 int appendseldb;
ed329fcf 379 char *pidfile;
9f3c422c 380 pid_t bgsavechildpid;
9d65a1bb 381 pid_t bgrewritechildpid;
382 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
28ed1f33 383 sds aofbuf; /* AOF buffer, written before entering the event loop */
ed9b544e 384 struct saveparam *saveparams;
385 int saveparamslen;
386 char *logfile;
387 char *bindaddr;
388 char *dbfilename;
44b38ef4 389 char *appendfilename;
abcb223e 390 char *requirepass;
121f70cf 391 int rdbcompression;
8ca3e9d1 392 int activerehashing;
ed9b544e 393 /* Replication related */
394 int isslave;
d0ccebcf 395 char *masterauth;
ed9b544e 396 char *masterhost;
397 int masterport;
40d224a9 398 redisClient *master; /* client that is master for this slave */
ed9b544e 399 int replstate;
285add55 400 unsigned int maxclients;
4ef8de8a 401 unsigned long long maxmemory;
d5d55fc3 402 unsigned int blpop_blocked_clients;
403 unsigned int vm_blocked_clients;
ed9b544e 404 /* Sort parameters - qsort_r() is only available under BSD so we
405 * have to take this state global, in order to pass it to sortCompare() */
406 int sort_desc;
407 int sort_alpha;
408 int sort_bypattern;
75680a3c 409 /* Virtual memory configuration */
410 int vm_enabled;
054e426d 411 char *vm_swap_file;
75680a3c 412 off_t vm_page_size;
413 off_t vm_pages;
4ef8de8a 414 unsigned long long vm_max_memory;
cbba7dd7 415 /* Hashes config */
416 size_t hash_max_zipmap_entries;
417 size_t hash_max_zipmap_value;
75680a3c 418 /* Virtual memory state */
419 FILE *vm_fp;
420 int vm_fd;
421 off_t vm_next_page; /* Next probably empty page */
422 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 423 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 424 time_t unixtime; /* Unix time sampled every second. */
92f8e882 425 /* Virtual memory I/O threads stuff */
92f8e882 426 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 427 * put the result of the operation in the io_done list. While the
428 * job is being processed, it's put on io_processing queue. */
429 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
430 list *io_processing; /* List of VM I/O jobs being processed */
431 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 432 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 433 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 434 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
435 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 436 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 437 int io_active_threads; /* Number of running I/O threads */
438 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 439 /* Our main thread is blocked on the event loop, locking for sockets ready
440 * to be read or written, so when a threaded I/O operation is ready to be
441 * processed by the main thread, the I/O thread will use a unix pipe to
442 * awake the main thread. The followings are the two pipe FDs. */
443 int io_ready_pipe_read;
444 int io_ready_pipe_write;
7d98e08c 445 /* Virtual memory stats */
446 unsigned long long vm_stats_used_pages;
447 unsigned long long vm_stats_swapped_objects;
448 unsigned long long vm_stats_swapouts;
449 unsigned long long vm_stats_swapins;
befec3cd 450 /* Pubsub */
ffc6b7f8 451 dict *pubsub_channels; /* Map channels to list of subscribed clients */
452 list *pubsub_patterns; /* A list of pubsub_patterns */
befec3cd 453 /* Misc */
b9bc0eef 454 FILE *devnull;
ed9b544e 455};
456
ffc6b7f8 457typedef struct pubsubPattern {
458 redisClient *client;
459 robj *pattern;
460} pubsubPattern;
461
ed9b544e 462typedef void redisCommandProc(redisClient *c);
ca1788b5 463typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
ed9b544e 464struct redisCommand {
465 char *name;
466 redisCommandProc *proc;
467 int arity;
468 int flags;
76583ea4
PN
469 /* Use a function to determine which keys need to be loaded
470 * in the background prior to executing this command. Takes precedence
471 * over vm_firstkey and others, ignored when NULL */
ca1788b5 472 redisVmPreloadProc *vm_preload_proc;
7c775e09 473 /* What keys should be loaded in background when calling this command? */
474 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
475 int vm_lastkey; /* THe last argument that's a key */
476 int vm_keystep; /* The step between first and last key */
ed9b544e 477};
478
de96dbfe 479struct redisFunctionSym {
480 char *name;
56906eef 481 unsigned long pointer;
de96dbfe 482};
483
ed9b544e 484typedef struct _redisSortObject {
485 robj *obj;
486 union {
487 double score;
488 robj *cmpobj;
489 } u;
490} redisSortObject;
491
492typedef struct _redisSortOperation {
493 int type;
494 robj *pattern;
495} redisSortOperation;
496
6b47e12e 497/* ZSETs use a specialized version of Skiplists */
498
499typedef struct zskiplistNode {
500 struct zskiplistNode **forward;
e3870fab 501 struct zskiplistNode *backward;
912b9165 502 unsigned int *span;
6b47e12e 503 double score;
504 robj *obj;
505} zskiplistNode;
506
507typedef struct zskiplist {
e3870fab 508 struct zskiplistNode *header, *tail;
d13f767c 509 unsigned long length;
6b47e12e 510 int level;
511} zskiplist;
512
1812e024 513typedef struct zset {
514 dict *dict;
6b47e12e 515 zskiplist *zsl;
1812e024 516} zset;
517
6b47e12e 518/* Our shared "common" objects */
519
05df7621 520#define REDIS_SHARED_INTEGERS 10000
ed9b544e 521struct sharedObjectsStruct {
c937aa89 522 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 523 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 524 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
525 *outofrangeerr, *plus,
ed9b544e 526 *select0, *select1, *select2, *select3, *select4,
befec3cd 527 *select5, *select6, *select7, *select8, *select9,
c8d0ea0e 528 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
529 *mbulk4, *psubscribebulk, *punsubscribebulk,
530 *integers[REDIS_SHARED_INTEGERS];
ed9b544e 531} shared;
532
a7866db6 533/* Global vars that are actally used as constants. The following double
534 * values are used for double on-disk serialization, and are initialized
535 * at runtime to avoid strange compiler optimizations. */
536
537static double R_Zero, R_PosInf, R_NegInf, R_Nan;
538
92f8e882 539/* VM threaded I/O request message */
b9bc0eef 540#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
541#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
542#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 543typedef struct iojob {
996cb5f7 544 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 545 redisDb *db;/* Redis database */
92f8e882 546 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 547 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 548 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
549 off_t page; /* Swap page where to read/write the object */
248ea310 550 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 551 int canceled; /* True if this command was canceled by blocking side of VM */
552 pthread_t thread; /* ID of the thread processing this entry */
553} iojob;
92f8e882 554
ed9b544e 555/*================================ Prototypes =============================== */
556
557static void freeStringObject(robj *o);
558static void freeListObject(robj *o);
559static void freeSetObject(robj *o);
560static void decrRefCount(void *o);
561static robj *createObject(int type, void *ptr);
562static void freeClient(redisClient *c);
f78fd11b 563static int rdbLoad(char *filename);
ed9b544e 564static void addReply(redisClient *c, robj *obj);
565static void addReplySds(redisClient *c, sds s);
566static void incrRefCount(robj *o);
f78fd11b 567static int rdbSaveBackground(char *filename);
ed9b544e 568static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 569static robj *dupStringObject(robj *o);
248ea310 570static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
dd142b9c 571static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
28ed1f33 572static void flushAppendOnlyFile(void);
44b38ef4 573static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 574static int syncWithMaster(void);
05df7621 575static robj *tryObjectEncoding(robj *o);
9d65a1bb 576static robj *getDecodedObject(robj *o);
3305306f 577static int removeExpire(redisDb *db, robj *key);
578static int expireIfNeeded(redisDb *db, robj *key);
579static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 580static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 581static int deleteKey(redisDb *db, robj *key);
bb32ede5 582static time_t getExpire(redisDb *db, robj *key);
583static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 584static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 585static void freeMemoryIfNeeded(void);
de96dbfe 586static int processCommand(redisClient *c);
56906eef 587static void setupSigSegvAction(void);
a3b21203 588static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 589static void aofRemoveTempFile(pid_t childpid);
0ea663ea 590static size_t stringObjectLen(robj *o);
638e42ac 591static void processInputBuffer(redisClient *c);
6b47e12e 592static zskiplist *zslCreate(void);
fd8ccf44 593static void zslFree(zskiplist *zsl);
2b59cfdf 594static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 595static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 596static void initClientMultiState(redisClient *c);
597static void freeClientMultiState(redisClient *c);
598static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 599static void unblockClientWaitingData(redisClient *c);
4409877e 600static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 601static void vmInit(void);
a35ddf12 602static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 603static robj *vmLoadObject(robj *key);
7e69548d 604static robj *vmPreviewObject(robj *key);
a69a0c9c 605static int vmSwapOneObjectBlocking(void);
606static int vmSwapOneObjectThreaded(void);
7e69548d 607static int vmCanSwapOut(void);
a5819310 608static int tryFreeOneObjectFromFreelist(void);
996cb5f7 609static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
610static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
611static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 612static void lockThreadedIO(void);
613static void unlockThreadedIO(void);
614static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
615static void freeIOJob(iojob *j);
616static void queueIOJob(iojob *j);
a5819310 617static int vmWriteObjectOnSwap(robj *o, off_t page);
618static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 619static void waitEmptyIOJobsQueue(void);
620static void vmReopenSwapFile(void);
970e10bb 621static int vmFreePage(off_t page);
ca1788b5 622static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
3805e04f 623static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
0a6f3f0f 624static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
d5d55fc3 625static int dontWaitForSwappedKey(redisClient *c, robj *key);
626static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
627static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
628static struct redisCommand *lookupCommand(char *name);
629static void call(redisClient *c, struct redisCommand *cmd);
630static void resetClient(redisClient *c);
ada386b2 631static void convertToRealHash(robj *o);
ffc6b7f8 632static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
633static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
634static void freePubsubPattern(void *p);
635static int listMatchPubsubPattern(void *a, void *b);
636static int compareStringObjects(robj *a, robj *b);
bf028098 637static int equalStringObjects(robj *a, robj *b);
befec3cd 638static void usage();
8f63ddca 639static int rewriteAppendOnlyFileBackground(void);
242a64f3 640static int vmSwapObjectBlocking(robj *key, robj *val);
fab43727 641static int prepareForShutdown();
37ab76c9 642static void touchWatchedKey(redisDb *db, robj *key);
9b30e1a2 643static void touchWatchedKeysOnFlush(int dbid);
37ab76c9 644static void unwatchAllKeys(redisClient *c);
ed9b544e 645
abcb223e 646static void authCommand(redisClient *c);
ed9b544e 647static void pingCommand(redisClient *c);
648static void echoCommand(redisClient *c);
649static void setCommand(redisClient *c);
650static void setnxCommand(redisClient *c);
526d00a5 651static void setexCommand(redisClient *c);
ed9b544e 652static void getCommand(redisClient *c);
653static void delCommand(redisClient *c);
654static void existsCommand(redisClient *c);
655static void incrCommand(redisClient *c);
656static void decrCommand(redisClient *c);
657static void incrbyCommand(redisClient *c);
658static void decrbyCommand(redisClient *c);
659static void selectCommand(redisClient *c);
660static void randomkeyCommand(redisClient *c);
661static void keysCommand(redisClient *c);
662static void dbsizeCommand(redisClient *c);
663static void lastsaveCommand(redisClient *c);
664static void saveCommand(redisClient *c);
665static void bgsaveCommand(redisClient *c);
9d65a1bb 666static void bgrewriteaofCommand(redisClient *c);
ed9b544e 667static void shutdownCommand(redisClient *c);
668static void moveCommand(redisClient *c);
669static void renameCommand(redisClient *c);
670static void renamenxCommand(redisClient *c);
671static void lpushCommand(redisClient *c);
672static void rpushCommand(redisClient *c);
673static void lpopCommand(redisClient *c);
674static void rpopCommand(redisClient *c);
675static void llenCommand(redisClient *c);
676static void lindexCommand(redisClient *c);
677static void lrangeCommand(redisClient *c);
678static void ltrimCommand(redisClient *c);
679static void typeCommand(redisClient *c);
680static void lsetCommand(redisClient *c);
681static void saddCommand(redisClient *c);
682static void sremCommand(redisClient *c);
a4460ef4 683static void smoveCommand(redisClient *c);
ed9b544e 684static void sismemberCommand(redisClient *c);
685static void scardCommand(redisClient *c);
12fea928 686static void spopCommand(redisClient *c);
2abb95a9 687static void srandmemberCommand(redisClient *c);
ed9b544e 688static void sinterCommand(redisClient *c);
689static void sinterstoreCommand(redisClient *c);
40d224a9 690static void sunionCommand(redisClient *c);
691static void sunionstoreCommand(redisClient *c);
f4f56e1d 692static void sdiffCommand(redisClient *c);
693static void sdiffstoreCommand(redisClient *c);
ed9b544e 694static void syncCommand(redisClient *c);
695static void flushdbCommand(redisClient *c);
696static void flushallCommand(redisClient *c);
697static void sortCommand(redisClient *c);
698static void lremCommand(redisClient *c);
0f5f7e9a 699static void rpoplpushcommand(redisClient *c);
ed9b544e 700static void infoCommand(redisClient *c);
70003d28 701static void mgetCommand(redisClient *c);
87eca727 702static void monitorCommand(redisClient *c);
3305306f 703static void expireCommand(redisClient *c);
802e8373 704static void expireatCommand(redisClient *c);
f6b141c5 705static void getsetCommand(redisClient *c);
fd88489a 706static void ttlCommand(redisClient *c);
321b0e13 707static void slaveofCommand(redisClient *c);
7f957c92 708static void debugCommand(redisClient *c);
f6b141c5 709static void msetCommand(redisClient *c);
710static void msetnxCommand(redisClient *c);
fd8ccf44 711static void zaddCommand(redisClient *c);
7db723ad 712static void zincrbyCommand(redisClient *c);
cc812361 713static void zrangeCommand(redisClient *c);
50c55df5 714static void zrangebyscoreCommand(redisClient *c);
f44dd428 715static void zcountCommand(redisClient *c);
e3870fab 716static void zrevrangeCommand(redisClient *c);
3c41331e 717static void zcardCommand(redisClient *c);
1b7106e7 718static void zremCommand(redisClient *c);
6e333bbe 719static void zscoreCommand(redisClient *c);
1807985b 720static void zremrangebyscoreCommand(redisClient *c);
6e469882 721static void multiCommand(redisClient *c);
722static void execCommand(redisClient *c);
18b6cb76 723static void discardCommand(redisClient *c);
4409877e 724static void blpopCommand(redisClient *c);
725static void brpopCommand(redisClient *c);
4b00bebd 726static void appendCommand(redisClient *c);
39191553 727static void substrCommand(redisClient *c);
69d95c3e 728static void zrankCommand(redisClient *c);
798d9e55 729static void zrevrankCommand(redisClient *c);
978c2c94 730static void hsetCommand(redisClient *c);
1f1c7695 731static void hsetnxCommand(redisClient *c);
978c2c94 732static void hgetCommand(redisClient *c);
09aeb579
PN
733static void hmsetCommand(redisClient *c);
734static void hmgetCommand(redisClient *c);
07efaf74 735static void hdelCommand(redisClient *c);
92b27fe9 736static void hlenCommand(redisClient *c);
9212eafd 737static void zremrangebyrankCommand(redisClient *c);
5d373da9 738static void zunionstoreCommand(redisClient *c);
739static void zinterstoreCommand(redisClient *c);
78409a0f 740static void hkeysCommand(redisClient *c);
741static void hvalsCommand(redisClient *c);
742static void hgetallCommand(redisClient *c);
a86f14b1 743static void hexistsCommand(redisClient *c);
500ece7c 744static void configCommand(redisClient *c);
01426b05 745static void hincrbyCommand(redisClient *c);
befec3cd 746static void subscribeCommand(redisClient *c);
747static void unsubscribeCommand(redisClient *c);
ffc6b7f8 748static void psubscribeCommand(redisClient *c);
749static void punsubscribeCommand(redisClient *c);
befec3cd 750static void publishCommand(redisClient *c);
37ab76c9 751static void watchCommand(redisClient *c);
752static void unwatchCommand(redisClient *c);
f6b141c5 753
ed9b544e 754/*================================= Globals ================================= */
755
756/* Global vars */
757static struct redisServer server; /* server global state */
1a132bbc 758static struct redisCommand *commandTable;
1a132bbc 759static struct redisCommand readonlyCommandTable[] = {
76583ea4
PN
760 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
761 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
762 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
526d00a5 763 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
764 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
765 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
766 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
767 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
768 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
769 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
770 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
771 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
772 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
773 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
780 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
781 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
782 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
783 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
784 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
785 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
786 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
787 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
788 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
790 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
791 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
792 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
793 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
794 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
795 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
796 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
797 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
798 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
799 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
800 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
801 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
802 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
5d373da9 803 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
804 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
805 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
807 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
808 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
809 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
810 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
811 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
812 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
813 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
1f1c7695 814 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 815 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
d33278d1 816 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 817 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
01426b05 818 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
819 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
820 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
821 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
822 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
823 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 824 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
825 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
826 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
827 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
828 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
829 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
830 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
831 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
832 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
833 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
834 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
835 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
837 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
840 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
842 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
843 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
844 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
845 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
846 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
847 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
848 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
3805e04f 849 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
850 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
851 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
852 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
853 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
854 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
855 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
856 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
857 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
858 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
859 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 860 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
befec3cd 861 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
862 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
ffc6b7f8 863 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
864 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
4005fef1 865 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
37ab76c9 866 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
d55d5c5d 867 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}
ed9b544e 868};
bcfc686d 869
ed9b544e 870/*============================ Utility functions ============================ */
871
872/* Glob-style pattern matching. */
500ece7c 873static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 874 const char *string, int stringLen, int nocase)
875{
876 while(patternLen) {
877 switch(pattern[0]) {
878 case '*':
879 while (pattern[1] == '*') {
880 pattern++;
881 patternLen--;
882 }
883 if (patternLen == 1)
884 return 1; /* match */
885 while(stringLen) {
886 if (stringmatchlen(pattern+1, patternLen-1,
887 string, stringLen, nocase))
888 return 1; /* match */
889 string++;
890 stringLen--;
891 }
892 return 0; /* no match */
893 break;
894 case '?':
895 if (stringLen == 0)
896 return 0; /* no match */
897 string++;
898 stringLen--;
899 break;
900 case '[':
901 {
902 int not, match;
903
904 pattern++;
905 patternLen--;
906 not = pattern[0] == '^';
907 if (not) {
908 pattern++;
909 patternLen--;
910 }
911 match = 0;
912 while(1) {
913 if (pattern[0] == '\\') {
914 pattern++;
915 patternLen--;
916 if (pattern[0] == string[0])
917 match = 1;
918 } else if (pattern[0] == ']') {
919 break;
920 } else if (patternLen == 0) {
921 pattern--;
922 patternLen++;
923 break;
924 } else if (pattern[1] == '-' && patternLen >= 3) {
925 int start = pattern[0];
926 int end = pattern[2];
927 int c = string[0];
928 if (start > end) {
929 int t = start;
930 start = end;
931 end = t;
932 }
933 if (nocase) {
934 start = tolower(start);
935 end = tolower(end);
936 c = tolower(c);
937 }
938 pattern += 2;
939 patternLen -= 2;
940 if (c >= start && c <= end)
941 match = 1;
942 } else {
943 if (!nocase) {
944 if (pattern[0] == string[0])
945 match = 1;
946 } else {
947 if (tolower((int)pattern[0]) == tolower((int)string[0]))
948 match = 1;
949 }
950 }
951 pattern++;
952 patternLen--;
953 }
954 if (not)
955 match = !match;
956 if (!match)
957 return 0; /* no match */
958 string++;
959 stringLen--;
960 break;
961 }
962 case '\\':
963 if (patternLen >= 2) {
964 pattern++;
965 patternLen--;
966 }
967 /* fall through */
968 default:
969 if (!nocase) {
970 if (pattern[0] != string[0])
971 return 0; /* no match */
972 } else {
973 if (tolower((int)pattern[0]) != tolower((int)string[0]))
974 return 0; /* no match */
975 }
976 string++;
977 stringLen--;
978 break;
979 }
980 pattern++;
981 patternLen--;
982 if (stringLen == 0) {
983 while(*pattern == '*') {
984 pattern++;
985 patternLen--;
986 }
987 break;
988 }
989 }
990 if (patternLen == 0 && stringLen == 0)
991 return 1;
992 return 0;
993}
994
500ece7c 995static int stringmatch(const char *pattern, const char *string, int nocase) {
996 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
997}
998
2b619329 999/* Convert a string representing an amount of memory into the number of
1000 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
1001 * (1024*1024*1024).
1002 *
1003 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1004 * set to 0 */
1005static long long memtoll(const char *p, int *err) {
1006 const char *u;
1007 char buf[128];
1008 long mul; /* unit multiplier */
1009 long long val;
1010 unsigned int digits;
1011
1012 if (err) *err = 0;
1013 /* Search the first non digit character. */
1014 u = p;
1015 if (*u == '-') u++;
1016 while(*u && isdigit(*u)) u++;
1017 if (*u == '\0' || !strcasecmp(u,"b")) {
1018 mul = 1;
72324005 1019 } else if (!strcasecmp(u,"k")) {
2b619329 1020 mul = 1000;
72324005 1021 } else if (!strcasecmp(u,"kb")) {
2b619329 1022 mul = 1024;
72324005 1023 } else if (!strcasecmp(u,"m")) {
2b619329 1024 mul = 1000*1000;
72324005 1025 } else if (!strcasecmp(u,"mb")) {
2b619329 1026 mul = 1024*1024;
72324005 1027 } else if (!strcasecmp(u,"g")) {
2b619329 1028 mul = 1000L*1000*1000;
72324005 1029 } else if (!strcasecmp(u,"gb")) {
2b619329 1030 mul = 1024L*1024*1024;
1031 } else {
1032 if (err) *err = 1;
1033 mul = 1;
1034 }
1035 digits = u-p;
1036 if (digits >= sizeof(buf)) {
1037 if (err) *err = 1;
1038 return LLONG_MAX;
1039 }
1040 memcpy(buf,p,digits);
1041 buf[digits] = '\0';
1042 val = strtoll(buf,NULL,10);
1043 return val*mul;
1044}
1045
ee14da56 1046/* Convert a long long into a string. Returns the number of
1047 * characters needed to represent the number, that can be shorter if passed
1048 * buffer length is not enough to store the whole number. */
1049static int ll2string(char *s, size_t len, long long value) {
1050 char buf[32], *p;
1051 unsigned long long v;
1052 size_t l;
1053
1054 if (len == 0) return 0;
1055 v = (value < 0) ? -value : value;
1056 p = buf+31; /* point to the last character */
1057 do {
1058 *p-- = '0'+(v%10);
1059 v /= 10;
1060 } while(v);
1061 if (value < 0) *p-- = '-';
1062 p++;
1063 l = 32-(p-buf);
1064 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1065 memcpy(s,p,l);
1066 s[l] = '\0';
1067 return l;
1068}
1069
56906eef 1070static void redisLog(int level, const char *fmt, ...) {
ed9b544e 1071 va_list ap;
1072 FILE *fp;
1073
1074 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1075 if (!fp) return;
1076
1077 va_start(ap, fmt);
1078 if (level >= server.verbosity) {
6766f45e 1079 char *c = ".-*#";
1904ecc1 1080 char buf[64];
1081 time_t now;
1082
1083 now = time(NULL);
6c9385e0 1084 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 1085 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 1086 vfprintf(fp, fmt, ap);
1087 fprintf(fp,"\n");
1088 fflush(fp);
1089 }
1090 va_end(ap);
1091
1092 if (server.logfile) fclose(fp);
1093}
1094
1095/*====================== Hash table type implementation ==================== */
1096
1097/* This is an hash table type that uses the SDS dynamic strings libary as
1098 * keys and radis objects as values (objects can hold SDS strings,
1099 * lists, sets). */
1100
1812e024 1101static void dictVanillaFree(void *privdata, void *val)
1102{
1103 DICT_NOTUSED(privdata);
1104 zfree(val);
1105}
1106
4409877e 1107static void dictListDestructor(void *privdata, void *val)
1108{
1109 DICT_NOTUSED(privdata);
1110 listRelease((list*)val);
1111}
1112
ed9b544e 1113static int sdsDictKeyCompare(void *privdata, const void *key1,
1114 const void *key2)
1115{
1116 int l1,l2;
1117 DICT_NOTUSED(privdata);
1118
1119 l1 = sdslen((sds)key1);
1120 l2 = sdslen((sds)key2);
1121 if (l1 != l2) return 0;
1122 return memcmp(key1, key2, l1) == 0;
1123}
1124
1125static void dictRedisObjectDestructor(void *privdata, void *val)
1126{
1127 DICT_NOTUSED(privdata);
1128
a35ddf12 1129 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 1130 decrRefCount(val);
1131}
1132
942a3961 1133static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1134 const void *key2)
1135{
1136 const robj *o1 = key1, *o2 = key2;
1137 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1138}
1139
942a3961 1140static unsigned int dictObjHash(const void *key) {
ed9b544e 1141 const robj *o = key;
1142 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1143}
1144
942a3961 1145static int dictEncObjKeyCompare(void *privdata, const void *key1,
1146 const void *key2)
1147{
9d65a1bb 1148 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1149 int cmp;
942a3961 1150
2a1198b4 1151 if (o1->encoding == REDIS_ENCODING_INT &&
dc05abde 1152 o2->encoding == REDIS_ENCODING_INT)
1153 return o1->ptr == o2->ptr;
2a1198b4 1154
9d65a1bb 1155 o1 = getDecodedObject(o1);
1156 o2 = getDecodedObject(o2);
1157 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1158 decrRefCount(o1);
1159 decrRefCount(o2);
1160 return cmp;
942a3961 1161}
1162
1163static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1164 robj *o = (robj*) key;
942a3961 1165
ed9e4966 1166 if (o->encoding == REDIS_ENCODING_RAW) {
1167 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1168 } else {
1169 if (o->encoding == REDIS_ENCODING_INT) {
1170 char buf[32];
1171 int len;
1172
ee14da56 1173 len = ll2string(buf,32,(long)o->ptr);
ed9e4966 1174 return dictGenHashFunction((unsigned char*)buf, len);
1175 } else {
1176 unsigned int hash;
1177
1178 o = getDecodedObject(o);
1179 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1180 decrRefCount(o);
1181 return hash;
1182 }
1183 }
942a3961 1184}
1185
f2d9f50f 1186/* Sets type and expires */
ed9b544e 1187static dictType setDictType = {
942a3961 1188 dictEncObjHash, /* hash function */
ed9b544e 1189 NULL, /* key dup */
1190 NULL, /* val dup */
942a3961 1191 dictEncObjKeyCompare, /* key compare */
ed9b544e 1192 dictRedisObjectDestructor, /* key destructor */
1193 NULL /* val destructor */
1194};
1195
f2d9f50f 1196/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1197static dictType zsetDictType = {
1198 dictEncObjHash, /* hash function */
1199 NULL, /* key dup */
1200 NULL, /* val dup */
1201 dictEncObjKeyCompare, /* key compare */
1202 dictRedisObjectDestructor, /* key destructor */
da0a1620 1203 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1204};
1205
f2d9f50f 1206/* Db->dict */
5234952b 1207static dictType dbDictType = {
942a3961 1208 dictObjHash, /* hash function */
ed9b544e 1209 NULL, /* key dup */
1210 NULL, /* val dup */
942a3961 1211 dictObjKeyCompare, /* key compare */
ed9b544e 1212 dictRedisObjectDestructor, /* key destructor */
1213 dictRedisObjectDestructor /* val destructor */
1214};
1215
f2d9f50f 1216/* Db->expires */
1217static dictType keyptrDictType = {
1218 dictObjHash, /* hash function */
1219 NULL, /* key dup */
1220 NULL, /* val dup */
1221 dictObjKeyCompare, /* key compare */
1222 dictRedisObjectDestructor, /* key destructor */
1223 NULL /* val destructor */
1224};
1225
5234952b 1226/* Hash type hash table (note that small hashes are represented with zimpaps) */
1227static dictType hashDictType = {
1228 dictEncObjHash, /* hash function */
1229 NULL, /* key dup */
1230 NULL, /* val dup */
1231 dictEncObjKeyCompare, /* key compare */
1232 dictRedisObjectDestructor, /* key destructor */
1233 dictRedisObjectDestructor /* val destructor */
1234};
1235
4409877e 1236/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1237 * lists as values. It's used for blocking operations (BLPOP) and to
1238 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1239static dictType keylistDictType = {
1240 dictObjHash, /* hash function */
1241 NULL, /* key dup */
1242 NULL, /* val dup */
1243 dictObjKeyCompare, /* key compare */
1244 dictRedisObjectDestructor, /* key destructor */
1245 dictListDestructor /* val destructor */
1246};
1247
42ab0172
AO
1248static void version();
1249
ed9b544e 1250/* ========================= Random utility functions ======================= */
1251
1252/* Redis generally does not try to recover from out of memory conditions
1253 * when allocating objects or strings, it is not clear if it will be possible
1254 * to report this condition to the client since the networking layer itself
1255 * is based on heap allocation for send buffers, so we simply abort.
1256 * At least the code will be simpler to read... */
1257static void oom(const char *msg) {
71c54b21 1258 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1259 sleep(1);
1260 abort();
1261}
1262
1263/* ====================== Redis server networking stuff ===================== */
56906eef 1264static void closeTimedoutClients(void) {
ed9b544e 1265 redisClient *c;
ed9b544e 1266 listNode *ln;
1267 time_t now = time(NULL);
c7df85a4 1268 listIter li;
ed9b544e 1269
c7df85a4 1270 listRewind(server.clients,&li);
1271 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1272 c = listNodeValue(ln);
f86a74e9 1273 if (server.maxidletime &&
1274 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1275 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
ffc6b7f8 1276 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1277 listLength(c->pubsub_patterns) == 0 &&
d6cc8867 1278 (now - c->lastinteraction > server.maxidletime))
f86a74e9 1279 {
f870935d 1280 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1281 freeClient(c);
f86a74e9 1282 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1283 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1284 addReply(c,shared.nullmultibulk);
b0d8747d 1285 unblockClientWaitingData(c);
f86a74e9 1286 }
ed9b544e 1287 }
1288 }
ed9b544e 1289}
1290
12fea928 1291static int htNeedsResize(dict *dict) {
1292 long long size, used;
1293
1294 size = dictSlots(dict);
1295 used = dictSize(dict);
1296 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1297 (used*100/size < REDIS_HT_MINFILL));
1298}
1299
0bc03378 1300/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1301 * we resize the hash table to save memory */
56906eef 1302static void tryResizeHashTables(void) {
0bc03378 1303 int j;
1304
1305 for (j = 0; j < server.dbnum; j++) {
5413c40d 1306 if (htNeedsResize(server.db[j].dict))
0bc03378 1307 dictResize(server.db[j].dict);
12fea928 1308 if (htNeedsResize(server.db[j].expires))
1309 dictResize(server.db[j].expires);
0bc03378 1310 }
1311}
1312
8ca3e9d1 1313/* Our hash table implementation performs rehashing incrementally while
1314 * we write/read from the hash table. Still if the server is idle, the hash
1315 * table will use two tables for a long time. So we try to use 1 millisecond
1316 * of CPU time at every serverCron() loop in order to rehash some key. */
1317static void incrementallyRehash(void) {
1318 int j;
1319
1320 for (j = 0; j < server.dbnum; j++) {
1321 if (dictIsRehashing(server.db[j].dict)) {
1322 dictRehashMilliseconds(server.db[j].dict,1);
1323 break; /* already used our millisecond for this loop... */
1324 }
1325 }
1326}
1327
9d65a1bb 1328/* A background saving child (BGSAVE) terminated its work. Handle this. */
1329void backgroundSaveDoneHandler(int statloc) {
1330 int exitcode = WEXITSTATUS(statloc);
1331 int bysignal = WIFSIGNALED(statloc);
1332
1333 if (!bysignal && exitcode == 0) {
1334 redisLog(REDIS_NOTICE,
1335 "Background saving terminated with success");
1336 server.dirty = 0;
1337 server.lastsave = time(NULL);
1338 } else if (!bysignal && exitcode != 0) {
1339 redisLog(REDIS_WARNING, "Background saving error");
1340 } else {
1341 redisLog(REDIS_WARNING,
454eea7c 1342 "Background saving terminated by signal %d", WTERMSIG(statloc));
9d65a1bb 1343 rdbRemoveTempFile(server.bgsavechildpid);
1344 }
1345 server.bgsavechildpid = -1;
1346 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1347 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1348 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1349}
1350
1351/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1352 * Handle this. */
1353void backgroundRewriteDoneHandler(int statloc) {
1354 int exitcode = WEXITSTATUS(statloc);
1355 int bysignal = WIFSIGNALED(statloc);
1356
1357 if (!bysignal && exitcode == 0) {
1358 int fd;
1359 char tmpfile[256];
1360
1361 redisLog(REDIS_NOTICE,
1362 "Background append only file rewriting terminated with success");
1363 /* Now it's time to flush the differences accumulated by the parent */
1364 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1365 fd = open(tmpfile,O_WRONLY|O_APPEND);
1366 if (fd == -1) {
1367 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1368 goto cleanup;
1369 }
1370 /* Flush our data... */
1371 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1372 (signed) sdslen(server.bgrewritebuf)) {
1373 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1374 close(fd);
1375 goto cleanup;
1376 }
b32627cd 1377 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1378 /* Now our work is to rename the temp file into the stable file. And
1379 * switch the file descriptor used by the server for append only. */
1380 if (rename(tmpfile,server.appendfilename) == -1) {
1381 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1382 close(fd);
1383 goto cleanup;
1384 }
1385 /* Mission completed... almost */
1386 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1387 if (server.appendfd != -1) {
1388 /* If append only is actually enabled... */
1389 close(server.appendfd);
1390 server.appendfd = fd;
1391 fsync(fd);
85a83172 1392 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1393 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1394 } else {
1395 /* If append only is disabled we just generate a dump in this
1396 * format. Why not? */
1397 close(fd);
1398 }
1399 } else if (!bysignal && exitcode != 0) {
1400 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1401 } else {
1402 redisLog(REDIS_WARNING,
454eea7c 1403 "Background append only file rewriting terminated by signal %d",
1404 WTERMSIG(statloc));
9d65a1bb 1405 }
1406cleanup:
1407 sdsfree(server.bgrewritebuf);
1408 server.bgrewritebuf = sdsempty();
1409 aofRemoveTempFile(server.bgrewritechildpid);
1410 server.bgrewritechildpid = -1;
1411}
1412
884d4b39 1413/* This function is called once a background process of some kind terminates,
1414 * as we want to avoid resizing the hash tables when there is a child in order
1415 * to play well with copy-on-write (otherwise when a resize happens lots of
1416 * memory pages are copied). The goal of this function is to update the ability
1417 * for dict.c to resize the hash tables accordingly to the fact we have o not
1418 * running childs. */
1419static void updateDictResizePolicy(void) {
1420 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1421 dictEnableResize();
1422 else
1423 dictDisableResize();
1424}
1425
56906eef 1426static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1427 int j, loops = server.cronloops++;
ed9b544e 1428 REDIS_NOTUSED(eventLoop);
1429 REDIS_NOTUSED(id);
1430 REDIS_NOTUSED(clientData);
1431
3a66edc7 1432 /* We take a cached value of the unix time in the global state because
1433 * with virtual memory and aging there is to store the current time
1434 * in objects at every object access, and accuracy is not needed.
1435 * To access a global var is faster than calling time(NULL) */
1436 server.unixtime = time(NULL);
1437
fab43727 1438 /* We received a SIGTERM, shutting down here in a safe way, as it is
1439 * not ok doing so inside the signal handler. */
1440 if (server.shutdown_asap) {
1441 if (prepareForShutdown() == REDIS_OK) exit(0);
1442 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1443 }
1444
0bc03378 1445 /* Show some info about non-empty databases */
ed9b544e 1446 for (j = 0; j < server.dbnum; j++) {
dec423d9 1447 long long size, used, vkeys;
94754ccc 1448
3305306f 1449 size = dictSlots(server.db[j].dict);
1450 used = dictSize(server.db[j].dict);
94754ccc 1451 vkeys = dictSize(server.db[j].expires);
1763929f 1452 if (!(loops % 50) && (used || vkeys)) {
f870935d 1453 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1454 /* dictPrintStats(server.dict); */
ed9b544e 1455 }
ed9b544e 1456 }
1457
0bc03378 1458 /* We don't want to resize the hash tables while a bacground saving
1459 * is in progress: the saving child is created using fork() that is
1460 * implemented with a copy-on-write semantic in most modern systems, so
1461 * if we resize the HT while there is the saving child at work actually
1462 * a lot of memory movements in the parent will cause a lot of pages
1463 * copied. */
8ca3e9d1 1464 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1465 if (!(loops % 10)) tryResizeHashTables();
1466 if (server.activerehashing) incrementallyRehash();
884d4b39 1467 }
0bc03378 1468
ed9b544e 1469 /* Show information about connected clients */
1763929f 1470 if (!(loops % 50)) {
bdcb92f2 1471 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
ed9b544e 1472 listLength(server.clients)-listLength(server.slaves),
1473 listLength(server.slaves),
bdcb92f2 1474 zmalloc_used_memory());
ed9b544e 1475 }
1476
1477 /* Close connections of timedout clients */
1763929f 1478 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1479 closeTimedoutClients();
1480
9d65a1bb 1481 /* Check if a background saving or AOF rewrite in progress terminated */
1482 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1483 int statloc;
9d65a1bb 1484 pid_t pid;
1485
1486 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1487 if (pid == server.bgsavechildpid) {
1488 backgroundSaveDoneHandler(statloc);
ed9b544e 1489 } else {
9d65a1bb 1490 backgroundRewriteDoneHandler(statloc);
ed9b544e 1491 }
884d4b39 1492 updateDictResizePolicy();
ed9b544e 1493 }
1494 } else {
1495 /* If there is not a background saving in progress check if
1496 * we have to save now */
1497 time_t now = time(NULL);
1498 for (j = 0; j < server.saveparamslen; j++) {
1499 struct saveparam *sp = server.saveparams+j;
1500
1501 if (server.dirty >= sp->changes &&
1502 now-server.lastsave > sp->seconds) {
1503 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1504 sp->changes, sp->seconds);
f78fd11b 1505 rdbSaveBackground(server.dbfilename);
ed9b544e 1506 break;
1507 }
1508 }
1509 }
94754ccc 1510
f2324293 1511 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1512 * will use few CPU cycles if there are few expiring keys, otherwise
1513 * it will get more aggressive to avoid that too much memory is used by
1514 * keys that can be removed from the keyspace. */
94754ccc 1515 for (j = 0; j < server.dbnum; j++) {
f2324293 1516 int expired;
94754ccc 1517 redisDb *db = server.db+j;
94754ccc 1518
f2324293 1519 /* Continue to expire if at the end of the cycle more than 25%
1520 * of the keys were expired. */
1521 do {
4ef8de8a 1522 long num = dictSize(db->expires);
94754ccc 1523 time_t now = time(NULL);
1524
f2324293 1525 expired = 0;
94754ccc 1526 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1527 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1528 while (num--) {
1529 dictEntry *de;
1530 time_t t;
1531
1532 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1533 t = (time_t) dictGetEntryVal(de);
1534 if (now > t) {
1535 deleteKey(db,dictGetEntryKey(de));
f2324293 1536 expired++;
2a6a2ed1 1537 server.stat_expiredkeys++;
94754ccc 1538 }
1539 }
f2324293 1540 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1541 }
1542
4ef8de8a 1543 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1544 * is enbled. Try to free objects from the free list first. */
7e69548d 1545 if (vmCanSwapOut()) {
1546 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1547 server.vm_max_memory)
1548 {
72e9fd40 1549 int retval;
1550
a5819310 1551 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1552 retval = (server.vm_max_threads == 0) ?
1553 vmSwapOneObjectBlocking() :
1554 vmSwapOneObjectThreaded();
1763929f 1555 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1556 zmalloc_used_memory() >
1557 (server.vm_max_memory+server.vm_max_memory/10))
1558 {
1559 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1560 }
72e9fd40 1561 /* Note that when using threade I/O we free just one object,
1562 * because anyway when the I/O thread in charge to swap this
1563 * object out will finish, the handler of completed jobs
1564 * will try to swap more objects if we are still out of memory. */
1565 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1566 }
1567 }
1568
ed9b544e 1569 /* Check if we should connect to a MASTER */
1763929f 1570 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1571 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1572 if (syncWithMaster() == REDIS_OK) {
1573 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
8f63ddca 1574 if (server.appendonly) rewriteAppendOnlyFileBackground();
ed9b544e 1575 }
1576 }
1763929f 1577 return 100;
ed9b544e 1578}
1579
d5d55fc3 1580/* This function gets called every time Redis is entering the
1581 * main loop of the event driven library, that is, before to sleep
1582 * for ready file descriptors. */
1583static void beforeSleep(struct aeEventLoop *eventLoop) {
1584 REDIS_NOTUSED(eventLoop);
1585
28ed1f33 1586 /* Awake clients that got all the swapped keys they requested */
d5d55fc3 1587 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1588 listIter li;
1589 listNode *ln;
1590
1591 listRewind(server.io_ready_clients,&li);
1592 while((ln = listNext(&li))) {
1593 redisClient *c = ln->value;
1594 struct redisCommand *cmd;
1595
1596 /* Resume the client. */
1597 listDelNode(server.io_ready_clients,ln);
1598 c->flags &= (~REDIS_IO_WAIT);
1599 server.vm_blocked_clients--;
1600 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1601 readQueryFromClient, c);
1602 cmd = lookupCommand(c->argv[0]->ptr);
1603 assert(cmd != NULL);
1604 call(c,cmd);
1605 resetClient(c);
1606 /* There may be more data to process in the input buffer. */
1607 if (c->querybuf && sdslen(c->querybuf) > 0)
1608 processInputBuffer(c);
1609 }
1610 }
28ed1f33 1611 /* Write the AOF buffer on disk */
1612 flushAppendOnlyFile();
d5d55fc3 1613}
1614
ed9b544e 1615static void createSharedObjects(void) {
05df7621 1616 int j;
1617
ed9b544e 1618 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1619 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1620 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1621 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1622 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1623 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1624 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1625 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1626 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1627 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1628 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1629 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1630 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1631 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1632 "-ERR no such key\r\n"));
ed9b544e 1633 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1634 "-ERR syntax error\r\n"));
c937aa89 1635 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1636 "-ERR source and destination objects are the same\r\n"));
1637 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1638 "-ERR index out of range\r\n"));
ed9b544e 1639 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1640 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1641 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1642 shared.select0 = createStringObject("select 0\r\n",10);
1643 shared.select1 = createStringObject("select 1\r\n",10);
1644 shared.select2 = createStringObject("select 2\r\n",10);
1645 shared.select3 = createStringObject("select 3\r\n",10);
1646 shared.select4 = createStringObject("select 4\r\n",10);
1647 shared.select5 = createStringObject("select 5\r\n",10);
1648 shared.select6 = createStringObject("select 6\r\n",10);
1649 shared.select7 = createStringObject("select 7\r\n",10);
1650 shared.select8 = createStringObject("select 8\r\n",10);
1651 shared.select9 = createStringObject("select 9\r\n",10);
befec3cd 1652 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
c8d0ea0e 1653 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
befec3cd 1654 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
fc46bb71 1655 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
ffc6b7f8 1656 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1657 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
befec3cd 1658 shared.mbulk3 = createStringObject("*3\r\n",4);
c8d0ea0e 1659 shared.mbulk4 = createStringObject("*4\r\n",4);
05df7621 1660 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1661 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1662 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1663 }
ed9b544e 1664}
1665
1666static void appendServerSaveParams(time_t seconds, int changes) {
1667 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1668 server.saveparams[server.saveparamslen].seconds = seconds;
1669 server.saveparams[server.saveparamslen].changes = changes;
1670 server.saveparamslen++;
1671}
1672
bcfc686d 1673static void resetServerSaveParams() {
ed9b544e 1674 zfree(server.saveparams);
1675 server.saveparams = NULL;
1676 server.saveparamslen = 0;
1677}
1678
1679static void initServerConfig() {
1680 server.dbnum = REDIS_DEFAULT_DBNUM;
1681 server.port = REDIS_SERVERPORT;
f870935d 1682 server.verbosity = REDIS_VERBOSE;
ed9b544e 1683 server.maxidletime = REDIS_MAXIDLETIME;
1684 server.saveparams = NULL;
1685 server.logfile = NULL; /* NULL = log on standard output */
1686 server.bindaddr = NULL;
1687 server.glueoutputbuf = 1;
1688 server.daemonize = 0;
44b38ef4 1689 server.appendonly = 0;
1b677732 1690 server.appendfsync = APPENDFSYNC_EVERYSEC;
48f0308a 1691 server.lastfsync = time(NULL);
44b38ef4 1692 server.appendfd = -1;
1693 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1694 server.pidfile = zstrdup("/var/run/redis.pid");
1695 server.dbfilename = zstrdup("dump.rdb");
1696 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1697 server.requirepass = NULL;
b0553789 1698 server.rdbcompression = 1;
8ca3e9d1 1699 server.activerehashing = 1;
285add55 1700 server.maxclients = 0;
d5d55fc3 1701 server.blpop_blocked_clients = 0;
3fd78bcd 1702 server.maxmemory = 0;
75680a3c 1703 server.vm_enabled = 0;
054e426d 1704 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1705 server.vm_page_size = 256; /* 256 bytes per page */
1706 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1707 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1708 server.vm_max_threads = 4;
d5d55fc3 1709 server.vm_blocked_clients = 0;
cbba7dd7 1710 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1711 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
fab43727 1712 server.shutdown_asap = 0;
75680a3c 1713
bcfc686d 1714 resetServerSaveParams();
ed9b544e 1715
1716 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1717 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1718 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1719 /* Replication related */
1720 server.isslave = 0;
d0ccebcf 1721 server.masterauth = NULL;
ed9b544e 1722 server.masterhost = NULL;
1723 server.masterport = 6379;
1724 server.master = NULL;
1725 server.replstate = REDIS_REPL_NONE;
a7866db6 1726
1727 /* Double constants initialization */
1728 R_Zero = 0.0;
1729 R_PosInf = 1.0/R_Zero;
1730 R_NegInf = -1.0/R_Zero;
1731 R_Nan = R_Zero/R_Zero;
ed9b544e 1732}
1733
1734static void initServer() {
1735 int j;
1736
1737 signal(SIGHUP, SIG_IGN);
1738 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1739 setupSigSegvAction();
ed9b544e 1740
b9bc0eef 1741 server.devnull = fopen("/dev/null","w");
1742 if (server.devnull == NULL) {
1743 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1744 exit(1);
1745 }
ed9b544e 1746 server.clients = listCreate();
1747 server.slaves = listCreate();
87eca727 1748 server.monitors = listCreate();
ed9b544e 1749 server.objfreelist = listCreate();
1750 createSharedObjects();
1751 server.el = aeCreateEventLoop();
3305306f 1752 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
ed9b544e 1753 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1754 if (server.fd == -1) {
1755 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1756 exit(1);
1757 }
3305306f 1758 for (j = 0; j < server.dbnum; j++) {
5234952b 1759 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1760 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
37ab76c9 1761 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1762 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1763 if (server.vm_enabled)
1764 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1765 server.db[j].id = j;
1766 }
ffc6b7f8 1767 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1768 server.pubsub_patterns = listCreate();
1769 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1770 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
ed9b544e 1771 server.cronloops = 0;
9f3c422c 1772 server.bgsavechildpid = -1;
9d65a1bb 1773 server.bgrewritechildpid = -1;
1774 server.bgrewritebuf = sdsempty();
28ed1f33 1775 server.aofbuf = sdsempty();
ed9b544e 1776 server.lastsave = time(NULL);
1777 server.dirty = 0;
ed9b544e 1778 server.stat_numcommands = 0;
1779 server.stat_numconnections = 0;
2a6a2ed1 1780 server.stat_expiredkeys = 0;
ed9b544e 1781 server.stat_starttime = time(NULL);
3a66edc7 1782 server.unixtime = time(NULL);
d8f8b666 1783 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1784 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1785 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1786
1787 if (server.appendonly) {
3bb225d6 1788 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1789 if (server.appendfd == -1) {
1790 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1791 strerror(errno));
1792 exit(1);
1793 }
1794 }
75680a3c 1795
1796 if (server.vm_enabled) vmInit();
ed9b544e 1797}
1798
1799/* Empty the whole database */
ca37e9cd 1800static long long emptyDb() {
ed9b544e 1801 int j;
ca37e9cd 1802 long long removed = 0;
ed9b544e 1803
3305306f 1804 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1805 removed += dictSize(server.db[j].dict);
3305306f 1806 dictEmpty(server.db[j].dict);
1807 dictEmpty(server.db[j].expires);
1808 }
ca37e9cd 1809 return removed;
ed9b544e 1810}
1811
85dd2f3a 1812static int yesnotoi(char *s) {
1813 if (!strcasecmp(s,"yes")) return 1;
1814 else if (!strcasecmp(s,"no")) return 0;
1815 else return -1;
1816}
1817
ed9b544e 1818/* I agree, this is a very rudimental way to load a configuration...
1819 will improve later if the config gets more complex */
1820static void loadServerConfig(char *filename) {
c9a111ac 1821 FILE *fp;
ed9b544e 1822 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1823 int linenum = 0;
1824 sds line = NULL;
c9a111ac 1825
1826 if (filename[0] == '-' && filename[1] == '\0')
1827 fp = stdin;
1828 else {
1829 if ((fp = fopen(filename,"r")) == NULL) {
9a22de82 1830 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
c9a111ac 1831 exit(1);
1832 }
ed9b544e 1833 }
c9a111ac 1834
ed9b544e 1835 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1836 sds *argv;
1837 int argc, j;
1838
1839 linenum++;
1840 line = sdsnew(buf);
1841 line = sdstrim(line," \t\r\n");
1842
1843 /* Skip comments and blank lines*/
1844 if (line[0] == '#' || line[0] == '\0') {
1845 sdsfree(line);
1846 continue;
1847 }
1848
1849 /* Split into arguments */
1850 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1851 sdstolower(argv[0]);
1852
1853 /* Execute config directives */
bb0b03a3 1854 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1855 server.maxidletime = atoi(argv[1]);
0150db36 1856 if (server.maxidletime < 0) {
ed9b544e 1857 err = "Invalid timeout value"; goto loaderr;
1858 }
bb0b03a3 1859 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1860 server.port = atoi(argv[1]);
1861 if (server.port < 1 || server.port > 65535) {
1862 err = "Invalid port"; goto loaderr;
1863 }
bb0b03a3 1864 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1865 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1866 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1867 int seconds = atoi(argv[1]);
1868 int changes = atoi(argv[2]);
1869 if (seconds < 1 || changes < 0) {
1870 err = "Invalid save parameters"; goto loaderr;
1871 }
1872 appendServerSaveParams(seconds,changes);
bb0b03a3 1873 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1874 if (chdir(argv[1]) == -1) {
1875 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1876 argv[1], strerror(errno));
1877 exit(1);
1878 }
bb0b03a3 1879 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1880 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1881 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1882 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1883 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1884 else {
1885 err = "Invalid log level. Must be one of debug, notice, warning";
1886 goto loaderr;
1887 }
bb0b03a3 1888 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1889 FILE *logfp;
ed9b544e 1890
1891 server.logfile = zstrdup(argv[1]);
bb0b03a3 1892 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1893 zfree(server.logfile);
1894 server.logfile = NULL;
1895 }
1896 if (server.logfile) {
1897 /* Test if we are able to open the file. The server will not
1898 * be able to abort just for this problem later... */
c9a111ac 1899 logfp = fopen(server.logfile,"a");
1900 if (logfp == NULL) {
ed9b544e 1901 err = sdscatprintf(sdsempty(),
1902 "Can't open the log file: %s", strerror(errno));
1903 goto loaderr;
1904 }
c9a111ac 1905 fclose(logfp);
ed9b544e 1906 }
bb0b03a3 1907 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1908 server.dbnum = atoi(argv[1]);
1909 if (server.dbnum < 1) {
1910 err = "Invalid number of databases"; goto loaderr;
1911 }
b3f83f12
JZ
1912 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1913 loadServerConfig(argv[1]);
285add55 1914 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1915 server.maxclients = atoi(argv[1]);
3fd78bcd 1916 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
2b619329 1917 server.maxmemory = memtoll(argv[1],NULL);
bb0b03a3 1918 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1919 server.masterhost = sdsnew(argv[1]);
1920 server.masterport = atoi(argv[2]);
1921 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1922 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1923 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1924 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1925 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1926 err = "argument must be 'yes' or 'no'"; goto loaderr;
1927 }
121f70cf 1928 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1929 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
8ca3e9d1 1930 err = "argument must be 'yes' or 'no'"; goto loaderr;
1931 }
1932 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1933 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
121f70cf 1934 err = "argument must be 'yes' or 'no'"; goto loaderr;
1935 }
bb0b03a3 1936 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1937 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1938 err = "argument must be 'yes' or 'no'"; goto loaderr;
1939 }
44b38ef4 1940 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1941 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1942 err = "argument must be 'yes' or 'no'"; goto loaderr;
1943 }
f3b52411
PN
1944 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1945 zfree(server.appendfilename);
1946 server.appendfilename = zstrdup(argv[1]);
48f0308a 1947 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1948 if (!strcasecmp(argv[1],"no")) {
48f0308a 1949 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1950 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1951 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1952 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1953 server.appendfsync = APPENDFSYNC_EVERYSEC;
1954 } else {
1955 err = "argument must be 'no', 'always' or 'everysec'";
1956 goto loaderr;
1957 }
bb0b03a3 1958 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 1959 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1960 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 1961 zfree(server.pidfile);
054e426d 1962 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1963 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 1964 zfree(server.dbfilename);
054e426d 1965 server.dbfilename = zstrdup(argv[1]);
75680a3c 1966 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1967 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1968 err = "argument must be 'yes' or 'no'"; goto loaderr;
1969 }
054e426d 1970 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 1971 zfree(server.vm_swap_file);
054e426d 1972 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 1973 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2b619329 1974 server.vm_max_memory = memtoll(argv[1],NULL);
4ef8de8a 1975 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2b619329 1976 server.vm_page_size = memtoll(argv[1], NULL);
4ef8de8a 1977 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2b619329 1978 server.vm_pages = memtoll(argv[1], NULL);
92f8e882 1979 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1980 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 1981 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2b619329 1982 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
cbba7dd7 1983 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2b619329 1984 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
ed9b544e 1985 } else {
1986 err = "Bad directive or wrong number of arguments"; goto loaderr;
1987 }
1988 for (j = 0; j < argc; j++)
1989 sdsfree(argv[j]);
1990 zfree(argv);
1991 sdsfree(line);
1992 }
c9a111ac 1993 if (fp != stdin) fclose(fp);
ed9b544e 1994 return;
1995
1996loaderr:
1997 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1998 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1999 fprintf(stderr, ">>> '%s'\n", line);
2000 fprintf(stderr, "%s\n", err);
2001 exit(1);
2002}
2003
2004static void freeClientArgv(redisClient *c) {
2005 int j;
2006
2007 for (j = 0; j < c->argc; j++)
2008 decrRefCount(c->argv[j]);
e8a74421 2009 for (j = 0; j < c->mbargc; j++)
2010 decrRefCount(c->mbargv[j]);
ed9b544e 2011 c->argc = 0;
e8a74421 2012 c->mbargc = 0;
ed9b544e 2013}
2014
2015static void freeClient(redisClient *c) {
2016 listNode *ln;
2017
4409877e 2018 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 2019 * call, we have to set querybuf to NULL *before* to call
2020 * unblockClientWaitingData() to avoid processInputBuffer() will get
2021 * called. Also it is important to remove the file events after
2022 * this, because this call adds the READABLE event. */
4409877e 2023 sdsfree(c->querybuf);
2024 c->querybuf = NULL;
2025 if (c->flags & REDIS_BLOCKED)
b0d8747d 2026 unblockClientWaitingData(c);
4409877e 2027
37ab76c9 2028 /* UNWATCH all the keys */
2029 unwatchAllKeys(c);
2030 listRelease(c->watched_keys);
ffc6b7f8 2031 /* Unsubscribe from all the pubsub channels */
2032 pubsubUnsubscribeAllChannels(c,0);
2033 pubsubUnsubscribeAllPatterns(c,0);
2034 dictRelease(c->pubsub_channels);
2035 listRelease(c->pubsub_patterns);
befec3cd 2036 /* Obvious cleanup */
ed9b544e 2037 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2038 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 2039 listRelease(c->reply);
2040 freeClientArgv(c);
2041 close(c->fd);
92f8e882 2042 /* Remove from the list of clients */
ed9b544e 2043 ln = listSearchKey(server.clients,c);
dfc5e96c 2044 redisAssert(ln != NULL);
ed9b544e 2045 listDelNode(server.clients,ln);
37ab76c9 2046 /* Remove from the list of clients that are now ready to be restarted
2047 * after waiting for swapped keys */
d5d55fc3 2048 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2049 ln = listSearchKey(server.io_ready_clients,c);
2050 if (ln) {
2051 listDelNode(server.io_ready_clients,ln);
2052 server.vm_blocked_clients--;
2053 }
2054 }
37ab76c9 2055 /* Remove from the list of clients waiting for swapped keys */
d5d55fc3 2056 while (server.vm_enabled && listLength(c->io_keys)) {
2057 ln = listFirst(c->io_keys);
2058 dontWaitForSwappedKey(c,ln->value);
92f8e882 2059 }
b3e3d0d7 2060 listRelease(c->io_keys);
befec3cd 2061 /* Master/slave cleanup */
ed9b544e 2062 if (c->flags & REDIS_SLAVE) {
6208b3a7 2063 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2064 close(c->repldbfd);
87eca727 2065 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2066 ln = listSearchKey(l,c);
dfc5e96c 2067 redisAssert(ln != NULL);
87eca727 2068 listDelNode(l,ln);
ed9b544e 2069 }
2070 if (c->flags & REDIS_MASTER) {
2071 server.master = NULL;
2072 server.replstate = REDIS_REPL_CONNECT;
2073 }
befec3cd 2074 /* Release memory */
93ea3759 2075 zfree(c->argv);
e8a74421 2076 zfree(c->mbargv);
6e469882 2077 freeClientMultiState(c);
ed9b544e 2078 zfree(c);
2079}
2080
cc30e368 2081#define GLUEREPLY_UP_TO (1024)
ed9b544e 2082static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 2083 int copylen = 0;
2084 char buf[GLUEREPLY_UP_TO];
6208b3a7 2085 listNode *ln;
c7df85a4 2086 listIter li;
ed9b544e 2087 robj *o;
2088
c7df85a4 2089 listRewind(c->reply,&li);
2090 while((ln = listNext(&li))) {
c28b42ac 2091 int objlen;
2092
ed9b544e 2093 o = ln->value;
c28b42ac 2094 objlen = sdslen(o->ptr);
2095 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2096 memcpy(buf+copylen,o->ptr,objlen);
2097 copylen += objlen;
ed9b544e 2098 listDelNode(c->reply,ln);
c28b42ac 2099 } else {
2100 if (copylen == 0) return;
2101 break;
ed9b544e 2102 }
ed9b544e 2103 }
c28b42ac 2104 /* Now the output buffer is empty, add the new single element */
2105 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2106 listAddNodeHead(c->reply,o);
ed9b544e 2107}
2108
2109static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2110 redisClient *c = privdata;
2111 int nwritten = 0, totwritten = 0, objlen;
2112 robj *o;
2113 REDIS_NOTUSED(el);
2114 REDIS_NOTUSED(mask);
2115
2895e862 2116 /* Use writev() if we have enough buffers to send */
7ea870c0 2117 if (!server.glueoutputbuf &&
e0a62c7f 2118 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
7ea870c0 2119 !(c->flags & REDIS_MASTER))
2895e862 2120 {
2121 sendReplyToClientWritev(el, fd, privdata, mask);
2122 return;
2123 }
2895e862 2124
ed9b544e 2125 while(listLength(c->reply)) {
c28b42ac 2126 if (server.glueoutputbuf && listLength(c->reply) > 1)
2127 glueReplyBuffersIfNeeded(c);
2128
ed9b544e 2129 o = listNodeValue(listFirst(c->reply));
2130 objlen = sdslen(o->ptr);
2131
2132 if (objlen == 0) {
2133 listDelNode(c->reply,listFirst(c->reply));
2134 continue;
2135 }
2136
2137 if (c->flags & REDIS_MASTER) {
6f376729 2138 /* Don't reply to a master */
ed9b544e 2139 nwritten = objlen - c->sentlen;
2140 } else {
a4d1ba9a 2141 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 2142 if (nwritten <= 0) break;
2143 }
2144 c->sentlen += nwritten;
2145 totwritten += nwritten;
2146 /* If we fully sent the object on head go to the next one */
2147 if (c->sentlen == objlen) {
2148 listDelNode(c->reply,listFirst(c->reply));
2149 c->sentlen = 0;
2150 }
6f376729 2151 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 2152 * bytes, in a single threaded server it's a good idea to serve
6f376729 2153 * other clients as well, even if a very large request comes from
2154 * super fast link that is always able to accept data (in real world
12f9d551 2155 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 2156 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 2157 }
2158 if (nwritten == -1) {
2159 if (errno == EAGAIN) {
2160 nwritten = 0;
2161 } else {
f870935d 2162 redisLog(REDIS_VERBOSE,
ed9b544e 2163 "Error writing to client: %s", strerror(errno));
2164 freeClient(c);
2165 return;
2166 }
2167 }
2168 if (totwritten > 0) c->lastinteraction = time(NULL);
2169 if (listLength(c->reply) == 0) {
2170 c->sentlen = 0;
2171 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2172 }
2173}
2174
2895e862 2175static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2176{
2177 redisClient *c = privdata;
2178 int nwritten = 0, totwritten = 0, objlen, willwrite;
2179 robj *o;
2180 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2181 int offset, ion = 0;
2182 REDIS_NOTUSED(el);
2183 REDIS_NOTUSED(mask);
2184
2185 listNode *node;
2186 while (listLength(c->reply)) {
2187 offset = c->sentlen;
2188 ion = 0;
2189 willwrite = 0;
2190
2191 /* fill-in the iov[] array */
2192 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2193 o = listNodeValue(node);
2194 objlen = sdslen(o->ptr);
2195
e0a62c7f 2196 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2895e862 2197 break;
2198
2199 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2200 break; /* no more iovecs */
2201
2202 iov[ion].iov_base = ((char*)o->ptr) + offset;
2203 iov[ion].iov_len = objlen - offset;
2204 willwrite += objlen - offset;
2205 offset = 0; /* just for the first item */
2206 ion++;
2207 }
2208
2209 if(willwrite == 0)
2210 break;
2211
2212 /* write all collected blocks at once */
2213 if((nwritten = writev(fd, iov, ion)) < 0) {
2214 if (errno != EAGAIN) {
f870935d 2215 redisLog(REDIS_VERBOSE,
2895e862 2216 "Error writing to client: %s", strerror(errno));
2217 freeClient(c);
2218 return;
2219 }
2220 break;
2221 }
2222
2223 totwritten += nwritten;
2224 offset = c->sentlen;
2225
2226 /* remove written robjs from c->reply */
2227 while (nwritten && listLength(c->reply)) {
2228 o = listNodeValue(listFirst(c->reply));
2229 objlen = sdslen(o->ptr);
2230
2231 if(nwritten >= objlen - offset) {
2232 listDelNode(c->reply, listFirst(c->reply));
2233 nwritten -= objlen - offset;
2234 c->sentlen = 0;
2235 } else {
2236 /* partial write */
2237 c->sentlen += nwritten;
2238 break;
2239 }
2240 offset = 0;
2241 }
2242 }
2243
e0a62c7f 2244 if (totwritten > 0)
2895e862 2245 c->lastinteraction = time(NULL);
2246
2247 if (listLength(c->reply) == 0) {
2248 c->sentlen = 0;
2249 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2250 }
2251}
2252
1a132bbc
PN
2253static int qsortRedisCommands(const void *r1, const void *r2) {
2254 return strcasecmp(
2255 ((struct redisCommand*)r1)->name,
2256 ((struct redisCommand*)r2)->name);
2257}
2258
2259static void sortCommandTable() {
1a132bbc
PN
2260 /* Copy and sort the read-only version of the command table */
2261 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2262 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
d55d5c5d 2263 qsort(commandTable,
2264 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2265 sizeof(struct redisCommand),qsortRedisCommands);
1a132bbc
PN
2266}
2267
ed9b544e 2268static struct redisCommand *lookupCommand(char *name) {
1a132bbc
PN
2269 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2270 return bsearch(
2271 &tmp,
2272 commandTable,
d55d5c5d 2273 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
1a132bbc
PN
2274 sizeof(struct redisCommand),
2275 qsortRedisCommands);
ed9b544e 2276}
2277
2278/* resetClient prepare the client to process the next command */
2279static void resetClient(redisClient *c) {
2280 freeClientArgv(c);
2281 c->bulklen = -1;
e8a74421 2282 c->multibulk = 0;
ed9b544e 2283}
2284
6e469882 2285/* Call() is the core of Redis execution of a command */
2286static void call(redisClient *c, struct redisCommand *cmd) {
2287 long long dirty;
2288
2289 dirty = server.dirty;
2290 cmd->proc(c);
4005fef1 2291 dirty = server.dirty-dirty;
2292
2293 if (server.appendonly && dirty)
6e469882 2294 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
4005fef1 2295 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2296 listLength(server.slaves))
248ea310 2297 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2298 if (listLength(server.monitors))
dd142b9c 2299 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2300 server.stat_numcommands++;
2301}
2302
ed9b544e 2303/* If this function gets called we already read a whole
2304 * command, argments are in the client argv/argc fields.
2305 * processCommand() execute the command or prepare the
2306 * server for a bulk read from the client.
2307 *
2308 * If 1 is returned the client is still alive and valid and
2309 * and other operations can be performed by the caller. Otherwise
2310 * if 0 is returned the client was destroied (i.e. after QUIT). */
2311static int processCommand(redisClient *c) {
2312 struct redisCommand *cmd;
ed9b544e 2313
3fd78bcd 2314 /* Free some memory if needed (maxmemory setting) */
2315 if (server.maxmemory) freeMemoryIfNeeded();
2316
e8a74421 2317 /* Handle the multi bulk command type. This is an alternative protocol
2318 * supported by Redis in order to receive commands that are composed of
2319 * multiple binary-safe "bulk" arguments. The latency of processing is
2320 * a bit higher but this allows things like multi-sets, so if this
2321 * protocol is used only for MSET and similar commands this is a big win. */
2322 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2323 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2324 if (c->multibulk <= 0) {
2325 resetClient(c);
2326 return 1;
2327 } else {
2328 decrRefCount(c->argv[c->argc-1]);
2329 c->argc--;
2330 return 1;
2331 }
2332 } else if (c->multibulk) {
2333 if (c->bulklen == -1) {
2334 if (((char*)c->argv[0]->ptr)[0] != '$') {
2335 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2336 resetClient(c);
2337 return 1;
2338 } else {
2339 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2340 decrRefCount(c->argv[0]);
2341 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2342 c->argc--;
2343 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2344 resetClient(c);
2345 return 1;
2346 }
2347 c->argc--;
2348 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2349 return 1;
2350 }
2351 } else {
2352 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2353 c->mbargv[c->mbargc] = c->argv[0];
2354 c->mbargc++;
2355 c->argc--;
2356 c->multibulk--;
2357 if (c->multibulk == 0) {
2358 robj **auxargv;
2359 int auxargc;
2360
2361 /* Here we need to swap the multi-bulk argc/argv with the
2362 * normal argc/argv of the client structure. */
2363 auxargv = c->argv;
2364 c->argv = c->mbargv;
2365 c->mbargv = auxargv;
2366
2367 auxargc = c->argc;
2368 c->argc = c->mbargc;
2369 c->mbargc = auxargc;
2370
2371 /* We need to set bulklen to something different than -1
2372 * in order for the code below to process the command without
2373 * to try to read the last argument of a bulk command as
2374 * a special argument. */
2375 c->bulklen = 0;
2376 /* continue below and process the command */
2377 } else {
2378 c->bulklen = -1;
2379 return 1;
2380 }
2381 }
2382 }
2383 /* -- end of multi bulk commands processing -- */
2384
ed9b544e 2385 /* The QUIT command is handled as a special case. Normal command
2386 * procs are unable to close the client connection safely */
bb0b03a3 2387 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2388 freeClient(c);
2389 return 0;
2390 }
d5d55fc3 2391
2392 /* Now lookup the command and check ASAP about trivial error conditions
2393 * such wrong arity, bad command name and so forth. */
ed9b544e 2394 cmd = lookupCommand(c->argv[0]->ptr);
2395 if (!cmd) {
2c14807b 2396 addReplySds(c,
2397 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2398 (char*)c->argv[0]->ptr));
ed9b544e 2399 resetClient(c);
2400 return 1;
2401 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2402 (c->argc < -cmd->arity)) {
454d4e43 2403 addReplySds(c,
2404 sdscatprintf(sdsempty(),
2405 "-ERR wrong number of arguments for '%s' command\r\n",
2406 cmd->name));
ed9b544e 2407 resetClient(c);
2408 return 1;
2409 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2410 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2411 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2412
2413 decrRefCount(c->argv[c->argc-1]);
2414 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2415 c->argc--;
2416 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2417 resetClient(c);
2418 return 1;
2419 }
2420 c->argc--;
2421 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2422 /* It is possible that the bulk read is already in the
8d0490e7 2423 * buffer. Check this condition and handle it accordingly.
2424 * This is just a fast path, alternative to call processInputBuffer().
2425 * It's a good idea since the code is small and this condition
2426 * happens most of the times. */
ed9b544e 2427 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2428 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2429 c->argc++;
2430 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2431 } else {
d5d55fc3 2432 /* Otherwise return... there is to read the last argument
2433 * from the socket. */
ed9b544e 2434 return 1;
2435 }
2436 }
942a3961 2437 /* Let's try to encode the bulk object to save space. */
2438 if (cmd->flags & REDIS_CMD_BULK)
05df7621 2439 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
942a3961 2440
e63943a4 2441 /* Check if the user is authenticated */
2442 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2443 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2444 resetClient(c);
2445 return 1;
2446 }
2447
b61a28fe 2448 /* Handle the maxmemory directive */
2449 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2450 zmalloc_used_memory() > server.maxmemory)
2451 {
2452 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2453 resetClient(c);
2454 return 1;
2455 }
2456
d6cc8867 2457 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
e6cca5db 2458 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2459 &&
ffc6b7f8 2460 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2461 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2462 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
d6cc8867 2463 resetClient(c);
2464 return 1;
2465 }
2466
ed9b544e 2467 /* Exec the command */
6531c94d 2468 if (c->flags & REDIS_MULTI &&
2469 cmd->proc != execCommand && cmd->proc != discardCommand &&
2470 cmd->proc != multiCommand && cmd->proc != watchCommand)
2471 {
6e469882 2472 queueMultiCommand(c,cmd);
2473 addReply(c,shared.queued);
2474 } else {
d5d55fc3 2475 if (server.vm_enabled && server.vm_max_threads > 0 &&
0a6f3f0f 2476 blockClientOnSwappedKeys(c,cmd)) return 1;
6e469882 2477 call(c,cmd);
2478 }
ed9b544e 2479
2480 /* Prepare the client for the next command */
ed9b544e 2481 resetClient(c);
2482 return 1;
2483}
2484
248ea310 2485static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2486 listNode *ln;
c7df85a4 2487 listIter li;
ed9b544e 2488 int outc = 0, j;
93ea3759 2489 robj **outv;
248ea310 2490 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2491 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2492 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2493 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2494 robj *lenobj;
93ea3759 2495
2496 if (argc <= REDIS_STATIC_ARGS) {
2497 outv = static_outv;
2498 } else {
248ea310 2499 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2500 }
248ea310 2501
2502 lenobj = createObject(REDIS_STRING,
2503 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2504 lenobj->refcount = 0;
2505 outv[outc++] = lenobj;
ed9b544e 2506 for (j = 0; j < argc; j++) {
248ea310 2507 lenobj = createObject(REDIS_STRING,
2508 sdscatprintf(sdsempty(),"$%lu\r\n",
2509 (unsigned long) stringObjectLen(argv[j])));
2510 lenobj->refcount = 0;
2511 outv[outc++] = lenobj;
ed9b544e 2512 outv[outc++] = argv[j];
248ea310 2513 outv[outc++] = shared.crlf;
ed9b544e 2514 }
ed9b544e 2515
40d224a9 2516 /* Increment all the refcounts at start and decrement at end in order to
2517 * be sure to free objects if there is no slave in a replication state
2518 * able to be feed with commands */
2519 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2520 listRewind(slaves,&li);
2521 while((ln = listNext(&li))) {
ed9b544e 2522 redisClient *slave = ln->value;
40d224a9 2523
2524 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2525 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2526
2527 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2528 if (slave->slaveseldb != dictid) {
2529 robj *selectcmd;
2530
2531 switch(dictid) {
2532 case 0: selectcmd = shared.select0; break;
2533 case 1: selectcmd = shared.select1; break;
2534 case 2: selectcmd = shared.select2; break;
2535 case 3: selectcmd = shared.select3; break;
2536 case 4: selectcmd = shared.select4; break;
2537 case 5: selectcmd = shared.select5; break;
2538 case 6: selectcmd = shared.select6; break;
2539 case 7: selectcmd = shared.select7; break;
2540 case 8: selectcmd = shared.select8; break;
2541 case 9: selectcmd = shared.select9; break;
2542 default:
2543 selectcmd = createObject(REDIS_STRING,
2544 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2545 selectcmd->refcount = 0;
2546 break;
2547 }
2548 addReply(slave,selectcmd);
2549 slave->slaveseldb = dictid;
2550 }
2551 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2552 }
40d224a9 2553 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2554 if (outv != static_outv) zfree(outv);
ed9b544e 2555}
2556
dd142b9c 2557static sds sdscatrepr(sds s, char *p, size_t len) {
2558 s = sdscatlen(s,"\"",1);
2559 while(len--) {
2560 switch(*p) {
2561 case '\\':
2562 case '"':
2563 s = sdscatprintf(s,"\\%c",*p);
2564 break;
2565 case '\n': s = sdscatlen(s,"\\n",1); break;
2566 case '\r': s = sdscatlen(s,"\\r",1); break;
2567 case '\t': s = sdscatlen(s,"\\t",1); break;
2568 case '\a': s = sdscatlen(s,"\\a",1); break;
2569 case '\b': s = sdscatlen(s,"\\b",1); break;
2570 default:
2571 if (isprint(*p))
2572 s = sdscatprintf(s,"%c",*p);
2573 else
2574 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2575 break;
2576 }
2577 p++;
2578 }
2579 return sdscatlen(s,"\"",1);
2580}
2581
2582static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2583 listNode *ln;
2584 listIter li;
2585 int j;
2586 sds cmdrepr = sdsnew("+");
2587 robj *cmdobj;
2588 struct timeval tv;
2589
2590 gettimeofday(&tv,NULL);
2591 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2592 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2593
2594 for (j = 0; j < argc; j++) {
2595 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2596 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2597 } else {
2598 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2599 sdslen(argv[j]->ptr));
2600 }
2601 if (j != argc-1)
2602 cmdrepr = sdscatlen(cmdrepr," ",1);
2603 }
2604 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2605 cmdobj = createObject(REDIS_STRING,cmdrepr);
2606
2607 listRewind(monitors,&li);
2608 while((ln = listNext(&li))) {
2609 redisClient *monitor = ln->value;
2610 addReply(monitor,cmdobj);
2611 }
2612 decrRefCount(cmdobj);
2613}
2614
638e42ac 2615static void processInputBuffer(redisClient *c) {
ed9b544e 2616again:
4409877e 2617 /* Before to process the input buffer, make sure the client is not
2618 * waitig for a blocking operation such as BLPOP. Note that the first
2619 * iteration the client is never blocked, otherwise the processInputBuffer
2620 * would not be called at all, but after the execution of the first commands
2621 * in the input buffer the client may be blocked, and the "goto again"
2622 * will try to reiterate. The following line will make it return asap. */
92f8e882 2623 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2624 if (c->bulklen == -1) {
2625 /* Read the first line of the query */
2626 char *p = strchr(c->querybuf,'\n');
2627 size_t querylen;
644fafa3 2628
ed9b544e 2629 if (p) {
2630 sds query, *argv;
2631 int argc, j;
e0a62c7f 2632
ed9b544e 2633 query = c->querybuf;
2634 c->querybuf = sdsempty();
2635 querylen = 1+(p-(query));
2636 if (sdslen(query) > querylen) {
2637 /* leave data after the first line of the query in the buffer */
2638 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2639 }
2640 *p = '\0'; /* remove "\n" */
2641 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2642 sdsupdatelen(query);
2643
2644 /* Now we can split the query in arguments */
ed9b544e 2645 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2646 sdsfree(query);
2647
2648 if (c->argv) zfree(c->argv);
2649 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2650
2651 for (j = 0; j < argc; j++) {
ed9b544e 2652 if (sdslen(argv[j])) {
2653 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2654 c->argc++;
2655 } else {
2656 sdsfree(argv[j]);
2657 }
2658 }
2659 zfree(argv);
7c49733c 2660 if (c->argc) {
2661 /* Execute the command. If the client is still valid
2662 * after processCommand() return and there is something
2663 * on the query buffer try to process the next command. */
2664 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2665 } else {
2666 /* Nothing to process, argc == 0. Just process the query
2667 * buffer if it's not empty or return to the caller */
2668 if (sdslen(c->querybuf)) goto again;
2669 }
ed9b544e 2670 return;
644fafa3 2671 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2672 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2673 freeClient(c);
2674 return;
2675 }
2676 } else {
2677 /* Bulk read handling. Note that if we are at this point
2678 the client already sent a command terminated with a newline,
2679 we are reading the bulk data that is actually the last
2680 argument of the command. */
2681 int qbl = sdslen(c->querybuf);
2682
2683 if (c->bulklen <= qbl) {
2684 /* Copy everything but the final CRLF as final argument */
2685 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2686 c->argc++;
2687 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2688 /* Process the command. If the client is still valid after
2689 * the processing and there is more data in the buffer
2690 * try to parse it. */
2691 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2692 return;
2693 }
2694 }
2695}
2696
638e42ac 2697static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2698 redisClient *c = (redisClient*) privdata;
2699 char buf[REDIS_IOBUF_LEN];
2700 int nread;
2701 REDIS_NOTUSED(el);
2702 REDIS_NOTUSED(mask);
2703
2704 nread = read(fd, buf, REDIS_IOBUF_LEN);
2705 if (nread == -1) {
2706 if (errno == EAGAIN) {
2707 nread = 0;
2708 } else {
f870935d 2709 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2710 freeClient(c);
2711 return;
2712 }
2713 } else if (nread == 0) {
f870935d 2714 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2715 freeClient(c);
2716 return;
2717 }
2718 if (nread) {
2719 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2720 c->lastinteraction = time(NULL);
2721 } else {
2722 return;
2723 }
168ac5c6 2724 processInputBuffer(c);
638e42ac 2725}
2726
ed9b544e 2727static int selectDb(redisClient *c, int id) {
2728 if (id < 0 || id >= server.dbnum)
2729 return REDIS_ERR;
3305306f 2730 c->db = &server.db[id];
ed9b544e 2731 return REDIS_OK;
2732}
2733
40d224a9 2734static void *dupClientReplyValue(void *o) {
2735 incrRefCount((robj*)o);
12d090d2 2736 return o;
40d224a9 2737}
2738
ffc6b7f8 2739static int listMatchObjects(void *a, void *b) {
bf028098 2740 return equalStringObjects(a,b);
ffc6b7f8 2741}
2742
ed9b544e 2743static redisClient *createClient(int fd) {
2744 redisClient *c = zmalloc(sizeof(*c));
2745
2746 anetNonBlock(NULL,fd);
2747 anetTcpNoDelay(NULL,fd);
2748 if (!c) return NULL;
2749 selectDb(c,0);
2750 c->fd = fd;
2751 c->querybuf = sdsempty();
2752 c->argc = 0;
93ea3759 2753 c->argv = NULL;
ed9b544e 2754 c->bulklen = -1;
e8a74421 2755 c->multibulk = 0;
2756 c->mbargc = 0;
2757 c->mbargv = NULL;
ed9b544e 2758 c->sentlen = 0;
2759 c->flags = 0;
2760 c->lastinteraction = time(NULL);
abcb223e 2761 c->authenticated = 0;
40d224a9 2762 c->replstate = REDIS_REPL_NONE;
6b47e12e 2763 c->reply = listCreate();
ed9b544e 2764 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2765 listSetDupMethod(c->reply,dupClientReplyValue);
37ab76c9 2766 c->blocking_keys = NULL;
2767 c->blocking_keys_num = 0;
92f8e882 2768 c->io_keys = listCreate();
87c68815 2769 c->watched_keys = listCreate();
92f8e882 2770 listSetFreeMethod(c->io_keys,decrRefCount);
ffc6b7f8 2771 c->pubsub_channels = dictCreate(&setDictType,NULL);
2772 c->pubsub_patterns = listCreate();
2773 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2774 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
ed9b544e 2775 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2776 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2777 freeClient(c);
2778 return NULL;
2779 }
6b47e12e 2780 listAddNodeTail(server.clients,c);
6e469882 2781 initClientMultiState(c);
ed9b544e 2782 return c;
2783}
2784
2785static void addReply(redisClient *c, robj *obj) {
2786 if (listLength(c->reply) == 0 &&
6208b3a7 2787 (c->replstate == REDIS_REPL_NONE ||
2788 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2789 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2790 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2791
2792 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2793 obj = dupStringObject(obj);
2794 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2795 }
9d65a1bb 2796 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2797}
2798
2799static void addReplySds(redisClient *c, sds s) {
2800 robj *o = createObject(REDIS_STRING,s);
2801 addReply(c,o);
2802 decrRefCount(o);
2803}
2804
e2665397 2805static void addReplyDouble(redisClient *c, double d) {
2806 char buf[128];
2807
2808 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2809 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2810 (unsigned long) strlen(buf),buf));
e2665397 2811}
2812
aa7c2934
PN
2813static void addReplyLongLong(redisClient *c, long long ll) {
2814 char buf[128];
2815 size_t len;
2816
2817 if (ll == 0) {
2818 addReply(c,shared.czero);
2819 return;
2820 } else if (ll == 1) {
2821 addReply(c,shared.cone);
2822 return;
2823 }
482b672d 2824 buf[0] = ':';
2825 len = ll2string(buf+1,sizeof(buf)-1,ll);
2826 buf[len+1] = '\r';
2827 buf[len+2] = '\n';
2828 addReplySds(c,sdsnewlen(buf,len+3));
aa7c2934
PN
2829}
2830
92b27fe9 2831static void addReplyUlong(redisClient *c, unsigned long ul) {
2832 char buf[128];
2833 size_t len;
2834
dd88747b 2835 if (ul == 0) {
2836 addReply(c,shared.czero);
2837 return;
2838 } else if (ul == 1) {
2839 addReply(c,shared.cone);
2840 return;
2841 }
92b27fe9 2842 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2843 addReplySds(c,sdsnewlen(buf,len));
2844}
2845
942a3961 2846static void addReplyBulkLen(redisClient *c, robj *obj) {
482b672d 2847 size_t len, intlen;
2848 char buf[128];
942a3961 2849
2850 if (obj->encoding == REDIS_ENCODING_RAW) {
2851 len = sdslen(obj->ptr);
2852 } else {
2853 long n = (long)obj->ptr;
2854
e054afda 2855 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2856 len = 1;
2857 if (n < 0) {
2858 len++;
2859 n = -n;
2860 }
2861 while((n = n/10) != 0) {
2862 len++;
2863 }
2864 }
482b672d 2865 buf[0] = '$';
2866 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2867 buf[intlen+1] = '\r';
2868 buf[intlen+2] = '\n';
2869 addReplySds(c,sdsnewlen(buf,intlen+3));
942a3961 2870}
2871
dd88747b 2872static void addReplyBulk(redisClient *c, robj *obj) {
2873 addReplyBulkLen(c,obj);
2874 addReply(c,obj);
2875 addReply(c,shared.crlf);
2876}
2877
500ece7c 2878/* In the CONFIG command we need to add vanilla C string as bulk replies */
2879static void addReplyBulkCString(redisClient *c, char *s) {
2880 if (s == NULL) {
2881 addReply(c,shared.nullbulk);
2882 } else {
2883 robj *o = createStringObject(s,strlen(s));
2884 addReplyBulk(c,o);
2885 decrRefCount(o);
2886 }
2887}
2888
ed9b544e 2889static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2890 int cport, cfd;
2891 char cip[128];
285add55 2892 redisClient *c;
ed9b544e 2893 REDIS_NOTUSED(el);
2894 REDIS_NOTUSED(mask);
2895 REDIS_NOTUSED(privdata);
2896
2897 cfd = anetAccept(server.neterr, fd, cip, &cport);
2898 if (cfd == AE_ERR) {
f870935d 2899 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2900 return;
2901 }
f870935d 2902 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2903 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2904 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2905 close(cfd); /* May be already closed, just ingore errors */
2906 return;
2907 }
285add55 2908 /* If maxclient directive is set and this is one client more... close the
2909 * connection. Note that we create the client instead to check before
2910 * for this condition, since now the socket is already set in nonblocking
2911 * mode and we can send an error for free using the Kernel I/O */
2912 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2913 char *err = "-ERR max number of clients reached\r\n";
2914
2915 /* That's a best effort error message, don't check write errors */
fee803ba 2916 if (write(c->fd,err,strlen(err)) == -1) {
2917 /* Nothing to do, Just to avoid the warning... */
2918 }
285add55 2919 freeClient(c);
2920 return;
2921 }
ed9b544e 2922 server.stat_numconnections++;
2923}
2924
2925/* ======================= Redis objects implementation ===================== */
2926
2927static robj *createObject(int type, void *ptr) {
2928 robj *o;
2929
a5819310 2930 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2931 if (listLength(server.objfreelist)) {
2932 listNode *head = listFirst(server.objfreelist);
2933 o = listNodeValue(head);
2934 listDelNode(server.objfreelist,head);
a5819310 2935 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2936 } else {
75680a3c 2937 if (server.vm_enabled) {
a5819310 2938 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2939 o = zmalloc(sizeof(*o));
2940 } else {
2941 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2942 }
ed9b544e 2943 }
ed9b544e 2944 o->type = type;
942a3961 2945 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2946 o->ptr = ptr;
2947 o->refcount = 1;
3a66edc7 2948 if (server.vm_enabled) {
1064ef87 2949 /* Note that this code may run in the context of an I/O thread
2950 * and accessing to server.unixtime in theory is an error
2951 * (no locks). But in practice this is safe, and even if we read
2952 * garbage Redis will not fail, as it's just a statistical info */
3a66edc7 2953 o->vm.atime = server.unixtime;
2954 o->storage = REDIS_VM_MEMORY;
2955 }
ed9b544e 2956 return o;
2957}
2958
2959static robj *createStringObject(char *ptr, size_t len) {
2960 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2961}
2962
3f973463
PN
2963static robj *createStringObjectFromLongLong(long long value) {
2964 robj *o;
2965 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2966 incrRefCount(shared.integers[value]);
2967 o = shared.integers[value];
2968 } else {
3f973463 2969 if (value >= LONG_MIN && value <= LONG_MAX) {
10dea8dc 2970 o = createObject(REDIS_STRING, NULL);
3f973463
PN
2971 o->encoding = REDIS_ENCODING_INT;
2972 o->ptr = (void*)((long)value);
2973 } else {
ee14da56 2974 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3f973463
PN
2975 }
2976 }
2977 return o;
2978}
2979
4ef8de8a 2980static robj *dupStringObject(robj *o) {
b9bc0eef 2981 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2982 return createStringObject(o->ptr,sdslen(o->ptr));
2983}
2984
ed9b544e 2985static robj *createListObject(void) {
2986 list *l = listCreate();
1cd92e7f 2987 robj *o = createObject(REDIS_LIST,l);
ed9b544e 2988 listSetFreeMethod(l,decrRefCount);
1cd92e7f
PN
2989 o->encoding = REDIS_ENCODING_LIST;
2990 return o;
2991}
2992
2993static robj *createZiplistObject(void) {
2994 unsigned char *zl = ziplistNew();
2995 robj *o = createObject(REDIS_LIST,zl);
2996 o->encoding = REDIS_ENCODING_ZIPLIST;
2997 return o;
ed9b544e 2998}
2999
3000static robj *createSetObject(void) {
3001 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 3002 return createObject(REDIS_SET,d);
3003}
3004
5234952b 3005static robj *createHashObject(void) {
3006 /* All the Hashes start as zipmaps. Will be automatically converted
3007 * into hash tables if there are enough elements or big elements
3008 * inside. */
3009 unsigned char *zm = zipmapNew();
3010 robj *o = createObject(REDIS_HASH,zm);
3011 o->encoding = REDIS_ENCODING_ZIPMAP;
3012 return o;
3013}
3014
1812e024 3015static robj *createZsetObject(void) {
6b47e12e 3016 zset *zs = zmalloc(sizeof(*zs));
3017
3018 zs->dict = dictCreate(&zsetDictType,NULL);
3019 zs->zsl = zslCreate();
3020 return createObject(REDIS_ZSET,zs);
1812e024 3021}
3022
ed9b544e 3023static void freeStringObject(robj *o) {
942a3961 3024 if (o->encoding == REDIS_ENCODING_RAW) {
3025 sdsfree(o->ptr);
3026 }
ed9b544e 3027}
3028
3029static void freeListObject(robj *o) {
c7d9d662
PN
3030 switch (o->encoding) {
3031 case REDIS_ENCODING_LIST:
3032 listRelease((list*) o->ptr);
3033 break;
3034 case REDIS_ENCODING_ZIPLIST:
3035 zfree(o->ptr);
3036 break;
3037 default:
3038 redisPanic("Unknown list encoding type");
3039 }
ed9b544e 3040}
3041
3042static void freeSetObject(robj *o) {
3043 dictRelease((dict*) o->ptr);
3044}
3045
fd8ccf44 3046static void freeZsetObject(robj *o) {
3047 zset *zs = o->ptr;
3048
3049 dictRelease(zs->dict);
3050 zslFree(zs->zsl);
3051 zfree(zs);
3052}
3053
ed9b544e 3054static void freeHashObject(robj *o) {
cbba7dd7 3055 switch (o->encoding) {
3056 case REDIS_ENCODING_HT:
3057 dictRelease((dict*) o->ptr);
3058 break;
3059 case REDIS_ENCODING_ZIPMAP:
3060 zfree(o->ptr);
3061 break;
3062 default:
f83c6cb5 3063 redisPanic("Unknown hash encoding type");
cbba7dd7 3064 break;
3065 }
ed9b544e 3066}
3067
3068static void incrRefCount(robj *o) {
3069 o->refcount++;
3070}
3071
3072static void decrRefCount(void *obj) {
3073 robj *o = obj;
94754ccc 3074
c651fd9e 3075 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
970e10bb 3076 /* Object is a key of a swapped out value, or in the process of being
3077 * loaded. */
996cb5f7 3078 if (server.vm_enabled &&
3079 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3080 {
996cb5f7 3081 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 3082 redisAssert(o->type == REDIS_STRING);
a35ddf12 3083 freeStringObject(o);
3084 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 3085 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 3086 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3087 !listAddNodeHead(server.objfreelist,o))
3088 zfree(o);
a5819310 3089 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 3090 server.vm_stats_swapped_objects--;
a35ddf12 3091 return;
3092 }
996cb5f7 3093 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 3094 if (--(o->refcount) == 0) {
996cb5f7 3095 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3096 vmCancelThreadedIOJob(obj);
ed9b544e 3097 switch(o->type) {
3098 case REDIS_STRING: freeStringObject(o); break;
3099 case REDIS_LIST: freeListObject(o); break;
3100 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 3101 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 3102 case REDIS_HASH: freeHashObject(o); break;
f83c6cb5 3103 default: redisPanic("Unknown object type"); break;
ed9b544e 3104 }
a5819310 3105 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 3106 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3107 !listAddNodeHead(server.objfreelist,o))
3108 zfree(o);
a5819310 3109 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 3110 }
3111}
3112
942a3961 3113static robj *lookupKey(redisDb *db, robj *key) {
3114 dictEntry *de = dictFind(db->dict,key);
3a66edc7 3115 if (de) {
55cf8433 3116 robj *key = dictGetEntryKey(de);
3117 robj *val = dictGetEntryVal(de);
3a66edc7 3118
55cf8433 3119 if (server.vm_enabled) {
996cb5f7 3120 if (key->storage == REDIS_VM_MEMORY ||
3121 key->storage == REDIS_VM_SWAPPING)
3122 {
3123 /* If we were swapping the object out, stop it, this key
3124 * was requested. */
3125 if (key->storage == REDIS_VM_SWAPPING)
3126 vmCancelThreadedIOJob(key);
55cf8433 3127 /* Update the access time of the key for the aging algorithm. */
3128 key->vm.atime = server.unixtime;
3129 } else {
d5d55fc3 3130 int notify = (key->storage == REDIS_VM_LOADING);
3131
55cf8433 3132 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 3133 redisAssert(val == NULL);
55cf8433 3134 val = vmLoadObject(key);
3135 dictGetEntryVal(de) = val;
d5d55fc3 3136
3137 /* Clients blocked by the VM subsystem may be waiting for
3138 * this key... */
3139 if (notify) handleClientsBlockedOnSwappedKey(db,key);
55cf8433 3140 }
3141 }
3142 return val;
3a66edc7 3143 } else {
3144 return NULL;
3145 }
942a3961 3146}
3147
3148static robj *lookupKeyRead(redisDb *db, robj *key) {
3149 expireIfNeeded(db,key);
3150 return lookupKey(db,key);
3151}
3152
3153static robj *lookupKeyWrite(redisDb *db, robj *key) {
3154 deleteIfVolatile(db,key);
37ab76c9 3155 touchWatchedKey(db,key);
942a3961 3156 return lookupKey(db,key);
3157}
3158
92b27fe9 3159static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3160 robj *o = lookupKeyRead(c->db, key);
3161 if (!o) addReply(c,reply);
3162 return o;
3163}
3164
3165static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3166 robj *o = lookupKeyWrite(c->db, key);
3167 if (!o) addReply(c,reply);
3168 return o;
3169}
3170
3171static int checkType(redisClient *c, robj *o, int type) {
3172 if (o->type != type) {
3173 addReply(c,shared.wrongtypeerr);
3174 return 1;
3175 }
3176 return 0;
3177}
3178
942a3961 3179static int deleteKey(redisDb *db, robj *key) {
3180 int retval;
3181
3182 /* We need to protect key from destruction: after the first dictDelete()
3183 * it may happen that 'key' is no longer valid if we don't increment
3184 * it's count. This may happen when we get the object reference directly
3185 * from the hash table with dictRandomKey() or dict iterators */
3186 incrRefCount(key);
3187 if (dictSize(db->expires)) dictDelete(db->expires,key);
3188 retval = dictDelete(db->dict,key);
3189 decrRefCount(key);
3190
3191 return retval == DICT_OK;
3192}
3193
724a51b1 3194/* Check if the nul-terminated string 's' can be represented by a long
3195 * (that is, is a number that fits into long without any other space or
3196 * character before or after the digits).
3197 *
3198 * If so, the function returns REDIS_OK and *longval is set to the value
3199 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 3200static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 3201 char buf[32], *endptr;
3202 long value;
3203 int slen;
e0a62c7f 3204
724a51b1 3205 value = strtol(s, &endptr, 10);
3206 if (endptr[0] != '\0') return REDIS_ERR;
ee14da56 3207 slen = ll2string(buf,32,value);
724a51b1 3208
3209 /* If the number converted back into a string is not identical
3210 * then it's not possible to encode the string as integer */
f69f2cba 3211 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 3212 if (longval) *longval = value;
3213 return REDIS_OK;
3214}
3215
942a3961 3216/* Try to encode a string object in order to save space */
05df7621 3217static robj *tryObjectEncoding(robj *o) {
942a3961 3218 long value;
942a3961 3219 sds s = o->ptr;
3305306f 3220
942a3961 3221 if (o->encoding != REDIS_ENCODING_RAW)
05df7621 3222 return o; /* Already encoded */
3305306f 3223
05df7621 3224 /* It's not safe to encode shared objects: shared objects can be shared
942a3961 3225 * everywhere in the "object space" of Redis. Encoded objects can only
3226 * appear as "values" (and not, for instance, as keys) */
05df7621 3227 if (o->refcount > 1) return o;
3305306f 3228
942a3961 3229 /* Currently we try to encode only strings */
dfc5e96c 3230 redisAssert(o->type == REDIS_STRING);
94754ccc 3231
724a51b1 3232 /* Check if we can represent this string as a long integer */
05df7621 3233 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
942a3961 3234
3235 /* Ok, this object can be encoded */
05df7621 3236 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3237 decrRefCount(o);
3238 incrRefCount(shared.integers[value]);
3239 return shared.integers[value];
3240 } else {
3241 o->encoding = REDIS_ENCODING_INT;
3242 sdsfree(o->ptr);
3243 o->ptr = (void*) value;
3244 return o;
3245 }
942a3961 3246}
3247
9d65a1bb 3248/* Get a decoded version of an encoded object (returned as a new object).
3249 * If the object is already raw-encoded just increment the ref count. */
3250static robj *getDecodedObject(robj *o) {
942a3961 3251 robj *dec;
e0a62c7f 3252
9d65a1bb 3253 if (o->encoding == REDIS_ENCODING_RAW) {
3254 incrRefCount(o);
3255 return o;
3256 }
942a3961 3257 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3258 char buf[32];
3259
ee14da56 3260 ll2string(buf,32,(long)o->ptr);
942a3961 3261 dec = createStringObject(buf,strlen(buf));
3262 return dec;
3263 } else {
08ee9b57 3264 redisPanic("Unknown encoding type");
942a3961 3265 }
3305306f 3266}
3267
d7f43c08 3268/* Compare two string objects via strcmp() or alike.
3269 * Note that the objects may be integer-encoded. In such a case we
ee14da56 3270 * use ll2string() to get a string representation of the numbers on the stack
1fd9bc8a 3271 * and compare the strings, it's much faster than calling getDecodedObject().
3272 *
3273 * Important note: if objects are not integer encoded, but binary-safe strings,
3274 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3275 * binary safe. */
724a51b1 3276static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 3277 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 3278 char bufa[128], bufb[128], *astr, *bstr;
3279 int bothsds = 1;
724a51b1 3280
e197b441 3281 if (a == b) return 0;
d7f43c08 3282 if (a->encoding != REDIS_ENCODING_RAW) {
ee14da56 3283 ll2string(bufa,sizeof(bufa),(long) a->ptr);
d7f43c08 3284 astr = bufa;
3285 bothsds = 0;
724a51b1 3286 } else {
d7f43c08 3287 astr = a->ptr;
724a51b1 3288 }
d7f43c08 3289 if (b->encoding != REDIS_ENCODING_RAW) {
ee14da56 3290 ll2string(bufb,sizeof(bufb),(long) b->ptr);
d7f43c08 3291 bstr = bufb;
3292 bothsds = 0;
3293 } else {
3294 bstr = b->ptr;
3295 }
3296 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3297}
3298
bf028098 3299/* Equal string objects return 1 if the two objects are the same from the
3300 * point of view of a string comparison, otherwise 0 is returned. Note that
3301 * this function is faster then checking for (compareStringObject(a,b) == 0)
3302 * because it can perform some more optimization. */
3303static int equalStringObjects(robj *a, robj *b) {
3304 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3305 return a->ptr == b->ptr;
3306 } else {
3307 return compareStringObjects(a,b) == 0;
3308 }
3309}
3310
0ea663ea 3311static size_t stringObjectLen(robj *o) {
dfc5e96c 3312 redisAssert(o->type == REDIS_STRING);
0ea663ea 3313 if (o->encoding == REDIS_ENCODING_RAW) {
3314 return sdslen(o->ptr);
3315 } else {
3316 char buf[32];
3317
ee14da56 3318 return ll2string(buf,32,(long)o->ptr);
0ea663ea 3319 }
3320}
3321
bd79a6bd
PN
3322static int getDoubleFromObject(robj *o, double *target) {
3323 double value;
682c73e8 3324 char *eptr;
bbe025e0 3325
bd79a6bd
PN
3326 if (o == NULL) {
3327 value = 0;
3328 } else {
3329 redisAssert(o->type == REDIS_STRING);
3330 if (o->encoding == REDIS_ENCODING_RAW) {
3331 value = strtod(o->ptr, &eptr);
682c73e8 3332 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3333 } else if (o->encoding == REDIS_ENCODING_INT) {
3334 value = (long)o->ptr;
3335 } else {
946342c1 3336 redisPanic("Unknown string encoding");
bd79a6bd
PN
3337 }
3338 }
3339
bd79a6bd
PN
3340 *target = value;
3341 return REDIS_OK;
3342}
bbe025e0 3343
bd79a6bd
PN
3344static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3345 double value;
3346 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3347 if (msg != NULL) {
3348 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3349 } else {
3350 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3351 }
bbe025e0
AM
3352 return REDIS_ERR;
3353 }
3354
bd79a6bd 3355 *target = value;
bbe025e0
AM
3356 return REDIS_OK;
3357}
3358
bd79a6bd
PN
3359static int getLongLongFromObject(robj *o, long long *target) {
3360 long long value;
682c73e8 3361 char *eptr;
bbe025e0 3362
bd79a6bd
PN
3363 if (o == NULL) {
3364 value = 0;
3365 } else {
3366 redisAssert(o->type == REDIS_STRING);
3367 if (o->encoding == REDIS_ENCODING_RAW) {
3368 value = strtoll(o->ptr, &eptr, 10);
682c73e8 3369 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3370 } else if (o->encoding == REDIS_ENCODING_INT) {
3371 value = (long)o->ptr;
3372 } else {
946342c1 3373 redisPanic("Unknown string encoding");
bd79a6bd
PN
3374 }
3375 }
3376
bd79a6bd
PN
3377 *target = value;
3378 return REDIS_OK;
3379}
bbe025e0 3380
bd79a6bd
PN
3381static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3382 long long value;
3383 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3384 if (msg != NULL) {
3385 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3386 } else {
3387 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3388 }
bbe025e0
AM
3389 return REDIS_ERR;
3390 }
3391
bd79a6bd 3392 *target = value;
bbe025e0
AM
3393 return REDIS_OK;
3394}
3395
bd79a6bd
PN
3396static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3397 long long value;
bbe025e0 3398
bd79a6bd
PN
3399 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3400 if (value < LONG_MIN || value > LONG_MAX) {
3401 if (msg != NULL) {
3402 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3403 } else {
3404 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3405 }
bbe025e0
AM
3406 return REDIS_ERR;
3407 }
3408
bd79a6bd 3409 *target = value;
bbe025e0
AM
3410 return REDIS_OK;
3411}
3412
06233c45 3413/*============================ RDB saving/loading =========================== */
ed9b544e 3414
f78fd11b 3415static int rdbSaveType(FILE *fp, unsigned char type) {
3416 if (fwrite(&type,1,1,fp) == 0) return -1;
3417 return 0;
3418}
3419
bb32ede5 3420static int rdbSaveTime(FILE *fp, time_t t) {
3421 int32_t t32 = (int32_t) t;
3422 if (fwrite(&t32,4,1,fp) == 0) return -1;
3423 return 0;
3424}
3425
e3566d4b 3426/* check rdbLoadLen() comments for more info */
f78fd11b 3427static int rdbSaveLen(FILE *fp, uint32_t len) {
3428 unsigned char buf[2];
3429
3430 if (len < (1<<6)) {
3431 /* Save a 6 bit len */
10c43610 3432 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3433 if (fwrite(buf,1,1,fp) == 0) return -1;
3434 } else if (len < (1<<14)) {
3435 /* Save a 14 bit len */
10c43610 3436 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3437 buf[1] = len&0xFF;
17be1a4a 3438 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3439 } else {
3440 /* Save a 32 bit len */
10c43610 3441 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3442 if (fwrite(buf,1,1,fp) == 0) return -1;
3443 len = htonl(len);
3444 if (fwrite(&len,4,1,fp) == 0) return -1;
3445 }
3446 return 0;
3447}
3448
32a66513 3449/* Encode 'value' as an integer if possible (if integer will fit the
3450 * supported range). If the function sucessful encoded the integer
3451 * then the (up to 5 bytes) encoded representation is written in the
3452 * string pointed by 'enc' and the length is returned. Otherwise
3453 * 0 is returned. */
3454static int rdbEncodeInteger(long long value, unsigned char *enc) {
e3566d4b 3455 /* Finally check if it fits in our ranges */
3456 if (value >= -(1<<7) && value <= (1<<7)-1) {
3457 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3458 enc[1] = value&0xFF;
3459 return 2;
3460 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3461 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3462 enc[1] = value&0xFF;
3463 enc[2] = (value>>8)&0xFF;
3464 return 3;
3465 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3466 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3467 enc[1] = value&0xFF;
3468 enc[2] = (value>>8)&0xFF;
3469 enc[3] = (value>>16)&0xFF;
3470 enc[4] = (value>>24)&0xFF;
3471 return 5;
3472 } else {
3473 return 0;
3474 }
3475}
3476
32a66513 3477/* String objects in the form "2391" "-100" without any space and with a
3478 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3479 * encoded as integers to save space */
3480static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3481 long long value;
3482 char *endptr, buf[32];
3483
3484 /* Check if it's possible to encode this value as a number */
3485 value = strtoll(s, &endptr, 10);
3486 if (endptr[0] != '\0') return 0;
3487 ll2string(buf,32,value);
3488
3489 /* If the number converted back into a string is not identical
3490 * then it's not possible to encode the string as integer */
3491 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3492
3493 return rdbEncodeInteger(value,enc);
3494}
3495
b1befe6a 3496static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3497 size_t comprlen, outlen;
774e3047 3498 unsigned char byte;
3499 void *out;
3500
3501 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3502 if (len <= 4) return 0;
3503 outlen = len-4;
3a2694c4 3504 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3505 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3506 if (comprlen == 0) {
88e85998 3507 zfree(out);
774e3047 3508 return 0;
3509 }
3510 /* Data compressed! Let's save it on disk */
3511 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3512 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3513 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3514 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3515 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3516 zfree(out);
774e3047 3517 return comprlen;
3518
3519writeerr:
88e85998 3520 zfree(out);
774e3047 3521 return -1;
3522}
3523
e3566d4b 3524/* Save a string objet as [len][data] on disk. If the object is a string
3525 * representation of an integer value we try to safe it in a special form */
b1befe6a 3526static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3527 int enclen;
10c43610 3528
774e3047 3529 /* Try integer encoding */
e3566d4b 3530 if (len <= 11) {
3531 unsigned char buf[5];
b1befe6a 3532 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3533 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3534 return 0;
3535 }
3536 }
774e3047 3537
3538 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3539 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3540 if (server.rdbcompression && len > 20) {
774e3047 3541 int retval;
3542
b1befe6a 3543 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3544 if (retval == -1) return -1;
3545 if (retval > 0) return 0;
3546 /* retval == 0 means data can't be compressed, save the old way */
3547 }
3548
3549 /* Store verbatim */
10c43610 3550 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3551 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3552 return 0;
3553}
3554
2796f6da
PN
3555/* Save a long long value as either an encoded string or a string. */
3556static int rdbSaveLongLongAsStringObject(FILE *fp, long long value) {
3557 unsigned char buf[32];
3558 int enclen = rdbEncodeInteger(value,buf);
3559 if (enclen > 0) {
3560 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3561 } else {
3562 /* Encode as string */
3563 enclen = ll2string((char*)buf,32,value);
3564 redisAssert(enclen < 32);
3565 if (rdbSaveLen(fp,enclen) == -1) return -1;
3566 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3567 }
3568 return 0;
3569}
3570
942a3961 3571/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3572static int rdbSaveStringObject(FILE *fp, robj *obj) {
32a66513 3573 /* Avoid to decode the object, then encode it again, if the
3574 * object is alrady integer encoded. */
3575 if (obj->encoding == REDIS_ENCODING_INT) {
2796f6da 3576 return rdbSaveLongLongAsStringObject(fp,(long)obj->ptr);
996cb5f7 3577 } else {
2796f6da
PN
3578 redisAssert(obj->encoding == REDIS_ENCODING_RAW);
3579 return rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3580 }
942a3961 3581}
3582
a7866db6 3583/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3584 * 8 bit integer specifing the length of the representation.
3585 * This 8 bit integer has special values in order to specify the following
3586 * conditions:
3587 * 253: not a number
3588 * 254: + inf
3589 * 255: - inf
3590 */
3591static int rdbSaveDoubleValue(FILE *fp, double val) {
3592 unsigned char buf[128];
3593 int len;
3594
3595 if (isnan(val)) {
3596 buf[0] = 253;
3597 len = 1;
3598 } else if (!isfinite(val)) {
3599 len = 1;
3600 buf[0] = (val < 0) ? 255 : 254;
3601 } else {
88e8d89f 3602#if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
fe244589 3603 /* Check if the float is in a safe range to be casted into a
3604 * long long. We are assuming that long long is 64 bit here.
3605 * Also we are assuming that there are no implementations around where
3606 * double has precision < 52 bit.
3607 *
3608 * Under this assumptions we test if a double is inside an interval
3609 * where casting to long long is safe. Then using two castings we
3610 * make sure the decimal part is zero. If all this is true we use
3611 * integer printing function that is much faster. */
fb82e75c 3612 double min = -4503599627370495; /* (2^52)-1 */
3613 double max = 4503599627370496; /* -(2^52) */
fe244589 3614 if (val > min && val < max && val == ((double)((long long)val)))
8c096b16 3615 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3616 else
88e8d89f 3617#endif
8c096b16 3618 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3619 buf[0] = strlen((char*)buf+1);
a7866db6 3620 len = buf[0]+1;
3621 }
3622 if (fwrite(buf,len,1,fp) == 0) return -1;
3623 return 0;
3624}
3625
06233c45 3626/* Save a Redis object. */
3627static int rdbSaveObject(FILE *fp, robj *o) {
3628 if (o->type == REDIS_STRING) {
3629 /* Save a string value */
3630 if (rdbSaveStringObject(fp,o) == -1) return -1;
3631 } else if (o->type == REDIS_LIST) {
3632 /* Save a list value */
23f96494
PN
3633 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
3634 unsigned char *p;
3635 unsigned char *vstr;
3636 unsigned int vlen;
3637 long long vlong;
3638
3639 if (rdbSaveLen(fp,ziplistLen(o->ptr)) == -1) return -1;
3640 p = ziplistIndex(o->ptr,0);
3641 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
3642 if (vstr) {
3643 if (rdbSaveRawString(fp,vstr,vlen) == -1)
3644 return -1;
3645 } else {
3646 if (rdbSaveLongLongAsStringObject(fp,vlong) == -1)
3647 return -1;
3648 }
3649 p = ziplistNext(o->ptr,p);
3650 }
3651 } else if (o->encoding == REDIS_ENCODING_LIST) {
3652 list *list = o->ptr;
3653 listIter li;
3654 listNode *ln;
3655
3656 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3657 listRewind(list,&li);
3658 while((ln = listNext(&li))) {
3659 robj *eleobj = listNodeValue(ln);
3660 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3661 }
3662 } else {
3663 redisPanic("Unknown list encoding");
06233c45 3664 }
3665 } else if (o->type == REDIS_SET) {
3666 /* Save a set value */
3667 dict *set = o->ptr;
3668 dictIterator *di = dictGetIterator(set);
3669 dictEntry *de;
3670
3671 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3672 while((de = dictNext(di)) != NULL) {
3673 robj *eleobj = dictGetEntryKey(de);
3674
3675 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3676 }
3677 dictReleaseIterator(di);
3678 } else if (o->type == REDIS_ZSET) {
3679 /* Save a set value */
3680 zset *zs = o->ptr;
3681 dictIterator *di = dictGetIterator(zs->dict);
3682 dictEntry *de;
3683
3684 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3685 while((de = dictNext(di)) != NULL) {
3686 robj *eleobj = dictGetEntryKey(de);
3687 double *score = dictGetEntryVal(de);
3688
3689 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3690 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3691 }
3692 dictReleaseIterator(di);
b1befe6a 3693 } else if (o->type == REDIS_HASH) {
3694 /* Save a hash value */
3695 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3696 unsigned char *p = zipmapRewind(o->ptr);
3697 unsigned int count = zipmapLen(o->ptr);
3698 unsigned char *key, *val;
3699 unsigned int klen, vlen;
3700
3701 if (rdbSaveLen(fp,count) == -1) return -1;
3702 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3703 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3704 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3705 }
3706 } else {
3707 dictIterator *di = dictGetIterator(o->ptr);
3708 dictEntry *de;
3709
3710 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3711 while((de = dictNext(di)) != NULL) {
3712 robj *key = dictGetEntryKey(de);
3713 robj *val = dictGetEntryVal(de);
3714
3715 if (rdbSaveStringObject(fp,key) == -1) return -1;
3716 if (rdbSaveStringObject(fp,val) == -1) return -1;
3717 }
3718 dictReleaseIterator(di);
3719 }
06233c45 3720 } else {
f83c6cb5 3721 redisPanic("Unknown object type");
06233c45 3722 }
3723 return 0;
3724}
3725
3726/* Return the length the object will have on disk if saved with
3727 * the rdbSaveObject() function. Currently we use a trick to get
3728 * this length with very little changes to the code. In the future
3729 * we could switch to a faster solution. */
b9bc0eef 3730static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3731 if (fp == NULL) fp = server.devnull;
06233c45 3732 rewind(fp);
3733 assert(rdbSaveObject(fp,o) != 1);
3734 return ftello(fp);
3735}
3736
06224fec 3737/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3738static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3739 off_t bytes = rdbSavedObjectLen(o,fp);
e0a62c7f 3740
06224fec 3741 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3742}
3743
ed9b544e 3744/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3745static int rdbSave(char *filename) {
ed9b544e 3746 dictIterator *di = NULL;
3747 dictEntry *de;
ed9b544e 3748 FILE *fp;
3749 char tmpfile[256];
3750 int j;
bb32ede5 3751 time_t now = time(NULL);
ed9b544e 3752
2316bb3b 3753 /* Wait for I/O therads to terminate, just in case this is a
3754 * foreground-saving, to avoid seeking the swap file descriptor at the
3755 * same time. */
3756 if (server.vm_enabled)
3757 waitEmptyIOJobsQueue();
3758
a3b21203 3759 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3760 fp = fopen(tmpfile,"w");
3761 if (!fp) {
3762 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3763 return REDIS_ERR;
3764 }
f78fd11b 3765 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3766 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3767 redisDb *db = server.db+j;
3768 dict *d = db->dict;
3305306f 3769 if (dictSize(d) == 0) continue;
ed9b544e 3770 di = dictGetIterator(d);
3771 if (!di) {
3772 fclose(fp);
3773 return REDIS_ERR;
3774 }
3775
3776 /* Write the SELECT DB opcode */
f78fd11b 3777 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3778 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3779
3780 /* Iterate this DB writing every entry */
3781 while((de = dictNext(di)) != NULL) {
3782 robj *key = dictGetEntryKey(de);
3783 robj *o = dictGetEntryVal(de);
bb32ede5 3784 time_t expiretime = getExpire(db,key);
3785
3786 /* Save the expire time */
3787 if (expiretime != -1) {
3788 /* If this key is already expired skip it */
3789 if (expiretime < now) continue;
3790 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3791 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3792 }
7e69548d 3793 /* Save the key and associated value. This requires special
3794 * handling if the value is swapped out. */
996cb5f7 3795 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3796 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3797 /* Save type, key, value */
3798 if (rdbSaveType(fp,o->type) == -1) goto werr;
3799 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3800 if (rdbSaveObject(fp,o) == -1) goto werr;
3801 } else {
996cb5f7 3802 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3803 robj *po;
7e69548d 3804 /* Get a preview of the object in memory */
3805 po = vmPreviewObject(key);
7e69548d 3806 /* Save type, key, value */
3807 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3808 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3809 if (rdbSaveObject(fp,po) == -1) goto werr;
3810 /* Remove the loaded object from memory */
3811 decrRefCount(po);
7e69548d 3812 }
ed9b544e 3813 }
3814 dictReleaseIterator(di);
3815 }
3816 /* EOF opcode */
f78fd11b 3817 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3818
3819 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3820 fflush(fp);
3821 fsync(fileno(fp));
3822 fclose(fp);
e0a62c7f 3823
ed9b544e 3824 /* Use RENAME to make sure the DB file is changed atomically only
3825 * if the generate DB file is ok. */
3826 if (rename(tmpfile,filename) == -1) {
325d1eb4 3827 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3828 unlink(tmpfile);
3829 return REDIS_ERR;
3830 }
3831 redisLog(REDIS_NOTICE,"DB saved on disk");
3832 server.dirty = 0;
3833 server.lastsave = time(NULL);
3834 return REDIS_OK;
3835
3836werr:
3837 fclose(fp);
3838 unlink(tmpfile);
3839 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3840 if (di) dictReleaseIterator(di);
3841 return REDIS_ERR;
3842}
3843
f78fd11b 3844static int rdbSaveBackground(char *filename) {
ed9b544e 3845 pid_t childpid;
3846
9d65a1bb 3847 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3848 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3849 if ((childpid = fork()) == 0) {
3850 /* Child */
054e426d 3851 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3852 close(server.fd);
f78fd11b 3853 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3854 _exit(0);
ed9b544e 3855 } else {
478c2c6f 3856 _exit(1);
ed9b544e 3857 }
3858 } else {
3859 /* Parent */
5a7c647e 3860 if (childpid == -1) {
3861 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3862 strerror(errno));
3863 return REDIS_ERR;
3864 }
ed9b544e 3865 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3866 server.bgsavechildpid = childpid;
884d4b39 3867 updateDictResizePolicy();
ed9b544e 3868 return REDIS_OK;
3869 }
3870 return REDIS_OK; /* unreached */
3871}
3872
a3b21203 3873static void rdbRemoveTempFile(pid_t childpid) {
3874 char tmpfile[256];
3875
3876 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3877 unlink(tmpfile);
3878}
3879
f78fd11b 3880static int rdbLoadType(FILE *fp) {
3881 unsigned char type;
7b45bfb2 3882 if (fread(&type,1,1,fp) == 0) return -1;
3883 return type;
3884}
3885
bb32ede5 3886static time_t rdbLoadTime(FILE *fp) {
3887 int32_t t32;
3888 if (fread(&t32,4,1,fp) == 0) return -1;
3889 return (time_t) t32;
3890}
3891
e3566d4b 3892/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3893 * of this file for a description of how this are stored on disk.
3894 *
3895 * isencoded is set to 1 if the readed length is not actually a length but
3896 * an "encoding type", check the above comments for more info */
c78a8ccc 3897static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3898 unsigned char buf[2];
3899 uint32_t len;
c78a8ccc 3900 int type;
f78fd11b 3901
e3566d4b 3902 if (isencoded) *isencoded = 0;
c78a8ccc 3903 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3904 type = (buf[0]&0xC0)>>6;
3905 if (type == REDIS_RDB_6BITLEN) {
3906 /* Read a 6 bit len */
3907 return buf[0]&0x3F;
3908 } else if (type == REDIS_RDB_ENCVAL) {
3909 /* Read a 6 bit len encoding type */
3910 if (isencoded) *isencoded = 1;
3911 return buf[0]&0x3F;
3912 } else if (type == REDIS_RDB_14BITLEN) {
3913 /* Read a 14 bit len */
3914 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3915 return ((buf[0]&0x3F)<<8)|buf[1];
3916 } else {
3917 /* Read a 32 bit len */
f78fd11b 3918 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3919 return ntohl(len);
f78fd11b 3920 }
f78fd11b 3921}
3922
ad30aa60 3923/* Load an integer-encoded object from file 'fp', with the specified
3924 * encoding type 'enctype'. If encode is true the function may return
3925 * an integer-encoded object as reply, otherwise the returned object
3926 * will always be encoded as a raw string. */
3927static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
e3566d4b 3928 unsigned char enc[4];
3929 long long val;
3930
3931 if (enctype == REDIS_RDB_ENC_INT8) {
3932 if (fread(enc,1,1,fp) == 0) return NULL;
3933 val = (signed char)enc[0];
3934 } else if (enctype == REDIS_RDB_ENC_INT16) {
3935 uint16_t v;
3936 if (fread(enc,2,1,fp) == 0) return NULL;
3937 v = enc[0]|(enc[1]<<8);
3938 val = (int16_t)v;
3939 } else if (enctype == REDIS_RDB_ENC_INT32) {
3940 uint32_t v;
3941 if (fread(enc,4,1,fp) == 0) return NULL;
3942 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3943 val = (int32_t)v;
3944 } else {
3945 val = 0; /* anti-warning */
f83c6cb5 3946 redisPanic("Unknown RDB integer encoding type");
e3566d4b 3947 }
ad30aa60 3948 if (encode)
3949 return createStringObjectFromLongLong(val);
3950 else
3951 return createObject(REDIS_STRING,sdsfromlonglong(val));
e3566d4b 3952}
3953
c78a8ccc 3954static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3955 unsigned int len, clen;
3956 unsigned char *c = NULL;
3957 sds val = NULL;
3958
c78a8ccc 3959 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3960 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3961 if ((c = zmalloc(clen)) == NULL) goto err;
3962 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3963 if (fread(c,clen,1,fp) == 0) goto err;
3964 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3965 zfree(c);
88e85998 3966 return createObject(REDIS_STRING,val);
3967err:
3968 zfree(c);
3969 sdsfree(val);
3970 return NULL;
3971}
3972
ad30aa60 3973static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
e3566d4b 3974 int isencoded;
3975 uint32_t len;
f78fd11b 3976 sds val;
3977
c78a8ccc 3978 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3979 if (isencoded) {
3980 switch(len) {
3981 case REDIS_RDB_ENC_INT8:
3982 case REDIS_RDB_ENC_INT16:
3983 case REDIS_RDB_ENC_INT32:
ad30aa60 3984 return rdbLoadIntegerObject(fp,len,encode);
88e85998 3985 case REDIS_RDB_ENC_LZF:
bdcb92f2 3986 return rdbLoadLzfStringObject(fp);
e3566d4b 3987 default:
f83c6cb5 3988 redisPanic("Unknown RDB encoding type");
e3566d4b 3989 }
3990 }
3991
f78fd11b 3992 if (len == REDIS_RDB_LENERR) return NULL;
3993 val = sdsnewlen(NULL,len);
3994 if (len && fread(val,len,1,fp) == 0) {
3995 sdsfree(val);
3996 return NULL;
3997 }
bdcb92f2 3998 return createObject(REDIS_STRING,val);
f78fd11b 3999}
4000
ad30aa60 4001static robj *rdbLoadStringObject(FILE *fp) {
4002 return rdbGenericLoadStringObject(fp,0);
4003}
4004
4005static robj *rdbLoadEncodedStringObject(FILE *fp) {
4006 return rdbGenericLoadStringObject(fp,1);
4007}
4008
a7866db6 4009/* For information about double serialization check rdbSaveDoubleValue() */
4010static int rdbLoadDoubleValue(FILE *fp, double *val) {
4011 char buf[128];
4012 unsigned char len;
4013
4014 if (fread(&len,1,1,fp) == 0) return -1;
4015 switch(len) {
4016 case 255: *val = R_NegInf; return 0;
4017 case 254: *val = R_PosInf; return 0;
4018 case 253: *val = R_Nan; return 0;
4019 default:
4020 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 4021 buf[len] = '\0';
a7866db6 4022 sscanf(buf, "%lg", val);
4023 return 0;
4024 }
4025}
4026
c78a8ccc 4027/* Load a Redis object of the specified type from the specified file.
4028 * On success a newly allocated object is returned, otherwise NULL. */
4029static robj *rdbLoadObject(int type, FILE *fp) {
23f96494
PN
4030 robj *o, *ele, *dec;
4031 size_t len;
c78a8ccc 4032
bcd11906 4033 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 4034 if (type == REDIS_STRING) {
4035 /* Read string value */
ad30aa60 4036 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4037 o = tryObjectEncoding(o);
23f96494
PN
4038 } else if (type == REDIS_LIST) {
4039 /* Read list value */
4040 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4041
1cd92e7f 4042 o = createZiplistObject();
c78a8ccc 4043
23f96494
PN
4044 /* Load every single element of the list */
4045 while(len--) {
4046 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4047
4048 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
4049 dec = getDecodedObject(ele);
4050 o->ptr = ziplistPush(o->ptr,dec->ptr,sdslen(dec->ptr),REDIS_TAIL);
4051 decrRefCount(dec);
4052 decrRefCount(ele);
4053 } else {
4054 ele = tryObjectEncoding(ele);
4055 listAddNodeTail(o->ptr,ele);
4056 incrRefCount(ele);
4057 }
4058 }
4059 } else if (type == REDIS_SET) {
4060 /* Read list/set value */
4061 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4062 o = createSetObject();
3c68de9b 4063 /* It's faster to expand the dict to the right size asap in order
4064 * to avoid rehashing */
23f96494
PN
4065 if (len > DICT_HT_INITIAL_SIZE)
4066 dictExpand(o->ptr,len);
c78a8ccc 4067 /* Load every single element of the list/set */
23f96494 4068 while(len--) {
ad30aa60 4069 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4070 ele = tryObjectEncoding(ele);
23f96494 4071 dictAdd((dict*)o->ptr,ele,NULL);
c78a8ccc 4072 }
4073 } else if (type == REDIS_ZSET) {
4074 /* Read list/set value */
ada386b2 4075 size_t zsetlen;
c78a8ccc 4076 zset *zs;
4077
4078 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4079 o = createZsetObject();
4080 zs = o->ptr;
4081 /* Load every single element of the list/set */
4082 while(zsetlen--) {
4083 robj *ele;
4084 double *score = zmalloc(sizeof(double));
4085
ad30aa60 4086 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4087 ele = tryObjectEncoding(ele);
c78a8ccc 4088 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4089 dictAdd(zs->dict,ele,score);
4090 zslInsert(zs->zsl,*score,ele);
4091 incrRefCount(ele); /* added to skiplist */
4092 }
ada386b2 4093 } else if (type == REDIS_HASH) {
4094 size_t hashlen;
4095
4096 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4097 o = createHashObject();
4098 /* Too many entries? Use an hash table. */
4099 if (hashlen > server.hash_max_zipmap_entries)
4100 convertToRealHash(o);
4101 /* Load every key/value, then set it into the zipmap or hash
4102 * table, as needed. */
4103 while(hashlen--) {
4104 robj *key, *val;
4105
4106 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4107 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4108 /* If we are using a zipmap and there are too big values
4109 * the object is converted to real hash table encoding. */
4110 if (o->encoding != REDIS_ENCODING_HT &&
4111 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4112 sdslen(val->ptr) > server.hash_max_zipmap_value))
4113 {
4114 convertToRealHash(o);
4115 }
4116
4117 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4118 unsigned char *zm = o->ptr;
4119
4120 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4121 val->ptr,sdslen(val->ptr),NULL);
4122 o->ptr = zm;
4123 decrRefCount(key);
4124 decrRefCount(val);
4125 } else {
05df7621 4126 key = tryObjectEncoding(key);
4127 val = tryObjectEncoding(val);
ada386b2 4128 dictAdd((dict*)o->ptr,key,val);
ada386b2 4129 }
4130 }
c78a8ccc 4131 } else {
f83c6cb5 4132 redisPanic("Unknown object type");
c78a8ccc 4133 }
4134 return o;
4135}
4136
f78fd11b 4137static int rdbLoad(char *filename) {
ed9b544e 4138 FILE *fp;
f78fd11b 4139 uint32_t dbid;
bb32ede5 4140 int type, retval, rdbver;
585af7e2 4141 int swap_all_values = 0;
3305306f 4142 dict *d = server.db[0].dict;
bb32ede5 4143 redisDb *db = server.db+0;
f78fd11b 4144 char buf[1024];
242a64f3 4145 time_t expiretime, now = time(NULL);
b492cf00 4146 long long loadedkeys = 0;
bb32ede5 4147
ed9b544e 4148 fp = fopen(filename,"r");
4149 if (!fp) return REDIS_ERR;
4150 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 4151 buf[9] = '\0';
4152 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 4153 fclose(fp);
4154 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4155 return REDIS_ERR;
4156 }
f78fd11b 4157 rdbver = atoi(buf+5);
c78a8ccc 4158 if (rdbver != 1) {
f78fd11b 4159 fclose(fp);
4160 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4161 return REDIS_ERR;
4162 }
ed9b544e 4163 while(1) {
585af7e2 4164 robj *key, *val;
ed9b544e 4165
585af7e2 4166 expiretime = -1;
ed9b544e 4167 /* Read type. */
f78fd11b 4168 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 4169 if (type == REDIS_EXPIRETIME) {
4170 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4171 /* We read the time so we need to read the object type again */
4172 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4173 }
ed9b544e 4174 if (type == REDIS_EOF) break;
4175 /* Handle SELECT DB opcode as a special case */
4176 if (type == REDIS_SELECTDB) {
c78a8ccc 4177 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 4178 goto eoferr;
ed9b544e 4179 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 4180 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 4181 exit(1);
4182 }
bb32ede5 4183 db = server.db+dbid;
4184 d = db->dict;
ed9b544e 4185 continue;
4186 }
4187 /* Read key */
585af7e2 4188 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
c78a8ccc 4189 /* Read value */
585af7e2 4190 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
89e689c5 4191 /* Check if the key already expired */
4192 if (expiretime != -1 && expiretime < now) {
4193 decrRefCount(key);
4194 decrRefCount(val);
4195 continue;
4196 }
ed9b544e 4197 /* Add the new object in the hash table */
585af7e2 4198 retval = dictAdd(d,key,val);
ed9b544e 4199 if (retval == DICT_ERR) {
585af7e2 4200 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
ed9b544e 4201 exit(1);
4202 }
242a64f3 4203 loadedkeys++;
bb32ede5 4204 /* Set the expire time if needed */
89e689c5 4205 if (expiretime != -1) setExpire(db,key,expiretime);
242a64f3 4206
b492cf00 4207 /* Handle swapping while loading big datasets when VM is on */
242a64f3 4208
4209 /* If we detecter we are hopeless about fitting something in memory
4210 * we just swap every new key on disk. Directly...
4211 * Note that's important to check for this condition before resorting
4212 * to random sampling, otherwise we may try to swap already
4213 * swapped keys. */
585af7e2 4214 if (swap_all_values) {
4215 dictEntry *de = dictFind(d,key);
242a64f3 4216
4217 /* de may be NULL since the key already expired */
4218 if (de) {
585af7e2 4219 key = dictGetEntryKey(de);
4220 val = dictGetEntryVal(de);
242a64f3 4221
585af7e2 4222 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
242a64f3 4223 dictGetEntryVal(de) = NULL;
4224 }
4225 }
4226 continue;
4227 }
4228
4229 /* If we have still some hope of having some value fitting memory
4230 * then we try random sampling. */
585af7e2 4231 if (!swap_all_values && server.vm_enabled && (loadedkeys % 5000) == 0) {
b492cf00 4232 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 4233 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 4234 }
242a64f3 4235 if (zmalloc_used_memory() > server.vm_max_memory)
585af7e2 4236 swap_all_values = 1; /* We are already using too much mem */
b492cf00 4237 }
ed9b544e 4238 }
4239 fclose(fp);
4240 return REDIS_OK;
4241
4242eoferr: /* unexpected end of file is handled here with a fatal exit */
f80dff62 4243 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 4244 exit(1);
4245 return REDIS_ERR; /* Just to avoid warning */
4246}
4247
b58ba105 4248/*================================== Shutdown =============================== */
fab43727 4249static int prepareForShutdown() {
b58ba105
AM
4250 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4251 /* Kill the saving child if there is a background saving in progress.
4252 We want to avoid race conditions, for instance our saving child may
4253 overwrite the synchronous saving did by SHUTDOWN. */
4254 if (server.bgsavechildpid != -1) {
4255 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4256 kill(server.bgsavechildpid,SIGKILL);
4257 rdbRemoveTempFile(server.bgsavechildpid);
4258 }
4259 if (server.appendonly) {
4260 /* Append only file: fsync() the AOF and exit */
4261 fsync(server.appendfd);
4262 if (server.vm_enabled) unlink(server.vm_swap_file);
b58ba105
AM
4263 } else {
4264 /* Snapshotting. Perform a SYNC SAVE and exit */
4265 if (rdbSave(server.dbfilename) == REDIS_OK) {
4266 if (server.daemonize)
4267 unlink(server.pidfile);
4268 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
b58ba105
AM
4269 } else {
4270 /* Ooops.. error saving! The best we can do is to continue
4271 * operating. Note that if there was a background saving process,
4272 * in the next cron() Redis will be notified that the background
4273 * saving aborted, handling special stuff like slaves pending for
4274 * synchronization... */
4275 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
fab43727 4276 return REDIS_ERR;
b58ba105
AM
4277 }
4278 }
8513a757 4279 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
fab43727 4280 return REDIS_OK;
b58ba105
AM
4281}
4282
ed9b544e 4283/*================================== Commands =============================== */
4284
abcb223e 4285static void authCommand(redisClient *c) {
2e77c2ee 4286 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
4287 c->authenticated = 1;
4288 addReply(c,shared.ok);
4289 } else {
4290 c->authenticated = 0;
fa4c0aba 4291 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
4292 }
4293}
4294
ed9b544e 4295static void pingCommand(redisClient *c) {
4296 addReply(c,shared.pong);
4297}
4298
4299static void echoCommand(redisClient *c) {
dd88747b 4300 addReplyBulk(c,c->argv[1]);
ed9b544e 4301}
4302
4303/*=================================== Strings =============================== */
4304
526d00a5 4305static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
ed9b544e 4306 int retval;
10ce1276 4307 long seconds = 0; /* initialized to avoid an harmness warning */
ed9b544e 4308
526d00a5 4309 if (expire) {
4310 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4311 return;
4312 if (seconds <= 0) {
4313 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4314 return;
4315 }
4316 }
4317
37ab76c9 4318 touchWatchedKey(c->db,key);
526d00a5 4319 if (nx) deleteIfVolatile(c->db,key);
4320 retval = dictAdd(c->db->dict,key,val);
ed9b544e 4321 if (retval == DICT_ERR) {
4322 if (!nx) {
1b03836c 4323 /* If the key is about a swapped value, we want a new key object
4324 * to overwrite the old. So we delete the old key in the database.
4325 * This will also make sure that swap pages about the old object
4326 * will be marked as free. */
526d00a5 4327 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4328 incrRefCount(key);
4329 dictReplace(c->db->dict,key,val);
4330 incrRefCount(val);
ed9b544e 4331 } else {
c937aa89 4332 addReply(c,shared.czero);
ed9b544e 4333 return;
4334 }
4335 } else {
526d00a5 4336 incrRefCount(key);
4337 incrRefCount(val);
ed9b544e 4338 }
4339 server.dirty++;
526d00a5 4340 removeExpire(c->db,key);
4341 if (expire) setExpire(c->db,key,time(NULL)+seconds);
c937aa89 4342 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 4343}
4344
4345static void setCommand(redisClient *c) {
526d00a5 4346 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
ed9b544e 4347}
4348
4349static void setnxCommand(redisClient *c) {
526d00a5 4350 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4351}
4352
4353static void setexCommand(redisClient *c) {
4354 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
ed9b544e 4355}
4356
322fc7d8 4357static int getGenericCommand(redisClient *c) {
dd88747b 4358 robj *o;
e0a62c7f 4359
dd88747b 4360 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 4361 return REDIS_OK;
dd88747b 4362
4363 if (o->type != REDIS_STRING) {
4364 addReply(c,shared.wrongtypeerr);
4365 return REDIS_ERR;
ed9b544e 4366 } else {
dd88747b 4367 addReplyBulk(c,o);
4368 return REDIS_OK;
ed9b544e 4369 }
4370}
4371
322fc7d8 4372static void getCommand(redisClient *c) {
4373 getGenericCommand(c);
4374}
4375
f6b141c5 4376static void getsetCommand(redisClient *c) {
322fc7d8 4377 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 4378 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4379 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4380 } else {
4381 incrRefCount(c->argv[1]);
4382 }
4383 incrRefCount(c->argv[2]);
4384 server.dirty++;
4385 removeExpire(c->db,c->argv[1]);
4386}
4387
70003d28 4388static void mgetCommand(redisClient *c) {
70003d28 4389 int j;
e0a62c7f 4390
c937aa89 4391 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 4392 for (j = 1; j < c->argc; j++) {
3305306f 4393 robj *o = lookupKeyRead(c->db,c->argv[j]);
4394 if (o == NULL) {
c937aa89 4395 addReply(c,shared.nullbulk);
70003d28 4396 } else {
70003d28 4397 if (o->type != REDIS_STRING) {
c937aa89 4398 addReply(c,shared.nullbulk);
70003d28 4399 } else {
dd88747b 4400 addReplyBulk(c,o);
70003d28 4401 }
4402 }
4403 }
4404}
4405
6c446631 4406static void msetGenericCommand(redisClient *c, int nx) {
906573e7 4407 int j, busykeys = 0;
6c446631 4408
4409 if ((c->argc % 2) == 0) {
454d4e43 4410 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 4411 return;
4412 }
4413 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4414 * set nothing at all if at least one already key exists. */
4415 if (nx) {
4416 for (j = 1; j < c->argc; j += 2) {
906573e7 4417 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4418 busykeys++;
6c446631 4419 }
4420 }
4421 }
906573e7 4422 if (busykeys) {
4423 addReply(c, shared.czero);
4424 return;
4425 }
6c446631 4426
4427 for (j = 1; j < c->argc; j += 2) {
4428 int retval;
4429
05df7621 4430 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
6c446631 4431 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4432 if (retval == DICT_ERR) {
4433 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4434 incrRefCount(c->argv[j+1]);
4435 } else {
4436 incrRefCount(c->argv[j]);
4437 incrRefCount(c->argv[j+1]);
4438 }
4439 removeExpire(c->db,c->argv[j]);
4440 }
4441 server.dirty += (c->argc-1)/2;
4442 addReply(c, nx ? shared.cone : shared.ok);
4443}
4444
4445static void msetCommand(redisClient *c) {
4446 msetGenericCommand(c,0);
4447}
4448
4449static void msetnxCommand(redisClient *c) {
4450 msetGenericCommand(c,1);
4451}
4452
d68ed120 4453static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 4454 long long value;
4455 int retval;
4456 robj *o;
e0a62c7f 4457
3305306f 4458 o = lookupKeyWrite(c->db,c->argv[1]);
6485f293
PN
4459 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4460 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
ed9b544e 4461
4462 value += incr;
d6f4c262 4463 o = createStringObjectFromLongLong(value);
3305306f 4464 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 4465 if (retval == DICT_ERR) {
3305306f 4466 dictReplace(c->db->dict,c->argv[1],o);
4467 removeExpire(c->db,c->argv[1]);
ed9b544e 4468 } else {
4469 incrRefCount(c->argv[1]);
4470 }
4471 server.dirty++;
c937aa89 4472 addReply(c,shared.colon);
ed9b544e 4473 addReply(c,o);
4474 addReply(c,shared.crlf);
4475}
4476
4477static void incrCommand(redisClient *c) {
a4d1ba9a 4478 incrDecrCommand(c,1);
ed9b544e 4479}
4480
4481static void decrCommand(redisClient *c) {
a4d1ba9a 4482 incrDecrCommand(c,-1);
ed9b544e 4483}
4484
4485static void incrbyCommand(redisClient *c) {
bbe025e0
AM
4486 long long incr;
4487
bd79a6bd 4488 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4489 incrDecrCommand(c,incr);
ed9b544e 4490}
4491
4492static void decrbyCommand(redisClient *c) {
bbe025e0
AM
4493 long long incr;
4494
bd79a6bd 4495 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4496 incrDecrCommand(c,-incr);
ed9b544e 4497}
4498
4b00bebd 4499static void appendCommand(redisClient *c) {
4500 int retval;
4501 size_t totlen;
4502 robj *o;
4503
4504 o = lookupKeyWrite(c->db,c->argv[1]);
4505 if (o == NULL) {
4506 /* Create the key */
4507 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4508 incrRefCount(c->argv[1]);
4509 incrRefCount(c->argv[2]);
4510 totlen = stringObjectLen(c->argv[2]);
4511 } else {
4512 dictEntry *de;
e0a62c7f 4513
4b00bebd 4514 de = dictFind(c->db->dict,c->argv[1]);
4515 assert(de != NULL);
4516
4517 o = dictGetEntryVal(de);
4518 if (o->type != REDIS_STRING) {
4519 addReply(c,shared.wrongtypeerr);
4520 return;
4521 }
4522 /* If the object is specially encoded or shared we have to make
4523 * a copy */
4524 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4525 robj *decoded = getDecodedObject(o);
4526
4527 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4528 decrRefCount(decoded);
4529 dictReplace(c->db->dict,c->argv[1],o);
4530 }
4531 /* APPEND! */
4532 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4533 o->ptr = sdscatlen(o->ptr,
4534 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4535 } else {
4536 o->ptr = sdscatprintf(o->ptr, "%ld",
4537 (unsigned long) c->argv[2]->ptr);
4538 }
4539 totlen = sdslen(o->ptr);
4540 }
4541 server.dirty++;
4542 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4543}
4544
39191553 4545static void substrCommand(redisClient *c) {
4546 robj *o;
4547 long start = atoi(c->argv[2]->ptr);
4548 long end = atoi(c->argv[3]->ptr);
dd88747b 4549 size_t rangelen, strlen;
4550 sds range;
39191553 4551
dd88747b 4552 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4553 checkType(c,o,REDIS_STRING)) return;
39191553 4554
dd88747b 4555 o = getDecodedObject(o);
4556 strlen = sdslen(o->ptr);
8fe7fad7 4557
dd88747b 4558 /* convert negative indexes */
4559 if (start < 0) start = strlen+start;
4560 if (end < 0) end = strlen+end;
4561 if (start < 0) start = 0;
4562 if (end < 0) end = 0;
39191553 4563
dd88747b 4564 /* indexes sanity checks */
4565 if (start > end || (size_t)start >= strlen) {
4566 /* Out of range start or start > end result in null reply */
4567 addReply(c,shared.nullbulk);
4568 decrRefCount(o);
4569 return;
39191553 4570 }
dd88747b 4571 if ((size_t)end >= strlen) end = strlen-1;
4572 rangelen = (end-start)+1;
4573
4574 /* Return the result */
4575 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4576 range = sdsnewlen((char*)o->ptr+start,rangelen);
4577 addReplySds(c,range);
4578 addReply(c,shared.crlf);
4579 decrRefCount(o);
39191553 4580}
4581
ed9b544e 4582/* ========================= Type agnostic commands ========================= */
4583
4584static void delCommand(redisClient *c) {
5109cdff 4585 int deleted = 0, j;
4586
4587 for (j = 1; j < c->argc; j++) {
4588 if (deleteKey(c->db,c->argv[j])) {
37ab76c9 4589 touchWatchedKey(c->db,c->argv[j]);
5109cdff 4590 server.dirty++;
4591 deleted++;
4592 }
4593 }
482b672d 4594 addReplyLongLong(c,deleted);
ed9b544e 4595}
4596
4597static void existsCommand(redisClient *c) {
f4f06efc
PN
4598 expireIfNeeded(c->db,c->argv[1]);
4599 if (dictFind(c->db->dict,c->argv[1])) {
4600 addReply(c, shared.cone);
4601 } else {
4602 addReply(c, shared.czero);
4603 }
ed9b544e 4604}
4605
4606static void selectCommand(redisClient *c) {
4607 int id = atoi(c->argv[1]->ptr);
e0a62c7f 4608
ed9b544e 4609 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4610 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4611 } else {
4612 addReply(c,shared.ok);
4613 }
4614}
4615
4616static void randomkeyCommand(redisClient *c) {
4617 dictEntry *de;
dc4be23e 4618 robj *key;
e0a62c7f 4619
3305306f 4620 while(1) {
4621 de = dictGetRandomKey(c->db->dict);
ce7bef07 4622 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 4623 }
2b619329 4624
ed9b544e 4625 if (de == NULL) {
dc4be23e 4626 addReply(c,shared.nullbulk);
4627 return;
4628 }
4629
4630 key = dictGetEntryKey(de);
4631 if (server.vm_enabled) {
4632 key = dupStringObject(key);
4633 addReplyBulk(c,key);
4634 decrRefCount(key);
ed9b544e 4635 } else {
dc4be23e 4636 addReplyBulk(c,key);
ed9b544e 4637 }
4638}
4639
4640static void keysCommand(redisClient *c) {
4641 dictIterator *di;
4642 dictEntry *de;
4643 sds pattern = c->argv[1]->ptr;
4644 int plen = sdslen(pattern);
a3f9eec2 4645 unsigned long numkeys = 0;
ed9b544e 4646 robj *lenobj = createObject(REDIS_STRING,NULL);
4647
3305306f 4648 di = dictGetIterator(c->db->dict);
ed9b544e 4649 addReply(c,lenobj);
4650 decrRefCount(lenobj);
4651 while((de = dictNext(di)) != NULL) {
4652 robj *keyobj = dictGetEntryKey(de);
3305306f 4653
ed9b544e 4654 sds key = keyobj->ptr;
4655 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4656 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 4657 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4658 addReplyBulk(c,keyobj);
3305306f 4659 numkeys++;
3305306f 4660 }
ed9b544e 4661 }
4662 }
4663 dictReleaseIterator(di);
a3f9eec2 4664 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4665}
4666
4667static void dbsizeCommand(redisClient *c) {
4668 addReplySds(c,
3305306f 4669 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4670}
4671
4672static void lastsaveCommand(redisClient *c) {
4673 addReplySds(c,
c937aa89 4674 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4675}
4676
4677static void typeCommand(redisClient *c) {
3305306f 4678 robj *o;
ed9b544e 4679 char *type;
3305306f 4680
4681 o = lookupKeyRead(c->db,c->argv[1]);
4682 if (o == NULL) {
c937aa89 4683 type = "+none";
ed9b544e 4684 } else {
ed9b544e 4685 switch(o->type) {
c937aa89 4686 case REDIS_STRING: type = "+string"; break;
4687 case REDIS_LIST: type = "+list"; break;
4688 case REDIS_SET: type = "+set"; break;
412a8bce 4689 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4690 case REDIS_HASH: type = "+hash"; break;
4691 default: type = "+unknown"; break;
ed9b544e 4692 }
4693 }
4694 addReplySds(c,sdsnew(type));
4695 addReply(c,shared.crlf);
4696}
4697
4698static void saveCommand(redisClient *c) {
9d65a1bb 4699 if (server.bgsavechildpid != -1) {
05557f6d 4700 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4701 return;
4702 }
f78fd11b 4703 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4704 addReply(c,shared.ok);
4705 } else {
4706 addReply(c,shared.err);
4707 }
4708}
4709
4710static void bgsaveCommand(redisClient *c) {
9d65a1bb 4711 if (server.bgsavechildpid != -1) {
ed9b544e 4712 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4713 return;
4714 }
f78fd11b 4715 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4716 char *status = "+Background saving started\r\n";
4717 addReplySds(c,sdsnew(status));
ed9b544e 4718 } else {
4719 addReply(c,shared.err);
4720 }
4721}
4722
4723static void shutdownCommand(redisClient *c) {
fab43727 4724 if (prepareForShutdown() == REDIS_OK)
4725 exit(0);
4726 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
ed9b544e 4727}
4728
4729static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4730 robj *o;
4731
4732 /* To use the same key as src and dst is probably an error */
4733 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4734 addReply(c,shared.sameobjecterr);
ed9b544e 4735 return;
4736 }
4737
dd88747b 4738 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4739 return;
dd88747b 4740
ed9b544e 4741 incrRefCount(o);
3305306f 4742 deleteIfVolatile(c->db,c->argv[2]);
4743 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 4744 if (nx) {
4745 decrRefCount(o);
c937aa89 4746 addReply(c,shared.czero);
ed9b544e 4747 return;
4748 }
3305306f 4749 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 4750 } else {
4751 incrRefCount(c->argv[2]);
4752 }
3305306f 4753 deleteKey(c->db,c->argv[1]);
b167f877 4754 touchWatchedKey(c->db,c->argv[2]);
ed9b544e 4755 server.dirty++;
c937aa89 4756 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4757}
4758
4759static void renameCommand(redisClient *c) {
4760 renameGenericCommand(c,0);
4761}
4762
4763static void renamenxCommand(redisClient *c) {
4764 renameGenericCommand(c,1);
4765}
4766
4767static void moveCommand(redisClient *c) {
3305306f 4768 robj *o;
4769 redisDb *src, *dst;
ed9b544e 4770 int srcid;
4771
4772 /* Obtain source and target DB pointers */
3305306f 4773 src = c->db;
4774 srcid = c->db->id;
ed9b544e 4775 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4776 addReply(c,shared.outofrangeerr);
ed9b544e 4777 return;
4778 }
3305306f 4779 dst = c->db;
4780 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4781
4782 /* If the user is moving using as target the same
4783 * DB as the source DB it is probably an error. */
4784 if (src == dst) {
c937aa89 4785 addReply(c,shared.sameobjecterr);
ed9b544e 4786 return;
4787 }
4788
4789 /* Check if the element exists and get a reference */
3305306f 4790 o = lookupKeyWrite(c->db,c->argv[1]);
4791 if (!o) {
c937aa89 4792 addReply(c,shared.czero);
ed9b544e 4793 return;
4794 }
4795
4796 /* Try to add the element to the target DB */
3305306f 4797 deleteIfVolatile(dst,c->argv[1]);
4798 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 4799 addReply(c,shared.czero);
ed9b544e 4800 return;
4801 }
3305306f 4802 incrRefCount(c->argv[1]);
ed9b544e 4803 incrRefCount(o);
4804
4805 /* OK! key moved, free the entry in the source DB */
3305306f 4806 deleteKey(src,c->argv[1]);
ed9b544e 4807 server.dirty++;
c937aa89 4808 addReply(c,shared.cone);
ed9b544e 4809}
4810
4811/* =================================== Lists ================================ */
c7d9d662
PN
4812static void lPush(robj *subject, robj *value, int where) {
4813 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4814 int pos = (where == REDIS_HEAD) ? ZIPLIST_HEAD : ZIPLIST_TAIL;
4815 value = getDecodedObject(value);
4816 subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),pos);
4817 decrRefCount(value);
4818 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4819 if (where == REDIS_HEAD) {
4820 listAddNodeHead(subject->ptr,value);
4821 } else {
4822 listAddNodeTail(subject->ptr,value);
4823 }
4824 incrRefCount(value);
4825 } else {
4826 redisPanic("Unknown list encoding");
4827 }
4828}
4829
d72562f7
PN
4830static robj *lPop(robj *subject, int where) {
4831 robj *value = NULL;
4832 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4833 unsigned char *p;
b6eb9703 4834 unsigned char *vstr;
d72562f7 4835 unsigned int vlen;
b6eb9703 4836 long long vlong;
d72562f7
PN
4837 int pos = (where == REDIS_HEAD) ? 0 : -1;
4838 p = ziplistIndex(subject->ptr,pos);
b6eb9703
PN
4839 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
4840 if (vstr) {
4841 value = createStringObject((char*)vstr,vlen);
d72562f7 4842 } else {
b6eb9703 4843 value = createStringObjectFromLongLong(vlong);
d72562f7 4844 }
0f62e177
PN
4845 /* We only need to delete an element when it exists */
4846 subject->ptr = ziplistDelete(subject->ptr,&p);
d72562f7 4847 }
d72562f7
PN
4848 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4849 list *list = subject->ptr;
4850 listNode *ln;
4851 if (where == REDIS_HEAD) {
4852 ln = listFirst(list);
4853 } else {
4854 ln = listLast(list);
4855 }
4856 if (ln != NULL) {
4857 value = listNodeValue(ln);
4858 incrRefCount(value);
4859 listDelNode(list,ln);
4860 }
4861 } else {
4862 redisPanic("Unknown list encoding");
4863 }
4864 return value;
4865}
4866
4867static unsigned long lLength(robj *subject) {
4868 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4869 return ziplistLen(subject->ptr);
4870 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4871 return listLength((list*)subject->ptr);
4872 } else {
4873 redisPanic("Unknown list encoding");
4874 }
4875}
4876
a6dd455b
PN
4877/* Structure to hold set iteration abstraction. */
4878typedef struct {
4879 robj *subject;
4880 unsigned char encoding;
be02a7c0 4881 unsigned char direction; /* Iteration direction */
a6dd455b
PN
4882 unsigned char *zi;
4883 listNode *ln;
4884} lIterator;
4885
be02a7c0
PN
4886/* Structure for an entry while iterating over a list. */
4887typedef struct {
4888 lIterator *li;
4889 unsigned char *zi; /* Entry in ziplist */
4890 listNode *ln; /* Entry in linked list */
4891} lEntry;
4892
a6dd455b 4893/* Initialize an iterator at the specified index. */
be02a7c0 4894static lIterator *lInitIterator(robj *subject, int index, unsigned char direction) {
a6dd455b
PN
4895 lIterator *li = zmalloc(sizeof(lIterator));
4896 li->subject = subject;
4897 li->encoding = subject->encoding;
be02a7c0 4898 li->direction = direction;
a6dd455b
PN
4899 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
4900 li->zi = ziplistIndex(subject->ptr,index);
4901 } else if (li->encoding == REDIS_ENCODING_LIST) {
4902 li->ln = listIndex(subject->ptr,index);
4903 } else {
4904 redisPanic("Unknown list encoding");
4905 }
4906 return li;
4907}
4908
4909/* Clean up the iterator. */
4910static void lReleaseIterator(lIterator *li) {
4911 zfree(li);
4912}
4913
be02a7c0
PN
4914/* Stores pointer to current the entry in the provided entry structure
4915 * and advances the position of the iterator. Returns 1 when the current
4916 * entry is in fact an entry, 0 otherwise. */
4917static int lNext(lIterator *li, lEntry *entry) {
4918 entry->li = li;
d2ee16ab 4919 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
be02a7c0
PN
4920 entry->zi = li->zi;
4921 if (entry->zi != NULL) {
4922 if (li->direction == REDIS_TAIL)
4923 li->zi = ziplistNext(li->subject->ptr,li->zi);
4924 else
4925 li->zi = ziplistPrev(li->subject->ptr,li->zi);
4926 return 1;
4927 }
d2ee16ab 4928 } else if (li->encoding == REDIS_ENCODING_LIST) {
be02a7c0
PN
4929 entry->ln = li->ln;
4930 if (entry->ln != NULL) {
4931 if (li->direction == REDIS_TAIL)
4932 li->ln = li->ln->next;
4933 else
4934 li->ln = li->ln->prev;
4935 return 1;
4936 }
d2ee16ab
PN
4937 } else {
4938 redisPanic("Unknown list encoding");
4939 }
be02a7c0 4940 return 0;
d2ee16ab
PN
4941}
4942
a6dd455b 4943/* Return entry or NULL at the current position of the iterator. */
be02a7c0
PN
4944static robj *lGet(lEntry *entry) {
4945 lIterator *li = entry->li;
a6dd455b
PN
4946 robj *value = NULL;
4947 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
b6eb9703 4948 unsigned char *vstr;
a6dd455b 4949 unsigned int vlen;
b6eb9703 4950 long long vlong;
be02a7c0 4951 redisAssert(entry->zi != NULL);
b6eb9703
PN
4952 if (ziplistGet(entry->zi,&vstr,&vlen,&vlong)) {
4953 if (vstr) {
4954 value = createStringObject((char*)vstr,vlen);
a6dd455b 4955 } else {
b6eb9703 4956 value = createStringObjectFromLongLong(vlong);
a6dd455b
PN
4957 }
4958 }
4959 } else if (li->encoding == REDIS_ENCODING_LIST) {
be02a7c0
PN
4960 redisAssert(entry->ln != NULL);
4961 value = listNodeValue(entry->ln);
a6dd455b
PN
4962 incrRefCount(value);
4963 } else {
4964 redisPanic("Unknown list encoding");
4965 }
4966 return value;
4967}
4968
d2ee16ab 4969/* Compare the given object with the entry at the current position. */
be02a7c0
PN
4970static int lEqual(lEntry *entry, robj *o) {
4971 lIterator *li = entry->li;
d2ee16ab
PN
4972 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
4973 redisAssert(o->encoding == REDIS_ENCODING_RAW);
be02a7c0 4974 return ziplistCompare(entry->zi,o->ptr,sdslen(o->ptr));
d2ee16ab 4975 } else if (li->encoding == REDIS_ENCODING_LIST) {
be02a7c0 4976 return equalStringObjects(o,listNodeValue(entry->ln));
d2ee16ab
PN
4977 } else {
4978 redisPanic("Unknown list encoding");
4979 }
4980}
4981
be02a7c0
PN
4982/* Delete the element pointed to. */
4983static void lDelete(lEntry *entry) {
4984 lIterator *li = entry->li;
a6dd455b 4985 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
be02a7c0
PN
4986 unsigned char *p = entry->zi;
4987 li->subject->ptr = ziplistDelete(li->subject->ptr,&p);
4988
4989 /* Update position of the iterator depending on the direction */
4990 if (li->direction == REDIS_TAIL)
4991 li->zi = p;
a6dd455b 4992 else
be02a7c0
PN
4993 li->zi = ziplistPrev(li->subject->ptr,p);
4994 } else if (entry->li->encoding == REDIS_ENCODING_LIST) {
4995 listNode *next;
4996 if (li->direction == REDIS_TAIL)
4997 next = entry->ln->next;
a6dd455b 4998 else
be02a7c0
PN
4999 next = entry->ln->prev;
5000 listDelNode(li->subject->ptr,entry->ln);
5001 li->ln = next;
a6dd455b
PN
5002 } else {
5003 redisPanic("Unknown list encoding");
5004 }
5005}
3305306f 5006
c7d9d662
PN
5007static void pushGenericCommand(redisClient *c, int where) {
5008 robj *lobj = lookupKeyWrite(c->db,c->argv[1]);
3305306f 5009 if (lobj == NULL) {
95242ab5 5010 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 5011 addReply(c,shared.cone);
95242ab5 5012 return;
5013 }
1cd92e7f 5014 lobj = createZiplistObject();
3305306f 5015 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 5016 incrRefCount(c->argv[1]);
ed9b544e 5017 } else {
ed9b544e 5018 if (lobj->type != REDIS_LIST) {
5019 addReply(c,shared.wrongtypeerr);
5020 return;
5021 }
95242ab5 5022 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 5023 addReply(c,shared.cone);
95242ab5 5024 return;
5025 }
ed9b544e 5026 }
c7d9d662
PN
5027 lPush(lobj,c->argv[2],where);
5028 addReplyLongLong(c,lLength(lobj));
ed9b544e 5029 server.dirty++;
ed9b544e 5030}
5031
5032static void lpushCommand(redisClient *c) {
5033 pushGenericCommand(c,REDIS_HEAD);
5034}
5035
5036static void rpushCommand(redisClient *c) {
5037 pushGenericCommand(c,REDIS_TAIL);
5038}
5039
5040static void llenCommand(redisClient *c) {
d72562f7
PN
5041 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.czero);
5042 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
5043 addReplyUlong(c,lLength(o));
ed9b544e 5044}
5045
5046static void lindexCommand(redisClient *c) {
697bd567
PN
5047 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk);
5048 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
ed9b544e 5049 int index = atoi(c->argv[2]->ptr);
bd8db0ad 5050 robj *value = NULL;
dd88747b 5051
697bd567
PN
5052 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5053 unsigned char *p;
b6eb9703 5054 unsigned char *vstr;
697bd567 5055 unsigned int vlen;
b6eb9703 5056 long long vlong;
697bd567 5057 p = ziplistIndex(o->ptr,index);
b6eb9703
PN
5058 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
5059 if (vstr) {
5060 value = createStringObject((char*)vstr,vlen);
697bd567 5061 } else {
b6eb9703 5062 value = createStringObjectFromLongLong(vlong);
697bd567 5063 }
bd8db0ad
PN
5064 addReplyBulk(c,value);
5065 decrRefCount(value);
697bd567
PN
5066 } else {
5067 addReply(c,shared.nullbulk);
5068 }
5069 } else if (o->encoding == REDIS_ENCODING_LIST) {
5070 listNode *ln = listIndex(o->ptr,index);
5071 if (ln != NULL) {
bd8db0ad
PN
5072 value = listNodeValue(ln);
5073 addReplyBulk(c,value);
697bd567
PN
5074 } else {
5075 addReply(c,shared.nullbulk);
5076 }
ed9b544e 5077 } else {
697bd567 5078 redisPanic("Unknown list encoding");
ed9b544e 5079 }
5080}
5081
5082static void lsetCommand(redisClient *c) {
697bd567
PN
5083 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr);
5084 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
ed9b544e 5085 int index = atoi(c->argv[2]->ptr);
697bd567 5086 robj *value = c->argv[3];
dd88747b 5087
697bd567
PN
5088 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5089 unsigned char *p, *zl = o->ptr;
5090 p = ziplistIndex(zl,index);
5091 if (p == NULL) {
5092 addReply(c,shared.outofrangeerr);
5093 } else {
be02a7c0 5094 o->ptr = ziplistDelete(o->ptr,&p);
697bd567
PN
5095 value = getDecodedObject(value);
5096 o->ptr = ziplistInsert(o->ptr,p,value->ptr,sdslen(value->ptr));
5097 decrRefCount(value);
5098 addReply(c,shared.ok);
5099 server.dirty++;
5100 }
5101 } else if (o->encoding == REDIS_ENCODING_LIST) {
5102 listNode *ln = listIndex(o->ptr,index);
5103 if (ln == NULL) {
5104 addReply(c,shared.outofrangeerr);
5105 } else {
5106 decrRefCount((robj*)listNodeValue(ln));
5107 listNodeValue(ln) = value;
5108 incrRefCount(value);
5109 addReply(c,shared.ok);
5110 server.dirty++;
5111 }
ed9b544e 5112 } else {
697bd567 5113 redisPanic("Unknown list encoding");
ed9b544e 5114 }
5115}
5116
5117static void popGenericCommand(redisClient *c, int where) {
d72562f7
PN
5118 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk);
5119 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
3305306f 5120
d72562f7
PN
5121 robj *value = lPop(o,where);
5122 if (value == NULL) {
dd88747b 5123 addReply(c,shared.nullbulk);
5124 } else {
d72562f7
PN
5125 addReplyBulk(c,value);
5126 decrRefCount(value);
5127 if (lLength(o) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5128 server.dirty++;
ed9b544e 5129 }
5130}
5131
5132static void lpopCommand(redisClient *c) {
5133 popGenericCommand(c,REDIS_HEAD);
5134}
5135
5136static void rpopCommand(redisClient *c) {
5137 popGenericCommand(c,REDIS_TAIL);
5138}
5139
5140static void lrangeCommand(redisClient *c) {
a6dd455b 5141 robj *o, *value;
ed9b544e 5142 int start = atoi(c->argv[2]->ptr);
5143 int end = atoi(c->argv[3]->ptr);
dd88747b 5144 int llen;
5145 int rangelen, j;
be02a7c0 5146 lEntry entry;
dd88747b 5147
4e27f268 5148 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5149 || checkType(c,o,REDIS_LIST)) return;
a6dd455b 5150 llen = lLength(o);
dd88747b 5151
5152 /* convert negative indexes */
5153 if (start < 0) start = llen+start;
5154 if (end < 0) end = llen+end;
5155 if (start < 0) start = 0;
5156 if (end < 0) end = 0;
5157
5158 /* indexes sanity checks */
5159 if (start > end || start >= llen) {
5160 /* Out of range start or start > end result in empty list */
5161 addReply(c,shared.emptymultibulk);
5162 return;
5163 }
5164 if (end >= llen) end = llen-1;
5165 rangelen = (end-start)+1;
3305306f 5166
dd88747b 5167 /* Return the result in form of a multi-bulk reply */
dd88747b 5168 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
be02a7c0 5169 lIterator *li = lInitIterator(o,start,REDIS_TAIL);
dd88747b 5170 for (j = 0; j < rangelen; j++) {
be02a7c0
PN
5171 redisAssert(lNext(li,&entry));
5172 value = lGet(&entry);
a6dd455b 5173 addReplyBulk(c,value);
be02a7c0 5174 decrRefCount(value);
ed9b544e 5175 }
a6dd455b 5176 lReleaseIterator(li);
ed9b544e 5177}
5178
5179static void ltrimCommand(redisClient *c) {
3305306f 5180 robj *o;
ed9b544e 5181 int start = atoi(c->argv[2]->ptr);
5182 int end = atoi(c->argv[3]->ptr);
dd88747b 5183 int llen;
5184 int j, ltrim, rtrim;
5185 list *list;
5186 listNode *ln;
5187
5188 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
5189 checkType(c,o,REDIS_LIST)) return;
9ae6b0be 5190 llen = lLength(o);
dd88747b 5191
5192 /* convert negative indexes */
5193 if (start < 0) start = llen+start;
5194 if (end < 0) end = llen+end;
5195 if (start < 0) start = 0;
5196 if (end < 0) end = 0;
5197
5198 /* indexes sanity checks */
5199 if (start > end || start >= llen) {
5200 /* Out of range start or start > end result in empty list */
5201 ltrim = llen;
5202 rtrim = 0;
ed9b544e 5203 } else {
dd88747b 5204 if (end >= llen) end = llen-1;
5205 ltrim = start;
5206 rtrim = llen-end-1;
5207 }
ed9b544e 5208
dd88747b 5209 /* Remove list elements to perform the trim */
9ae6b0be
PN
5210 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5211 o->ptr = ziplistDeleteRange(o->ptr,0,ltrim);
5212 o->ptr = ziplistDeleteRange(o->ptr,-rtrim,rtrim);
5213 } else if (o->encoding == REDIS_ENCODING_LIST) {
5214 list = o->ptr;
5215 for (j = 0; j < ltrim; j++) {
5216 ln = listFirst(list);
5217 listDelNode(list,ln);
5218 }
5219 for (j = 0; j < rtrim; j++) {
5220 ln = listLast(list);
5221 listDelNode(list,ln);
5222 }
5223 } else {
5224 redisPanic("Unknown list encoding");
ed9b544e 5225 }
9ae6b0be 5226 if (lLength(o) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5227 server.dirty++;
5228 addReply(c,shared.ok);
ed9b544e 5229}
5230
5231static void lremCommand(redisClient *c) {
d2ee16ab 5232 robj *subject, *obj = c->argv[3];
dd88747b 5233 int toremove = atoi(c->argv[2]->ptr);
5234 int removed = 0;
be02a7c0 5235 lEntry entry;
a4d1ba9a 5236
d2ee16ab
PN
5237 subject = lookupKeyWriteOrReply(c,c->argv[1],shared.czero);
5238 if (subject == NULL || checkType(c,subject,REDIS_LIST)) return;
dd88747b 5239
d2ee16ab
PN
5240 /* Make sure obj is raw when we're dealing with a ziplist */
5241 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5242 obj = getDecodedObject(obj);
5243
5244 lIterator *li;
dd88747b 5245 if (toremove < 0) {
5246 toremove = -toremove;
be02a7c0 5247 li = lInitIterator(subject,-1,REDIS_HEAD);
d2ee16ab 5248 } else {
be02a7c0 5249 li = lInitIterator(subject,0,REDIS_TAIL);
dd88747b 5250 }
dd88747b 5251
be02a7c0
PN
5252 while (lNext(li,&entry)) {
5253 if (lEqual(&entry,obj)) {
5254 lDelete(&entry);
dd88747b 5255 server.dirty++;
5256 removed++;
3fbf9001 5257 if (toremove && removed == toremove) break;
ed9b544e 5258 }
5259 }
d2ee16ab
PN
5260 lReleaseIterator(li);
5261
5262 /* Clean up raw encoded object */
5263 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5264 decrRefCount(obj);
5265
5266 if (lLength(subject) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5267 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 5268}
5269
12f9d551 5270/* This is the semantic of this command:
0f5f7e9a 5271 * RPOPLPUSH srclist dstlist:
12f9d551 5272 * IF LLEN(srclist) > 0
5273 * element = RPOP srclist
5274 * LPUSH dstlist element
5275 * RETURN element
5276 * ELSE
5277 * RETURN nil
5278 * END
5279 * END
5280 *
5281 * The idea is to be able to get an element from a list in a reliable way
5282 * since the element is not just returned but pushed against another list
5283 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5284 */
0f5f7e9a 5285static void rpoplpushcommand(redisClient *c) {
0f62e177 5286 robj *sobj, *value;
dd88747b 5287 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5288 checkType(c,sobj,REDIS_LIST)) return;
12f9d551 5289
0f62e177 5290 if (lLength(sobj) == 0) {
12f9d551 5291 addReply(c,shared.nullbulk);
5292 } else {
dd88747b 5293 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
0f62e177
PN
5294 if (dobj && checkType(c,dobj,REDIS_LIST)) return;
5295 value = lPop(sobj,REDIS_TAIL);
12f9d551 5296
dd88747b 5297 /* Add the element to the target list (unless it's directly
5298 * passed to some BLPOP-ing client */
0f62e177
PN
5299 if (!handleClientsWaitingListPush(c,c->argv[2],value)) {
5300 /* Create the list if the key does not exist */
5301 if (!dobj) {
1cd92e7f 5302 dobj = createZiplistObject();
dd88747b 5303 dictAdd(c->db->dict,c->argv[2],dobj);
5304 incrRefCount(c->argv[2]);
12f9d551 5305 }
0f62e177 5306 lPush(dobj,value,REDIS_HEAD);
12f9d551 5307 }
dd88747b 5308
5309 /* Send the element to the client as reply as well */
0f62e177
PN
5310 addReplyBulk(c,value);
5311
5312 /* lPop returns an object with its refcount incremented */
5313 decrRefCount(value);
dd88747b 5314
0f62e177
PN
5315 /* Delete the source list when it is empty */
5316 if (lLength(sobj) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5317 server.dirty++;
12f9d551 5318 }
5319}
5320
ed9b544e 5321/* ==================================== Sets ================================ */
5322
5323static void saddCommand(redisClient *c) {
ed9b544e 5324 robj *set;
5325
3305306f 5326 set = lookupKeyWrite(c->db,c->argv[1]);
5327 if (set == NULL) {
ed9b544e 5328 set = createSetObject();
3305306f 5329 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 5330 incrRefCount(c->argv[1]);
5331 } else {
ed9b544e 5332 if (set->type != REDIS_SET) {
c937aa89 5333 addReply(c,shared.wrongtypeerr);
ed9b544e 5334 return;
5335 }
5336 }
5337 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5338 incrRefCount(c->argv[2]);
5339 server.dirty++;
c937aa89 5340 addReply(c,shared.cone);
ed9b544e 5341 } else {
c937aa89 5342 addReply(c,shared.czero);
ed9b544e 5343 }
5344}
5345
5346static void sremCommand(redisClient *c) {
3305306f 5347 robj *set;
ed9b544e 5348
dd88747b 5349 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5350 checkType(c,set,REDIS_SET)) return;
5351
5352 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5353 server.dirty++;
5354 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 5355 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5356 addReply(c,shared.cone);
ed9b544e 5357 } else {
dd88747b 5358 addReply(c,shared.czero);
ed9b544e 5359 }
5360}
5361
a4460ef4 5362static void smoveCommand(redisClient *c) {
5363 robj *srcset, *dstset;
5364
5365 srcset = lookupKeyWrite(c->db,c->argv[1]);
5366 dstset = lookupKeyWrite(c->db,c->argv[2]);
5367
5368 /* If the source key does not exist return 0, if it's of the wrong type
5369 * raise an error */
5370 if (srcset == NULL || srcset->type != REDIS_SET) {
5371 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5372 return;
5373 }
5374 /* Error if the destination key is not a set as well */
5375 if (dstset && dstset->type != REDIS_SET) {
5376 addReply(c,shared.wrongtypeerr);
5377 return;
5378 }
5379 /* Remove the element from the source set */
5380 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5381 /* Key not found in the src set! return zero */
5382 addReply(c,shared.czero);
5383 return;
5384 }
3ea27d37 5385 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5386 deleteKey(c->db,c->argv[1]);
a4460ef4 5387 server.dirty++;
5388 /* Add the element to the destination set */
5389 if (!dstset) {
5390 dstset = createSetObject();
5391 dictAdd(c->db->dict,c->argv[2],dstset);
5392 incrRefCount(c->argv[2]);
5393 }
5394 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5395 incrRefCount(c->argv[3]);
5396 addReply(c,shared.cone);
5397}
5398
ed9b544e 5399static void sismemberCommand(redisClient *c) {
3305306f 5400 robj *set;
ed9b544e 5401
dd88747b 5402 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5403 checkType(c,set,REDIS_SET)) return;
5404
5405 if (dictFind(set->ptr,c->argv[2]))
5406 addReply(c,shared.cone);
5407 else
c937aa89 5408 addReply(c,shared.czero);
ed9b544e 5409}
5410
5411static void scardCommand(redisClient *c) {
3305306f 5412 robj *o;
ed9b544e 5413 dict *s;
dd88747b 5414
5415 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5416 checkType(c,o,REDIS_SET)) return;
e0a62c7f 5417
dd88747b 5418 s = o->ptr;
5419 addReplyUlong(c,dictSize(s));
ed9b544e 5420}
5421
12fea928 5422static void spopCommand(redisClient *c) {
5423 robj *set;
5424 dictEntry *de;
5425
dd88747b 5426 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5427 checkType(c,set,REDIS_SET)) return;
5428
5429 de = dictGetRandomKey(set->ptr);
5430 if (de == NULL) {
12fea928 5431 addReply(c,shared.nullbulk);
5432 } else {
dd88747b 5433 robj *ele = dictGetEntryKey(de);
12fea928 5434
dd88747b 5435 addReplyBulk(c,ele);
5436 dictDelete(set->ptr,ele);
5437 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 5438 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5439 server.dirty++;
12fea928 5440 }
5441}
5442
2abb95a9 5443static void srandmemberCommand(redisClient *c) {
5444 robj *set;
5445 dictEntry *de;
5446
dd88747b 5447 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5448 checkType(c,set,REDIS_SET)) return;
5449
5450 de = dictGetRandomKey(set->ptr);
5451 if (de == NULL) {
2abb95a9 5452 addReply(c,shared.nullbulk);
5453 } else {
dd88747b 5454 robj *ele = dictGetEntryKey(de);
2abb95a9 5455
dd88747b 5456 addReplyBulk(c,ele);
2abb95a9 5457 }
5458}
5459
ed9b544e 5460static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5461 dict **d1 = (void*) s1, **d2 = (void*) s2;
5462
3305306f 5463 return dictSize(*d1)-dictSize(*d2);
ed9b544e 5464}
5465
682ac724 5466static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 5467 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5468 dictIterator *di;
5469 dictEntry *de;
5470 robj *lenobj = NULL, *dstset = NULL;
682ac724 5471 unsigned long j, cardinality = 0;
ed9b544e 5472
ed9b544e 5473 for (j = 0; j < setsnum; j++) {
5474 robj *setobj;
3305306f 5475
5476 setobj = dstkey ?
5477 lookupKeyWrite(c->db,setskeys[j]) :
5478 lookupKeyRead(c->db,setskeys[j]);
5479 if (!setobj) {
ed9b544e 5480 zfree(dv);
5faa6025 5481 if (dstkey) {
fdcaae84 5482 if (deleteKey(c->db,dstkey))
5483 server.dirty++;
0d36ded0 5484 addReply(c,shared.czero);
5faa6025 5485 } else {
4e27f268 5486 addReply(c,shared.emptymultibulk);
5faa6025 5487 }
ed9b544e 5488 return;
5489 }
ed9b544e 5490 if (setobj->type != REDIS_SET) {
5491 zfree(dv);
c937aa89 5492 addReply(c,shared.wrongtypeerr);
ed9b544e 5493 return;
5494 }
5495 dv[j] = setobj->ptr;
5496 }
5497 /* Sort sets from the smallest to largest, this will improve our
5498 * algorithm's performace */
5499 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5500
5501 /* The first thing we should output is the total number of elements...
5502 * since this is a multi-bulk write, but at this stage we don't know
5503 * the intersection set size, so we use a trick, append an empty object
5504 * to the output list and save the pointer to later modify it with the
5505 * right length */
5506 if (!dstkey) {
5507 lenobj = createObject(REDIS_STRING,NULL);
5508 addReply(c,lenobj);
5509 decrRefCount(lenobj);
5510 } else {
5511 /* If we have a target key where to store the resulting set
5512 * create this key with an empty set inside */
5513 dstset = createSetObject();
ed9b544e 5514 }
5515
5516 /* Iterate all the elements of the first (smallest) set, and test
5517 * the element against all the other sets, if at least one set does
5518 * not include the element it is discarded */
5519 di = dictGetIterator(dv[0]);
ed9b544e 5520
5521 while((de = dictNext(di)) != NULL) {
5522 robj *ele;
5523
5524 for (j = 1; j < setsnum; j++)
5525 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5526 if (j != setsnum)
5527 continue; /* at least one set does not contain the member */
5528 ele = dictGetEntryKey(de);
5529 if (!dstkey) {
dd88747b 5530 addReplyBulk(c,ele);
ed9b544e 5531 cardinality++;
5532 } else {
5533 dictAdd(dstset->ptr,ele,NULL);
5534 incrRefCount(ele);
5535 }
5536 }
5537 dictReleaseIterator(di);
5538
83cdfe18 5539 if (dstkey) {
3ea27d37 5540 /* Store the resulting set into the target, if the intersection
5541 * is not an empty set. */
83cdfe18 5542 deleteKey(c->db,dstkey);
3ea27d37 5543 if (dictSize((dict*)dstset->ptr) > 0) {
5544 dictAdd(c->db->dict,dstkey,dstset);
5545 incrRefCount(dstkey);
482b672d 5546 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5547 } else {
5548 decrRefCount(dstset);
d36c4e97 5549 addReply(c,shared.czero);
3ea27d37 5550 }
40d224a9 5551 server.dirty++;
d36c4e97 5552 } else {
5553 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 5554 }
ed9b544e 5555 zfree(dv);
5556}
5557
5558static void sinterCommand(redisClient *c) {
5559 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5560}
5561
5562static void sinterstoreCommand(redisClient *c) {
5563 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5564}
5565
f4f56e1d 5566#define REDIS_OP_UNION 0
5567#define REDIS_OP_DIFF 1
2830ca53 5568#define REDIS_OP_INTER 2
f4f56e1d 5569
5570static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 5571 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5572 dictIterator *di;
5573 dictEntry *de;
f4f56e1d 5574 robj *dstset = NULL;
40d224a9 5575 int j, cardinality = 0;
5576
40d224a9 5577 for (j = 0; j < setsnum; j++) {
5578 robj *setobj;
5579
5580 setobj = dstkey ?
5581 lookupKeyWrite(c->db,setskeys[j]) :
5582 lookupKeyRead(c->db,setskeys[j]);
5583 if (!setobj) {
5584 dv[j] = NULL;
5585 continue;
5586 }
5587 if (setobj->type != REDIS_SET) {
5588 zfree(dv);
5589 addReply(c,shared.wrongtypeerr);
5590 return;
5591 }
5592 dv[j] = setobj->ptr;
5593 }
5594
5595 /* We need a temp set object to store our union. If the dstkey
5596 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5597 * this set object will be the resulting object to set into the target key*/
5598 dstset = createSetObject();
5599
40d224a9 5600 /* Iterate all the elements of all the sets, add every element a single
5601 * time to the result set */
5602 for (j = 0; j < setsnum; j++) {
51829ed3 5603 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 5604 if (!dv[j]) continue; /* non existing keys are like empty sets */
5605
5606 di = dictGetIterator(dv[j]);
40d224a9 5607
5608 while((de = dictNext(di)) != NULL) {
5609 robj *ele;
5610
5611 /* dictAdd will not add the same element multiple times */
5612 ele = dictGetEntryKey(de);
f4f56e1d 5613 if (op == REDIS_OP_UNION || j == 0) {
5614 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5615 incrRefCount(ele);
40d224a9 5616 cardinality++;
5617 }
f4f56e1d 5618 } else if (op == REDIS_OP_DIFF) {
5619 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5620 cardinality--;
5621 }
40d224a9 5622 }
5623 }
5624 dictReleaseIterator(di);
51829ed3 5625
d36c4e97 5626 /* result set is empty? Exit asap. */
5627 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 5628 }
5629
f4f56e1d 5630 /* Output the content of the resulting set, if not in STORE mode */
5631 if (!dstkey) {
5632 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5633 di = dictGetIterator(dstset->ptr);
f4f56e1d 5634 while((de = dictNext(di)) != NULL) {
5635 robj *ele;
5636
5637 ele = dictGetEntryKey(de);
dd88747b 5638 addReplyBulk(c,ele);
f4f56e1d 5639 }
5640 dictReleaseIterator(di);
d36c4e97 5641 decrRefCount(dstset);
83cdfe18
AG
5642 } else {
5643 /* If we have a target key where to store the resulting set
5644 * create this key with the result set inside */
5645 deleteKey(c->db,dstkey);
3ea27d37 5646 if (dictSize((dict*)dstset->ptr) > 0) {
5647 dictAdd(c->db->dict,dstkey,dstset);
5648 incrRefCount(dstkey);
482b672d 5649 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5650 } else {
5651 decrRefCount(dstset);
d36c4e97 5652 addReply(c,shared.czero);
3ea27d37 5653 }
40d224a9 5654 server.dirty++;
5655 }
5656 zfree(dv);
5657}
5658
5659static void sunionCommand(redisClient *c) {
f4f56e1d 5660 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 5661}
5662
5663static void sunionstoreCommand(redisClient *c) {
f4f56e1d 5664 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5665}
5666
5667static void sdiffCommand(redisClient *c) {
5668 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5669}
5670
5671static void sdiffstoreCommand(redisClient *c) {
5672 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 5673}
5674
6b47e12e 5675/* ==================================== ZSets =============================== */
5676
5677/* ZSETs are ordered sets using two data structures to hold the same elements
5678 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5679 * data structure.
5680 *
5681 * The elements are added to an hash table mapping Redis objects to scores.
5682 * At the same time the elements are added to a skip list mapping scores
5683 * to Redis objects (so objects are sorted by scores in this "view"). */
5684
5685/* This skiplist implementation is almost a C translation of the original
5686 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5687 * Alternative to Balanced Trees", modified in three ways:
5688 * a) this implementation allows for repeated values.
5689 * b) the comparison is not just by key (our 'score') but by satellite data.
5690 * c) there is a back pointer, so it's a doubly linked list with the back
5691 * pointers being only at "level 1". This allows to traverse the list
5692 * from tail to head, useful for ZREVRANGE. */
5693
5694static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5695 zskiplistNode *zn = zmalloc(sizeof(*zn));
5696
5697 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2f4dd7e0 5698 if (level > 1)
2b37892e 5699 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
2f4dd7e0 5700 else
5701 zn->span = NULL;
6b47e12e 5702 zn->score = score;
5703 zn->obj = obj;
5704 return zn;
5705}
5706
5707static zskiplist *zslCreate(void) {
5708 int j;
5709 zskiplist *zsl;
e0a62c7f 5710
6b47e12e 5711 zsl = zmalloc(sizeof(*zsl));
5712 zsl->level = 1;
cc812361 5713 zsl->length = 0;
6b47e12e 5714 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 5715 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 5716 zsl->header->forward[j] = NULL;
94e543b5 5717
5718 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5719 if (j < ZSKIPLIST_MAXLEVEL-1)
5720 zsl->header->span[j] = 0;
69d95c3e 5721 }
e3870fab 5722 zsl->header->backward = NULL;
5723 zsl->tail = NULL;
6b47e12e 5724 return zsl;
5725}
5726
fd8ccf44 5727static void zslFreeNode(zskiplistNode *node) {
5728 decrRefCount(node->obj);
ad807e6f 5729 zfree(node->forward);
69d95c3e 5730 zfree(node->span);
fd8ccf44 5731 zfree(node);
5732}
5733
5734static void zslFree(zskiplist *zsl) {
ad807e6f 5735 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 5736
ad807e6f 5737 zfree(zsl->header->forward);
69d95c3e 5738 zfree(zsl->header->span);
ad807e6f 5739 zfree(zsl->header);
fd8ccf44 5740 while(node) {
599379dd 5741 next = node->forward[0];
fd8ccf44 5742 zslFreeNode(node);
5743 node = next;
5744 }
ad807e6f 5745 zfree(zsl);
fd8ccf44 5746}
5747
6b47e12e 5748static int zslRandomLevel(void) {
5749 int level = 1;
5750 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5751 level += 1;
10c2baa5 5752 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6b47e12e 5753}
5754
5755static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5756 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 5757 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 5758 int i, level;
5759
5760 x = zsl->header;
5761 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
5762 /* store rank that is crossed to reach the insert position */
5763 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 5764
9d60e6e4 5765 while (x->forward[i] &&
5766 (x->forward[i]->score < score ||
5767 (x->forward[i]->score == score &&
69d95c3e 5768 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 5769 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 5770 x = x->forward[i];
69d95c3e 5771 }
6b47e12e 5772 update[i] = x;
5773 }
6b47e12e 5774 /* we assume the key is not already inside, since we allow duplicated
5775 * scores, and the re-insertion of score and redis object should never
5776 * happpen since the caller of zslInsert() should test in the hash table
5777 * if the element is already inside or not. */
5778 level = zslRandomLevel();
5779 if (level > zsl->level) {
69d95c3e 5780 for (i = zsl->level; i < level; i++) {
2b37892e 5781 rank[i] = 0;
6b47e12e 5782 update[i] = zsl->header;
2b37892e 5783 update[i]->span[i-1] = zsl->length;
69d95c3e 5784 }
6b47e12e 5785 zsl->level = level;
5786 }
5787 x = zslCreateNode(level,score,obj);
5788 for (i = 0; i < level; i++) {
5789 x->forward[i] = update[i]->forward[i];
5790 update[i]->forward[i] = x;
69d95c3e
PN
5791
5792 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
5793 if (i > 0) {
5794 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5795 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5796 }
6b47e12e 5797 }
69d95c3e
PN
5798
5799 /* increment span for untouched levels */
5800 for (i = level; i < zsl->level; i++) {
2b37892e 5801 update[i]->span[i-1]++;
69d95c3e
PN
5802 }
5803
bb975144 5804 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 5805 if (x->forward[0])
5806 x->forward[0]->backward = x;
5807 else
5808 zsl->tail = x;
cc812361 5809 zsl->length++;
6b47e12e 5810}
5811
84105336
PN
5812/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5813void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5814 int i;
5815 for (i = 0; i < zsl->level; i++) {
5816 if (update[i]->forward[i] == x) {
5817 if (i > 0) {
5818 update[i]->span[i-1] += x->span[i-1] - 1;
5819 }
5820 update[i]->forward[i] = x->forward[i];
5821 } else {
5822 /* invariant: i > 0, because update[0]->forward[0]
5823 * is always equal to x */
5824 update[i]->span[i-1] -= 1;
5825 }
5826 }
5827 if (x->forward[0]) {
5828 x->forward[0]->backward = x->backward;
5829 } else {
5830 zsl->tail = x->backward;
5831 }
5832 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5833 zsl->level--;
5834 zsl->length--;
5835}
5836
50c55df5 5837/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 5838static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 5839 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5840 int i;
5841
5842 x = zsl->header;
5843 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 5844 while (x->forward[i] &&
5845 (x->forward[i]->score < score ||
5846 (x->forward[i]->score == score &&
5847 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 5848 x = x->forward[i];
5849 update[i] = x;
5850 }
5851 /* We may have multiple elements with the same score, what we need
5852 * is to find the element with both the right score and object. */
5853 x = x->forward[0];
bf028098 5854 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
84105336 5855 zslDeleteNode(zsl, x, update);
9d60e6e4 5856 zslFreeNode(x);
9d60e6e4 5857 return 1;
5858 } else {
5859 return 0; /* not found */
e197b441 5860 }
5861 return 0; /* not found */
fd8ccf44 5862}
5863
1807985b 5864/* Delete all the elements with score between min and max from the skiplist.
5865 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5866 * Note that this function takes the reference to the hash table view of the
5867 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 5868static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 5869 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5870 unsigned long removed = 0;
5871 int i;
5872
5873 x = zsl->header;
5874 for (i = zsl->level-1; i >= 0; i--) {
5875 while (x->forward[i] && x->forward[i]->score < min)
5876 x = x->forward[i];
5877 update[i] = x;
5878 }
5879 /* We may have multiple elements with the same score, what we need
5880 * is to find the element with both the right score and object. */
5881 x = x->forward[0];
5882 while (x && x->score <= max) {
84105336
PN
5883 zskiplistNode *next = x->forward[0];
5884 zslDeleteNode(zsl, x, update);
1807985b 5885 dictDelete(dict,x->obj);
5886 zslFreeNode(x);
1807985b 5887 removed++;
5888 x = next;
5889 }
5890 return removed; /* not found */
5891}
1807985b 5892
9212eafd 5893/* Delete all the elements with rank between start and end from the skiplist.
2424490f 5894 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
5895static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5896 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5897 unsigned long traversed = 0, removed = 0;
5898 int i;
5899
9212eafd
PN
5900 x = zsl->header;
5901 for (i = zsl->level-1; i >= 0; i--) {
5902 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5903 traversed += i > 0 ? x->span[i-1] : 1;
5904 x = x->forward[i];
1807985b 5905 }
9212eafd
PN
5906 update[i] = x;
5907 }
5908
5909 traversed++;
5910 x = x->forward[0];
5911 while (x && traversed <= end) {
84105336
PN
5912 zskiplistNode *next = x->forward[0];
5913 zslDeleteNode(zsl, x, update);
1807985b 5914 dictDelete(dict,x->obj);
5915 zslFreeNode(x);
1807985b 5916 removed++;
9212eafd 5917 traversed++;
1807985b 5918 x = next;
5919 }
9212eafd 5920 return removed;
1807985b 5921}
5922
50c55df5 5923/* Find the first node having a score equal or greater than the specified one.
5924 * Returns NULL if there is no match. */
5925static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5926 zskiplistNode *x;
5927 int i;
5928
5929 x = zsl->header;
5930 for (i = zsl->level-1; i >= 0; i--) {
5931 while (x->forward[i] && x->forward[i]->score < score)
5932 x = x->forward[i];
5933 }
5934 /* We may have multiple elements with the same score, what we need
5935 * is to find the element with both the right score and object. */
5936 return x->forward[0];
5937}
5938
27b0ccca
PN
5939/* Find the rank for an element by both score and key.
5940 * Returns 0 when the element cannot be found, rank otherwise.
5941 * Note that the rank is 1-based due to the span of zsl->header to the
5942 * first element. */
5943static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5944 zskiplistNode *x;
5945 unsigned long rank = 0;
5946 int i;
5947
5948 x = zsl->header;
5949 for (i = zsl->level-1; i >= 0; i--) {
5950 while (x->forward[i] &&
5951 (x->forward[i]->score < score ||
5952 (x->forward[i]->score == score &&
5953 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 5954 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
5955 x = x->forward[i];
5956 }
5957
5958 /* x might be equal to zsl->header, so test if obj is non-NULL */
bf028098 5959 if (x->obj && equalStringObjects(x->obj,o)) {
27b0ccca
PN
5960 return rank;
5961 }
5962 }
5963 return 0;
5964}
5965
e74825c2
PN
5966/* Finds an element by its rank. The rank argument needs to be 1-based. */
5967zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5968 zskiplistNode *x;
5969 unsigned long traversed = 0;
5970 int i;
5971
5972 x = zsl->header;
5973 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 5974 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5975 {
a50ea45c 5976 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
5977 x = x->forward[i];
5978 }
e74825c2
PN
5979 if (traversed == rank) {
5980 return x;
5981 }
5982 }
5983 return NULL;
5984}
5985
fd8ccf44 5986/* The actual Z-commands implementations */
5987
7db723ad 5988/* This generic command implements both ZADD and ZINCRBY.
e2665397 5989 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 5990 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 5991static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 5992 robj *zsetobj;
5993 zset *zs;
5994 double *score;
5995
5fc9229c 5996 if (isnan(scoreval)) {
5997 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
5998 return;
5999 }
6000
e2665397 6001 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 6002 if (zsetobj == NULL) {
6003 zsetobj = createZsetObject();
e2665397 6004 dictAdd(c->db->dict,key,zsetobj);
6005 incrRefCount(key);
fd8ccf44 6006 } else {
6007 if (zsetobj->type != REDIS_ZSET) {
6008 addReply(c,shared.wrongtypeerr);
6009 return;
6010 }
6011 }
fd8ccf44 6012 zs = zsetobj->ptr;
e2665397 6013
7db723ad 6014 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 6015 * needs to handle the two different conditions. It's all about setting
6016 * '*score', that is, the new score to set, to the right value. */
6017 score = zmalloc(sizeof(double));
6018 if (doincrement) {
6019 dictEntry *de;
6020
6021 /* Read the old score. If the element was not present starts from 0 */
6022 de = dictFind(zs->dict,ele);
6023 if (de) {
6024 double *oldscore = dictGetEntryVal(de);
6025 *score = *oldscore + scoreval;
6026 } else {
6027 *score = scoreval;
6028 }
5fc9229c 6029 if (isnan(*score)) {
6030 addReplySds(c,
6031 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
6032 zfree(score);
6033 /* Note that we don't need to check if the zset may be empty and
6034 * should be removed here, as we can only obtain Nan as score if
6035 * there was already an element in the sorted set. */
6036 return;
6037 }
e2665397 6038 } else {
6039 *score = scoreval;
6040 }
6041
6042 /* What follows is a simple remove and re-insert operation that is common
7db723ad 6043 * to both ZADD and ZINCRBY... */
e2665397 6044 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 6045 /* case 1: New element */
e2665397 6046 incrRefCount(ele); /* added to hash */
6047 zslInsert(zs->zsl,*score,ele);
6048 incrRefCount(ele); /* added to skiplist */
fd8ccf44 6049 server.dirty++;
e2665397 6050 if (doincrement)
e2665397 6051 addReplyDouble(c,*score);
91d71bfc 6052 else
6053 addReply(c,shared.cone);
fd8ccf44 6054 } else {
6055 dictEntry *de;
6056 double *oldscore;
e0a62c7f 6057
fd8ccf44 6058 /* case 2: Score update operation */
e2665397 6059 de = dictFind(zs->dict,ele);
dfc5e96c 6060 redisAssert(de != NULL);
fd8ccf44 6061 oldscore = dictGetEntryVal(de);
6062 if (*score != *oldscore) {
6063 int deleted;
6064
e2665397 6065 /* Remove and insert the element in the skip list with new score */
6066 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 6067 redisAssert(deleted != 0);
e2665397 6068 zslInsert(zs->zsl,*score,ele);
6069 incrRefCount(ele);
6070 /* Update the score in the hash table */
6071 dictReplace(zs->dict,ele,score);
fd8ccf44 6072 server.dirty++;
2161a965 6073 } else {
6074 zfree(score);
fd8ccf44 6075 }
e2665397 6076 if (doincrement)
6077 addReplyDouble(c,*score);
6078 else
6079 addReply(c,shared.czero);
fd8ccf44 6080 }
6081}
6082
e2665397 6083static void zaddCommand(redisClient *c) {
6084 double scoreval;
6085
bd79a6bd 6086 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 6087 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
6088}
6089
7db723ad 6090static void zincrbyCommand(redisClient *c) {
e2665397 6091 double scoreval;
6092
bd79a6bd 6093 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 6094 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
6095}
6096
1b7106e7 6097static void zremCommand(redisClient *c) {
6098 robj *zsetobj;
6099 zset *zs;
dd88747b 6100 dictEntry *de;
6101 double *oldscore;
6102 int deleted;
1b7106e7 6103
dd88747b 6104 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6105 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 6106
dd88747b 6107 zs = zsetobj->ptr;
6108 de = dictFind(zs->dict,c->argv[2]);
6109 if (de == NULL) {
6110 addReply(c,shared.czero);
6111 return;
1b7106e7 6112 }
dd88747b 6113 /* Delete from the skiplist */
6114 oldscore = dictGetEntryVal(de);
6115 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
6116 redisAssert(deleted != 0);
6117
6118 /* Delete from the hash table */
6119 dictDelete(zs->dict,c->argv[2]);
6120 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 6121 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 6122 server.dirty++;
6123 addReply(c,shared.cone);
1b7106e7 6124}
6125
1807985b 6126static void zremrangebyscoreCommand(redisClient *c) {
bbe025e0
AM
6127 double min;
6128 double max;
dd88747b 6129 long deleted;
1807985b 6130 robj *zsetobj;
6131 zset *zs;
6132
bd79a6bd
PN
6133 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
6134 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
bbe025e0 6135
dd88747b 6136 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6137 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 6138
dd88747b 6139 zs = zsetobj->ptr;
6140 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
6141 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 6142 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 6143 server.dirty += deleted;
482b672d 6144 addReplyLongLong(c,deleted);
1807985b 6145}
6146
9212eafd 6147static void zremrangebyrankCommand(redisClient *c) {
bbe025e0
AM
6148 long start;
6149 long end;
dd88747b 6150 int llen;
6151 long deleted;
9212eafd
PN
6152 robj *zsetobj;
6153 zset *zs;
6154
bd79a6bd
PN
6155 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6156 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 6157
dd88747b 6158 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6159 checkType(c,zsetobj,REDIS_ZSET)) return;
6160 zs = zsetobj->ptr;
6161 llen = zs->zsl->length;
9212eafd 6162
dd88747b 6163 /* convert negative indexes */
6164 if (start < 0) start = llen+start;
6165 if (end < 0) end = llen+end;
6166 if (start < 0) start = 0;
6167 if (end < 0) end = 0;
9212eafd 6168
dd88747b 6169 /* indexes sanity checks */
6170 if (start > end || start >= llen) {
6171 addReply(c,shared.czero);
6172 return;
9212eafd 6173 }
dd88747b 6174 if (end >= llen) end = llen-1;
6175
6176 /* increment start and end because zsl*Rank functions
6177 * use 1-based rank */
6178 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
6179 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 6180 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 6181 server.dirty += deleted;
482b672d 6182 addReplyLongLong(c, deleted);
9212eafd
PN
6183}
6184
8f92e768
PN
6185typedef struct {
6186 dict *dict;
6187 double weight;
6188} zsetopsrc;
6189
6190static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
6191 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
6192 unsigned long size1, size2;
6193 size1 = d1->dict ? dictSize(d1->dict) : 0;
6194 size2 = d2->dict ? dictSize(d2->dict) : 0;
6195 return size1 - size2;
6196}
6197
d2764cd6
PN
6198#define REDIS_AGGR_SUM 1
6199#define REDIS_AGGR_MIN 2
6200#define REDIS_AGGR_MAX 3
bc000c1d 6201#define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
d2764cd6
PN
6202
6203inline static void zunionInterAggregate(double *target, double val, int aggregate) {
6204 if (aggregate == REDIS_AGGR_SUM) {
6205 *target = *target + val;
6206 } else if (aggregate == REDIS_AGGR_MIN) {
6207 *target = val < *target ? val : *target;
6208 } else if (aggregate == REDIS_AGGR_MAX) {
6209 *target = val > *target ? val : *target;
6210 } else {
6211 /* safety net */
f83c6cb5 6212 redisPanic("Unknown ZUNION/INTER aggregate type");
d2764cd6
PN
6213 }
6214}
6215
2830ca53 6216static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
bc000c1d 6217 int i, j, setnum;
d2764cd6 6218 int aggregate = REDIS_AGGR_SUM;
8f92e768 6219 zsetopsrc *src;
2830ca53
PN
6220 robj *dstobj;
6221 zset *dstzset;
b287c9bb
PN
6222 dictIterator *di;
6223 dictEntry *de;
6224
bc000c1d
JC
6225 /* expect setnum input keys to be given */
6226 setnum = atoi(c->argv[2]->ptr);
6227 if (setnum < 1) {
5d373da9 6228 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
2830ca53 6229 return;
b287c9bb 6230 }
2830ca53
PN
6231
6232 /* test if the expected number of keys would overflow */
bc000c1d 6233 if (3+setnum > c->argc) {
b287c9bb
PN
6234 addReply(c,shared.syntaxerr);
6235 return;
6236 }
6237
2830ca53 6238 /* read keys to be used for input */
bc000c1d
JC
6239 src = zmalloc(sizeof(zsetopsrc) * setnum);
6240 for (i = 0, j = 3; i < setnum; i++, j++) {
6241 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
6242 if (!obj) {
8f92e768 6243 src[i].dict = NULL;
b287c9bb 6244 } else {
bc000c1d
JC
6245 if (obj->type == REDIS_ZSET) {
6246 src[i].dict = ((zset*)obj->ptr)->dict;
6247 } else if (obj->type == REDIS_SET) {
6248 src[i].dict = (obj->ptr);
6249 } else {
8f92e768 6250 zfree(src);
b287c9bb
PN
6251 addReply(c,shared.wrongtypeerr);
6252 return;
6253 }
b287c9bb 6254 }
2830ca53
PN
6255
6256 /* default all weights to 1 */
8f92e768 6257 src[i].weight = 1.0;
b287c9bb
PN
6258 }
6259
2830ca53
PN
6260 /* parse optional extra arguments */
6261 if (j < c->argc) {
d2764cd6 6262 int remaining = c->argc - j;
b287c9bb 6263
2830ca53 6264 while (remaining) {
bc000c1d 6265 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 6266 j++; remaining--;
bc000c1d 6267 for (i = 0; i < setnum; i++, j++, remaining--) {
bd79a6bd 6268 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
bbe025e0 6269 return;
2830ca53 6270 }
d2764cd6
PN
6271 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6272 j++; remaining--;
6273 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6274 aggregate = REDIS_AGGR_SUM;
6275 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6276 aggregate = REDIS_AGGR_MIN;
6277 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6278 aggregate = REDIS_AGGR_MAX;
6279 } else {
6280 zfree(src);
6281 addReply(c,shared.syntaxerr);
6282 return;
6283 }
6284 j++; remaining--;
2830ca53 6285 } else {
8f92e768 6286 zfree(src);
2830ca53
PN
6287 addReply(c,shared.syntaxerr);
6288 return;
6289 }
6290 }
6291 }
b287c9bb 6292
d2764cd6
PN
6293 /* sort sets from the smallest to largest, this will improve our
6294 * algorithm's performance */
bc000c1d 6295 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
d2764cd6 6296
2830ca53
PN
6297 dstobj = createZsetObject();
6298 dstzset = dstobj->ptr;
6299
6300 if (op == REDIS_OP_INTER) {
8f92e768
PN
6301 /* skip going over all entries if the smallest zset is NULL or empty */
6302 if (src[0].dict && dictSize(src[0].dict) > 0) {
6303 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6304 * from small to large, all src[i > 0].dict are non-empty too */
6305 di = dictGetIterator(src[0].dict);
2830ca53 6306 while((de = dictNext(di)) != NULL) {
d2764cd6 6307 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6308 *score = src[0].weight * zunionInterDictValue(de);
2830ca53 6309
bc000c1d 6310 for (j = 1; j < setnum; j++) {
d2764cd6 6311 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6312 if (other) {
bc000c1d 6313 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6314 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6315 } else {
6316 break;
6317 }
6318 }
b287c9bb 6319
2830ca53 6320 /* skip entry when not present in every source dict */
bc000c1d 6321 if (j != setnum) {
2830ca53
PN
6322 zfree(score);
6323 } else {
6324 robj *o = dictGetEntryKey(de);
6325 dictAdd(dstzset->dict,o,score);
6326 incrRefCount(o); /* added to dictionary */
6327 zslInsert(dstzset->zsl,*score,o);
6328 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
6329 }
6330 }
2830ca53
PN
6331 dictReleaseIterator(di);
6332 }
6333 } else if (op == REDIS_OP_UNION) {
bc000c1d 6334 for (i = 0; i < setnum; i++) {
8f92e768 6335 if (!src[i].dict) continue;
2830ca53 6336
8f92e768 6337 di = dictGetIterator(src[i].dict);
2830ca53
PN
6338 while((de = dictNext(di)) != NULL) {
6339 /* skip key when already processed */
6340 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6341
d2764cd6 6342 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6343 *score = src[i].weight * zunionInterDictValue(de);
2830ca53 6344
d2764cd6
PN
6345 /* because the zsets are sorted by size, its only possible
6346 * for sets at larger indices to hold this entry */
bc000c1d 6347 for (j = (i+1); j < setnum; j++) {
d2764cd6 6348 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6349 if (other) {
bc000c1d 6350 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6351 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6352 }
6353 }
b287c9bb 6354
2830ca53
PN
6355 robj *o = dictGetEntryKey(de);
6356 dictAdd(dstzset->dict,o,score);
6357 incrRefCount(o); /* added to dictionary */
6358 zslInsert(dstzset->zsl,*score,o);
6359 incrRefCount(o); /* added to skiplist */
6360 }
6361 dictReleaseIterator(di);
b287c9bb 6362 }
2830ca53
PN
6363 } else {
6364 /* unknown operator */
6365 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
6366 }
6367
6368 deleteKey(c->db,dstkey);
3ea27d37 6369 if (dstzset->zsl->length) {
6370 dictAdd(c->db->dict,dstkey,dstobj);
6371 incrRefCount(dstkey);
482b672d 6372 addReplyLongLong(c, dstzset->zsl->length);
3ea27d37 6373 server.dirty++;
6374 } else {
8bca8773 6375 decrRefCount(dstobj);
3ea27d37 6376 addReply(c, shared.czero);
6377 }
8f92e768 6378 zfree(src);
b287c9bb
PN
6379}
6380
5d373da9 6381static void zunionstoreCommand(redisClient *c) {
2830ca53 6382 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
6383}
6384
5d373da9 6385static void zinterstoreCommand(redisClient *c) {
2830ca53 6386 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
6387}
6388
e3870fab 6389static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 6390 robj *o;
bbe025e0
AM
6391 long start;
6392 long end;
752da584 6393 int withscores = 0;
dd88747b 6394 int llen;
6395 int rangelen, j;
6396 zset *zsetobj;
6397 zskiplist *zsl;
6398 zskiplistNode *ln;
6399 robj *ele;
752da584 6400
bd79a6bd
PN
6401 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6402 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 6403
752da584 6404 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6405 withscores = 1;
6406 } else if (c->argc >= 5) {
6407 addReply(c,shared.syntaxerr);
6408 return;
6409 }
cc812361 6410
4e27f268 6411 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6412 || checkType(c,o,REDIS_ZSET)) return;
dd88747b 6413 zsetobj = o->ptr;
6414 zsl = zsetobj->zsl;
6415 llen = zsl->length;
cc812361 6416
dd88747b 6417 /* convert negative indexes */
6418 if (start < 0) start = llen+start;
6419 if (end < 0) end = llen+end;
6420 if (start < 0) start = 0;
6421 if (end < 0) end = 0;
cc812361 6422
dd88747b 6423 /* indexes sanity checks */
6424 if (start > end || start >= llen) {
6425 /* Out of range start or start > end result in empty list */
6426 addReply(c,shared.emptymultibulk);
6427 return;
6428 }
6429 if (end >= llen) end = llen-1;
6430 rangelen = (end-start)+1;
cc812361 6431
dd88747b 6432 /* check if starting point is trivial, before searching
6433 * the element in log(N) time */
6434 if (reverse) {
6435 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6436 } else {
6437 ln = start == 0 ?
6438 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6439 }
cc812361 6440
dd88747b 6441 /* Return the result in form of a multi-bulk reply */
6442 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6443 withscores ? (rangelen*2) : rangelen));
6444 for (j = 0; j < rangelen; j++) {
6445 ele = ln->obj;
6446 addReplyBulk(c,ele);
6447 if (withscores)
6448 addReplyDouble(c,ln->score);
6449 ln = reverse ? ln->backward : ln->forward[0];
cc812361 6450 }
6451}
6452
e3870fab 6453static void zrangeCommand(redisClient *c) {
6454 zrangeGenericCommand(c,0);
6455}
6456
6457static void zrevrangeCommand(redisClient *c) {
6458 zrangeGenericCommand(c,1);
6459}
6460
f44dd428 6461/* This command implements both ZRANGEBYSCORE and ZCOUNT.
6462 * If justcount is non-zero, just the count is returned. */
6463static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 6464 robj *o;
f44dd428 6465 double min, max;
6466 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 6467 int offset = 0, limit = -1;
0500ef27
SH
6468 int withscores = 0;
6469 int badsyntax = 0;
6470
f44dd428 6471 /* Parse the min-max interval. If one of the values is prefixed
6472 * by the "(" character, it's considered "open". For instance
6473 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6474 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6475 if (((char*)c->argv[2]->ptr)[0] == '(') {
6476 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6477 minex = 1;
6478 } else {
6479 min = strtod(c->argv[2]->ptr,NULL);
6480 }
6481 if (((char*)c->argv[3]->ptr)[0] == '(') {
6482 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6483 maxex = 1;
6484 } else {
6485 max = strtod(c->argv[3]->ptr,NULL);
6486 }
6487
6488 /* Parse "WITHSCORES": note that if the command was called with
6489 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6490 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 6491 if (c->argc == 5 || c->argc == 8) {
3a3978b1 6492 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6493 withscores = 1;
6494 else
6495 badsyntax = 1;
0500ef27 6496 }
3a3978b1 6497 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 6498 badsyntax = 1;
0500ef27 6499 if (badsyntax) {
454d4e43 6500 addReplySds(c,
6501 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 6502 return;
0500ef27
SH
6503 }
6504
f44dd428 6505 /* Parse "LIMIT" */
0500ef27 6506 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 6507 addReply(c,shared.syntaxerr);
6508 return;
0500ef27 6509 } else if (c->argc == (7 + withscores)) {
80181f78 6510 offset = atoi(c->argv[5]->ptr);
6511 limit = atoi(c->argv[6]->ptr);
0b13687c 6512 if (offset < 0) offset = 0;
80181f78 6513 }
50c55df5 6514
f44dd428 6515 /* Ok, lookup the key and get the range */
50c55df5 6516 o = lookupKeyRead(c->db,c->argv[1]);
6517 if (o == NULL) {
4e27f268 6518 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6519 } else {
6520 if (o->type != REDIS_ZSET) {
6521 addReply(c,shared.wrongtypeerr);
6522 } else {
6523 zset *zsetobj = o->ptr;
6524 zskiplist *zsl = zsetobj->zsl;
6525 zskiplistNode *ln;
f44dd428 6526 robj *ele, *lenobj = NULL;
6527 unsigned long rangelen = 0;
50c55df5 6528
f44dd428 6529 /* Get the first node with the score >= min, or with
6530 * score > min if 'minex' is true. */
50c55df5 6531 ln = zslFirstWithScore(zsl,min);
f44dd428 6532 while (minex && ln && ln->score == min) ln = ln->forward[0];
6533
50c55df5 6534 if (ln == NULL) {
6535 /* No element matching the speciifed interval */
f44dd428 6536 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6537 return;
6538 }
6539
6540 /* We don't know in advance how many matching elements there
6541 * are in the list, so we push this object that will represent
6542 * the multi-bulk length in the output buffer, and will "fix"
6543 * it later */
f44dd428 6544 if (!justcount) {
6545 lenobj = createObject(REDIS_STRING,NULL);
6546 addReply(c,lenobj);
6547 decrRefCount(lenobj);
6548 }
50c55df5 6549
f44dd428 6550 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 6551 if (offset) {
6552 offset--;
6553 ln = ln->forward[0];
6554 continue;
6555 }
6556 if (limit == 0) break;
f44dd428 6557 if (!justcount) {
6558 ele = ln->obj;
dd88747b 6559 addReplyBulk(c,ele);
f44dd428 6560 if (withscores)
6561 addReplyDouble(c,ln->score);
6562 }
50c55df5 6563 ln = ln->forward[0];
6564 rangelen++;
80181f78 6565 if (limit > 0) limit--;
50c55df5 6566 }
f44dd428 6567 if (justcount) {
482b672d 6568 addReplyLongLong(c,(long)rangelen);
f44dd428 6569 } else {
6570 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6571 withscores ? (rangelen*2) : rangelen);
6572 }
50c55df5 6573 }
6574 }
6575}
6576
f44dd428 6577static void zrangebyscoreCommand(redisClient *c) {
6578 genericZrangebyscoreCommand(c,0);
6579}
6580
6581static void zcountCommand(redisClient *c) {
6582 genericZrangebyscoreCommand(c,1);
6583}
6584
3c41331e 6585static void zcardCommand(redisClient *c) {
e197b441 6586 robj *o;
6587 zset *zs;
dd88747b 6588
6589 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6590 checkType(c,o,REDIS_ZSET)) return;
6591
6592 zs = o->ptr;
6593 addReplyUlong(c,zs->zsl->length);
e197b441 6594}
6595
6e333bbe 6596static void zscoreCommand(redisClient *c) {
6597 robj *o;
6598 zset *zs;
dd88747b 6599 dictEntry *de;
6600
6601 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6602 checkType(c,o,REDIS_ZSET)) return;
6603
6604 zs = o->ptr;
6605 de = dictFind(zs->dict,c->argv[2]);
6606 if (!de) {
96d8b4ee 6607 addReply(c,shared.nullbulk);
6e333bbe 6608 } else {
dd88747b 6609 double *score = dictGetEntryVal(de);
6e333bbe 6610
dd88747b 6611 addReplyDouble(c,*score);
6e333bbe 6612 }
6613}
6614
798d9e55 6615static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 6616 robj *o;
dd88747b 6617 zset *zs;
6618 zskiplist *zsl;
6619 dictEntry *de;
6620 unsigned long rank;
6621 double *score;
6622
6623 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6624 checkType(c,o,REDIS_ZSET)) return;
6625
6626 zs = o->ptr;
6627 zsl = zs->zsl;
6628 de = dictFind(zs->dict,c->argv[2]);
6629 if (!de) {
69d95c3e
PN
6630 addReply(c,shared.nullbulk);
6631 return;
6632 }
69d95c3e 6633
dd88747b 6634 score = dictGetEntryVal(de);
6635 rank = zslGetRank(zsl, *score, c->argv[2]);
6636 if (rank) {
6637 if (reverse) {
482b672d 6638 addReplyLongLong(c, zsl->length - rank);
27b0ccca 6639 } else {
482b672d 6640 addReplyLongLong(c, rank-1);
69d95c3e 6641 }
dd88747b 6642 } else {
6643 addReply(c,shared.nullbulk);
978c2c94 6644 }
6645}
6646
798d9e55
PN
6647static void zrankCommand(redisClient *c) {
6648 zrankGenericCommand(c, 0);
6649}
6650
6651static void zrevrankCommand(redisClient *c) {
6652 zrankGenericCommand(c, 1);
6653}
6654
7fb16bac
PN
6655/* ========================= Hashes utility functions ======================= */
6656#define REDIS_HASH_KEY 1
6657#define REDIS_HASH_VALUE 2
978c2c94 6658
7fb16bac
PN
6659/* Check the length of a number of objects to see if we need to convert a
6660 * zipmap to a real hash. Note that we only check string encoded objects
6661 * as their string length can be queried in constant time. */
6662static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6663 int i;
6664 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
978c2c94 6665
7fb16bac
PN
6666 for (i = start; i <= end; i++) {
6667 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6668 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6669 {
6670 convertToRealHash(subject);
978c2c94 6671 return;
6672 }
6673 }
7fb16bac 6674}
bae2c7ec 6675
97224de7
PN
6676/* Encode given objects in-place when the hash uses a dict. */
6677static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6678 if (subject->encoding == REDIS_ENCODING_HT) {
3f973463
PN
6679 if (o1) *o1 = tryObjectEncoding(*o1);
6680 if (o2) *o2 = tryObjectEncoding(*o2);
97224de7
PN
6681 }
6682}
6683
7fb16bac 6684/* Get the value from a hash identified by key. Returns either a string
a3f3af86
PN
6685 * object or NULL if the value cannot be found. The refcount of the object
6686 * is always increased by 1 when the value was found. */
7fb16bac
PN
6687static robj *hashGet(robj *o, robj *key) {
6688 robj *value = NULL;
978c2c94 6689 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7fb16bac
PN
6690 unsigned char *v;
6691 unsigned int vlen;
6692 key = getDecodedObject(key);
6693 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6694 value = createStringObject((char*)v,vlen);
6695 }
6696 decrRefCount(key);
6697 } else {
6698 dictEntry *de = dictFind(o->ptr,key);
6699 if (de != NULL) {
6700 value = dictGetEntryVal(de);
a3f3af86 6701 incrRefCount(value);
7fb16bac
PN
6702 }
6703 }
6704 return value;
6705}
978c2c94 6706
7fb16bac
PN
6707/* Test if the key exists in the given hash. Returns 1 if the key
6708 * exists and 0 when it doesn't. */
6709static int hashExists(robj *o, robj *key) {
6710 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6711 key = getDecodedObject(key);
6712 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6713 decrRefCount(key);
6714 return 1;
6715 }
6716 decrRefCount(key);
6717 } else {
6718 if (dictFind(o->ptr,key) != NULL) {
6719 return 1;
6720 }
6721 }
6722 return 0;
6723}
bae2c7ec 6724
7fb16bac
PN
6725/* Add an element, discard the old if the key already exists.
6726 * Return 0 on insert and 1 on update. */
feb8d7e6 6727static int hashSet(robj *o, robj *key, robj *value) {
7fb16bac
PN
6728 int update = 0;
6729 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6730 key = getDecodedObject(key);
6731 value = getDecodedObject(value);
6732 o->ptr = zipmapSet(o->ptr,
6733 key->ptr,sdslen(key->ptr),
6734 value->ptr,sdslen(value->ptr), &update);
6735 decrRefCount(key);
6736 decrRefCount(value);
6737
6738 /* Check if the zipmap needs to be upgraded to a real hash table */
6739 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
bae2c7ec 6740 convertToRealHash(o);
978c2c94 6741 } else {
7fb16bac
PN
6742 if (dictReplace(o->ptr,key,value)) {
6743 /* Insert */
6744 incrRefCount(key);
978c2c94 6745 } else {
7fb16bac 6746 /* Update */
978c2c94 6747 update = 1;
6748 }
7fb16bac 6749 incrRefCount(value);
978c2c94 6750 }
7fb16bac 6751 return update;
978c2c94 6752}
6753
7fb16bac
PN
6754/* Delete an element from a hash.
6755 * Return 1 on deleted and 0 on not found. */
6756static int hashDelete(robj *o, robj *key) {
6757 int deleted = 0;
6758 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6759 key = getDecodedObject(key);
6760 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6761 decrRefCount(key);
6762 } else {
6763 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6764 /* Always check if the dictionary needs a resize after a delete. */
6765 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
d33278d1 6766 }
7fb16bac
PN
6767 return deleted;
6768}
d33278d1 6769
7fb16bac 6770/* Return the number of elements in a hash. */
c811bb38 6771static unsigned long hashLength(robj *o) {
7fb16bac
PN
6772 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6773 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6774}
6775
6776/* Structure to hold hash iteration abstration. Note that iteration over
6777 * hashes involves both fields and values. Because it is possible that
6778 * not both are required, store pointers in the iterator to avoid
6779 * unnecessary memory allocation for fields/values. */
6780typedef struct {
6781 int encoding;
6782 unsigned char *zi;
6783 unsigned char *zk, *zv;
6784 unsigned int zklen, zvlen;
6785
6786 dictIterator *di;
6787 dictEntry *de;
6788} hashIterator;
6789
c44d3b56
PN
6790static hashIterator *hashInitIterator(robj *subject) {
6791 hashIterator *hi = zmalloc(sizeof(hashIterator));
7fb16bac
PN
6792 hi->encoding = subject->encoding;
6793 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6794 hi->zi = zipmapRewind(subject->ptr);
6795 } else if (hi->encoding == REDIS_ENCODING_HT) {
6796 hi->di = dictGetIterator(subject->ptr);
d33278d1 6797 } else {
7fb16bac 6798 redisAssert(NULL);
d33278d1 6799 }
c44d3b56 6800 return hi;
7fb16bac 6801}
d33278d1 6802
7fb16bac
PN
6803static void hashReleaseIterator(hashIterator *hi) {
6804 if (hi->encoding == REDIS_ENCODING_HT) {
6805 dictReleaseIterator(hi->di);
d33278d1 6806 }
c44d3b56 6807 zfree(hi);
7fb16bac 6808}
d33278d1 6809
7fb16bac
PN
6810/* Move to the next entry in the hash. Return REDIS_OK when the next entry
6811 * could be found and REDIS_ERR when the iterator reaches the end. */
c811bb38 6812static int hashNext(hashIterator *hi) {
7fb16bac
PN
6813 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6814 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6815 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6816 } else {
6817 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6818 }
6819 return REDIS_OK;
6820}
d33278d1 6821
0c390abc 6822/* Get key or value object at current iteration position.
a3f3af86 6823 * This increases the refcount of the field object by 1. */
c811bb38 6824static robj *hashCurrent(hashIterator *hi, int what) {
7fb16bac
PN
6825 robj *o;
6826 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6827 if (what & REDIS_HASH_KEY) {
6828 o = createStringObject((char*)hi->zk,hi->zklen);
6829 } else {
6830 o = createStringObject((char*)hi->zv,hi->zvlen);
d33278d1 6831 }
d33278d1 6832 } else {
7fb16bac
PN
6833 if (what & REDIS_HASH_KEY) {
6834 o = dictGetEntryKey(hi->de);
6835 } else {
6836 o = dictGetEntryVal(hi->de);
d33278d1 6837 }
a3f3af86 6838 incrRefCount(o);
d33278d1 6839 }
7fb16bac 6840 return o;
d33278d1
PN
6841}
6842
7fb16bac
PN
6843static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6844 robj *o = lookupKeyWrite(c->db,key);
01426b05
PN
6845 if (o == NULL) {
6846 o = createHashObject();
7fb16bac
PN
6847 dictAdd(c->db->dict,key,o);
6848 incrRefCount(key);
01426b05
PN
6849 } else {
6850 if (o->type != REDIS_HASH) {
6851 addReply(c,shared.wrongtypeerr);
7fb16bac 6852 return NULL;
01426b05
PN
6853 }
6854 }
7fb16bac
PN
6855 return o;
6856}
01426b05 6857
7fb16bac
PN
6858/* ============================= Hash commands ============================== */
6859static void hsetCommand(redisClient *c) {
6e9e463f 6860 int update;
7fb16bac 6861 robj *o;
bbe025e0 6862
7fb16bac
PN
6863 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6864 hashTryConversion(o,c->argv,2,3);
97224de7 6865 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6866 update = hashSet(o,c->argv[2],c->argv[3]);
6e9e463f 6867 addReply(c, update ? shared.czero : shared.cone);
7fb16bac
PN
6868 server.dirty++;
6869}
01426b05 6870
1f1c7695
PN
6871static void hsetnxCommand(redisClient *c) {
6872 robj *o;
6873 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6874 hashTryConversion(o,c->argv,2,3);
6875
6876 if (hashExists(o, c->argv[2])) {
6877 addReply(c, shared.czero);
01426b05 6878 } else {
97224de7 6879 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6880 hashSet(o,c->argv[2],c->argv[3]);
1f1c7695
PN
6881 addReply(c, shared.cone);
6882 server.dirty++;
6883 }
6884}
01426b05 6885
7fb16bac
PN
6886static void hmsetCommand(redisClient *c) {
6887 int i;
6888 robj *o;
01426b05 6889
7fb16bac
PN
6890 if ((c->argc % 2) == 1) {
6891 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6892 return;
6893 }
01426b05 6894
7fb16bac
PN
6895 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6896 hashTryConversion(o,c->argv,2,c->argc-1);
6897 for (i = 2; i < c->argc; i += 2) {
97224de7 6898 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
feb8d7e6 6899 hashSet(o,c->argv[i],c->argv[i+1]);
7fb16bac
PN
6900 }
6901 addReply(c, shared.ok);
edc2f63a 6902 server.dirty++;
7fb16bac
PN
6903}
6904
6905static void hincrbyCommand(redisClient *c) {
6906 long long value, incr;
6907 robj *o, *current, *new;
6908
bd79a6bd 6909 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
7fb16bac
PN
6910 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6911 if ((current = hashGet(o,c->argv[2])) != NULL) {
946342c1
PN
6912 if (getLongLongFromObjectOrReply(c,current,&value,
6913 "hash value is not an integer") != REDIS_OK) {
6914 decrRefCount(current);
6915 return;
6916 }
a3f3af86 6917 decrRefCount(current);
7fb16bac
PN
6918 } else {
6919 value = 0;
01426b05
PN
6920 }
6921
7fb16bac 6922 value += incr;
3f973463
PN
6923 new = createStringObjectFromLongLong(value);
6924 hashTryObjectEncoding(o,&c->argv[2],NULL);
feb8d7e6 6925 hashSet(o,c->argv[2],new);
7fb16bac
PN
6926 decrRefCount(new);
6927 addReplyLongLong(c,value);
01426b05 6928 server.dirty++;
01426b05
PN
6929}
6930
978c2c94 6931static void hgetCommand(redisClient *c) {
7fb16bac 6932 robj *o, *value;
dd88747b 6933 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6934 checkType(c,o,REDIS_HASH)) return;
6935
7fb16bac
PN
6936 if ((value = hashGet(o,c->argv[2])) != NULL) {
6937 addReplyBulk(c,value);
a3f3af86 6938 decrRefCount(value);
dd88747b 6939 } else {
7fb16bac 6940 addReply(c,shared.nullbulk);
69d95c3e 6941 }
69d95c3e
PN
6942}
6943
09aeb579
PN
6944static void hmgetCommand(redisClient *c) {
6945 int i;
7fb16bac
PN
6946 robj *o, *value;
6947 o = lookupKeyRead(c->db,c->argv[1]);
6948 if (o != NULL && o->type != REDIS_HASH) {
6949 addReply(c,shared.wrongtypeerr);
09aeb579
PN
6950 }
6951
7fb16bac
PN
6952 /* Note the check for o != NULL happens inside the loop. This is
6953 * done because objects that cannot be found are considered to be
6954 * an empty hash. The reply should then be a series of NULLs. */
09aeb579 6955 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7fb16bac
PN
6956 for (i = 2; i < c->argc; i++) {
6957 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6958 addReplyBulk(c,value);
a3f3af86 6959 decrRefCount(value);
7fb16bac
PN
6960 } else {
6961 addReply(c,shared.nullbulk);
09aeb579
PN
6962 }
6963 }
6964}
6965
07efaf74 6966static void hdelCommand(redisClient *c) {
dd88747b 6967 robj *o;
dd88747b 6968 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6969 checkType(c,o,REDIS_HASH)) return;
07efaf74 6970
7fb16bac
PN
6971 if (hashDelete(o,c->argv[2])) {
6972 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6973 addReply(c,shared.cone);
6974 server.dirty++;
dd88747b 6975 } else {
7fb16bac 6976 addReply(c,shared.czero);
07efaf74 6977 }
6978}
6979
92b27fe9 6980static void hlenCommand(redisClient *c) {
6981 robj *o;
dd88747b 6982 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 6983 checkType(c,o,REDIS_HASH)) return;
6984
7fb16bac 6985 addReplyUlong(c,hashLength(o));
92b27fe9 6986}
6987
78409a0f 6988static void genericHgetallCommand(redisClient *c, int flags) {
7fb16bac 6989 robj *o, *lenobj, *obj;
78409a0f 6990 unsigned long count = 0;
c44d3b56 6991 hashIterator *hi;
78409a0f 6992
4e27f268 6993 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
78409a0f 6994 || checkType(c,o,REDIS_HASH)) return;
6995
6996 lenobj = createObject(REDIS_STRING,NULL);
6997 addReply(c,lenobj);
6998 decrRefCount(lenobj);
6999
c44d3b56
PN
7000 hi = hashInitIterator(o);
7001 while (hashNext(hi) != REDIS_ERR) {
7fb16bac 7002 if (flags & REDIS_HASH_KEY) {
c44d3b56 7003 obj = hashCurrent(hi,REDIS_HASH_KEY);
7fb16bac 7004 addReplyBulk(c,obj);
a3f3af86 7005 decrRefCount(obj);
7fb16bac 7006 count++;
78409a0f 7007 }
7fb16bac 7008 if (flags & REDIS_HASH_VALUE) {
c44d3b56 7009 obj = hashCurrent(hi,REDIS_HASH_VALUE);
7fb16bac 7010 addReplyBulk(c,obj);
a3f3af86 7011 decrRefCount(obj);
7fb16bac 7012 count++;
78409a0f 7013 }
78409a0f 7014 }
c44d3b56 7015 hashReleaseIterator(hi);
7fb16bac 7016
78409a0f 7017 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
7018}
7019
7020static void hkeysCommand(redisClient *c) {
7fb16bac 7021 genericHgetallCommand(c,REDIS_HASH_KEY);
78409a0f 7022}
7023
7024static void hvalsCommand(redisClient *c) {
7fb16bac 7025 genericHgetallCommand(c,REDIS_HASH_VALUE);
78409a0f 7026}
7027
7028static void hgetallCommand(redisClient *c) {
7fb16bac 7029 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
78409a0f 7030}
7031
a86f14b1 7032static void hexistsCommand(redisClient *c) {
7033 robj *o;
a86f14b1 7034 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
7035 checkType(c,o,REDIS_HASH)) return;
7036
7fb16bac 7037 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
a86f14b1 7038}
7039
ada386b2 7040static void convertToRealHash(robj *o) {
7041 unsigned char *key, *val, *p, *zm = o->ptr;
7042 unsigned int klen, vlen;
7043 dict *dict = dictCreate(&hashDictType,NULL);
7044
7045 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
7046 p = zipmapRewind(zm);
7047 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
7048 robj *keyobj, *valobj;
7049
7050 keyobj = createStringObject((char*)key,klen);
7051 valobj = createStringObject((char*)val,vlen);
05df7621 7052 keyobj = tryObjectEncoding(keyobj);
7053 valobj = tryObjectEncoding(valobj);
ada386b2 7054 dictAdd(dict,keyobj,valobj);
7055 }
7056 o->encoding = REDIS_ENCODING_HT;
7057 o->ptr = dict;
7058 zfree(zm);
7059}
7060
6b47e12e 7061/* ========================= Non type-specific commands ==================== */
7062
ed9b544e 7063static void flushdbCommand(redisClient *c) {
ca37e9cd 7064 server.dirty += dictSize(c->db->dict);
9b30e1a2 7065 touchWatchedKeysOnFlush(c->db->id);
3305306f 7066 dictEmpty(c->db->dict);
7067 dictEmpty(c->db->expires);
ed9b544e 7068 addReply(c,shared.ok);
ed9b544e 7069}
7070
7071static void flushallCommand(redisClient *c) {
9b30e1a2 7072 touchWatchedKeysOnFlush(-1);
ca37e9cd 7073 server.dirty += emptyDb();
ed9b544e 7074 addReply(c,shared.ok);
500ece7c 7075 if (server.bgsavechildpid != -1) {
7076 kill(server.bgsavechildpid,SIGKILL);
7077 rdbRemoveTempFile(server.bgsavechildpid);
7078 }
f78fd11b 7079 rdbSave(server.dbfilename);
ca37e9cd 7080 server.dirty++;
ed9b544e 7081}
7082
56906eef 7083static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 7084 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 7085 so->type = type;
7086 so->pattern = pattern;
7087 return so;
7088}
7089
7090/* Return the value associated to the key with a name obtained
55017f9d
PN
7091 * substituting the first occurence of '*' in 'pattern' with 'subst'.
7092 * The returned object will always have its refcount increased by 1
7093 * when it is non-NULL. */
56906eef 7094static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6d7d1370 7095 char *p, *f;
ed9b544e 7096 sds spat, ssub;
6d7d1370
PN
7097 robj keyobj, fieldobj, *o;
7098 int prefixlen, sublen, postfixlen, fieldlen;
ed9b544e 7099 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
7100 struct {
f1017b3f 7101 long len;
7102 long free;
ed9b544e 7103 char buf[REDIS_SORTKEY_MAX+1];
6d7d1370 7104 } keyname, fieldname;
ed9b544e 7105
28173a49 7106 /* If the pattern is "#" return the substitution object itself in order
7107 * to implement the "SORT ... GET #" feature. */
7108 spat = pattern->ptr;
7109 if (spat[0] == '#' && spat[1] == '\0') {
55017f9d 7110 incrRefCount(subst);
28173a49 7111 return subst;
7112 }
7113
7114 /* The substitution object may be specially encoded. If so we create
9d65a1bb 7115 * a decoded object on the fly. Otherwise getDecodedObject will just
7116 * increment the ref count, that we'll decrement later. */
7117 subst = getDecodedObject(subst);
942a3961 7118
ed9b544e 7119 ssub = subst->ptr;
7120 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
7121 p = strchr(spat,'*');
ed5a857a 7122 if (!p) {
7123 decrRefCount(subst);
7124 return NULL;
7125 }
ed9b544e 7126
6d7d1370
PN
7127 /* Find out if we're dealing with a hash dereference. */
7128 if ((f = strstr(p+1, "->")) != NULL) {
7129 fieldlen = sdslen(spat)-(f-spat);
7130 /* this also copies \0 character */
7131 memcpy(fieldname.buf,f+2,fieldlen-1);
7132 fieldname.len = fieldlen-2;
7133 } else {
7134 fieldlen = 0;
7135 }
7136
ed9b544e 7137 prefixlen = p-spat;
7138 sublen = sdslen(ssub);
6d7d1370 7139 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
ed9b544e 7140 memcpy(keyname.buf,spat,prefixlen);
7141 memcpy(keyname.buf+prefixlen,ssub,sublen);
7142 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
7143 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
7144 keyname.len = prefixlen+sublen+postfixlen;
942a3961 7145 decrRefCount(subst);
7146
6d7d1370
PN
7147 /* Lookup substituted key */
7148 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
7149 o = lookupKeyRead(db,&keyobj);
55017f9d
PN
7150 if (o == NULL) return NULL;
7151
7152 if (fieldlen > 0) {
7153 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6d7d1370 7154
705dad38
PN
7155 /* Retrieve value from hash by the field name. This operation
7156 * already increases the refcount of the returned object. */
6d7d1370
PN
7157 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
7158 o = hashGet(o, &fieldobj);
705dad38 7159 } else {
55017f9d 7160 if (o->type != REDIS_STRING) return NULL;
b6f07345 7161
705dad38
PN
7162 /* Every object that this function returns needs to have its refcount
7163 * increased. sortCommand decreases it again. */
7164 incrRefCount(o);
6d7d1370
PN
7165 }
7166
7167 return o;
ed9b544e 7168}
7169
7170/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
7171 * the additional parameter is not standard but a BSD-specific we have to
7172 * pass sorting parameters via the global 'server' structure */
7173static int sortCompare(const void *s1, const void *s2) {
7174 const redisSortObject *so1 = s1, *so2 = s2;
7175 int cmp;
7176
7177 if (!server.sort_alpha) {
7178 /* Numeric sorting. Here it's trivial as we precomputed scores */
7179 if (so1->u.score > so2->u.score) {
7180 cmp = 1;
7181 } else if (so1->u.score < so2->u.score) {
7182 cmp = -1;
7183 } else {
7184 cmp = 0;
7185 }
7186 } else {
7187 /* Alphanumeric sorting */
7188 if (server.sort_bypattern) {
7189 if (!so1->u.cmpobj || !so2->u.cmpobj) {
7190 /* At least one compare object is NULL */
7191 if (so1->u.cmpobj == so2->u.cmpobj)
7192 cmp = 0;
7193 else if (so1->u.cmpobj == NULL)
7194 cmp = -1;
7195 else
7196 cmp = 1;
7197 } else {
7198 /* We have both the objects, use strcoll */
7199 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
7200 }
7201 } else {
08ee9b57 7202 /* Compare elements directly. */
7203 cmp = compareStringObjects(so1->obj,so2->obj);
ed9b544e 7204 }
7205 }
7206 return server.sort_desc ? -cmp : cmp;
7207}
7208
7209/* The SORT command is the most complex command in Redis. Warning: this code
7210 * is optimized for speed and a bit less for readability */
7211static void sortCommand(redisClient *c) {
ed9b544e 7212 list *operations;
a03611e1 7213 unsigned int outputlen = 0;
ed9b544e 7214 int desc = 0, alpha = 0;
7215 int limit_start = 0, limit_count = -1, start, end;
7216 int j, dontsort = 0, vectorlen;
7217 int getop = 0; /* GET operation counter */
443c6409 7218 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 7219 redisSortObject *vector; /* Resulting vector to sort */
7220
7221 /* Lookup the key to sort. It must be of the right types */
3305306f 7222 sortval = lookupKeyRead(c->db,c->argv[1]);
7223 if (sortval == NULL) {
4e27f268 7224 addReply(c,shared.emptymultibulk);
ed9b544e 7225 return;
7226 }
a5eb649b 7227 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
7228 sortval->type != REDIS_ZSET)
7229 {
c937aa89 7230 addReply(c,shared.wrongtypeerr);
ed9b544e 7231 return;
7232 }
7233
7234 /* Create a list of operations to perform for every sorted element.
7235 * Operations can be GET/DEL/INCR/DECR */
7236 operations = listCreate();
092dac2a 7237 listSetFreeMethod(operations,zfree);
ed9b544e 7238 j = 2;
7239
7240 /* Now we need to protect sortval incrementing its count, in the future
7241 * SORT may have options able to overwrite/delete keys during the sorting
7242 * and the sorted key itself may get destroied */
7243 incrRefCount(sortval);
7244
7245 /* The SORT command has an SQL-alike syntax, parse it */
7246 while(j < c->argc) {
7247 int leftargs = c->argc-j-1;
7248 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7249 desc = 0;
7250 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7251 desc = 1;
7252 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7253 alpha = 1;
7254 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7255 limit_start = atoi(c->argv[j+1]->ptr);
7256 limit_count = atoi(c->argv[j+2]->ptr);
7257 j+=2;
443c6409 7258 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7259 storekey = c->argv[j+1];
7260 j++;
ed9b544e 7261 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7262 sortby = c->argv[j+1];
7263 /* If the BY pattern does not contain '*', i.e. it is constant,
7264 * we don't need to sort nor to lookup the weight keys. */
7265 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7266 j++;
7267 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7268 listAddNodeTail(operations,createSortOperation(
7269 REDIS_SORT_GET,c->argv[j+1]));
7270 getop++;
7271 j++;
ed9b544e 7272 } else {
7273 decrRefCount(sortval);
7274 listRelease(operations);
c937aa89 7275 addReply(c,shared.syntaxerr);
ed9b544e 7276 return;
7277 }
7278 j++;
7279 }
7280
7281 /* Load the sorting vector with all the objects to sort */
a5eb649b 7282 switch(sortval->type) {
a03611e1 7283 case REDIS_LIST: vectorlen = lLength(sortval); break;
a5eb649b 7284 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7285 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
f83c6cb5 7286 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
a5eb649b 7287 }
ed9b544e 7288 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 7289 j = 0;
a5eb649b 7290
ed9b544e 7291 if (sortval->type == REDIS_LIST) {
a03611e1
PN
7292 lIterator *li = lInitIterator(sortval,0,REDIS_TAIL);
7293 lEntry entry;
7294 while(lNext(li,&entry)) {
7295 vector[j].obj = lGet(&entry);
ed9b544e 7296 vector[j].u.score = 0;
7297 vector[j].u.cmpobj = NULL;
ed9b544e 7298 j++;
7299 }
a03611e1 7300 lReleaseIterator(li);
ed9b544e 7301 } else {
a5eb649b 7302 dict *set;
ed9b544e 7303 dictIterator *di;
7304 dictEntry *setele;
7305
a5eb649b 7306 if (sortval->type == REDIS_SET) {
7307 set = sortval->ptr;
7308 } else {
7309 zset *zs = sortval->ptr;
7310 set = zs->dict;
7311 }
7312
ed9b544e 7313 di = dictGetIterator(set);
ed9b544e 7314 while((setele = dictNext(di)) != NULL) {
7315 vector[j].obj = dictGetEntryKey(setele);
7316 vector[j].u.score = 0;
7317 vector[j].u.cmpobj = NULL;
7318 j++;
7319 }
7320 dictReleaseIterator(di);
7321 }
dfc5e96c 7322 redisAssert(j == vectorlen);
ed9b544e 7323
7324 /* Now it's time to load the right scores in the sorting vector */
7325 if (dontsort == 0) {
7326 for (j = 0; j < vectorlen; j++) {
6d7d1370 7327 robj *byval;
ed9b544e 7328 if (sortby) {
6d7d1370 7329 /* lookup value to sort by */
3305306f 7330 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
705dad38 7331 if (!byval) continue;
ed9b544e 7332 } else {
6d7d1370
PN
7333 /* use object itself to sort by */
7334 byval = vector[j].obj;
7335 }
7336
7337 if (alpha) {
08ee9b57 7338 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6d7d1370
PN
7339 } else {
7340 if (byval->encoding == REDIS_ENCODING_RAW) {
7341 vector[j].u.score = strtod(byval->ptr,NULL);
16fa22f1 7342 } else if (byval->encoding == REDIS_ENCODING_INT) {
6d7d1370
PN
7343 /* Don't need to decode the object if it's
7344 * integer-encoded (the only encoding supported) so
7345 * far. We can just cast it */
16fa22f1
PN
7346 vector[j].u.score = (long)byval->ptr;
7347 } else {
7348 redisAssert(1 != 1);
942a3961 7349 }
ed9b544e 7350 }
6d7d1370 7351
705dad38
PN
7352 /* when the object was retrieved using lookupKeyByPattern,
7353 * its refcount needs to be decreased. */
7354 if (sortby) {
7355 decrRefCount(byval);
ed9b544e 7356 }
7357 }
7358 }
7359
7360 /* We are ready to sort the vector... perform a bit of sanity check
7361 * on the LIMIT option too. We'll use a partial version of quicksort. */
7362 start = (limit_start < 0) ? 0 : limit_start;
7363 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7364 if (start >= vectorlen) {
7365 start = vectorlen-1;
7366 end = vectorlen-2;
7367 }
7368 if (end >= vectorlen) end = vectorlen-1;
7369
7370 if (dontsort == 0) {
7371 server.sort_desc = desc;
7372 server.sort_alpha = alpha;
7373 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 7374 if (sortby && (start != 0 || end != vectorlen-1))
7375 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7376 else
7377 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 7378 }
7379
7380 /* Send command output to the output buffer, performing the specified
7381 * GET/DEL/INCR/DECR operations if any. */
7382 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 7383 if (storekey == NULL) {
7384 /* STORE option not specified, sent the sorting result to client */
7385 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7386 for (j = start; j <= end; j++) {
7387 listNode *ln;
c7df85a4 7388 listIter li;
7389
dd88747b 7390 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 7391 listRewind(operations,&li);
7392 while((ln = listNext(&li))) {
443c6409 7393 redisSortOperation *sop = ln->value;
7394 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7395 vector[j].obj);
7396
7397 if (sop->type == REDIS_SORT_GET) {
55017f9d 7398 if (!val) {
443c6409 7399 addReply(c,shared.nullbulk);
7400 } else {
dd88747b 7401 addReplyBulk(c,val);
55017f9d 7402 decrRefCount(val);
443c6409 7403 }
7404 } else {
dfc5e96c 7405 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 7406 }
7407 }
ed9b544e 7408 }
443c6409 7409 } else {
74e0f445 7410 robj *sobj = createZiplistObject();
443c6409 7411
7412 /* STORE option specified, set the sorting result as a List object */
7413 for (j = start; j <= end; j++) {
7414 listNode *ln;
c7df85a4 7415 listIter li;
7416
443c6409 7417 if (!getop) {
a03611e1
PN
7418 lPush(sobj,vector[j].obj,REDIS_TAIL);
7419 } else {
7420 listRewind(operations,&li);
7421 while((ln = listNext(&li))) {
7422 redisSortOperation *sop = ln->value;
7423 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7424 vector[j].obj);
7425
7426 if (sop->type == REDIS_SORT_GET) {
7427 if (!val) val = createStringObject("",0);
7428
7429 /* lPush does an incrRefCount, so we should take care
7430 * care of the incremented refcount caused by either
7431 * lookupKeyByPattern or createStringObject("",0) */
7432 lPush(sobj,val,REDIS_TAIL);
7433 decrRefCount(val);
443c6409 7434 } else {
a03611e1
PN
7435 /* always fails */
7436 redisAssert(sop->type == REDIS_SORT_GET);
443c6409 7437 }
ed9b544e 7438 }
ed9b544e 7439 }
ed9b544e 7440 }
a03611e1 7441 if (dictReplace(c->db->dict,storekey,sobj)) {
121796f7 7442 incrRefCount(storekey);
7443 }
443c6409 7444 /* Note: we add 1 because the DB is dirty anyway since even if the
7445 * SORT result is empty a new key is set and maybe the old content
7446 * replaced. */
7447 server.dirty += 1+outputlen;
7448 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 7449 }
7450
7451 /* Cleanup */
a03611e1
PN
7452 if (sortval->type == REDIS_LIST)
7453 for (j = 0; j < vectorlen; j++)
7454 decrRefCount(vector[j].obj);
ed9b544e 7455 decrRefCount(sortval);
7456 listRelease(operations);
7457 for (j = 0; j < vectorlen; j++) {
16fa22f1 7458 if (alpha && vector[j].u.cmpobj)
ed9b544e 7459 decrRefCount(vector[j].u.cmpobj);
7460 }
7461 zfree(vector);
7462}
7463
ec6c7a1d 7464/* Convert an amount of bytes into a human readable string in the form
7465 * of 100B, 2G, 100M, 4K, and so forth. */
7466static void bytesToHuman(char *s, unsigned long long n) {
7467 double d;
7468
7469 if (n < 1024) {
7470 /* Bytes */
7471 sprintf(s,"%lluB",n);
7472 return;
7473 } else if (n < (1024*1024)) {
7474 d = (double)n/(1024);
7475 sprintf(s,"%.2fK",d);
7476 } else if (n < (1024LL*1024*1024)) {
7477 d = (double)n/(1024*1024);
7478 sprintf(s,"%.2fM",d);
7479 } else if (n < (1024LL*1024*1024*1024)) {
7480 d = (double)n/(1024LL*1024*1024);
b72f6a4b 7481 sprintf(s,"%.2fG",d);
ec6c7a1d 7482 }
7483}
7484
1c85b79f 7485/* Create the string returned by the INFO command. This is decoupled
7486 * by the INFO command itself as we need to report the same information
7487 * on memory corruption problems. */
7488static sds genRedisInfoString(void) {
ed9b544e 7489 sds info;
7490 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 7491 int j;
ec6c7a1d 7492 char hmem[64];
55a8298f 7493
b72f6a4b 7494 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 7495 info = sdscatprintf(sdsempty(),
7496 "redis_version:%s\r\n"
5436146c
PN
7497 "redis_git_sha1:%s\r\n"
7498 "redis_git_dirty:%d\r\n"
f1017b3f 7499 "arch_bits:%s\r\n"
7a932b74 7500 "multiplexing_api:%s\r\n"
0d7170a4 7501 "process_id:%ld\r\n"
682ac724 7502 "uptime_in_seconds:%ld\r\n"
7503 "uptime_in_days:%ld\r\n"
ed9b544e 7504 "connected_clients:%d\r\n"
7505 "connected_slaves:%d\r\n"
f86a74e9 7506 "blocked_clients:%d\r\n"
5fba9f71 7507 "used_memory:%zu\r\n"
ec6c7a1d 7508 "used_memory_human:%s\r\n"
ed9b544e 7509 "changes_since_last_save:%lld\r\n"
be2bb6b0 7510 "bgsave_in_progress:%d\r\n"
682ac724 7511 "last_save_time:%ld\r\n"
b3fad521 7512 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 7513 "total_connections_received:%lld\r\n"
7514 "total_commands_processed:%lld\r\n"
2a6a2ed1 7515 "expired_keys:%lld\r\n"
3be2c9d7 7516 "hash_max_zipmap_entries:%zu\r\n"
7517 "hash_max_zipmap_value:%zu\r\n"
ffc6b7f8 7518 "pubsub_channels:%ld\r\n"
7519 "pubsub_patterns:%u\r\n"
7d98e08c 7520 "vm_enabled:%d\r\n"
a0f643ea 7521 "role:%s\r\n"
ed9b544e 7522 ,REDIS_VERSION,
5436146c 7523 REDIS_GIT_SHA1,
274e45e3 7524 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
f1017b3f 7525 (sizeof(long) == 8) ? "64" : "32",
7a932b74 7526 aeGetApiName(),
0d7170a4 7527 (long) getpid(),
a0f643ea 7528 uptime,
7529 uptime/(3600*24),
ed9b544e 7530 listLength(server.clients)-listLength(server.slaves),
7531 listLength(server.slaves),
d5d55fc3 7532 server.blpop_blocked_clients,
b72f6a4b 7533 zmalloc_used_memory(),
ec6c7a1d 7534 hmem,
ed9b544e 7535 server.dirty,
9d65a1bb 7536 server.bgsavechildpid != -1,
ed9b544e 7537 server.lastsave,
b3fad521 7538 server.bgrewritechildpid != -1,
ed9b544e 7539 server.stat_numconnections,
7540 server.stat_numcommands,
2a6a2ed1 7541 server.stat_expiredkeys,
55a8298f 7542 server.hash_max_zipmap_entries,
7543 server.hash_max_zipmap_value,
ffc6b7f8 7544 dictSize(server.pubsub_channels),
7545 listLength(server.pubsub_patterns),
7d98e08c 7546 server.vm_enabled != 0,
a0f643ea 7547 server.masterhost == NULL ? "master" : "slave"
ed9b544e 7548 );
a0f643ea 7549 if (server.masterhost) {
7550 info = sdscatprintf(info,
7551 "master_host:%s\r\n"
7552 "master_port:%d\r\n"
7553 "master_link_status:%s\r\n"
7554 "master_last_io_seconds_ago:%d\r\n"
7555 ,server.masterhost,
7556 server.masterport,
7557 (server.replstate == REDIS_REPL_CONNECTED) ?
7558 "up" : "down",
f72b934d 7559 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 7560 );
7561 }
7d98e08c 7562 if (server.vm_enabled) {
1064ef87 7563 lockThreadedIO();
7d98e08c 7564 info = sdscatprintf(info,
7565 "vm_conf_max_memory:%llu\r\n"
7566 "vm_conf_page_size:%llu\r\n"
7567 "vm_conf_pages:%llu\r\n"
7568 "vm_stats_used_pages:%llu\r\n"
7569 "vm_stats_swapped_objects:%llu\r\n"
7570 "vm_stats_swappin_count:%llu\r\n"
7571 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 7572 "vm_stats_io_newjobs_len:%lu\r\n"
7573 "vm_stats_io_processing_len:%lu\r\n"
7574 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 7575 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 7576 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 7577 ,(unsigned long long) server.vm_max_memory,
7578 (unsigned long long) server.vm_page_size,
7579 (unsigned long long) server.vm_pages,
7580 (unsigned long long) server.vm_stats_used_pages,
7581 (unsigned long long) server.vm_stats_swapped_objects,
7582 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 7583 (unsigned long long) server.vm_stats_swapouts,
7584 (unsigned long) listLength(server.io_newjobs),
7585 (unsigned long) listLength(server.io_processing),
7586 (unsigned long) listLength(server.io_processed),
d5d55fc3 7587 (unsigned long) server.io_active_threads,
7588 (unsigned long) server.vm_blocked_clients
7d98e08c 7589 );
1064ef87 7590 unlockThreadedIO();
7d98e08c 7591 }
c3cb078d 7592 for (j = 0; j < server.dbnum; j++) {
7593 long long keys, vkeys;
7594
7595 keys = dictSize(server.db[j].dict);
7596 vkeys = dictSize(server.db[j].expires);
7597 if (keys || vkeys) {
9d65a1bb 7598 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 7599 j, keys, vkeys);
7600 }
7601 }
1c85b79f 7602 return info;
7603}
7604
7605static void infoCommand(redisClient *c) {
7606 sds info = genRedisInfoString();
83c6a618 7607 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7608 (unsigned long)sdslen(info)));
ed9b544e 7609 addReplySds(c,info);
70003d28 7610 addReply(c,shared.crlf);
ed9b544e 7611}
7612
3305306f 7613static void monitorCommand(redisClient *c) {
7614 /* ignore MONITOR if aleady slave or in monitor mode */
7615 if (c->flags & REDIS_SLAVE) return;
7616
7617 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7618 c->slaveseldb = 0;
6b47e12e 7619 listAddNodeTail(server.monitors,c);
3305306f 7620 addReply(c,shared.ok);
7621}
7622
7623/* ================================= Expire ================================= */
7624static int removeExpire(redisDb *db, robj *key) {
7625 if (dictDelete(db->expires,key) == DICT_OK) {
7626 return 1;
7627 } else {
7628 return 0;
7629 }
7630}
7631
7632static int setExpire(redisDb *db, robj *key, time_t when) {
7633 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7634 return 0;
7635 } else {
7636 incrRefCount(key);
7637 return 1;
7638 }
7639}
7640
bb32ede5 7641/* Return the expire time of the specified key, or -1 if no expire
7642 * is associated with this key (i.e. the key is non volatile) */
7643static time_t getExpire(redisDb *db, robj *key) {
7644 dictEntry *de;
7645
7646 /* No expire? return ASAP */
7647 if (dictSize(db->expires) == 0 ||
7648 (de = dictFind(db->expires,key)) == NULL) return -1;
7649
7650 return (time_t) dictGetEntryVal(de);
7651}
7652
3305306f 7653static int expireIfNeeded(redisDb *db, robj *key) {
7654 time_t when;
7655 dictEntry *de;
7656
7657 /* No expire? return ASAP */
7658 if (dictSize(db->expires) == 0 ||
7659 (de = dictFind(db->expires,key)) == NULL) return 0;
7660
7661 /* Lookup the expire */
7662 when = (time_t) dictGetEntryVal(de);
7663 if (time(NULL) <= when) return 0;
7664
7665 /* Delete the key */
7666 dictDelete(db->expires,key);
2a6a2ed1 7667 server.stat_expiredkeys++;
3305306f 7668 return dictDelete(db->dict,key) == DICT_OK;
7669}
7670
7671static int deleteIfVolatile(redisDb *db, robj *key) {
7672 dictEntry *de;
7673
7674 /* No expire? return ASAP */
7675 if (dictSize(db->expires) == 0 ||
7676 (de = dictFind(db->expires,key)) == NULL) return 0;
7677
7678 /* Delete the key */
0c66a471 7679 server.dirty++;
2a6a2ed1 7680 server.stat_expiredkeys++;
3305306f 7681 dictDelete(db->expires,key);
7682 return dictDelete(db->dict,key) == DICT_OK;
7683}
7684
bbe025e0 7685static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
3305306f 7686 dictEntry *de;
bbe025e0
AM
7687 time_t seconds;
7688
bd79a6bd 7689 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
bbe025e0
AM
7690
7691 seconds -= offset;
3305306f 7692
802e8373 7693 de = dictFind(c->db->dict,key);
3305306f 7694 if (de == NULL) {
7695 addReply(c,shared.czero);
7696 return;
7697 }
d4dd6556 7698 if (seconds <= 0) {
43e5ccdf 7699 if (deleteKey(c->db,key)) server.dirty++;
7700 addReply(c, shared.cone);
3305306f 7701 return;
7702 } else {
7703 time_t when = time(NULL)+seconds;
802e8373 7704 if (setExpire(c->db,key,when)) {
3305306f 7705 addReply(c,shared.cone);
77423026 7706 server.dirty++;
7707 } else {
3305306f 7708 addReply(c,shared.czero);
77423026 7709 }
3305306f 7710 return;
7711 }
7712}
7713
802e8373 7714static void expireCommand(redisClient *c) {
bbe025e0 7715 expireGenericCommand(c,c->argv[1],c->argv[2],0);
802e8373 7716}
7717
7718static void expireatCommand(redisClient *c) {
bbe025e0 7719 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
802e8373 7720}
7721
fd88489a 7722static void ttlCommand(redisClient *c) {
7723 time_t expire;
7724 int ttl = -1;
7725
7726 expire = getExpire(c->db,c->argv[1]);
7727 if (expire != -1) {
7728 ttl = (int) (expire-time(NULL));
7729 if (ttl < 0) ttl = -1;
7730 }
7731 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7732}
7733
6e469882 7734/* ================================ MULTI/EXEC ============================== */
7735
7736/* Client state initialization for MULTI/EXEC */
7737static void initClientMultiState(redisClient *c) {
7738 c->mstate.commands = NULL;
7739 c->mstate.count = 0;
7740}
7741
7742/* Release all the resources associated with MULTI/EXEC state */
7743static void freeClientMultiState(redisClient *c) {
7744 int j;
7745
7746 for (j = 0; j < c->mstate.count; j++) {
7747 int i;
7748 multiCmd *mc = c->mstate.commands+j;
7749
7750 for (i = 0; i < mc->argc; i++)
7751 decrRefCount(mc->argv[i]);
7752 zfree(mc->argv);
7753 }
7754 zfree(c->mstate.commands);
7755}
7756
7757/* Add a new command into the MULTI commands queue */
7758static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7759 multiCmd *mc;
7760 int j;
7761
7762 c->mstate.commands = zrealloc(c->mstate.commands,
7763 sizeof(multiCmd)*(c->mstate.count+1));
7764 mc = c->mstate.commands+c->mstate.count;
7765 mc->cmd = cmd;
7766 mc->argc = c->argc;
7767 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7768 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7769 for (j = 0; j < c->argc; j++)
7770 incrRefCount(mc->argv[j]);
7771 c->mstate.count++;
7772}
7773
7774static void multiCommand(redisClient *c) {
6531c94d 7775 if (c->flags & REDIS_MULTI) {
7776 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
7777 return;
7778 }
6e469882 7779 c->flags |= REDIS_MULTI;
36c548f0 7780 addReply(c,shared.ok);
6e469882 7781}
7782
18b6cb76
DJ
7783static void discardCommand(redisClient *c) {
7784 if (!(c->flags & REDIS_MULTI)) {
7785 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7786 return;
7787 }
7788
7789 freeClientMultiState(c);
7790 initClientMultiState(c);
7791 c->flags &= (~REDIS_MULTI);
7792 addReply(c,shared.ok);
7793}
7794
66c8853f 7795/* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7796 * implememntation for more information. */
7797static void execCommandReplicateMulti(redisClient *c) {
7798 struct redisCommand *cmd;
7799 robj *multistring = createStringObject("MULTI",5);
7800
7801 cmd = lookupCommand("multi");
7802 if (server.appendonly)
7803 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7804 if (listLength(server.slaves))
7805 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7806 decrRefCount(multistring);
7807}
7808
6e469882 7809static void execCommand(redisClient *c) {
7810 int j;
7811 robj **orig_argv;
7812 int orig_argc;
7813
7814 if (!(c->flags & REDIS_MULTI)) {
7815 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7816 return;
7817 }
7818
37ab76c9 7819 /* Check if we need to abort the EXEC if some WATCHed key was touched.
7820 * A failed EXEC will return a multi bulk nil object. */
7821 if (c->flags & REDIS_DIRTY_CAS) {
7822 freeClientMultiState(c);
7823 initClientMultiState(c);
7824 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7825 unwatchAllKeys(c);
7826 addReply(c,shared.nullmultibulk);
7827 return;
7828 }
7829
66c8853f 7830 /* Replicate a MULTI request now that we are sure the block is executed.
7831 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7832 * both the AOF and the replication link will have the same consistency
7833 * and atomicity guarantees. */
7834 execCommandReplicateMulti(c);
7835
7836 /* Exec all the queued commands */
1ad4d316 7837 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
6e469882 7838 orig_argv = c->argv;
7839 orig_argc = c->argc;
7840 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7841 for (j = 0; j < c->mstate.count; j++) {
7842 c->argc = c->mstate.commands[j].argc;
7843 c->argv = c->mstate.commands[j].argv;
7844 call(c,c->mstate.commands[j].cmd);
7845 }
7846 c->argv = orig_argv;
7847 c->argc = orig_argc;
7848 freeClientMultiState(c);
7849 initClientMultiState(c);
1ad4d316 7850 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
66c8853f 7851 /* Make sure the EXEC command is always replicated / AOF, since we
7852 * always send the MULTI command (we can't know beforehand if the
7853 * next operations will contain at least a modification to the DB). */
7854 server.dirty++;
6e469882 7855}
7856
4409877e 7857/* =========================== Blocking Operations ========================= */
7858
7859/* Currently Redis blocking operations support is limited to list POP ops,
7860 * so the current implementation is not fully generic, but it is also not
7861 * completely specific so it will not require a rewrite to support new
7862 * kind of blocking operations in the future.
7863 *
7864 * Still it's important to note that list blocking operations can be already
7865 * used as a notification mechanism in order to implement other blocking
7866 * operations at application level, so there must be a very strong evidence
7867 * of usefulness and generality before new blocking operations are implemented.
7868 *
7869 * This is how the current blocking POP works, we use BLPOP as example:
7870 * - If the user calls BLPOP and the key exists and contains a non empty list
7871 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7872 * if there is not to block.
7873 * - If instead BLPOP is called and the key does not exists or the list is
7874 * empty we need to block. In order to do so we remove the notification for
7875 * new data to read in the client socket (so that we'll not serve new
7876 * requests if the blocking request is not served). Also we put the client
37ab76c9 7877 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
4409877e 7878 * blocking for this keys.
7879 * - If a PUSH operation against a key with blocked clients waiting is
7880 * performed, we serve the first in the list: basically instead to push
7881 * the new element inside the list we return it to the (first / oldest)
7882 * blocking client, unblock the client, and remove it form the list.
7883 *
7884 * The above comment and the source code should be enough in order to understand
7885 * the implementation and modify / fix it later.
7886 */
7887
7888/* Set a client in blocking mode for the specified key, with the specified
7889 * timeout */
b177fd30 7890static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 7891 dictEntry *de;
7892 list *l;
b177fd30 7893 int j;
4409877e 7894
37ab76c9 7895 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
7896 c->blocking_keys_num = numkeys;
4409877e 7897 c->blockingto = timeout;
b177fd30 7898 for (j = 0; j < numkeys; j++) {
7899 /* Add the key in the client structure, to map clients -> keys */
37ab76c9 7900 c->blocking_keys[j] = keys[j];
b177fd30 7901 incrRefCount(keys[j]);
4409877e 7902
b177fd30 7903 /* And in the other "side", to map keys -> clients */
37ab76c9 7904 de = dictFind(c->db->blocking_keys,keys[j]);
b177fd30 7905 if (de == NULL) {
7906 int retval;
7907
7908 /* For every key we take a list of clients blocked for it */
7909 l = listCreate();
37ab76c9 7910 retval = dictAdd(c->db->blocking_keys,keys[j],l);
b177fd30 7911 incrRefCount(keys[j]);
7912 assert(retval == DICT_OK);
7913 } else {
7914 l = dictGetEntryVal(de);
7915 }
7916 listAddNodeTail(l,c);
4409877e 7917 }
b177fd30 7918 /* Mark the client as a blocked client */
4409877e 7919 c->flags |= REDIS_BLOCKED;
d5d55fc3 7920 server.blpop_blocked_clients++;
4409877e 7921}
7922
7923/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 7924static void unblockClientWaitingData(redisClient *c) {
4409877e 7925 dictEntry *de;
7926 list *l;
b177fd30 7927 int j;
4409877e 7928
37ab76c9 7929 assert(c->blocking_keys != NULL);
b177fd30 7930 /* The client may wait for multiple keys, so unblock it for every key. */
37ab76c9 7931 for (j = 0; j < c->blocking_keys_num; j++) {
b177fd30 7932 /* Remove this client from the list of clients waiting for this key. */
37ab76c9 7933 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
b177fd30 7934 assert(de != NULL);
7935 l = dictGetEntryVal(de);
7936 listDelNode(l,listSearchKey(l,c));
7937 /* If the list is empty we need to remove it to avoid wasting memory */
7938 if (listLength(l) == 0)
37ab76c9 7939 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
7940 decrRefCount(c->blocking_keys[j]);
b177fd30 7941 }
7942 /* Cleanup the client structure */
37ab76c9 7943 zfree(c->blocking_keys);
7944 c->blocking_keys = NULL;
4409877e 7945 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 7946 server.blpop_blocked_clients--;
5921aa36 7947 /* We want to process data if there is some command waiting
b0d8747d 7948 * in the input buffer. Note that this is safe even if
7949 * unblockClientWaitingData() gets called from freeClient() because
7950 * freeClient() will be smart enough to call this function
7951 * *after* c->querybuf was set to NULL. */
4409877e 7952 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7953}
7954
7955/* This should be called from any function PUSHing into lists.
7956 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7957 * 'ele' is the element pushed.
7958 *
7959 * If the function returns 0 there was no client waiting for a list push
7960 * against this key.
7961 *
7962 * If the function returns 1 there was a client waiting for a list push
7963 * against this key, the element was passed to this client thus it's not
7964 * needed to actually add it to the list and the caller should return asap. */
7965static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7966 struct dictEntry *de;
7967 redisClient *receiver;
7968 list *l;
7969 listNode *ln;
7970
37ab76c9 7971 de = dictFind(c->db->blocking_keys,key);
4409877e 7972 if (de == NULL) return 0;
7973 l = dictGetEntryVal(de);
7974 ln = listFirst(l);
7975 assert(ln != NULL);
7976 receiver = ln->value;
4409877e 7977
b177fd30 7978 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 7979 addReplyBulk(receiver,key);
7980 addReplyBulk(receiver,ele);
b0d8747d 7981 unblockClientWaitingData(receiver);
4409877e 7982 return 1;
7983}
7984
7985/* Blocking RPOP/LPOP */
7986static void blockingPopGenericCommand(redisClient *c, int where) {
7987 robj *o;
7988 time_t timeout;
b177fd30 7989 int j;
4409877e 7990
b177fd30 7991 for (j = 1; j < c->argc-1; j++) {
7992 o = lookupKeyWrite(c->db,c->argv[j]);
7993 if (o != NULL) {
7994 if (o->type != REDIS_LIST) {
7995 addReply(c,shared.wrongtypeerr);
4409877e 7996 return;
b177fd30 7997 } else {
7998 list *list = o->ptr;
7999 if (listLength(list) != 0) {
8000 /* If the list contains elements fall back to the usual
8001 * non-blocking POP operation */
8002 robj *argv[2], **orig_argv;
8003 int orig_argc;
e0a62c7f 8004
b177fd30 8005 /* We need to alter the command arguments before to call
8006 * popGenericCommand() as the command takes a single key. */
8007 orig_argv = c->argv;
8008 orig_argc = c->argc;
8009 argv[1] = c->argv[j];
8010 c->argv = argv;
8011 c->argc = 2;
8012
8013 /* Also the return value is different, we need to output
8014 * the multi bulk reply header and the key name. The
8015 * "real" command will add the last element (the value)
8016 * for us. If this souds like an hack to you it's just
8017 * because it is... */
8018 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 8019 addReplyBulk(c,argv[1]);
b177fd30 8020 popGenericCommand(c,where);
8021
8022 /* Fix the client structure with the original stuff */
8023 c->argv = orig_argv;
8024 c->argc = orig_argc;
8025 return;
8026 }
4409877e 8027 }
8028 }
8029 }
8030 /* If the list is empty or the key does not exists we must block */
b177fd30 8031 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 8032 if (timeout > 0) timeout += time(NULL);
b177fd30 8033 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 8034}
8035
8036static void blpopCommand(redisClient *c) {
8037 blockingPopGenericCommand(c,REDIS_HEAD);
8038}
8039
8040static void brpopCommand(redisClient *c) {
8041 blockingPopGenericCommand(c,REDIS_TAIL);
8042}
8043
ed9b544e 8044/* =============================== Replication ============================= */
8045
a4d1ba9a 8046static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 8047 ssize_t nwritten, ret = size;
8048 time_t start = time(NULL);
8049
8050 timeout++;
8051 while(size) {
8052 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
8053 nwritten = write(fd,ptr,size);
8054 if (nwritten == -1) return -1;
8055 ptr += nwritten;
8056 size -= nwritten;
8057 }
8058 if ((time(NULL)-start) > timeout) {
8059 errno = ETIMEDOUT;
8060 return -1;
8061 }
8062 }
8063 return ret;
8064}
8065
a4d1ba9a 8066static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 8067 ssize_t nread, totread = 0;
8068 time_t start = time(NULL);
8069
8070 timeout++;
8071 while(size) {
8072 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
8073 nread = read(fd,ptr,size);
8074 if (nread == -1) return -1;
8075 ptr += nread;
8076 size -= nread;
8077 totread += nread;
8078 }
8079 if ((time(NULL)-start) > timeout) {
8080 errno = ETIMEDOUT;
8081 return -1;
8082 }
8083 }
8084 return totread;
8085}
8086
8087static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
8088 ssize_t nread = 0;
8089
8090 size--;
8091 while(size) {
8092 char c;
8093
8094 if (syncRead(fd,&c,1,timeout) == -1) return -1;
8095 if (c == '\n') {
8096 *ptr = '\0';
8097 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
8098 return nread;
8099 } else {
8100 *ptr++ = c;
8101 *ptr = '\0';
8102 nread++;
8103 }
8104 }
8105 return nread;
8106}
8107
8108static void syncCommand(redisClient *c) {
40d224a9 8109 /* ignore SYNC if aleady slave or in monitor mode */
8110 if (c->flags & REDIS_SLAVE) return;
8111
8112 /* SYNC can't be issued when the server has pending data to send to
8113 * the client about already issued commands. We need a fresh reply
8114 * buffer registering the differences between the BGSAVE and the current
8115 * dataset, so that we can copy to other slaves if needed. */
8116 if (listLength(c->reply) != 0) {
8117 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
8118 return;
8119 }
8120
8121 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
8122 /* Here we need to check if there is a background saving operation
8123 * in progress, or if it is required to start one */
9d65a1bb 8124 if (server.bgsavechildpid != -1) {
40d224a9 8125 /* Ok a background save is in progress. Let's check if it is a good
8126 * one for replication, i.e. if there is another slave that is
8127 * registering differences since the server forked to save */
8128 redisClient *slave;
8129 listNode *ln;
c7df85a4 8130 listIter li;
40d224a9 8131
c7df85a4 8132 listRewind(server.slaves,&li);
8133 while((ln = listNext(&li))) {
40d224a9 8134 slave = ln->value;
8135 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 8136 }
8137 if (ln) {
8138 /* Perfect, the server is already registering differences for
8139 * another slave. Set the right state, and copy the buffer. */
8140 listRelease(c->reply);
8141 c->reply = listDup(slave->reply);
40d224a9 8142 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8143 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
8144 } else {
8145 /* No way, we need to wait for the next BGSAVE in order to
8146 * register differences */
8147 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8148 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
8149 }
8150 } else {
8151 /* Ok we don't have a BGSAVE in progress, let's start one */
8152 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
8153 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8154 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
8155 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
8156 return;
8157 }
8158 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8159 }
6208b3a7 8160 c->repldbfd = -1;
40d224a9 8161 c->flags |= REDIS_SLAVE;
8162 c->slaveseldb = 0;
6b47e12e 8163 listAddNodeTail(server.slaves,c);
40d224a9 8164 return;
8165}
8166
6208b3a7 8167static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
8168 redisClient *slave = privdata;
8169 REDIS_NOTUSED(el);
8170 REDIS_NOTUSED(mask);
8171 char buf[REDIS_IOBUF_LEN];
8172 ssize_t nwritten, buflen;
8173
8174 if (slave->repldboff == 0) {
8175 /* Write the bulk write count before to transfer the DB. In theory here
8176 * we don't know how much room there is in the output buffer of the
8177 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
8178 * operations) will never be smaller than the few bytes we need. */
8179 sds bulkcount;
8180
8181 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
8182 slave->repldbsize);
8183 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
8184 {
8185 sdsfree(bulkcount);
8186 freeClient(slave);
8187 return;
8188 }
8189 sdsfree(bulkcount);
8190 }
8191 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
8192 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
8193 if (buflen <= 0) {
8194 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
8195 (buflen == 0) ? "premature EOF" : strerror(errno));
8196 freeClient(slave);
8197 return;
8198 }
8199 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 8200 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 8201 strerror(errno));
8202 freeClient(slave);
8203 return;
8204 }
8205 slave->repldboff += nwritten;
8206 if (slave->repldboff == slave->repldbsize) {
8207 close(slave->repldbfd);
8208 slave->repldbfd = -1;
8209 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8210 slave->replstate = REDIS_REPL_ONLINE;
8211 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 8212 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 8213 freeClient(slave);
8214 return;
8215 }
8216 addReplySds(slave,sdsempty());
8217 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
8218 }
8219}
ed9b544e 8220
a3b21203 8221/* This function is called at the end of every backgrond saving.
8222 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
8223 * otherwise REDIS_ERR is passed to the function.
8224 *
8225 * The goal of this function is to handle slaves waiting for a successful
8226 * background saving in order to perform non-blocking synchronization. */
8227static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 8228 listNode *ln;
8229 int startbgsave = 0;
c7df85a4 8230 listIter li;
ed9b544e 8231
c7df85a4 8232 listRewind(server.slaves,&li);
8233 while((ln = listNext(&li))) {
6208b3a7 8234 redisClient *slave = ln->value;
ed9b544e 8235
6208b3a7 8236 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
8237 startbgsave = 1;
8238 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8239 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 8240 struct redis_stat buf;
e0a62c7f 8241
6208b3a7 8242 if (bgsaveerr != REDIS_OK) {
8243 freeClient(slave);
8244 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8245 continue;
8246 }
8247 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 8248 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 8249 freeClient(slave);
8250 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8251 continue;
8252 }
8253 slave->repldboff = 0;
8254 slave->repldbsize = buf.st_size;
8255 slave->replstate = REDIS_REPL_SEND_BULK;
8256 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 8257 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 8258 freeClient(slave);
8259 continue;
8260 }
8261 }
ed9b544e 8262 }
6208b3a7 8263 if (startbgsave) {
8264 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 8265 listIter li;
8266
8267 listRewind(server.slaves,&li);
6208b3a7 8268 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 8269 while((ln = listNext(&li))) {
6208b3a7 8270 redisClient *slave = ln->value;
ed9b544e 8271
6208b3a7 8272 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8273 freeClient(slave);
8274 }
8275 }
8276 }
ed9b544e 8277}
8278
8279static int syncWithMaster(void) {
d0ccebcf 8280 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 8281 long dumpsize;
ed9b544e 8282 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 8283 int dfd, maxtries = 5;
ed9b544e 8284
8285 if (fd == -1) {
8286 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8287 strerror(errno));
8288 return REDIS_ERR;
8289 }
d0ccebcf 8290
8291 /* AUTH with the master if required. */
8292 if(server.masterauth) {
8293 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8294 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8295 close(fd);
8296 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8297 strerror(errno));
8298 return REDIS_ERR;
8299 }
8300 /* Read the AUTH result. */
8301 if (syncReadLine(fd,buf,1024,3600) == -1) {
8302 close(fd);
8303 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8304 strerror(errno));
8305 return REDIS_ERR;
8306 }
8307 if (buf[0] != '+') {
8308 close(fd);
8309 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8310 return REDIS_ERR;
8311 }
8312 }
8313
ed9b544e 8314 /* Issue the SYNC command */
8315 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8316 close(fd);
8317 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8318 strerror(errno));
8319 return REDIS_ERR;
8320 }
8321 /* Read the bulk write count */
8c4d91fc 8322 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 8323 close(fd);
8324 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8325 strerror(errno));
8326 return REDIS_ERR;
8327 }
4aa701c1 8328 if (buf[0] != '$') {
8329 close(fd);
8330 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8331 return REDIS_ERR;
8332 }
18e61fa2 8333 dumpsize = strtol(buf+1,NULL,10);
8334 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 8335 /* Read the bulk write data on a temp file */
8c5abee8 8336 while(maxtries--) {
8337 snprintf(tmpfile,256,
8338 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8339 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8340 if (dfd != -1) break;
5de9ad7c 8341 sleep(1);
8c5abee8 8342 }
ed9b544e 8343 if (dfd == -1) {
8344 close(fd);
8345 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8346 return REDIS_ERR;
8347 }
8348 while(dumpsize) {
8349 int nread, nwritten;
8350
8351 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8352 if (nread == -1) {
8353 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8354 strerror(errno));
8355 close(fd);
8356 close(dfd);
8357 return REDIS_ERR;
8358 }
8359 nwritten = write(dfd,buf,nread);
8360 if (nwritten == -1) {
8361 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8362 close(fd);
8363 close(dfd);
8364 return REDIS_ERR;
8365 }
8366 dumpsize -= nread;
8367 }
8368 close(dfd);
8369 if (rename(tmpfile,server.dbfilename) == -1) {
8370 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8371 unlink(tmpfile);
8372 close(fd);
8373 return REDIS_ERR;
8374 }
8375 emptyDb();
f78fd11b 8376 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 8377 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8378 close(fd);
8379 return REDIS_ERR;
8380 }
8381 server.master = createClient(fd);
8382 server.master->flags |= REDIS_MASTER;
179b3952 8383 server.master->authenticated = 1;
ed9b544e 8384 server.replstate = REDIS_REPL_CONNECTED;
8385 return REDIS_OK;
8386}
8387
321b0e13 8388static void slaveofCommand(redisClient *c) {
8389 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8390 !strcasecmp(c->argv[2]->ptr,"one")) {
8391 if (server.masterhost) {
8392 sdsfree(server.masterhost);
8393 server.masterhost = NULL;
8394 if (server.master) freeClient(server.master);
8395 server.replstate = REDIS_REPL_NONE;
8396 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8397 }
8398 } else {
8399 sdsfree(server.masterhost);
8400 server.masterhost = sdsdup(c->argv[1]->ptr);
8401 server.masterport = atoi(c->argv[2]->ptr);
8402 if (server.master) freeClient(server.master);
8403 server.replstate = REDIS_REPL_CONNECT;
8404 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8405 server.masterhost, server.masterport);
8406 }
8407 addReply(c,shared.ok);
8408}
8409
3fd78bcd 8410/* ============================ Maxmemory directive ======================== */
8411
a5819310 8412/* Try to free one object form the pre-allocated objects free list.
8413 * This is useful under low mem conditions as by default we take 1 million
8414 * free objects allocated. On success REDIS_OK is returned, otherwise
8415 * REDIS_ERR. */
8416static int tryFreeOneObjectFromFreelist(void) {
f870935d 8417 robj *o;
8418
a5819310 8419 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8420 if (listLength(server.objfreelist)) {
8421 listNode *head = listFirst(server.objfreelist);
8422 o = listNodeValue(head);
8423 listDelNode(server.objfreelist,head);
8424 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8425 zfree(o);
8426 return REDIS_OK;
8427 } else {
8428 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8429 return REDIS_ERR;
8430 }
f870935d 8431}
8432
3fd78bcd 8433/* This function gets called when 'maxmemory' is set on the config file to limit
8434 * the max memory used by the server, and we are out of memory.
8435 * This function will try to, in order:
8436 *
8437 * - Free objects from the free list
8438 * - Try to remove keys with an EXPIRE set
8439 *
8440 * It is not possible to free enough memory to reach used-memory < maxmemory
8441 * the server will start refusing commands that will enlarge even more the
8442 * memory usage.
8443 */
8444static void freeMemoryIfNeeded(void) {
8445 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 8446 int j, k, freed = 0;
8447
8448 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8449 for (j = 0; j < server.dbnum; j++) {
8450 int minttl = -1;
8451 robj *minkey = NULL;
8452 struct dictEntry *de;
8453
8454 if (dictSize(server.db[j].expires)) {
8455 freed = 1;
8456 /* From a sample of three keys drop the one nearest to
8457 * the natural expire */
8458 for (k = 0; k < 3; k++) {
8459 time_t t;
8460
8461 de = dictGetRandomKey(server.db[j].expires);
8462 t = (time_t) dictGetEntryVal(de);
8463 if (minttl == -1 || t < minttl) {
8464 minkey = dictGetEntryKey(de);
8465 minttl = t;
3fd78bcd 8466 }
3fd78bcd 8467 }
a5819310 8468 deleteKey(server.db+j,minkey);
3fd78bcd 8469 }
3fd78bcd 8470 }
a5819310 8471 if (!freed) return; /* nothing to free... */
3fd78bcd 8472 }
8473}
8474
f80dff62 8475/* ============================== Append Only file ========================== */
8476
28ed1f33 8477/* Write the append only file buffer on disk.
8478 *
8479 * Since we are required to write the AOF before replying to the client,
8480 * and the only way the client socket can get a write is entering when the
8481 * the event loop, we accumulate all the AOF writes in a memory
8482 * buffer and write it on disk using this function just before entering
8483 * the event loop again. */
8484static void flushAppendOnlyFile(void) {
8485 time_t now;
8486 ssize_t nwritten;
8487
8488 if (sdslen(server.aofbuf) == 0) return;
8489
8490 /* We want to perform a single write. This should be guaranteed atomic
8491 * at least if the filesystem we are writing is a real physical one.
8492 * While this will save us against the server being killed I don't think
8493 * there is much to do about the whole server stopping for power problems
8494 * or alike */
8495 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8496 if (nwritten != (signed)sdslen(server.aofbuf)) {
8497 /* Ooops, we are in troubles. The best thing to do for now is
8498 * aborting instead of giving the illusion that everything is
8499 * working as expected. */
8500 if (nwritten == -1) {
8501 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8502 } else {
8503 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8504 }
8505 exit(1);
8506 }
8507 sdsfree(server.aofbuf);
8508 server.aofbuf = sdsempty();
8509
8510 /* Fsync if needed */
8511 now = time(NULL);
8512 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8513 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8514 now-server.lastfsync > 1))
8515 {
8516 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8517 * flushing metadata. */
8518 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8519 server.lastfsync = now;
8520 }
8521}
8522
9376e434
PN
8523static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8524 int j;
8525 buf = sdscatprintf(buf,"*%d\r\n",argc);
8526 for (j = 0; j < argc; j++) {
8527 robj *o = getDecodedObject(argv[j]);
8528 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8529 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8530 buf = sdscatlen(buf,"\r\n",2);
8531 decrRefCount(o);
8532 }
8533 return buf;
8534}
8535
8536static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8537 int argc = 3;
8538 long when;
8539 robj *argv[3];
8540
8541 /* Make sure we can use strtol */
8542 seconds = getDecodedObject(seconds);
8543 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8544 decrRefCount(seconds);
8545
8546 argv[0] = createStringObject("EXPIREAT",8);
8547 argv[1] = key;
8548 argv[2] = createObject(REDIS_STRING,
8549 sdscatprintf(sdsempty(),"%ld",when));
8550 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8551 decrRefCount(argv[0]);
8552 decrRefCount(argv[2]);
8553 return buf;
8554}
8555
f80dff62 8556static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8557 sds buf = sdsempty();
f80dff62 8558 robj *tmpargv[3];
8559
8560 /* The DB this command was targetting is not the same as the last command
8561 * we appendend. To issue a SELECT command is needed. */
8562 if (dictid != server.appendseldb) {
8563 char seldb[64];
8564
8565 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 8566 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 8567 (unsigned long)strlen(seldb),seldb);
f80dff62 8568 server.appendseldb = dictid;
8569 }
8570
f80dff62 8571 if (cmd->proc == expireCommand) {
9376e434
PN
8572 /* Translate EXPIRE into EXPIREAT */
8573 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8574 } else if (cmd->proc == setexCommand) {
8575 /* Translate SETEX to SET and EXPIREAT */
8576 tmpargv[0] = createStringObject("SET",3);
f80dff62 8577 tmpargv[1] = argv[1];
9376e434
PN
8578 tmpargv[2] = argv[3];
8579 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8580 decrRefCount(tmpargv[0]);
8581 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8582 } else {
8583 buf = catAppendOnlyGenericCommand(buf,argc,argv);
f80dff62 8584 }
8585
28ed1f33 8586 /* Append to the AOF buffer. This will be flushed on disk just before
8587 * of re-entering the event loop, so before the client will get a
8588 * positive reply about the operation performed. */
8589 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8590
85a83172 8591 /* If a background append only file rewriting is in progress we want to
8592 * accumulate the differences between the child DB and the current one
8593 * in a buffer, so that when the child process will do its work we
8594 * can append the differences to the new append only file. */
8595 if (server.bgrewritechildpid != -1)
8596 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8597
8598 sdsfree(buf);
f80dff62 8599}
8600
8601/* In Redis commands are always executed in the context of a client, so in
8602 * order to load the append only file we need to create a fake client. */
8603static struct redisClient *createFakeClient(void) {
8604 struct redisClient *c = zmalloc(sizeof(*c));
8605
8606 selectDb(c,0);
8607 c->fd = -1;
8608 c->querybuf = sdsempty();
8609 c->argc = 0;
8610 c->argv = NULL;
8611 c->flags = 0;
9387d17d 8612 /* We set the fake client as a slave waiting for the synchronization
8613 * so that Redis will not try to send replies to this client. */
8614 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 8615 c->reply = listCreate();
8616 listSetFreeMethod(c->reply,decrRefCount);
8617 listSetDupMethod(c->reply,dupClientReplyValue);
4132ad8d 8618 initClientMultiState(c);
f80dff62 8619 return c;
8620}
8621
8622static void freeFakeClient(struct redisClient *c) {
8623 sdsfree(c->querybuf);
8624 listRelease(c->reply);
4132ad8d 8625 freeClientMultiState(c);
f80dff62 8626 zfree(c);
8627}
8628
8629/* Replay the append log file. On error REDIS_OK is returned. On non fatal
8630 * error (the append only file is zero-length) REDIS_ERR is returned. On
8631 * fatal error an error message is logged and the program exists. */
8632int loadAppendOnlyFile(char *filename) {
8633 struct redisClient *fakeClient;
8634 FILE *fp = fopen(filename,"r");
8635 struct redis_stat sb;
b492cf00 8636 unsigned long long loadedkeys = 0;
4132ad8d 8637 int appendonly = server.appendonly;
f80dff62 8638
8639 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8640 return REDIS_ERR;
8641
8642 if (fp == NULL) {
8643 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8644 exit(1);
8645 }
8646
4132ad8d
PN
8647 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8648 * to the same file we're about to read. */
8649 server.appendonly = 0;
8650
f80dff62 8651 fakeClient = createFakeClient();
8652 while(1) {
8653 int argc, j;
8654 unsigned long len;
8655 robj **argv;
8656 char buf[128];
8657 sds argsds;
8658 struct redisCommand *cmd;
8659
8660 if (fgets(buf,sizeof(buf),fp) == NULL) {
8661 if (feof(fp))
8662 break;
8663 else
8664 goto readerr;
8665 }
8666 if (buf[0] != '*') goto fmterr;
8667 argc = atoi(buf+1);
8668 argv = zmalloc(sizeof(robj*)*argc);
8669 for (j = 0; j < argc; j++) {
8670 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8671 if (buf[0] != '$') goto fmterr;
8672 len = strtol(buf+1,NULL,10);
8673 argsds = sdsnewlen(NULL,len);
0f151ef1 8674 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 8675 argv[j] = createObject(REDIS_STRING,argsds);
8676 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8677 }
8678
8679 /* Command lookup */
8680 cmd = lookupCommand(argv[0]->ptr);
8681 if (!cmd) {
8682 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8683 exit(1);
8684 }
bdcb92f2 8685 /* Try object encoding */
f80dff62 8686 if (cmd->flags & REDIS_CMD_BULK)
05df7621 8687 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
f80dff62 8688 /* Run the command in the context of a fake client */
8689 fakeClient->argc = argc;
8690 fakeClient->argv = argv;
8691 cmd->proc(fakeClient);
8692 /* Discard the reply objects list from the fake client */
8693 while(listLength(fakeClient->reply))
8694 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8695 /* Clean up, ready for the next command */
8696 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8697 zfree(argv);
b492cf00 8698 /* Handle swapping while loading big datasets when VM is on */
8699 loadedkeys++;
8700 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8701 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 8702 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 8703 }
8704 }
f80dff62 8705 }
4132ad8d
PN
8706
8707 /* This point can only be reached when EOF is reached without errors.
8708 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8709 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8710
f80dff62 8711 fclose(fp);
8712 freeFakeClient(fakeClient);
4132ad8d 8713 server.appendonly = appendonly;
f80dff62 8714 return REDIS_OK;
8715
8716readerr:
8717 if (feof(fp)) {
8718 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8719 } else {
8720 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8721 }
8722 exit(1);
8723fmterr:
8724 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8725 exit(1);
8726}
8727
9c8e3cee 8728/* Write binary-safe string into a file in the bulkformat
8729 * $<count>\r\n<payload>\r\n */
8730static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
9eaef89f
PN
8731 char cbuf[128];
8732 int clen;
8733 cbuf[0] = '$';
8734 clen = 1+ll2string(cbuf+1,sizeof(cbuf)-1,len);
8735 cbuf[clen++] = '\r';
8736 cbuf[clen++] = '\n';
8737 if (fwrite(cbuf,clen,1,fp) == 0) return 0;
8738 if (len > 0 && fwrite(s,len,1,fp) == 0) return 0;
9c8e3cee 8739 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8740 return 1;
8741}
8742
9d65a1bb 8743/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8744static int fwriteBulkDouble(FILE *fp, double d) {
8745 char buf[128], dbuf[128];
8746
8747 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8748 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8749 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8750 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8751 return 1;
8752}
8753
8754/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
9eaef89f
PN
8755static int fwriteBulkLongLong(FILE *fp, long long l) {
8756 char bbuf[128], lbuf[128];
8757 unsigned int blen, llen;
8758 llen = ll2string(lbuf,32,l);
8759 blen = snprintf(bbuf,sizeof(bbuf),"$%u\r\n%s\r\n",llen,lbuf);
8760 if (fwrite(bbuf,blen,1,fp) == 0) return 0;
9d65a1bb 8761 return 1;
8762}
8763
9eaef89f
PN
8764/* Delegate writing an object to writing a bulk string or bulk long long. */
8765static int fwriteBulkObject(FILE *fp, robj *obj) {
8766 /* Avoid using getDecodedObject to help copy-on-write (we are often
8767 * in a child process when this function is called). */
8768 if (obj->encoding == REDIS_ENCODING_INT) {
8769 return fwriteBulkLongLong(fp,(long)obj->ptr);
8770 } else if (obj->encoding == REDIS_ENCODING_RAW) {
8771 return fwriteBulkString(fp,obj->ptr,sdslen(obj->ptr));
8772 } else {
8773 redisPanic("Unknown string encoding");
8774 }
8775}
8776
9d65a1bb 8777/* Write a sequence of commands able to fully rebuild the dataset into
8778 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8779static int rewriteAppendOnlyFile(char *filename) {
8780 dictIterator *di = NULL;
8781 dictEntry *de;
8782 FILE *fp;
8783 char tmpfile[256];
8784 int j;
8785 time_t now = time(NULL);
8786
8787 /* Note that we have to use a different temp name here compared to the
8788 * one used by rewriteAppendOnlyFileBackground() function. */
8789 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8790 fp = fopen(tmpfile,"w");
8791 if (!fp) {
8792 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8793 return REDIS_ERR;
8794 }
8795 for (j = 0; j < server.dbnum; j++) {
8796 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8797 redisDb *db = server.db+j;
8798 dict *d = db->dict;
8799 if (dictSize(d) == 0) continue;
8800 di = dictGetIterator(d);
8801 if (!di) {
8802 fclose(fp);
8803 return REDIS_ERR;
8804 }
8805
8806 /* SELECT the new DB */
8807 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
9eaef89f 8808 if (fwriteBulkLongLong(fp,j) == 0) goto werr;
9d65a1bb 8809
8810 /* Iterate this DB writing every entry */
8811 while((de = dictNext(di)) != NULL) {
e7546c63 8812 robj *key, *o;
8813 time_t expiretime;
8814 int swapped;
8815
8816 key = dictGetEntryKey(de);
b9bc0eef 8817 /* If the value for this key is swapped, load a preview in memory.
8818 * We use a "swapped" flag to remember if we need to free the
8819 * value object instead to just increment the ref count anyway
8820 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 8821 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8822 key->storage == REDIS_VM_SWAPPING) {
e7546c63 8823 o = dictGetEntryVal(de);
8824 swapped = 0;
8825 } else {
8826 o = vmPreviewObject(key);
e7546c63 8827 swapped = 1;
8828 }
8829 expiretime = getExpire(db,key);
9d65a1bb 8830
8831 /* Save the key and associated value */
9d65a1bb 8832 if (o->type == REDIS_STRING) {
8833 /* Emit a SET command */
8834 char cmd[]="*3\r\n$3\r\nSET\r\n";
8835 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8836 /* Key and value */
9c8e3cee 8837 if (fwriteBulkObject(fp,key) == 0) goto werr;
8838 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 8839 } else if (o->type == REDIS_LIST) {
8840 /* Emit the RPUSHes needed to rebuild the list */
6ddc908a
PN
8841 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8842 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
8843 unsigned char *zl = o->ptr;
8844 unsigned char *p = ziplistIndex(zl,0);
8845 unsigned char *vstr;
8846 unsigned int vlen;
8847 long long vlong;
8848
8849 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
8850 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8851 if (fwriteBulkObject(fp,key) == 0) goto werr;
8852 if (vstr) {
8853 if (fwriteBulkString(fp,(char*)vstr,vlen) == 0)
8854 goto werr;
8855 } else {
8856 if (fwriteBulkLongLong(fp,vlong) == 0)
8857 goto werr;
8858 }
8859 p = ziplistNext(zl,p);
8860 }
8861 } else if (o->encoding == REDIS_ENCODING_LIST) {
8862 list *list = o->ptr;
8863 listNode *ln;
8864 listIter li;
8865
8866 listRewind(list,&li);
8867 while((ln = listNext(&li))) {
8868 robj *eleobj = listNodeValue(ln);
8869
8870 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8871 if (fwriteBulkObject(fp,key) == 0) goto werr;
8872 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
8873 }
8874 } else {
8875 redisPanic("Unknown list encoding");
9d65a1bb 8876 }
8877 } else if (o->type == REDIS_SET) {
8878 /* Emit the SADDs needed to rebuild the set */
8879 dict *set = o->ptr;
8880 dictIterator *di = dictGetIterator(set);
8881 dictEntry *de;
8882
8883 while((de = dictNext(di)) != NULL) {
8884 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8885 robj *eleobj = dictGetEntryKey(de);
8886
8887 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8888 if (fwriteBulkObject(fp,key) == 0) goto werr;
8889 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8890 }
8891 dictReleaseIterator(di);
8892 } else if (o->type == REDIS_ZSET) {
8893 /* Emit the ZADDs needed to rebuild the sorted set */
8894 zset *zs = o->ptr;
8895 dictIterator *di = dictGetIterator(zs->dict);
8896 dictEntry *de;
8897
8898 while((de = dictNext(di)) != NULL) {
8899 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8900 robj *eleobj = dictGetEntryKey(de);
8901 double *score = dictGetEntryVal(de);
8902
8903 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8904 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8905 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 8906 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8907 }
8908 dictReleaseIterator(di);
9c8e3cee 8909 } else if (o->type == REDIS_HASH) {
8910 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8911
8912 /* Emit the HSETs needed to rebuild the hash */
8913 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8914 unsigned char *p = zipmapRewind(o->ptr);
8915 unsigned char *field, *val;
8916 unsigned int flen, vlen;
8917
8918 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8919 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8920 if (fwriteBulkObject(fp,key) == 0) goto werr;
8921 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8922 return -1;
8923 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8924 return -1;
8925 }
8926 } else {
8927 dictIterator *di = dictGetIterator(o->ptr);
8928 dictEntry *de;
8929
8930 while((de = dictNext(di)) != NULL) {
8931 robj *field = dictGetEntryKey(de);
8932 robj *val = dictGetEntryVal(de);
8933
8934 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8935 if (fwriteBulkObject(fp,key) == 0) goto werr;
8936 if (fwriteBulkObject(fp,field) == -1) return -1;
8937 if (fwriteBulkObject(fp,val) == -1) return -1;
8938 }
8939 dictReleaseIterator(di);
8940 }
9d65a1bb 8941 } else {
f83c6cb5 8942 redisPanic("Unknown object type");
9d65a1bb 8943 }
8944 /* Save the expire time */
8945 if (expiretime != -1) {
e96e4fbf 8946 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 8947 /* If this key is already expired skip it */
8948 if (expiretime < now) continue;
8949 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8950 if (fwriteBulkObject(fp,key) == 0) goto werr;
9eaef89f 8951 if (fwriteBulkLongLong(fp,expiretime) == 0) goto werr;
9d65a1bb 8952 }
b9bc0eef 8953 if (swapped) decrRefCount(o);
9d65a1bb 8954 }
8955 dictReleaseIterator(di);
8956 }
8957
8958 /* Make sure data will not remain on the OS's output buffers */
8959 fflush(fp);
8960 fsync(fileno(fp));
8961 fclose(fp);
e0a62c7f 8962
9d65a1bb 8963 /* Use RENAME to make sure the DB file is changed atomically only
8964 * if the generate DB file is ok. */
8965 if (rename(tmpfile,filename) == -1) {
8966 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8967 unlink(tmpfile);
8968 return REDIS_ERR;
8969 }
8970 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8971 return REDIS_OK;
8972
8973werr:
8974 fclose(fp);
8975 unlink(tmpfile);
e96e4fbf 8976 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 8977 if (di) dictReleaseIterator(di);
8978 return REDIS_ERR;
8979}
8980
8981/* This is how rewriting of the append only file in background works:
8982 *
8983 * 1) The user calls BGREWRITEAOF
8984 * 2) Redis calls this function, that forks():
8985 * 2a) the child rewrite the append only file in a temp file.
8986 * 2b) the parent accumulates differences in server.bgrewritebuf.
8987 * 3) When the child finished '2a' exists.
8988 * 4) The parent will trap the exit code, if it's OK, will append the
8989 * data accumulated into server.bgrewritebuf into the temp file, and
8990 * finally will rename(2) the temp file in the actual file name.
8991 * The the new file is reopened as the new append only file. Profit!
8992 */
8993static int rewriteAppendOnlyFileBackground(void) {
8994 pid_t childpid;
8995
8996 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 8997 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 8998 if ((childpid = fork()) == 0) {
8999 /* Child */
9000 char tmpfile[256];
9d65a1bb 9001
054e426d 9002 if (server.vm_enabled) vmReopenSwapFile();
9003 close(server.fd);
9d65a1bb 9004 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
9005 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 9006 _exit(0);
9d65a1bb 9007 } else {
478c2c6f 9008 _exit(1);
9d65a1bb 9009 }
9010 } else {
9011 /* Parent */
9012 if (childpid == -1) {
9013 redisLog(REDIS_WARNING,
9014 "Can't rewrite append only file in background: fork: %s",
9015 strerror(errno));
9016 return REDIS_ERR;
9017 }
9018 redisLog(REDIS_NOTICE,
9019 "Background append only file rewriting started by pid %d",childpid);
9020 server.bgrewritechildpid = childpid;
884d4b39 9021 updateDictResizePolicy();
85a83172 9022 /* We set appendseldb to -1 in order to force the next call to the
9023 * feedAppendOnlyFile() to issue a SELECT command, so the differences
9024 * accumulated by the parent into server.bgrewritebuf will start
9025 * with a SELECT statement and it will be safe to merge. */
9026 server.appendseldb = -1;
9d65a1bb 9027 return REDIS_OK;
9028 }
9029 return REDIS_OK; /* unreached */
9030}
9031
9032static void bgrewriteaofCommand(redisClient *c) {
9033 if (server.bgrewritechildpid != -1) {
9034 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
9035 return;
9036 }
9037 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 9038 char *status = "+Background append only file rewriting started\r\n";
9039 addReplySds(c,sdsnew(status));
9d65a1bb 9040 } else {
9041 addReply(c,shared.err);
9042 }
9043}
9044
9045static void aofRemoveTempFile(pid_t childpid) {
9046 char tmpfile[256];
9047
9048 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
9049 unlink(tmpfile);
9050}
9051
996cb5f7 9052/* Virtual Memory is composed mainly of two subsystems:
9053 * - Blocking Virutal Memory
9054 * - Threaded Virtual Memory I/O
9055 * The two parts are not fully decoupled, but functions are split among two
9056 * different sections of the source code (delimited by comments) in order to
9057 * make more clear what functionality is about the blocking VM and what about
9058 * the threaded (not blocking) VM.
9059 *
9060 * Redis VM design:
9061 *
9062 * Redis VM is a blocking VM (one that blocks reading swapped values from
9063 * disk into memory when a value swapped out is needed in memory) that is made
9064 * unblocking by trying to examine the command argument vector in order to
9065 * load in background values that will likely be needed in order to exec
9066 * the command. The command is executed only once all the relevant keys
9067 * are loaded into memory.
9068 *
9069 * This basically is almost as simple of a blocking VM, but almost as parallel
9070 * as a fully non-blocking VM.
9071 */
9072
2e5eb04e 9073/* Called when the user switches from "appendonly yes" to "appendonly no"
9074 * at runtime using the CONFIG command. */
9075static void stopAppendOnly(void) {
9076 flushAppendOnlyFile();
9077 fsync(server.appendfd);
9078 close(server.appendfd);
9079
9080 server.appendfd = -1;
9081 server.appendseldb = -1;
9082 server.appendonly = 0;
9083 /* rewrite operation in progress? kill it, wait child exit */
9084 if (server.bgsavechildpid != -1) {
9085 int statloc;
9086
30dd89b6 9087 if (kill(server.bgsavechildpid,SIGKILL) != -1)
9088 wait3(&statloc,0,NULL);
2e5eb04e 9089 /* reset the buffer accumulating changes while the child saves */
9090 sdsfree(server.bgrewritebuf);
9091 server.bgrewritebuf = sdsempty();
30dd89b6 9092 server.bgsavechildpid = -1;
2e5eb04e 9093 }
9094}
9095
9096/* Called when the user switches from "appendonly no" to "appendonly yes"
9097 * at runtime using the CONFIG command. */
9098static int startAppendOnly(void) {
9099 server.appendonly = 1;
9100 server.lastfsync = time(NULL);
9101 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
9102 if (server.appendfd == -1) {
9103 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
9104 return REDIS_ERR;
9105 }
9106 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
9107 server.appendonly = 0;
9108 close(server.appendfd);
9109 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
9110 return REDIS_ERR;
9111 }
9112 return REDIS_OK;
9113}
9114
996cb5f7 9115/* =================== Virtual Memory - Blocking Side ====================== */
054e426d 9116
75680a3c 9117static void vmInit(void) {
9118 off_t totsize;
996cb5f7 9119 int pipefds[2];
bcaa7a4f 9120 size_t stacksize;
8b5bb414 9121 struct flock fl;
75680a3c 9122
4ad37480 9123 if (server.vm_max_threads != 0)
9124 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
9125
054e426d 9126 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8b5bb414 9127 /* Try to open the old swap file, otherwise create it */
6fa987e3 9128 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
9129 server.vm_fp = fopen(server.vm_swap_file,"w+b");
9130 }
75680a3c 9131 if (server.vm_fp == NULL) {
6fa987e3 9132 redisLog(REDIS_WARNING,
8b5bb414 9133 "Can't open the swap file: %s. Exiting.",
6fa987e3 9134 strerror(errno));
75680a3c 9135 exit(1);
9136 }
9137 server.vm_fd = fileno(server.vm_fp);
8b5bb414 9138 /* Lock the swap file for writing, this is useful in order to avoid
9139 * another instance to use the same swap file for a config error. */
9140 fl.l_type = F_WRLCK;
9141 fl.l_whence = SEEK_SET;
9142 fl.l_start = fl.l_len = 0;
9143 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
9144 redisLog(REDIS_WARNING,
9145 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
9146 exit(1);
9147 }
9148 /* Initialize */
75680a3c 9149 server.vm_next_page = 0;
9150 server.vm_near_pages = 0;
7d98e08c 9151 server.vm_stats_used_pages = 0;
9152 server.vm_stats_swapped_objects = 0;
9153 server.vm_stats_swapouts = 0;
9154 server.vm_stats_swapins = 0;
75680a3c 9155 totsize = server.vm_pages*server.vm_page_size;
9156 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
9157 if (ftruncate(server.vm_fd,totsize) == -1) {
9158 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
9159 strerror(errno));
9160 exit(1);
9161 } else {
9162 redisLog(REDIS_NOTICE,"Swap file allocated with success");
9163 }
7d30035d 9164 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 9165 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 9166 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 9167 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 9168
996cb5f7 9169 /* Initialize threaded I/O (used by Virtual Memory) */
9170 server.io_newjobs = listCreate();
9171 server.io_processing = listCreate();
9172 server.io_processed = listCreate();
d5d55fc3 9173 server.io_ready_clients = listCreate();
92f8e882 9174 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 9175 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
9176 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 9177 server.io_active_threads = 0;
996cb5f7 9178 if (pipe(pipefds) == -1) {
9179 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
9180 ,strerror(errno));
9181 exit(1);
9182 }
9183 server.io_ready_pipe_read = pipefds[0];
9184 server.io_ready_pipe_write = pipefds[1];
9185 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 9186 /* LZF requires a lot of stack */
9187 pthread_attr_init(&server.io_threads_attr);
9188 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
9189 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
9190 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 9191 /* Listen for events in the threaded I/O pipe */
9192 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
9193 vmThreadedIOCompletedJob, NULL) == AE_ERR)
9194 oom("creating file event");
75680a3c 9195}
9196
06224fec 9197/* Mark the page as used */
9198static void vmMarkPageUsed(off_t page) {
9199 off_t byte = page/8;
9200 int bit = page&7;
970e10bb 9201 redisAssert(vmFreePage(page) == 1);
06224fec 9202 server.vm_bitmap[byte] |= 1<<bit;
9203}
9204
9205/* Mark N contiguous pages as used, with 'page' being the first. */
9206static void vmMarkPagesUsed(off_t page, off_t count) {
9207 off_t j;
9208
9209 for (j = 0; j < count; j++)
7d30035d 9210 vmMarkPageUsed(page+j);
7d98e08c 9211 server.vm_stats_used_pages += count;
7c775e09 9212 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
9213 (long long)count, (long long)page);
06224fec 9214}
9215
9216/* Mark the page as free */
9217static void vmMarkPageFree(off_t page) {
9218 off_t byte = page/8;
9219 int bit = page&7;
970e10bb 9220 redisAssert(vmFreePage(page) == 0);
06224fec 9221 server.vm_bitmap[byte] &= ~(1<<bit);
9222}
9223
9224/* Mark N contiguous pages as free, with 'page' being the first. */
9225static void vmMarkPagesFree(off_t page, off_t count) {
9226 off_t j;
9227
9228 for (j = 0; j < count; j++)
7d30035d 9229 vmMarkPageFree(page+j);
7d98e08c 9230 server.vm_stats_used_pages -= count;
7c775e09 9231 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
9232 (long long)count, (long long)page);
06224fec 9233}
9234
9235/* Test if the page is free */
9236static int vmFreePage(off_t page) {
9237 off_t byte = page/8;
9238 int bit = page&7;
7d30035d 9239 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 9240}
9241
9242/* Find N contiguous free pages storing the first page of the cluster in *first.
e0a62c7f 9243 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
3a66edc7 9244 * REDIS_ERR is returned.
06224fec 9245 *
9246 * This function uses a simple algorithm: we try to allocate
9247 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9248 * again from the start of the swap file searching for free spaces.
9249 *
9250 * If it looks pretty clear that there are no free pages near our offset
9251 * we try to find less populated places doing a forward jump of
9252 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9253 * without hurry, and then we jump again and so forth...
e0a62c7f 9254 *
06224fec 9255 * This function can be improved using a free list to avoid to guess
9256 * too much, since we could collect data about freed pages.
9257 *
9258 * note: I implemented this function just after watching an episode of
9259 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9260 */
c7df85a4 9261static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 9262 off_t base, offset = 0, since_jump = 0, numfree = 0;
9263
9264 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9265 server.vm_near_pages = 0;
9266 server.vm_next_page = 0;
9267 }
9268 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9269 base = server.vm_next_page;
9270
9271 while(offset < server.vm_pages) {
9272 off_t this = base+offset;
9273
9274 /* If we overflow, restart from page zero */
9275 if (this >= server.vm_pages) {
9276 this -= server.vm_pages;
9277 if (this == 0) {
9278 /* Just overflowed, what we found on tail is no longer
9279 * interesting, as it's no longer contiguous. */
9280 numfree = 0;
9281 }
9282 }
9283 if (vmFreePage(this)) {
9284 /* This is a free page */
9285 numfree++;
9286 /* Already got N free pages? Return to the caller, with success */
9287 if (numfree == n) {
7d30035d 9288 *first = this-(n-1);
9289 server.vm_next_page = this+1;
7c775e09 9290 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 9291 return REDIS_OK;
06224fec 9292 }
9293 } else {
9294 /* The current one is not a free page */
9295 numfree = 0;
9296 }
9297
9298 /* Fast-forward if the current page is not free and we already
9299 * searched enough near this place. */
9300 since_jump++;
9301 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9302 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9303 since_jump = 0;
9304 /* Note that even if we rewind after the jump, we are don't need
9305 * to make sure numfree is set to zero as we only jump *if* it
9306 * is set to zero. */
9307 } else {
9308 /* Otherwise just check the next page */
9309 offset++;
9310 }
9311 }
3a66edc7 9312 return REDIS_ERR;
9313}
9314
a5819310 9315/* Write the specified object at the specified page of the swap file */
9316static int vmWriteObjectOnSwap(robj *o, off_t page) {
9317 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9318 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9319 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9320 redisLog(REDIS_WARNING,
9ebed7cf 9321 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 9322 strerror(errno));
9323 return REDIS_ERR;
9324 }
9325 rdbSaveObject(server.vm_fp,o);
ba76a8f9 9326 fflush(server.vm_fp);
a5819310 9327 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9328 return REDIS_OK;
9329}
9330
3a66edc7 9331/* Swap the 'val' object relative to 'key' into disk. Store all the information
9332 * needed to later retrieve the object into the key object.
9333 * If we can't find enough contiguous empty pages to swap the object on disk
9334 * REDIS_ERR is returned. */
a69a0c9c 9335static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 9336 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 9337 off_t page;
9338
9339 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 9340 assert(key->refcount == 1);
3a66edc7 9341 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 9342 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 9343 key->vm.page = page;
9344 key->vm.usedpages = pages;
9345 key->storage = REDIS_VM_SWAPPED;
d894161b 9346 key->vtype = val->type;
3a66edc7 9347 decrRefCount(val); /* Deallocate the object from memory. */
9348 vmMarkPagesUsed(page,pages);
7d30035d 9349 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
9350 (unsigned char*) key->ptr,
9351 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 9352 server.vm_stats_swapped_objects++;
9353 server.vm_stats_swapouts++;
3a66edc7 9354 return REDIS_OK;
9355}
9356
a5819310 9357static robj *vmReadObjectFromSwap(off_t page, int type) {
9358 robj *o;
3a66edc7 9359
a5819310 9360 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9361 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 9362 redisLog(REDIS_WARNING,
d5d55fc3 9363 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 9364 strerror(errno));
478c2c6f 9365 _exit(1);
3a66edc7 9366 }
a5819310 9367 o = rdbLoadObject(type,server.vm_fp);
9368 if (o == NULL) {
d5d55fc3 9369 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 9370 _exit(1);
3a66edc7 9371 }
a5819310 9372 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9373 return o;
9374}
9375
9376/* Load the value object relative to the 'key' object from swap to memory.
9377 * The newly allocated object is returned.
9378 *
9379 * If preview is true the unserialized object is returned to the caller but
9380 * no changes are made to the key object, nor the pages are marked as freed */
9381static robj *vmGenericLoadObject(robj *key, int preview) {
9382 robj *val;
9383
d5d55fc3 9384 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
a5819310 9385 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 9386 if (!preview) {
9387 key->storage = REDIS_VM_MEMORY;
9388 key->vm.atime = server.unixtime;
9389 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9390 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
9391 (unsigned char*) key->ptr);
7d98e08c 9392 server.vm_stats_swapped_objects--;
38aba9a1 9393 } else {
9394 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
9395 (unsigned char*) key->ptr);
7e69548d 9396 }
7d98e08c 9397 server.vm_stats_swapins++;
3a66edc7 9398 return val;
06224fec 9399}
9400
7e69548d 9401/* Plain object loading, from swap to memory */
9402static robj *vmLoadObject(robj *key) {
996cb5f7 9403 /* If we are loading the object in background, stop it, we
9404 * need to load this object synchronously ASAP. */
9405 if (key->storage == REDIS_VM_LOADING)
9406 vmCancelThreadedIOJob(key);
7e69548d 9407 return vmGenericLoadObject(key,0);
9408}
9409
9410/* Just load the value on disk, without to modify the key.
9411 * This is useful when we want to perform some operation on the value
9412 * without to really bring it from swap to memory, like while saving the
9413 * dataset or rewriting the append only log. */
9414static robj *vmPreviewObject(robj *key) {
9415 return vmGenericLoadObject(key,1);
9416}
9417
4ef8de8a 9418/* How a good candidate is this object for swapping?
9419 * The better candidate it is, the greater the returned value.
9420 *
9421 * Currently we try to perform a fast estimation of the object size in
9422 * memory, and combine it with aging informations.
9423 *
9424 * Basically swappability = idle-time * log(estimated size)
9425 *
9426 * Bigger objects are preferred over smaller objects, but not
9427 * proportionally, this is why we use the logarithm. This algorithm is
9428 * just a first try and will probably be tuned later. */
9429static double computeObjectSwappability(robj *o) {
9430 time_t age = server.unixtime - o->vm.atime;
9431 long asize = 0;
9432 list *l;
9433 dict *d;
9434 struct dictEntry *de;
9435 int z;
9436
9437 if (age <= 0) return 0;
9438 switch(o->type) {
9439 case REDIS_STRING:
9440 if (o->encoding != REDIS_ENCODING_RAW) {
9441 asize = sizeof(*o);
9442 } else {
9443 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9444 }
9445 break;
9446 case REDIS_LIST:
9447 l = o->ptr;
9448 listNode *ln = listFirst(l);
9449
9450 asize = sizeof(list);
9451 if (ln) {
9452 robj *ele = ln->value;
9453 long elesize;
9454
9455 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9456 (sizeof(*o)+sdslen(ele->ptr)) :
9457 sizeof(*o);
9458 asize += (sizeof(listNode)+elesize)*listLength(l);
9459 }
9460 break;
9461 case REDIS_SET:
9462 case REDIS_ZSET:
9463 z = (o->type == REDIS_ZSET);
9464 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9465
9466 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9467 if (z) asize += sizeof(zset)-sizeof(dict);
9468 if (dictSize(d)) {
9469 long elesize;
9470 robj *ele;
9471
9472 de = dictGetRandomKey(d);
9473 ele = dictGetEntryKey(de);
9474 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9475 (sizeof(*o)+sdslen(ele->ptr)) :
9476 sizeof(*o);
9477 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9478 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9479 }
9480 break;
a97b9060 9481 case REDIS_HASH:
9482 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9483 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9484 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9485 unsigned int klen, vlen;
9486 unsigned char *key, *val;
9487
9488 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9489 klen = 0;
9490 vlen = 0;
9491 }
9492 asize = len*(klen+vlen+3);
9493 } else if (o->encoding == REDIS_ENCODING_HT) {
9494 d = o->ptr;
9495 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9496 if (dictSize(d)) {
9497 long elesize;
9498 robj *ele;
9499
9500 de = dictGetRandomKey(d);
9501 ele = dictGetEntryKey(de);
9502 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9503 (sizeof(*o)+sdslen(ele->ptr)) :
9504 sizeof(*o);
9505 ele = dictGetEntryVal(de);
9506 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9507 (sizeof(*o)+sdslen(ele->ptr)) :
9508 sizeof(*o);
9509 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9510 }
9511 }
9512 break;
4ef8de8a 9513 }
c8c72447 9514 return (double)age*log(1+asize);
4ef8de8a 9515}
9516
9517/* Try to swap an object that's a good candidate for swapping.
9518 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 9519 * to swap any object at all.
9520 *
9521 * If 'usethreaded' is true, Redis will try to swap the object in background
9522 * using I/O threads. */
9523static int vmSwapOneObject(int usethreads) {
4ef8de8a 9524 int j, i;
9525 struct dictEntry *best = NULL;
9526 double best_swappability = 0;
b9bc0eef 9527 redisDb *best_db = NULL;
4ef8de8a 9528 robj *key, *val;
9529
9530 for (j = 0; j < server.dbnum; j++) {
9531 redisDb *db = server.db+j;
b72f6a4b 9532 /* Why maxtries is set to 100?
9533 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9534 * are swappable objects */
b0d8747d 9535 int maxtries = 100;
4ef8de8a 9536
9537 if (dictSize(db->dict) == 0) continue;
9538 for (i = 0; i < 5; i++) {
9539 dictEntry *de;
9540 double swappability;
9541
e3cadb8a 9542 if (maxtries) maxtries--;
4ef8de8a 9543 de = dictGetRandomKey(db->dict);
9544 key = dictGetEntryKey(de);
9545 val = dictGetEntryVal(de);
1064ef87 9546 /* Only swap objects that are currently in memory.
9547 *
9548 * Also don't swap shared objects if threaded VM is on, as we
9549 * try to ensure that the main thread does not touch the
9550 * object while the I/O thread is using it, but we can't
9551 * control other keys without adding additional mutex. */
9552 if (key->storage != REDIS_VM_MEMORY ||
9553 (server.vm_max_threads != 0 && val->refcount != 1)) {
e3cadb8a 9554 if (maxtries) i--; /* don't count this try */
9555 continue;
9556 }
4ef8de8a 9557 swappability = computeObjectSwappability(val);
9558 if (!best || swappability > best_swappability) {
9559 best = de;
9560 best_swappability = swappability;
b9bc0eef 9561 best_db = db;
4ef8de8a 9562 }
9563 }
9564 }
7c775e09 9565 if (best == NULL) return REDIS_ERR;
4ef8de8a 9566 key = dictGetEntryKey(best);
9567 val = dictGetEntryVal(best);
9568
e3cadb8a 9569 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 9570 key->ptr, best_swappability);
9571
9572 /* Unshare the key if needed */
9573 if (key->refcount > 1) {
9574 robj *newkey = dupStringObject(key);
9575 decrRefCount(key);
9576 key = dictGetEntryKey(best) = newkey;
9577 }
9578 /* Swap it */
a69a0c9c 9579 if (usethreads) {
b9bc0eef 9580 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 9581 return REDIS_OK;
9582 } else {
a69a0c9c 9583 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9584 dictGetEntryVal(best) = NULL;
9585 return REDIS_OK;
9586 } else {
9587 return REDIS_ERR;
9588 }
4ef8de8a 9589 }
9590}
9591
a69a0c9c 9592static int vmSwapOneObjectBlocking() {
9593 return vmSwapOneObject(0);
9594}
9595
9596static int vmSwapOneObjectThreaded() {
9597 return vmSwapOneObject(1);
9598}
9599
7e69548d 9600/* Return true if it's safe to swap out objects in a given moment.
9601 * Basically we don't want to swap objects out while there is a BGSAVE
9602 * or a BGAEOREWRITE running in backgroud. */
9603static int vmCanSwapOut(void) {
9604 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9605}
9606
1b03836c 9607/* Delete a key if swapped. Returns 1 if the key was found, was swapped
9608 * and was deleted. Otherwise 0 is returned. */
9609static int deleteIfSwapped(redisDb *db, robj *key) {
9610 dictEntry *de;
9611 robj *foundkey;
9612
9613 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9614 foundkey = dictGetEntryKey(de);
9615 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9616 deleteKey(db,key);
9617 return 1;
9618}
9619
996cb5f7 9620/* =================== Virtual Memory - Threaded I/O ======================= */
9621
b9bc0eef 9622static void freeIOJob(iojob *j) {
d5d55fc3 9623 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9624 j->type == REDIS_IOJOB_DO_SWAP ||
9625 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
b9bc0eef 9626 decrRefCount(j->val);
78ebe4c8 9627 /* We don't decrRefCount the j->key field as we did't incremented
9628 * the count creating IO Jobs. This is because the key field here is
9629 * just used as an indentifier and if a key is removed the Job should
9630 * never be touched again. */
b9bc0eef 9631 zfree(j);
9632}
9633
996cb5f7 9634/* Every time a thread finished a Job, it writes a byte into the write side
9635 * of an unix pipe in order to "awake" the main thread, and this function
9636 * is called. */
9637static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9638 int mask)
9639{
9640 char buf[1];
b0d8747d 9641 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 9642 REDIS_NOTUSED(el);
9643 REDIS_NOTUSED(mask);
9644 REDIS_NOTUSED(privdata);
9645
9646 /* For every byte we read in the read side of the pipe, there is one
9647 * I/O job completed to process. */
9648 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 9649 iojob *j;
9650 listNode *ln;
9651 robj *key;
9652 struct dictEntry *de;
9653
996cb5f7 9654 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 9655
9656 /* Get the processed element (the oldest one) */
9657 lockThreadedIO();
1064ef87 9658 assert(listLength(server.io_processed) != 0);
f6c0bba8 9659 if (toprocess == -1) {
9660 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9661 if (toprocess <= 0) toprocess = 1;
9662 }
b9bc0eef 9663 ln = listFirst(server.io_processed);
9664 j = ln->value;
9665 listDelNode(server.io_processed,ln);
9666 unlockThreadedIO();
9667 /* If this job is marked as canceled, just ignore it */
9668 if (j->canceled) {
9669 freeIOJob(j);
9670 continue;
9671 }
9672 /* Post process it in the main thread, as there are things we
9673 * can do just here to avoid race conditions and/or invasive locks */
6c96ba7d 9674 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 9675 de = dictFind(j->db->dict,j->key);
9676 assert(de != NULL);
9677 key = dictGetEntryKey(de);
9678 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9679 redisDb *db;
9680
b9bc0eef 9681 /* Key loaded, bring it at home */
9682 key->storage = REDIS_VM_MEMORY;
9683 key->vm.atime = server.unixtime;
9684 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9685 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9686 (unsigned char*) key->ptr);
9687 server.vm_stats_swapped_objects--;
9688 server.vm_stats_swapins++;
d5d55fc3 9689 dictGetEntryVal(de) = j->val;
9690 incrRefCount(j->val);
9691 db = j->db;
b9bc0eef 9692 freeIOJob(j);
d5d55fc3 9693 /* Handle clients waiting for this key to be loaded. */
9694 handleClientsBlockedOnSwappedKey(db,key);
b9bc0eef 9695 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9696 /* Now we know the amount of pages required to swap this object.
9697 * Let's find some space for it, and queue this task again
9698 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 9699 if (!vmCanSwapOut() ||
9700 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9701 {
9702 /* Ooops... no space or we can't swap as there is
9703 * a fork()ed Redis trying to save stuff on disk. */
b9bc0eef 9704 freeIOJob(j);
054e426d 9705 key->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 9706 } else {
c7df85a4 9707 /* Note that we need to mark this pages as used now,
9708 * if the job will be canceled, we'll mark them as freed
9709 * again. */
9710 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 9711 j->type = REDIS_IOJOB_DO_SWAP;
9712 lockThreadedIO();
9713 queueIOJob(j);
9714 unlockThreadedIO();
9715 }
9716 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9717 robj *val;
9718
9719 /* Key swapped. We can finally free some memory. */
6c96ba7d 9720 if (key->storage != REDIS_VM_SWAPPING) {
9721 printf("key->storage: %d\n",key->storage);
9722 printf("key->name: %s\n",(char*)key->ptr);
9723 printf("key->refcount: %d\n",key->refcount);
9724 printf("val: %p\n",(void*)j->val);
9725 printf("val->type: %d\n",j->val->type);
9726 printf("val->ptr: %s\n",(char*)j->val->ptr);
9727 }
9728 redisAssert(key->storage == REDIS_VM_SWAPPING);
b9bc0eef 9729 val = dictGetEntryVal(de);
9730 key->vm.page = j->page;
9731 key->vm.usedpages = j->pages;
9732 key->storage = REDIS_VM_SWAPPED;
9733 key->vtype = j->val->type;
9734 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 9735 dictGetEntryVal(de) = NULL;
b9bc0eef 9736 redisLog(REDIS_DEBUG,
9737 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9738 (unsigned char*) key->ptr,
9739 (unsigned long long) j->page, (unsigned long long) j->pages);
9740 server.vm_stats_swapped_objects++;
9741 server.vm_stats_swapouts++;
9742 freeIOJob(j);
f11b8647 9743 /* Put a few more swap requests in queue if we are still
9744 * out of memory */
b0d8747d 9745 if (trytoswap && vmCanSwapOut() &&
9746 zmalloc_used_memory() > server.vm_max_memory)
9747 {
f11b8647 9748 int more = 1;
9749 while(more) {
9750 lockThreadedIO();
9751 more = listLength(server.io_newjobs) <
9752 (unsigned) server.vm_max_threads;
9753 unlockThreadedIO();
9754 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 9755 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9756 trytoswap = 0;
9757 break;
9758 }
f11b8647 9759 }
9760 }
b9bc0eef 9761 }
c953f24b 9762 processed++;
f6c0bba8 9763 if (processed == toprocess) return;
996cb5f7 9764 }
9765 if (retval < 0 && errno != EAGAIN) {
9766 redisLog(REDIS_WARNING,
9767 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9768 strerror(errno));
9769 }
9770}
9771
9772static void lockThreadedIO(void) {
9773 pthread_mutex_lock(&server.io_mutex);
9774}
9775
9776static void unlockThreadedIO(void) {
9777 pthread_mutex_unlock(&server.io_mutex);
9778}
9779
9780/* Remove the specified object from the threaded I/O queue if still not
9781 * processed, otherwise make sure to flag it as canceled. */
9782static void vmCancelThreadedIOJob(robj *o) {
9783 list *lists[3] = {
6c96ba7d 9784 server.io_newjobs, /* 0 */
9785 server.io_processing, /* 1 */
9786 server.io_processed /* 2 */
996cb5f7 9787 };
9788 int i;
9789
9790 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 9791again:
996cb5f7 9792 lockThreadedIO();
9793 /* Search for a matching key in one of the queues */
9794 for (i = 0; i < 3; i++) {
9795 listNode *ln;
c7df85a4 9796 listIter li;
996cb5f7 9797
c7df85a4 9798 listRewind(lists[i],&li);
9799 while ((ln = listNext(&li)) != NULL) {
996cb5f7 9800 iojob *job = ln->value;
9801
6c96ba7d 9802 if (job->canceled) continue; /* Skip this, already canceled. */
78ebe4c8 9803 if (job->key == o) {
970e10bb 9804 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9805 (void*)job, (char*)o->ptr, job->type, i);
427a2153 9806 /* Mark the pages as free since the swap didn't happened
9807 * or happened but is now discarded. */
970e10bb 9808 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 9809 vmMarkPagesFree(job->page,job->pages);
9810 /* Cancel the job. It depends on the list the job is
9811 * living in. */
996cb5f7 9812 switch(i) {
9813 case 0: /* io_newjobs */
6c96ba7d 9814 /* If the job was yet not processed the best thing to do
996cb5f7 9815 * is to remove it from the queue at all */
6c96ba7d 9816 freeIOJob(job);
996cb5f7 9817 listDelNode(lists[i],ln);
9818 break;
9819 case 1: /* io_processing */
d5d55fc3 9820 /* Oh Shi- the thread is messing with the Job:
9821 *
9822 * Probably it's accessing the object if this is a
9823 * PREPARE_SWAP or DO_SWAP job.
9824 * If it's a LOAD job it may be reading from disk and
9825 * if we don't wait for the job to terminate before to
9826 * cancel it, maybe in a few microseconds data can be
9827 * corrupted in this pages. So the short story is:
9828 *
9829 * Better to wait for the job to move into the
9830 * next queue (processed)... */
9831
9832 /* We try again and again until the job is completed. */
9833 unlockThreadedIO();
9834 /* But let's wait some time for the I/O thread
9835 * to finish with this job. After all this condition
9836 * should be very rare. */
9837 usleep(1);
9838 goto again;
996cb5f7 9839 case 2: /* io_processed */
2e111efe 9840 /* The job was already processed, that's easy...
9841 * just mark it as canceled so that we'll ignore it
9842 * when processing completed jobs. */
996cb5f7 9843 job->canceled = 1;
9844 break;
9845 }
c7df85a4 9846 /* Finally we have to adjust the storage type of the object
9847 * in order to "UNDO" the operaiton. */
996cb5f7 9848 if (o->storage == REDIS_VM_LOADING)
9849 o->storage = REDIS_VM_SWAPPED;
9850 else if (o->storage == REDIS_VM_SWAPPING)
9851 o->storage = REDIS_VM_MEMORY;
9852 unlockThreadedIO();
9853 return;
9854 }
9855 }
9856 }
9857 unlockThreadedIO();
9858 assert(1 != 1); /* We should never reach this */
9859}
9860
b9bc0eef 9861static void *IOThreadEntryPoint(void *arg) {
9862 iojob *j;
9863 listNode *ln;
9864 REDIS_NOTUSED(arg);
9865
9866 pthread_detach(pthread_self());
9867 while(1) {
9868 /* Get a new job to process */
9869 lockThreadedIO();
9870 if (listLength(server.io_newjobs) == 0) {
9871 /* No new jobs in queue, exit. */
9ebed7cf 9872 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9873 (long) pthread_self());
b9bc0eef 9874 server.io_active_threads--;
9875 unlockThreadedIO();
9876 return NULL;
9877 }
9878 ln = listFirst(server.io_newjobs);
9879 j = ln->value;
9880 listDelNode(server.io_newjobs,ln);
9881 /* Add the job in the processing queue */
9882 j->thread = pthread_self();
9883 listAddNodeTail(server.io_processing,j);
9884 ln = listLast(server.io_processing); /* We use ln later to remove it */
9885 unlockThreadedIO();
9ebed7cf 9886 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9887 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 9888
9889 /* Process the Job */
9890 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9891 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
b9bc0eef 9892 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9893 FILE *fp = fopen("/dev/null","w+");
9894 j->pages = rdbSavedObjectPages(j->val,fp);
9895 fclose(fp);
9896 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 9897 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9898 j->canceled = 1;
b9bc0eef 9899 }
9900
9901 /* Done: insert the job into the processed queue */
9ebed7cf 9902 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9903 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 9904 lockThreadedIO();
9905 listDelNode(server.io_processing,ln);
9906 listAddNodeTail(server.io_processed,j);
9907 unlockThreadedIO();
e0a62c7f 9908
b9bc0eef 9909 /* Signal the main thread there is new stuff to process */
9910 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9911 }
9912 return NULL; /* never reached */
9913}
9914
9915static void spawnIOThread(void) {
9916 pthread_t thread;
478c2c6f 9917 sigset_t mask, omask;
a97b9060 9918 int err;
b9bc0eef 9919
478c2c6f 9920 sigemptyset(&mask);
9921 sigaddset(&mask,SIGCHLD);
9922 sigaddset(&mask,SIGHUP);
9923 sigaddset(&mask,SIGPIPE);
9924 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 9925 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9926 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9927 strerror(err));
9928 usleep(1000000);
9929 }
478c2c6f 9930 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 9931 server.io_active_threads++;
9932}
9933
4ee9488d 9934/* We need to wait for the last thread to exit before we are able to
9935 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 9936static void waitEmptyIOJobsQueue(void) {
4ee9488d 9937 while(1) {
76b7233a 9938 int io_processed_len;
9939
4ee9488d 9940 lockThreadedIO();
054e426d 9941 if (listLength(server.io_newjobs) == 0 &&
9942 listLength(server.io_processing) == 0 &&
9943 server.io_active_threads == 0)
9944 {
4ee9488d 9945 unlockThreadedIO();
9946 return;
9947 }
76b7233a 9948 /* While waiting for empty jobs queue condition we post-process some
9949 * finshed job, as I/O threads may be hanging trying to write against
9950 * the io_ready_pipe_write FD but there are so much pending jobs that
9951 * it's blocking. */
9952 io_processed_len = listLength(server.io_processed);
4ee9488d 9953 unlockThreadedIO();
76b7233a 9954 if (io_processed_len) {
9955 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9956 usleep(1000); /* 1 millisecond */
9957 } else {
9958 usleep(10000); /* 10 milliseconds */
9959 }
4ee9488d 9960 }
9961}
9962
054e426d 9963static void vmReopenSwapFile(void) {
478c2c6f 9964 /* Note: we don't close the old one as we are in the child process
9965 * and don't want to mess at all with the original file object. */
054e426d 9966 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9967 if (server.vm_fp == NULL) {
9968 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9969 server.vm_swap_file);
478c2c6f 9970 _exit(1);
054e426d 9971 }
9972 server.vm_fd = fileno(server.vm_fp);
9973}
9974
b9bc0eef 9975/* This function must be called while with threaded IO locked */
9976static void queueIOJob(iojob *j) {
6c96ba7d 9977 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9978 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 9979 listAddNodeTail(server.io_newjobs,j);
9980 if (server.io_active_threads < server.vm_max_threads)
9981 spawnIOThread();
9982}
9983
9984static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9985 iojob *j;
e0a62c7f 9986
b9bc0eef 9987 assert(key->storage == REDIS_VM_MEMORY);
9988 assert(key->refcount == 1);
9989
9990 j = zmalloc(sizeof(*j));
9991 j->type = REDIS_IOJOB_PREPARE_SWAP;
9992 j->db = db;
78ebe4c8 9993 j->key = key;
b9bc0eef 9994 j->val = val;
9995 incrRefCount(val);
9996 j->canceled = 0;
9997 j->thread = (pthread_t) -1;
f11b8647 9998 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 9999
10000 lockThreadedIO();
10001 queueIOJob(j);
10002 unlockThreadedIO();
10003 return REDIS_OK;
10004}
10005
b0d8747d 10006/* ============ Virtual Memory - Blocking clients on missing keys =========== */
10007
d5d55fc3 10008/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
10009 * If there is not already a job loading the key, it is craeted.
10010 * The key is added to the io_keys list in the client structure, and also
10011 * in the hash table mapping swapped keys to waiting clients, that is,
10012 * server.io_waited_keys. */
10013static int waitForSwappedKey(redisClient *c, robj *key) {
10014 struct dictEntry *de;
10015 robj *o;
10016 list *l;
10017
10018 /* If the key does not exist or is already in RAM we don't need to
10019 * block the client at all. */
10020 de = dictFind(c->db->dict,key);
10021 if (de == NULL) return 0;
10022 o = dictGetEntryKey(de);
10023 if (o->storage == REDIS_VM_MEMORY) {
10024 return 0;
10025 } else if (o->storage == REDIS_VM_SWAPPING) {
10026 /* We were swapping the key, undo it! */
10027 vmCancelThreadedIOJob(o);
10028 return 0;
10029 }
e0a62c7f 10030
d5d55fc3 10031 /* OK: the key is either swapped, or being loaded just now. */
10032
10033 /* Add the key to the list of keys this client is waiting for.
10034 * This maps clients to keys they are waiting for. */
10035 listAddNodeTail(c->io_keys,key);
10036 incrRefCount(key);
10037
10038 /* Add the client to the swapped keys => clients waiting map. */
10039 de = dictFind(c->db->io_keys,key);
10040 if (de == NULL) {
10041 int retval;
10042
10043 /* For every key we take a list of clients blocked for it */
10044 l = listCreate();
10045 retval = dictAdd(c->db->io_keys,key,l);
10046 incrRefCount(key);
10047 assert(retval == DICT_OK);
10048 } else {
10049 l = dictGetEntryVal(de);
10050 }
10051 listAddNodeTail(l,c);
10052
10053 /* Are we already loading the key from disk? If not create a job */
10054 if (o->storage == REDIS_VM_SWAPPED) {
10055 iojob *j;
10056
10057 o->storage = REDIS_VM_LOADING;
10058 j = zmalloc(sizeof(*j));
10059 j->type = REDIS_IOJOB_LOAD;
10060 j->db = c->db;
78ebe4c8 10061 j->key = o;
d5d55fc3 10062 j->key->vtype = o->vtype;
10063 j->page = o->vm.page;
10064 j->val = NULL;
10065 j->canceled = 0;
10066 j->thread = (pthread_t) -1;
10067 lockThreadedIO();
10068 queueIOJob(j);
10069 unlockThreadedIO();
10070 }
10071 return 1;
10072}
10073
6f078746
PN
10074/* Preload keys for any command with first, last and step values for
10075 * the command keys prototype, as defined in the command table. */
10076static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10077 int j, last;
10078 if (cmd->vm_firstkey == 0) return;
10079 last = cmd->vm_lastkey;
10080 if (last < 0) last = argc+last;
10081 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
10082 redisAssert(j < argc);
10083 waitForSwappedKey(c,argv[j]);
10084 }
10085}
10086
5d373da9 10087/* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
739ba0d2
PN
10088 * Note that the number of keys to preload is user-defined, so we need to
10089 * apply a sanity check against argc. */
ca1788b5 10090static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
76583ea4 10091 int i, num;
ca1788b5 10092 REDIS_NOTUSED(cmd);
ca1788b5
PN
10093
10094 num = atoi(argv[2]->ptr);
739ba0d2 10095 if (num > (argc-3)) return;
76583ea4 10096 for (i = 0; i < num; i++) {
ca1788b5 10097 waitForSwappedKey(c,argv[3+i]);
76583ea4
PN
10098 }
10099}
10100
3805e04f
PN
10101/* Preload keys needed to execute the entire MULTI/EXEC block.
10102 *
10103 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
10104 * and will block the client when any command requires a swapped out value. */
10105static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10106 int i, margc;
10107 struct redisCommand *mcmd;
10108 robj **margv;
10109 REDIS_NOTUSED(cmd);
10110 REDIS_NOTUSED(argc);
10111 REDIS_NOTUSED(argv);
10112
10113 if (!(c->flags & REDIS_MULTI)) return;
10114 for (i = 0; i < c->mstate.count; i++) {
10115 mcmd = c->mstate.commands[i].cmd;
10116 margc = c->mstate.commands[i].argc;
10117 margv = c->mstate.commands[i].argv;
10118
10119 if (mcmd->vm_preload_proc != NULL) {
10120 mcmd->vm_preload_proc(c,mcmd,margc,margv);
10121 } else {
10122 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
10123 }
76583ea4
PN
10124 }
10125}
10126
b0d8747d 10127/* Is this client attempting to run a command against swapped keys?
d5d55fc3 10128 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 10129 *
d5d55fc3 10130 * The important idea about this function is that it can fail! If keys will
10131 * still be swapped when the client is resumed, this key lookups will
10132 * just block loading keys from disk. In practical terms this should only
10133 * happen with SORT BY command or if there is a bug in this function.
10134 *
10135 * Return 1 if the client is marked as blocked, 0 if the client can
10136 * continue as the keys it is going to access appear to be in memory. */
0a6f3f0f 10137static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
76583ea4 10138 if (cmd->vm_preload_proc != NULL) {
ca1788b5 10139 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
76583ea4 10140 } else {
6f078746 10141 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
76583ea4
PN
10142 }
10143
d5d55fc3 10144 /* If the client was blocked for at least one key, mark it as blocked. */
10145 if (listLength(c->io_keys)) {
10146 c->flags |= REDIS_IO_WAIT;
10147 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
10148 server.vm_blocked_clients++;
10149 return 1;
10150 } else {
10151 return 0;
10152 }
10153}
10154
10155/* Remove the 'key' from the list of blocked keys for a given client.
10156 *
10157 * The function returns 1 when there are no longer blocking keys after
10158 * the current one was removed (and the client can be unblocked). */
10159static int dontWaitForSwappedKey(redisClient *c, robj *key) {
10160 list *l;
10161 listNode *ln;
10162 listIter li;
10163 struct dictEntry *de;
10164
10165 /* Remove the key from the list of keys this client is waiting for. */
10166 listRewind(c->io_keys,&li);
10167 while ((ln = listNext(&li)) != NULL) {
bf028098 10168 if (equalStringObjects(ln->value,key)) {
d5d55fc3 10169 listDelNode(c->io_keys,ln);
10170 break;
10171 }
10172 }
10173 assert(ln != NULL);
10174
10175 /* Remove the client form the key => waiting clients map. */
10176 de = dictFind(c->db->io_keys,key);
10177 assert(de != NULL);
10178 l = dictGetEntryVal(de);
10179 ln = listSearchKey(l,c);
10180 assert(ln != NULL);
10181 listDelNode(l,ln);
10182 if (listLength(l) == 0)
10183 dictDelete(c->db->io_keys,key);
10184
10185 return listLength(c->io_keys) == 0;
10186}
10187
10188static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
10189 struct dictEntry *de;
10190 list *l;
10191 listNode *ln;
10192 int len;
10193
10194 de = dictFind(db->io_keys,key);
10195 if (!de) return;
10196
10197 l = dictGetEntryVal(de);
10198 len = listLength(l);
10199 /* Note: we can't use something like while(listLength(l)) as the list
10200 * can be freed by the calling function when we remove the last element. */
10201 while (len--) {
10202 ln = listFirst(l);
10203 redisClient *c = ln->value;
10204
10205 if (dontWaitForSwappedKey(c,key)) {
10206 /* Put the client in the list of clients ready to go as we
10207 * loaded all the keys about it. */
10208 listAddNodeTail(server.io_ready_clients,c);
10209 }
10210 }
b0d8747d 10211}
b0d8747d 10212
500ece7c 10213/* =========================== Remote Configuration ========================= */
10214
10215static void configSetCommand(redisClient *c) {
10216 robj *o = getDecodedObject(c->argv[3]);
2e5eb04e 10217 long long ll;
10218
500ece7c 10219 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
10220 zfree(server.dbfilename);
10221 server.dbfilename = zstrdup(o->ptr);
10222 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
10223 zfree(server.requirepass);
10224 server.requirepass = zstrdup(o->ptr);
10225 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
10226 zfree(server.masterauth);
10227 server.masterauth = zstrdup(o->ptr);
10228 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
2e5eb04e 10229 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10230 ll < 0) goto badfmt;
10231 server.maxmemory = ll;
10232 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
10233 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10234 ll < 0 || ll > LONG_MAX) goto badfmt;
10235 server.maxidletime = ll;
1b677732 10236 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
10237 if (!strcasecmp(o->ptr,"no")) {
10238 server.appendfsync = APPENDFSYNC_NO;
10239 } else if (!strcasecmp(o->ptr,"everysec")) {
10240 server.appendfsync = APPENDFSYNC_EVERYSEC;
10241 } else if (!strcasecmp(o->ptr,"always")) {
10242 server.appendfsync = APPENDFSYNC_ALWAYS;
10243 } else {
10244 goto badfmt;
10245 }
2e5eb04e 10246 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
10247 int old = server.appendonly;
10248 int new = yesnotoi(o->ptr);
10249
10250 if (new == -1) goto badfmt;
10251 if (old != new) {
10252 if (new == 0) {
10253 stopAppendOnly();
10254 } else {
10255 if (startAppendOnly() == REDIS_ERR) {
10256 addReplySds(c,sdscatprintf(sdsempty(),
10257 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10258 decrRefCount(o);
10259 return;
10260 }
10261 }
10262 }
a34e0a25 10263 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10264 int vlen, j;
10265 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10266
10267 /* Perform sanity check before setting the new config:
10268 * - Even number of args
10269 * - Seconds >= 1, changes >= 0 */
10270 if (vlen & 1) {
10271 sdsfreesplitres(v,vlen);
10272 goto badfmt;
10273 }
10274 for (j = 0; j < vlen; j++) {
10275 char *eptr;
10276 long val;
10277
10278 val = strtoll(v[j], &eptr, 10);
10279 if (eptr[0] != '\0' ||
10280 ((j & 1) == 0 && val < 1) ||
10281 ((j & 1) == 1 && val < 0)) {
10282 sdsfreesplitres(v,vlen);
10283 goto badfmt;
10284 }
10285 }
10286 /* Finally set the new config */
10287 resetServerSaveParams();
10288 for (j = 0; j < vlen; j += 2) {
10289 time_t seconds;
10290 int changes;
10291
10292 seconds = strtoll(v[j],NULL,10);
10293 changes = strtoll(v[j+1],NULL,10);
10294 appendServerSaveParams(seconds, changes);
10295 }
10296 sdsfreesplitres(v,vlen);
500ece7c 10297 } else {
10298 addReplySds(c,sdscatprintf(sdsempty(),
10299 "-ERR not supported CONFIG parameter %s\r\n",
10300 (char*)c->argv[2]->ptr));
10301 decrRefCount(o);
10302 return;
10303 }
10304 decrRefCount(o);
10305 addReply(c,shared.ok);
a34e0a25 10306 return;
10307
10308badfmt: /* Bad format errors */
10309 addReplySds(c,sdscatprintf(sdsempty(),
10310 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10311 (char*)o->ptr,
10312 (char*)c->argv[2]->ptr));
10313 decrRefCount(o);
500ece7c 10314}
10315
10316static void configGetCommand(redisClient *c) {
10317 robj *o = getDecodedObject(c->argv[2]);
10318 robj *lenobj = createObject(REDIS_STRING,NULL);
10319 char *pattern = o->ptr;
10320 int matches = 0;
10321
10322 addReply(c,lenobj);
10323 decrRefCount(lenobj);
10324
10325 if (stringmatch(pattern,"dbfilename",0)) {
10326 addReplyBulkCString(c,"dbfilename");
10327 addReplyBulkCString(c,server.dbfilename);
10328 matches++;
10329 }
10330 if (stringmatch(pattern,"requirepass",0)) {
10331 addReplyBulkCString(c,"requirepass");
10332 addReplyBulkCString(c,server.requirepass);
10333 matches++;
10334 }
10335 if (stringmatch(pattern,"masterauth",0)) {
10336 addReplyBulkCString(c,"masterauth");
10337 addReplyBulkCString(c,server.masterauth);
10338 matches++;
10339 }
10340 if (stringmatch(pattern,"maxmemory",0)) {
10341 char buf[128];
10342
2e5eb04e 10343 ll2string(buf,128,server.maxmemory);
500ece7c 10344 addReplyBulkCString(c,"maxmemory");
10345 addReplyBulkCString(c,buf);
10346 matches++;
10347 }
2e5eb04e 10348 if (stringmatch(pattern,"timeout",0)) {
10349 char buf[128];
10350
10351 ll2string(buf,128,server.maxidletime);
10352 addReplyBulkCString(c,"timeout");
10353 addReplyBulkCString(c,buf);
10354 matches++;
10355 }
10356 if (stringmatch(pattern,"appendonly",0)) {
10357 addReplyBulkCString(c,"appendonly");
10358 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10359 matches++;
10360 }
1b677732 10361 if (stringmatch(pattern,"appendfsync",0)) {
10362 char *policy;
10363
10364 switch(server.appendfsync) {
10365 case APPENDFSYNC_NO: policy = "no"; break;
10366 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10367 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10368 default: policy = "unknown"; break; /* too harmless to panic */
10369 }
10370 addReplyBulkCString(c,"appendfsync");
10371 addReplyBulkCString(c,policy);
10372 matches++;
10373 }
a34e0a25 10374 if (stringmatch(pattern,"save",0)) {
10375 sds buf = sdsempty();
10376 int j;
10377
10378 for (j = 0; j < server.saveparamslen; j++) {
10379 buf = sdscatprintf(buf,"%ld %d",
10380 server.saveparams[j].seconds,
10381 server.saveparams[j].changes);
10382 if (j != server.saveparamslen-1)
10383 buf = sdscatlen(buf," ",1);
10384 }
10385 addReplyBulkCString(c,"save");
10386 addReplyBulkCString(c,buf);
10387 sdsfree(buf);
10388 matches++;
10389 }
500ece7c 10390 decrRefCount(o);
10391 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10392}
10393
10394static void configCommand(redisClient *c) {
10395 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10396 if (c->argc != 4) goto badarity;
10397 configSetCommand(c);
10398 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10399 if (c->argc != 3) goto badarity;
10400 configGetCommand(c);
10401 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10402 if (c->argc != 2) goto badarity;
10403 server.stat_numcommands = 0;
10404 server.stat_numconnections = 0;
10405 server.stat_expiredkeys = 0;
10406 server.stat_starttime = time(NULL);
10407 addReply(c,shared.ok);
10408 } else {
10409 addReplySds(c,sdscatprintf(sdsempty(),
10410 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10411 }
10412 return;
10413
10414badarity:
10415 addReplySds(c,sdscatprintf(sdsempty(),
10416 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10417 (char*) c->argv[1]->ptr));
10418}
10419
befec3cd 10420/* =========================== Pubsub implementation ======================== */
10421
ffc6b7f8 10422static void freePubsubPattern(void *p) {
10423 pubsubPattern *pat = p;
10424
10425 decrRefCount(pat->pattern);
10426 zfree(pat);
10427}
10428
10429static int listMatchPubsubPattern(void *a, void *b) {
10430 pubsubPattern *pa = a, *pb = b;
10431
10432 return (pa->client == pb->client) &&
bf028098 10433 (equalStringObjects(pa->pattern,pb->pattern));
ffc6b7f8 10434}
10435
10436/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10437 * 0 if the client was already subscribed to that channel. */
10438static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
befec3cd 10439 struct dictEntry *de;
10440 list *clients = NULL;
10441 int retval = 0;
10442
ffc6b7f8 10443 /* Add the channel to the client -> channels hash table */
10444 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
befec3cd 10445 retval = 1;
ffc6b7f8 10446 incrRefCount(channel);
10447 /* Add the client to the channel -> list of clients hash table */
10448 de = dictFind(server.pubsub_channels,channel);
befec3cd 10449 if (de == NULL) {
10450 clients = listCreate();
ffc6b7f8 10451 dictAdd(server.pubsub_channels,channel,clients);
10452 incrRefCount(channel);
befec3cd 10453 } else {
10454 clients = dictGetEntryVal(de);
10455 }
10456 listAddNodeTail(clients,c);
10457 }
10458 /* Notify the client */
10459 addReply(c,shared.mbulk3);
10460 addReply(c,shared.subscribebulk);
ffc6b7f8 10461 addReplyBulk(c,channel);
482b672d 10462 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
befec3cd 10463 return retval;
10464}
10465
ffc6b7f8 10466/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10467 * 0 if the client was not subscribed to the specified channel. */
10468static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
befec3cd 10469 struct dictEntry *de;
10470 list *clients;
10471 listNode *ln;
10472 int retval = 0;
10473
ffc6b7f8 10474 /* Remove the channel from the client -> channels hash table */
10475 incrRefCount(channel); /* channel may be just a pointer to the same object
201037f5 10476 we have in the hash tables. Protect it... */
ffc6b7f8 10477 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
befec3cd 10478 retval = 1;
ffc6b7f8 10479 /* Remove the client from the channel -> clients list hash table */
10480 de = dictFind(server.pubsub_channels,channel);
befec3cd 10481 assert(de != NULL);
10482 clients = dictGetEntryVal(de);
10483 ln = listSearchKey(clients,c);
10484 assert(ln != NULL);
10485 listDelNode(clients,ln);
ff767a75 10486 if (listLength(clients) == 0) {
10487 /* Free the list and associated hash entry at all if this was
10488 * the latest client, so that it will be possible to abuse
ffc6b7f8 10489 * Redis PUBSUB creating millions of channels. */
10490 dictDelete(server.pubsub_channels,channel);
ff767a75 10491 }
befec3cd 10492 }
10493 /* Notify the client */
10494 if (notify) {
10495 addReply(c,shared.mbulk3);
10496 addReply(c,shared.unsubscribebulk);
ffc6b7f8 10497 addReplyBulk(c,channel);
482b672d 10498 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10499 listLength(c->pubsub_patterns));
10500
10501 }
10502 decrRefCount(channel); /* it is finally safe to release it */
10503 return retval;
10504}
10505
10506/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10507static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10508 int retval = 0;
10509
10510 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10511 retval = 1;
10512 pubsubPattern *pat;
10513 listAddNodeTail(c->pubsub_patterns,pattern);
10514 incrRefCount(pattern);
10515 pat = zmalloc(sizeof(*pat));
10516 pat->pattern = getDecodedObject(pattern);
10517 pat->client = c;
10518 listAddNodeTail(server.pubsub_patterns,pat);
10519 }
10520 /* Notify the client */
10521 addReply(c,shared.mbulk3);
10522 addReply(c,shared.psubscribebulk);
10523 addReplyBulk(c,pattern);
482b672d 10524 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
ffc6b7f8 10525 return retval;
10526}
10527
10528/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10529 * 0 if the client was not subscribed to the specified channel. */
10530static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10531 listNode *ln;
10532 pubsubPattern pat;
10533 int retval = 0;
10534
10535 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10536 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10537 retval = 1;
10538 listDelNode(c->pubsub_patterns,ln);
10539 pat.client = c;
10540 pat.pattern = pattern;
10541 ln = listSearchKey(server.pubsub_patterns,&pat);
10542 listDelNode(server.pubsub_patterns,ln);
10543 }
10544 /* Notify the client */
10545 if (notify) {
10546 addReply(c,shared.mbulk3);
10547 addReply(c,shared.punsubscribebulk);
10548 addReplyBulk(c,pattern);
482b672d 10549 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10550 listLength(c->pubsub_patterns));
befec3cd 10551 }
ffc6b7f8 10552 decrRefCount(pattern);
befec3cd 10553 return retval;
10554}
10555
ffc6b7f8 10556/* Unsubscribe from all the channels. Return the number of channels the
10557 * client was subscribed from. */
10558static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10559 dictIterator *di = dictGetIterator(c->pubsub_channels);
befec3cd 10560 dictEntry *de;
10561 int count = 0;
10562
10563 while((de = dictNext(di)) != NULL) {
ffc6b7f8 10564 robj *channel = dictGetEntryKey(de);
befec3cd 10565
ffc6b7f8 10566 count += pubsubUnsubscribeChannel(c,channel,notify);
befec3cd 10567 }
10568 dictReleaseIterator(di);
10569 return count;
10570}
10571
ffc6b7f8 10572/* Unsubscribe from all the patterns. Return the number of patterns the
10573 * client was subscribed from. */
10574static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10575 listNode *ln;
10576 listIter li;
10577 int count = 0;
10578
10579 listRewind(c->pubsub_patterns,&li);
10580 while ((ln = listNext(&li)) != NULL) {
10581 robj *pattern = ln->value;
10582
10583 count += pubsubUnsubscribePattern(c,pattern,notify);
10584 }
10585 return count;
10586}
10587
befec3cd 10588/* Publish a message */
ffc6b7f8 10589static int pubsubPublishMessage(robj *channel, robj *message) {
befec3cd 10590 int receivers = 0;
10591 struct dictEntry *de;
ffc6b7f8 10592 listNode *ln;
10593 listIter li;
befec3cd 10594
ffc6b7f8 10595 /* Send to clients listening for that channel */
10596 de = dictFind(server.pubsub_channels,channel);
befec3cd 10597 if (de) {
10598 list *list = dictGetEntryVal(de);
10599 listNode *ln;
10600 listIter li;
10601
10602 listRewind(list,&li);
10603 while ((ln = listNext(&li)) != NULL) {
10604 redisClient *c = ln->value;
10605
10606 addReply(c,shared.mbulk3);
10607 addReply(c,shared.messagebulk);
ffc6b7f8 10608 addReplyBulk(c,channel);
befec3cd 10609 addReplyBulk(c,message);
10610 receivers++;
10611 }
10612 }
ffc6b7f8 10613 /* Send to clients listening to matching channels */
10614 if (listLength(server.pubsub_patterns)) {
10615 listRewind(server.pubsub_patterns,&li);
10616 channel = getDecodedObject(channel);
10617 while ((ln = listNext(&li)) != NULL) {
10618 pubsubPattern *pat = ln->value;
10619
10620 if (stringmatchlen((char*)pat->pattern->ptr,
10621 sdslen(pat->pattern->ptr),
10622 (char*)channel->ptr,
10623 sdslen(channel->ptr),0)) {
c8d0ea0e 10624 addReply(pat->client,shared.mbulk4);
10625 addReply(pat->client,shared.pmessagebulk);
10626 addReplyBulk(pat->client,pat->pattern);
ffc6b7f8 10627 addReplyBulk(pat->client,channel);
10628 addReplyBulk(pat->client,message);
10629 receivers++;
10630 }
10631 }
10632 decrRefCount(channel);
10633 }
befec3cd 10634 return receivers;
10635}
10636
10637static void subscribeCommand(redisClient *c) {
10638 int j;
10639
10640 for (j = 1; j < c->argc; j++)
ffc6b7f8 10641 pubsubSubscribeChannel(c,c->argv[j]);
befec3cd 10642}
10643
10644static void unsubscribeCommand(redisClient *c) {
10645 if (c->argc == 1) {
ffc6b7f8 10646 pubsubUnsubscribeAllChannels(c,1);
10647 return;
10648 } else {
10649 int j;
10650
10651 for (j = 1; j < c->argc; j++)
10652 pubsubUnsubscribeChannel(c,c->argv[j],1);
10653 }
10654}
10655
10656static void psubscribeCommand(redisClient *c) {
10657 int j;
10658
10659 for (j = 1; j < c->argc; j++)
10660 pubsubSubscribePattern(c,c->argv[j]);
10661}
10662
10663static void punsubscribeCommand(redisClient *c) {
10664 if (c->argc == 1) {
10665 pubsubUnsubscribeAllPatterns(c,1);
befec3cd 10666 return;
10667 } else {
10668 int j;
10669
10670 for (j = 1; j < c->argc; j++)
ffc6b7f8 10671 pubsubUnsubscribePattern(c,c->argv[j],1);
befec3cd 10672 }
10673}
10674
10675static void publishCommand(redisClient *c) {
10676 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
482b672d 10677 addReplyLongLong(c,receivers);
befec3cd 10678}
10679
37ab76c9 10680/* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10681 *
10682 * The implementation uses a per-DB hash table mapping keys to list of clients
10683 * WATCHing those keys, so that given a key that is going to be modified
10684 * we can mark all the associated clients as dirty.
10685 *
10686 * Also every client contains a list of WATCHed keys so that's possible to
10687 * un-watch such keys when the client is freed or when UNWATCH is called. */
10688
10689/* In the client->watched_keys list we need to use watchedKey structures
10690 * as in order to identify a key in Redis we need both the key name and the
10691 * DB */
10692typedef struct watchedKey {
10693 robj *key;
10694 redisDb *db;
10695} watchedKey;
10696
10697/* Watch for the specified key */
10698static void watchForKey(redisClient *c, robj *key) {
10699 list *clients = NULL;
10700 listIter li;
10701 listNode *ln;
10702 watchedKey *wk;
10703
10704 /* Check if we are already watching for this key */
10705 listRewind(c->watched_keys,&li);
10706 while((ln = listNext(&li))) {
10707 wk = listNodeValue(ln);
10708 if (wk->db == c->db && equalStringObjects(key,wk->key))
10709 return; /* Key already watched */
10710 }
10711 /* This key is not already watched in this DB. Let's add it */
10712 clients = dictFetchValue(c->db->watched_keys,key);
10713 if (!clients) {
10714 clients = listCreate();
10715 dictAdd(c->db->watched_keys,key,clients);
10716 incrRefCount(key);
10717 }
10718 listAddNodeTail(clients,c);
10719 /* Add the new key to the lits of keys watched by this client */
10720 wk = zmalloc(sizeof(*wk));
10721 wk->key = key;
10722 wk->db = c->db;
10723 incrRefCount(key);
10724 listAddNodeTail(c->watched_keys,wk);
10725}
10726
10727/* Unwatch all the keys watched by this client. To clean the EXEC dirty
10728 * flag is up to the caller. */
10729static void unwatchAllKeys(redisClient *c) {
10730 listIter li;
10731 listNode *ln;
10732
10733 if (listLength(c->watched_keys) == 0) return;
10734 listRewind(c->watched_keys,&li);
10735 while((ln = listNext(&li))) {
10736 list *clients;
10737 watchedKey *wk;
10738
10739 /* Lookup the watched key -> clients list and remove the client
10740 * from the list */
10741 wk = listNodeValue(ln);
10742 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10743 assert(clients != NULL);
10744 listDelNode(clients,listSearchKey(clients,c));
10745 /* Kill the entry at all if this was the only client */
10746 if (listLength(clients) == 0)
10747 dictDelete(wk->db->watched_keys, wk->key);
10748 /* Remove this watched key from the client->watched list */
10749 listDelNode(c->watched_keys,ln);
10750 decrRefCount(wk->key);
10751 zfree(wk);
10752 }
10753}
10754
ca3f830b 10755/* "Touch" a key, so that if this key is being WATCHed by some client the
37ab76c9 10756 * next EXEC will fail. */
10757static void touchWatchedKey(redisDb *db, robj *key) {
10758 list *clients;
10759 listIter li;
10760 listNode *ln;
10761
10762 if (dictSize(db->watched_keys) == 0) return;
10763 clients = dictFetchValue(db->watched_keys, key);
10764 if (!clients) return;
10765
10766 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
10767 /* Check if we are already watching for this key */
10768 listRewind(clients,&li);
10769 while((ln = listNext(&li))) {
10770 redisClient *c = listNodeValue(ln);
10771
10772 c->flags |= REDIS_DIRTY_CAS;
10773 }
10774}
10775
9b30e1a2 10776/* On FLUSHDB or FLUSHALL all the watched keys that are present before the
10777 * flush but will be deleted as effect of the flushing operation should
10778 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
10779 * a FLUSHALL operation (all the DBs flushed). */
10780static void touchWatchedKeysOnFlush(int dbid) {
10781 listIter li1, li2;
10782 listNode *ln;
10783
10784 /* For every client, check all the waited keys */
10785 listRewind(server.clients,&li1);
10786 while((ln = listNext(&li1))) {
10787 redisClient *c = listNodeValue(ln);
10788 listRewind(c->watched_keys,&li2);
10789 while((ln = listNext(&li2))) {
10790 watchedKey *wk = listNodeValue(ln);
10791
10792 /* For every watched key matching the specified DB, if the
10793 * key exists, mark the client as dirty, as the key will be
10794 * removed. */
10795 if (dbid == -1 || wk->db->id == dbid) {
10796 if (dictFind(wk->db->dict, wk->key) != NULL)
10797 c->flags |= REDIS_DIRTY_CAS;
10798 }
10799 }
10800 }
10801}
10802
37ab76c9 10803static void watchCommand(redisClient *c) {
10804 int j;
10805
6531c94d 10806 if (c->flags & REDIS_MULTI) {
10807 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
10808 return;
10809 }
37ab76c9 10810 for (j = 1; j < c->argc; j++)
10811 watchForKey(c,c->argv[j]);
10812 addReply(c,shared.ok);
10813}
10814
10815static void unwatchCommand(redisClient *c) {
10816 unwatchAllKeys(c);
10817 c->flags &= (~REDIS_DIRTY_CAS);
10818 addReply(c,shared.ok);
10819}
10820
7f957c92 10821/* ================================= Debugging ============================== */
10822
ba798261 10823/* Compute the sha1 of string at 's' with 'len' bytes long.
10824 * The SHA1 is then xored againt the string pointed by digest.
10825 * Since xor is commutative, this operation is used in order to
10826 * "add" digests relative to unordered elements.
10827 *
10828 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
10829static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
10830 SHA1_CTX ctx;
10831 unsigned char hash[20], *s = ptr;
10832 int j;
10833
10834 SHA1Init(&ctx);
10835 SHA1Update(&ctx,s,len);
10836 SHA1Final(hash,&ctx);
10837
10838 for (j = 0; j < 20; j++)
10839 digest[j] ^= hash[j];
10840}
10841
10842static void xorObjectDigest(unsigned char *digest, robj *o) {
10843 o = getDecodedObject(o);
10844 xorDigest(digest,o->ptr,sdslen(o->ptr));
10845 decrRefCount(o);
10846}
10847
10848/* This function instead of just computing the SHA1 and xoring it
10849 * against diget, also perform the digest of "digest" itself and
10850 * replace the old value with the new one.
10851 *
10852 * So the final digest will be:
10853 *
10854 * digest = SHA1(digest xor SHA1(data))
10855 *
10856 * This function is used every time we want to preserve the order so
10857 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
10858 *
10859 * Also note that mixdigest("foo") followed by mixdigest("bar")
10860 * will lead to a different digest compared to "fo", "obar".
10861 */
10862static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
10863 SHA1_CTX ctx;
10864 char *s = ptr;
10865
10866 xorDigest(digest,s,len);
10867 SHA1Init(&ctx);
10868 SHA1Update(&ctx,digest,20);
10869 SHA1Final(digest,&ctx);
10870}
10871
10872static void mixObjectDigest(unsigned char *digest, robj *o) {
10873 o = getDecodedObject(o);
10874 mixDigest(digest,o->ptr,sdslen(o->ptr));
10875 decrRefCount(o);
10876}
10877
10878/* Compute the dataset digest. Since keys, sets elements, hashes elements
10879 * are not ordered, we use a trick: every aggregate digest is the xor
10880 * of the digests of their elements. This way the order will not change
10881 * the result. For list instead we use a feedback entering the output digest
10882 * as input in order to ensure that a different ordered list will result in
10883 * a different digest. */
10884static void computeDatasetDigest(unsigned char *final) {
10885 unsigned char digest[20];
10886 char buf[128];
10887 dictIterator *di = NULL;
10888 dictEntry *de;
10889 int j;
10890 uint32_t aux;
10891
10892 memset(final,0,20); /* Start with a clean result */
10893
10894 for (j = 0; j < server.dbnum; j++) {
10895 redisDb *db = server.db+j;
10896
10897 if (dictSize(db->dict) == 0) continue;
10898 di = dictGetIterator(db->dict);
10899
10900 /* hash the DB id, so the same dataset moved in a different
10901 * DB will lead to a different digest */
10902 aux = htonl(j);
10903 mixDigest(final,&aux,sizeof(aux));
10904
10905 /* Iterate this DB writing every entry */
10906 while((de = dictNext(di)) != NULL) {
cbae1d34 10907 robj *key, *o, *kcopy;
ba798261 10908 time_t expiretime;
10909
10910 memset(digest,0,20); /* This key-val digest */
10911 key = dictGetEntryKey(de);
cbae1d34 10912
10913 if (!server.vm_enabled) {
10914 mixObjectDigest(digest,key);
ba798261 10915 o = dictGetEntryVal(de);
ba798261 10916 } else {
cbae1d34 10917 /* Don't work with the key directly as when VM is active
10918 * this is unsafe: TODO: fix decrRefCount to check if the
10919 * count really reached 0 to avoid this mess */
10920 kcopy = dupStringObject(key);
10921 mixObjectDigest(digest,kcopy);
10922 o = lookupKeyRead(db,kcopy);
10923 decrRefCount(kcopy);
ba798261 10924 }
10925 aux = htonl(o->type);
10926 mixDigest(digest,&aux,sizeof(aux));
10927 expiretime = getExpire(db,key);
10928
10929 /* Save the key and associated value */
10930 if (o->type == REDIS_STRING) {
10931 mixObjectDigest(digest,o);
10932 } else if (o->type == REDIS_LIST) {
dc845730
PN
10933 lIterator *li = lInitIterator(o,0,REDIS_TAIL);
10934 lEntry entry;
10935 while(lNext(li,&entry)) {
10936 robj *eleobj = lGet(&entry);
ba798261 10937 mixObjectDigest(digest,eleobj);
dc845730 10938 decrRefCount(eleobj);
ba798261 10939 }
dc845730 10940 lReleaseIterator(li);
ba798261 10941 } else if (o->type == REDIS_SET) {
10942 dict *set = o->ptr;
10943 dictIterator *di = dictGetIterator(set);
10944 dictEntry *de;
10945
10946 while((de = dictNext(di)) != NULL) {
10947 robj *eleobj = dictGetEntryKey(de);
10948
10949 xorObjectDigest(digest,eleobj);
10950 }
10951 dictReleaseIterator(di);
10952 } else if (o->type == REDIS_ZSET) {
10953 zset *zs = o->ptr;
10954 dictIterator *di = dictGetIterator(zs->dict);
10955 dictEntry *de;
10956
10957 while((de = dictNext(di)) != NULL) {
10958 robj *eleobj = dictGetEntryKey(de);
10959 double *score = dictGetEntryVal(de);
10960 unsigned char eledigest[20];
10961
10962 snprintf(buf,sizeof(buf),"%.17g",*score);
10963 memset(eledigest,0,20);
10964 mixObjectDigest(eledigest,eleobj);
10965 mixDigest(eledigest,buf,strlen(buf));
10966 xorDigest(digest,eledigest,20);
10967 }
10968 dictReleaseIterator(di);
10969 } else if (o->type == REDIS_HASH) {
10970 hashIterator *hi;
10971 robj *obj;
10972
10973 hi = hashInitIterator(o);
10974 while (hashNext(hi) != REDIS_ERR) {
10975 unsigned char eledigest[20];
10976
10977 memset(eledigest,0,20);
10978 obj = hashCurrent(hi,REDIS_HASH_KEY);
10979 mixObjectDigest(eledigest,obj);
10980 decrRefCount(obj);
10981 obj = hashCurrent(hi,REDIS_HASH_VALUE);
10982 mixObjectDigest(eledigest,obj);
10983 decrRefCount(obj);
10984 xorDigest(digest,eledigest,20);
10985 }
10986 hashReleaseIterator(hi);
10987 } else {
10988 redisPanic("Unknown object type");
10989 }
ba798261 10990 /* If the key has an expire, add it to the mix */
10991 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
10992 /* We can finally xor the key-val digest to the final digest */
10993 xorDigest(final,digest,20);
10994 }
10995 dictReleaseIterator(di);
10996 }
10997}
10998
7f957c92 10999static void debugCommand(redisClient *c) {
11000 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
11001 *((char*)-1) = 'x';
210e29f7 11002 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
11003 if (rdbSave(server.dbfilename) != REDIS_OK) {
11004 addReply(c,shared.err);
11005 return;
11006 }
11007 emptyDb();
11008 if (rdbLoad(server.dbfilename) != REDIS_OK) {
11009 addReply(c,shared.err);
11010 return;
11011 }
11012 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
11013 addReply(c,shared.ok);
71c2b467 11014 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
11015 emptyDb();
11016 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
11017 addReply(c,shared.err);
11018 return;
11019 }
11020 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
11021 addReply(c,shared.ok);
333298da 11022 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
11023 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
11024 robj *key, *val;
11025
11026 if (!de) {
11027 addReply(c,shared.nokeyerr);
11028 return;
11029 }
11030 key = dictGetEntryKey(de);
11031 val = dictGetEntryVal(de);
59146ef3 11032 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
11033 key->storage == REDIS_VM_SWAPPING)) {
07efaf74 11034 char *strenc;
11035 char buf[128];
11036
11037 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
11038 strenc = strencoding[val->encoding];
11039 } else {
11040 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
11041 strenc = buf;
11042 }
ace06542 11043 addReplySds(c,sdscatprintf(sdsempty(),
11044 "+Key at:%p refcount:%d, value at:%p refcount:%d "
07efaf74 11045 "encoding:%s serializedlength:%lld\r\n",
682ac724 11046 (void*)key, key->refcount, (void*)val, val->refcount,
07efaf74 11047 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 11048 } else {
11049 addReplySds(c,sdscatprintf(sdsempty(),
11050 "+Key at:%p refcount:%d, value swapped at: page %llu "
11051 "using %llu pages\r\n",
11052 (void*)key, key->refcount, (unsigned long long) key->vm.page,
11053 (unsigned long long) key->vm.usedpages));
11054 }
78ebe4c8 11055 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
11056 lookupKeyRead(c->db,c->argv[2]);
11057 addReply(c,shared.ok);
7d30035d 11058 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
11059 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
11060 robj *key, *val;
11061
11062 if (!server.vm_enabled) {
11063 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
11064 return;
11065 }
11066 if (!de) {
11067 addReply(c,shared.nokeyerr);
11068 return;
11069 }
11070 key = dictGetEntryKey(de);
11071 val = dictGetEntryVal(de);
4ef8de8a 11072 /* If the key is shared we want to create a copy */
11073 if (key->refcount > 1) {
11074 robj *newkey = dupStringObject(key);
11075 decrRefCount(key);
11076 key = dictGetEntryKey(de) = newkey;
11077 }
11078 /* Swap it */
7d30035d 11079 if (key->storage != REDIS_VM_MEMORY) {
11080 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 11081 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 11082 dictGetEntryVal(de) = NULL;
11083 addReply(c,shared.ok);
11084 } else {
11085 addReply(c,shared.err);
11086 }
59305dc7 11087 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
11088 long keys, j;
11089 robj *key, *val;
11090 char buf[128];
11091
11092 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
11093 return;
11094 for (j = 0; j < keys; j++) {
11095 snprintf(buf,sizeof(buf),"key:%lu",j);
11096 key = createStringObject(buf,strlen(buf));
11097 if (lookupKeyRead(c->db,key) != NULL) {
11098 decrRefCount(key);
11099 continue;
11100 }
11101 snprintf(buf,sizeof(buf),"value:%lu",j);
11102 val = createStringObject(buf,strlen(buf));
11103 dictAdd(c->db->dict,key,val);
11104 }
11105 addReply(c,shared.ok);
ba798261 11106 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
11107 unsigned char digest[20];
11108 sds d = sdsnew("+");
11109 int j;
11110
11111 computeDatasetDigest(digest);
11112 for (j = 0; j < 20; j++)
11113 d = sdscatprintf(d, "%02x",digest[j]);
11114
11115 d = sdscatlen(d,"\r\n",2);
11116 addReplySds(c,d);
7f957c92 11117 } else {
333298da 11118 addReplySds(c,sdsnew(
bdcb92f2 11119 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 11120 }
11121}
56906eef 11122
6c96ba7d 11123static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 11124 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
fdfb02e7 11125 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
dfc5e96c 11126#ifdef HAVE_BACKTRACE
11127 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11128 *((char*)-1) = 'x';
11129#endif
11130}
11131
c651fd9e 11132static void _redisPanic(char *msg, char *file, int line) {
11133 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
17772754 11134 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
c651fd9e 11135#ifdef HAVE_BACKTRACE
11136 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11137 *((char*)-1) = 'x';
11138#endif
11139}
11140
bcfc686d 11141/* =================================== Main! ================================ */
56906eef 11142
bcfc686d 11143#ifdef __linux__
11144int linuxOvercommitMemoryValue(void) {
11145 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
11146 char buf[64];
56906eef 11147
bcfc686d 11148 if (!fp) return -1;
11149 if (fgets(buf,64,fp) == NULL) {
11150 fclose(fp);
11151 return -1;
11152 }
11153 fclose(fp);
56906eef 11154
bcfc686d 11155 return atoi(buf);
11156}
11157
11158void linuxOvercommitMemoryWarning(void) {
11159 if (linuxOvercommitMemoryValue() == 0) {
7ccd2d0a 11160 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
bcfc686d 11161 }
11162}
11163#endif /* __linux__ */
11164
11165static void daemonize(void) {
11166 int fd;
11167 FILE *fp;
11168
11169 if (fork() != 0) exit(0); /* parent exits */
11170 setsid(); /* create a new session */
11171
11172 /* Every output goes to /dev/null. If Redis is daemonized but
11173 * the 'logfile' is set to 'stdout' in the configuration file
11174 * it will not log at all. */
11175 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
11176 dup2(fd, STDIN_FILENO);
11177 dup2(fd, STDOUT_FILENO);
11178 dup2(fd, STDERR_FILENO);
11179 if (fd > STDERR_FILENO) close(fd);
11180 }
11181 /* Try to write the pid file */
11182 fp = fopen(server.pidfile,"w");
11183 if (fp) {
11184 fprintf(fp,"%d\n",getpid());
11185 fclose(fp);
56906eef 11186 }
56906eef 11187}
11188
42ab0172 11189static void version() {
8a3b0d2d 11190 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
11191 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
42ab0172
AO
11192 exit(0);
11193}
11194
723fb69b
AO
11195static void usage() {
11196 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 11197 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
11198 exit(1);
11199}
11200
bcfc686d 11201int main(int argc, char **argv) {
9651a787 11202 time_t start;
11203
bcfc686d 11204 initServerConfig();
1a132bbc 11205 sortCommandTable();
bcfc686d 11206 if (argc == 2) {
44efe66e 11207 if (strcmp(argv[1], "-v") == 0 ||
11208 strcmp(argv[1], "--version") == 0) version();
11209 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 11210 resetServerSaveParams();
11211 loadServerConfig(argv[1]);
723fb69b
AO
11212 } else if ((argc > 2)) {
11213 usage();
bcfc686d 11214 } else {
11215 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
11216 }
bcfc686d 11217 if (server.daemonize) daemonize();
71c54b21 11218 initServer();
bcfc686d 11219 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
11220#ifdef __linux__
11221 linuxOvercommitMemoryWarning();
11222#endif
9651a787 11223 start = time(NULL);
bcfc686d 11224 if (server.appendonly) {
11225 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 11226 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 11227 } else {
11228 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 11229 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 11230 }
bcfc686d 11231 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 11232 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 11233 aeMain(server.el);
11234 aeDeleteEventLoop(server.el);
11235 return 0;
11236}
11237
11238/* ============================= Backtrace support ========================= */
11239
11240#ifdef HAVE_BACKTRACE
11241static char *findFuncName(void *pointer, unsigned long *offset);
11242
56906eef 11243static void *getMcontextEip(ucontext_t *uc) {
11244#if defined(__FreeBSD__)
11245 return (void*) uc->uc_mcontext.mc_eip;
11246#elif defined(__dietlibc__)
11247 return (void*) uc->uc_mcontext.eip;
06db1f50 11248#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 11249 #if __x86_64__
11250 return (void*) uc->uc_mcontext->__ss.__rip;
11251 #else
56906eef 11252 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 11253 #endif
06db1f50 11254#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 11255 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 11256 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 11257 #else
11258 return (void*) uc->uc_mcontext->__ss.__eip;
e0a62c7f 11259 #endif
54bac49d 11260#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 11261 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 11262#elif defined(__ia64__) /* Linux IA64 */
11263 return (void*) uc->uc_mcontext.sc_ip;
11264#else
11265 return NULL;
56906eef 11266#endif
11267}
11268
11269static void segvHandler(int sig, siginfo_t *info, void *secret) {
11270 void *trace[100];
11271 char **messages = NULL;
11272 int i, trace_size = 0;
11273 unsigned long offset=0;
56906eef 11274 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 11275 sds infostring;
56906eef 11276 REDIS_NOTUSED(info);
11277
11278 redisLog(REDIS_WARNING,
11279 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 11280 infostring = genRedisInfoString();
11281 redisLog(REDIS_WARNING, "%s",infostring);
11282 /* It's not safe to sdsfree() the returned string under memory
11283 * corruption conditions. Let it leak as we are going to abort */
e0a62c7f 11284
56906eef 11285 trace_size = backtrace(trace, 100);
de96dbfe 11286 /* overwrite sigaction with caller's address */
b91cf5ef 11287 if (getMcontextEip(uc) != NULL) {
11288 trace[1] = getMcontextEip(uc);
11289 }
56906eef 11290 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 11291
d76412d1 11292 for (i=1; i<trace_size; ++i) {
56906eef 11293 char *fn = findFuncName(trace[i], &offset), *p;
11294
11295 p = strchr(messages[i],'+');
11296 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11297 redisLog(REDIS_WARNING,"%s", messages[i]);
11298 } else {
11299 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11300 }
11301 }
b177fd30 11302 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 11303 _exit(0);
fe3bbfbe 11304}
56906eef 11305
fab43727 11306static void sigtermHandler(int sig) {
11307 REDIS_NOTUSED(sig);
b58ba105 11308
fab43727 11309 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11310 server.shutdown_asap = 1;
b58ba105
AM
11311}
11312
56906eef 11313static void setupSigSegvAction(void) {
11314 struct sigaction act;
11315
11316 sigemptyset (&act.sa_mask);
11317 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11318 * is used. Otherwise, sa_handler is used */
11319 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11320 act.sa_sigaction = segvHandler;
11321 sigaction (SIGSEGV, &act, NULL);
11322 sigaction (SIGBUS, &act, NULL);
12fea928 11323 sigaction (SIGFPE, &act, NULL);
11324 sigaction (SIGILL, &act, NULL);
11325 sigaction (SIGBUS, &act, NULL);
b58ba105
AM
11326
11327 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
fab43727 11328 act.sa_handler = sigtermHandler;
b58ba105 11329 sigaction (SIGTERM, &act, NULL);
e65fdc78 11330 return;
56906eef 11331}
e65fdc78 11332
bcfc686d 11333#include "staticsymbols.h"
11334/* This function try to convert a pointer into a function name. It's used in
11335 * oreder to provide a backtrace under segmentation fault that's able to
11336 * display functions declared as static (otherwise the backtrace is useless). */
11337static char *findFuncName(void *pointer, unsigned long *offset){
11338 int i, ret = -1;
11339 unsigned long off, minoff = 0;
ed9b544e 11340
bcfc686d 11341 /* Try to match against the Symbol with the smallest offset */
11342 for (i=0; symsTable[i].pointer; i++) {
11343 unsigned long lp = (unsigned long) pointer;
0bc03378 11344
bcfc686d 11345 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11346 off=lp-symsTable[i].pointer;
11347 if (ret < 0 || off < minoff) {
11348 minoff=off;
11349 ret=i;
11350 }
11351 }
0bc03378 11352 }
bcfc686d 11353 if (ret == -1) return NULL;
11354 *offset = minoff;
11355 return symsTable[ret].name;
0bc03378 11356}
bcfc686d 11357#else /* HAVE_BACKTRACE */
11358static void setupSigSegvAction(void) {
0bc03378 11359}
bcfc686d 11360#endif /* HAVE_BACKTRACE */
0bc03378 11361
ed9b544e 11362
ed9b544e 11363
bcfc686d 11364/* The End */
11365
11366
ed9b544e 11367