]> git.saurik.com Git - redis.git/blame - redis.c
intset housekeeping
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
9005896c 30#define REDIS_VERSION "2.1.1"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
40#include <signal.h>
fbf9bcdb 41
42#ifdef HAVE_BACKTRACE
c9468bcf 43#include <execinfo.h>
44#include <ucontext.h>
fbf9bcdb 45#endif /* HAVE_BACKTRACE */
46
ed9b544e 47#include <sys/wait.h>
48#include <errno.h>
49#include <assert.h>
50#include <ctype.h>
51#include <stdarg.h>
52#include <inttypes.h>
53#include <arpa/inet.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#include <sys/time.h>
57#include <sys/resource.h>
2895e862 58#include <sys/uio.h>
f78fd11b 59#include <limits.h>
fb82e75c 60#include <float.h>
a7866db6 61#include <math.h>
92f8e882 62#include <pthread.h>
0bc1b2f6 63
64#if defined(__sun)
5043dff3 65#include "solarisfixes.h"
66#endif
ed9b544e 67
c9468bcf 68#include "redis.h"
ed9b544e 69#include "ae.h" /* Event driven programming library */
70#include "sds.h" /* Dynamic safe strings */
71#include "anet.h" /* Networking the easy way */
72#include "dict.h" /* Hash tables */
73#include "adlist.h" /* Linked lists */
74#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 75#include "lzf.h" /* LZF compression library */
76#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
ba798261 77#include "zipmap.h" /* Compact dictionary-alike data structure */
c7d9d662 78#include "ziplist.h" /* Compact list data structure */
ba798261 79#include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
5436146c 80#include "release.h" /* Release and/or git repository information */
ed9b544e 81
82/* Error codes */
83#define REDIS_OK 0
84#define REDIS_ERR -1
85
86/* Static server configuration */
87#define REDIS_SERVERPORT 6379 /* TCP port */
88#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 89#define REDIS_IOBUF_LEN 1024
ed9b544e 90#define REDIS_LOADBUF_LEN 1024
248ea310 91#define REDIS_STATIC_ARGS 8
ed9b544e 92#define REDIS_DEFAULT_DBNUM 16
93#define REDIS_CONFIGLINE_MAX 1024
94#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
95#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
8ca3e9d1 96#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
6f376729 97#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 98#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
99
100/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
101#define REDIS_WRITEV_THRESHOLD 3
102/* Max number of iovecs used for each writev call */
103#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 104
105/* Hash table parameters */
106#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 107
108/* Command flags */
3fd78bcd 109#define REDIS_CMD_BULK 1 /* Bulk write command */
110#define REDIS_CMD_INLINE 2 /* Inline command */
111/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
112 this flags will return an error when the 'maxmemory' option is set in the
113 config file and the server is using more than maxmemory bytes of memory.
114 In short this commands are denied on low memory conditions. */
115#define REDIS_CMD_DENYOOM 4
4005fef1 116#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
ed9b544e 117
118/* Object types */
119#define REDIS_STRING 0
120#define REDIS_LIST 1
121#define REDIS_SET 2
1812e024 122#define REDIS_ZSET 3
123#define REDIS_HASH 4
560db612 124#define REDIS_VMPOINTER 8
f78fd11b 125
5234952b 126/* Objects encoding. Some kind of objects like Strings and Hashes can be
127 * internally represented in multiple ways. The 'encoding' field of the object
128 * is set to one of this fields for this object. */
c7d9d662
PN
129#define REDIS_ENCODING_RAW 0 /* Raw representation */
130#define REDIS_ENCODING_INT 1 /* Encoded as integer */
131#define REDIS_ENCODING_HT 2 /* Encoded as hash table */
132#define REDIS_ENCODING_ZIPMAP 3 /* Encoded as zipmap */
133#define REDIS_ENCODING_LIST 4 /* Encoded as zipmap */
134#define REDIS_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
942a3961 135
07efaf74 136static char* strencoding[] = {
846d8b3e 137 "raw", "int", "hashtable", "zipmap", "list", "ziplist"
07efaf74 138};
139
f78fd11b 140/* Object types only used for dumping to disk */
bb32ede5 141#define REDIS_EXPIRETIME 253
ed9b544e 142#define REDIS_SELECTDB 254
143#define REDIS_EOF 255
144
f78fd11b 145/* Defines related to the dump file format. To store 32 bits lengths for short
146 * keys requires a lot of space, so we check the most significant 2 bits of
147 * the first byte to interpreter the length:
148 *
149 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
150 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
151 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 152 * 11|000000 this means: specially encoded object will follow. The six bits
153 * number specify the kind of object that follows.
154 * See the REDIS_RDB_ENC_* defines.
f78fd11b 155 *
10c43610 156 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
157 * values, will fit inside. */
f78fd11b 158#define REDIS_RDB_6BITLEN 0
159#define REDIS_RDB_14BITLEN 1
160#define REDIS_RDB_32BITLEN 2
17be1a4a 161#define REDIS_RDB_ENCVAL 3
f78fd11b 162#define REDIS_RDB_LENERR UINT_MAX
163
a4d1ba9a 164/* When a length of a string object stored on disk has the first two bits
165 * set, the remaining two bits specify a special encoding for the object
166 * accordingly to the following defines: */
167#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
168#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
169#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 170#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 171
75680a3c 172/* Virtual memory object->where field. */
173#define REDIS_VM_MEMORY 0 /* The object is on memory */
174#define REDIS_VM_SWAPPED 1 /* The object is on disk */
175#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
176#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
177
06224fec 178/* Virtual memory static configuration stuff.
179 * Check vmFindContiguousPages() to know more about this magic numbers. */
180#define REDIS_VM_MAX_NEAR_PAGES 65536
181#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 182#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 183#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 184/* The following is the *percentage* of completed I/O jobs to process when the
185 * handelr is called. While Virtual Memory I/O operations are performed by
186 * threads, this operations must be processed by the main thread when completed
187 * in order to take effect. */
c953f24b 188#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 189
ed9b544e 190/* Client flags */
d5d55fc3 191#define REDIS_SLAVE 1 /* This client is a slave server */
192#define REDIS_MASTER 2 /* This client is a master server */
193#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
194#define REDIS_MULTI 8 /* This client is in a MULTI context */
195#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
196#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
37ab76c9 197#define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
ed9b544e 198
40d224a9 199/* Slave replication state - slave side */
ed9b544e 200#define REDIS_REPL_NONE 0 /* No active replication */
201#define REDIS_REPL_CONNECT 1 /* Must connect to master */
202#define REDIS_REPL_CONNECTED 2 /* Connected to master */
203
40d224a9 204/* Slave replication state - from the point of view of master
205 * Note that in SEND_BULK and ONLINE state the slave receives new updates
206 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
207 * to start the next background saving in order to send updates to it. */
208#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
209#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
210#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
211#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
212
ed9b544e 213/* List related stuff */
214#define REDIS_HEAD 0
215#define REDIS_TAIL 1
216
217/* Sort operations */
218#define REDIS_SORT_GET 0
443c6409 219#define REDIS_SORT_ASC 1
220#define REDIS_SORT_DESC 2
ed9b544e 221#define REDIS_SORTKEY_MAX 1024
222
223/* Log levels */
224#define REDIS_DEBUG 0
f870935d 225#define REDIS_VERBOSE 1
226#define REDIS_NOTICE 2
227#define REDIS_WARNING 3
ed9b544e 228
229/* Anti-warning macro... */
230#define REDIS_NOTUSED(V) ((void) V)
231
6b47e12e 232#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
233#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 234
48f0308a 235/* Append only defines */
236#define APPENDFSYNC_NO 0
237#define APPENDFSYNC_ALWAYS 1
238#define APPENDFSYNC_EVERYSEC 2
239
d0686e07 240/* Zip structure related defaults */
cbba7dd7 241#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
242#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
d0686e07
PN
243#define REDIS_LIST_MAX_ZIPLIST_ENTRIES 1024
244#define REDIS_LIST_MAX_ZIPLIST_VALUE 32
cbba7dd7 245
dfc5e96c 246/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 247#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
c651fd9e 248#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
6c96ba7d 249static void _redisAssert(char *estr, char *file, int line);
c651fd9e 250static void _redisPanic(char *msg, char *file, int line);
dfc5e96c 251
ed9b544e 252/*================================= Data types ============================== */
253
254/* A redis object, that is a type able to hold a string / list / set */
75680a3c 255
75680a3c 256/* The actual Redis Object */
ed9b544e 257typedef struct redisObject {
560db612 258 unsigned type:4;
259 unsigned storage:2; /* REDIS_VM_MEMORY or REDIS_VM_SWAPPING */
260 unsigned encoding:4;
261 unsigned lru:22; /* lru time (relative to server.lruclock) */
ed9b544e 262 int refcount;
560db612 263 void *ptr;
75680a3c 264 /* VM fields, this are only allocated if VM is active, otherwise the
265 * object allocation function will just allocate
266 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
267 * Redis without VM active will not have any overhead. */
ed9b544e 268} robj;
269
560db612 270/* The VM pointer structure - identifies an object in the swap file.
271 *
272 * This object is stored in place of the value
273 * object in the main key->value hash table representing a database.
274 * Note that the first fields (type, storage) are the same as the redisObject
275 * structure so that vmPointer strucuters can be accessed even when casted
276 * as redisObject structures.
277 *
278 * This is useful as we don't know if a value object is or not on disk, but we
169dd6b7 279 * are always able to read obj->storage to check this. For vmPointer
560db612 280 * structures "type" is set to REDIS_VMPOINTER (even if without this field
281 * is still possible to check the kind of object from the value of 'storage').*/
282typedef struct vmPointer {
283 unsigned type:4;
284 unsigned storage:2; /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
285 unsigned notused:26;
286 unsigned int vtype; /* type of the object stored in the swap file */
287 off_t page; /* the page at witch the object is stored on disk */
288 off_t usedpages; /* number of pages used on disk */
289} vmpointer;
290
dfc5e96c 291/* Macro used to initalize a Redis object allocated on the stack.
292 * Note that this macro is taken near the structure definition to make sure
293 * we'll update it when the structure is changed, to avoid bugs like
294 * bug #85 introduced exactly in this way. */
295#define initStaticStringObject(_var,_ptr) do { \
296 _var.refcount = 1; \
297 _var.type = REDIS_STRING; \
298 _var.encoding = REDIS_ENCODING_RAW; \
299 _var.ptr = _ptr; \
560db612 300 _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 301} while(0);
302
3305306f 303typedef struct redisDb {
4409877e 304 dict *dict; /* The keyspace for this DB */
305 dict *expires; /* Timeout of keys with a timeout set */
37ab76c9 306 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 307 dict *io_keys; /* Keys with clients waiting for VM I/O */
37ab76c9 308 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
3305306f 309 int id;
310} redisDb;
311
6e469882 312/* Client MULTI/EXEC state */
313typedef struct multiCmd {
314 robj **argv;
315 int argc;
316 struct redisCommand *cmd;
317} multiCmd;
318
319typedef struct multiState {
320 multiCmd *commands; /* Array of MULTI commands */
321 int count; /* Total number of MULTI commands */
322} multiState;
323
ed9b544e 324/* With multiplexing we need to take per-clinet state.
325 * Clients are taken in a liked list. */
326typedef struct redisClient {
327 int fd;
3305306f 328 redisDb *db;
ed9b544e 329 int dictid;
330 sds querybuf;
e8a74421 331 robj **argv, **mbargv;
332 int argc, mbargc;
40d224a9 333 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 334 int multibulk; /* multi bulk command format active */
ed9b544e 335 list *reply;
336 int sentlen;
337 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 338 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 339 int slaveseldb; /* slave selected db, if this client is a slave */
340 int authenticated; /* when requirepass is non-NULL */
341 int replstate; /* replication state if this is a slave */
342 int repldbfd; /* replication DB file descriptor */
6e469882 343 long repldboff; /* replication DB file offset */
40d224a9 344 off_t repldbsize; /* replication DB file size */
6e469882 345 multiState mstate; /* MULTI/EXEC state */
37ab76c9 346 robj **blocking_keys; /* The key we are waiting to terminate a blocking
4409877e 347 * operation such as BLPOP. Otherwise NULL. */
37ab76c9 348 int blocking_keys_num; /* Number of blocking keys */
4409877e 349 time_t blockingto; /* Blocking operation timeout. If UNIX current time
350 * is >= blockingto then the operation timed out. */
92f8e882 351 list *io_keys; /* Keys this client is waiting to be loaded from the
352 * swap file in order to continue. */
37ab76c9 353 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
ffc6b7f8 354 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
355 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
ed9b544e 356} redisClient;
357
358struct saveparam {
359 time_t seconds;
360 int changes;
361};
362
363/* Global server state structure */
364struct redisServer {
365 int port;
366 int fd;
3305306f 367 redisDb *db;
ed9b544e 368 long long dirty; /* changes to DB from the last save */
369 list *clients;
87eca727 370 list *slaves, *monitors;
ed9b544e 371 char neterr[ANET_ERR_LEN];
372 aeEventLoop *el;
373 int cronloops; /* number of times the cron function run */
374 list *objfreelist; /* A list of freed objects to avoid malloc() */
375 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 376 /* Fields used only for stats */
377 time_t stat_starttime; /* server start time */
378 long long stat_numcommands; /* number of processed commands */
379 long long stat_numconnections; /* number of connections received */
2a6a2ed1 380 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 381 /* Configuration */
382 int verbosity;
383 int glueoutputbuf;
384 int maxidletime;
385 int dbnum;
386 int daemonize;
44b38ef4 387 int appendonly;
48f0308a 388 int appendfsync;
38db9171 389 int no_appendfsync_on_rewrite;
fab43727 390 int shutdown_asap;
48f0308a 391 time_t lastfsync;
44b38ef4 392 int appendfd;
393 int appendseldb;
ed329fcf 394 char *pidfile;
9f3c422c 395 pid_t bgsavechildpid;
9d65a1bb 396 pid_t bgrewritechildpid;
397 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
28ed1f33 398 sds aofbuf; /* AOF buffer, written before entering the event loop */
ed9b544e 399 struct saveparam *saveparams;
400 int saveparamslen;
401 char *logfile;
402 char *bindaddr;
403 char *dbfilename;
44b38ef4 404 char *appendfilename;
abcb223e 405 char *requirepass;
121f70cf 406 int rdbcompression;
8ca3e9d1 407 int activerehashing;
ed9b544e 408 /* Replication related */
409 int isslave;
d0ccebcf 410 char *masterauth;
ed9b544e 411 char *masterhost;
412 int masterport;
40d224a9 413 redisClient *master; /* client that is master for this slave */
ed9b544e 414 int replstate;
285add55 415 unsigned int maxclients;
4ef8de8a 416 unsigned long long maxmemory;
d5d55fc3 417 unsigned int blpop_blocked_clients;
418 unsigned int vm_blocked_clients;
ed9b544e 419 /* Sort parameters - qsort_r() is only available under BSD so we
420 * have to take this state global, in order to pass it to sortCompare() */
421 int sort_desc;
422 int sort_alpha;
423 int sort_bypattern;
75680a3c 424 /* Virtual memory configuration */
425 int vm_enabled;
054e426d 426 char *vm_swap_file;
75680a3c 427 off_t vm_page_size;
428 off_t vm_pages;
4ef8de8a 429 unsigned long long vm_max_memory;
d0686e07 430 /* Zip structure config */
cbba7dd7 431 size_t hash_max_zipmap_entries;
432 size_t hash_max_zipmap_value;
d0686e07
PN
433 size_t list_max_ziplist_entries;
434 size_t list_max_ziplist_value;
75680a3c 435 /* Virtual memory state */
436 FILE *vm_fp;
437 int vm_fd;
438 off_t vm_next_page; /* Next probably empty page */
439 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 440 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 441 time_t unixtime; /* Unix time sampled every second. */
92f8e882 442 /* Virtual memory I/O threads stuff */
92f8e882 443 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 444 * put the result of the operation in the io_done list. While the
445 * job is being processed, it's put on io_processing queue. */
446 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
447 list *io_processing; /* List of VM I/O jobs being processed */
448 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 449 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 450 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 451 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
452 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 453 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 454 int io_active_threads; /* Number of running I/O threads */
455 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 456 /* Our main thread is blocked on the event loop, locking for sockets ready
457 * to be read or written, so when a threaded I/O operation is ready to be
458 * processed by the main thread, the I/O thread will use a unix pipe to
459 * awake the main thread. The followings are the two pipe FDs. */
460 int io_ready_pipe_read;
461 int io_ready_pipe_write;
7d98e08c 462 /* Virtual memory stats */
463 unsigned long long vm_stats_used_pages;
464 unsigned long long vm_stats_swapped_objects;
465 unsigned long long vm_stats_swapouts;
466 unsigned long long vm_stats_swapins;
befec3cd 467 /* Pubsub */
ffc6b7f8 468 dict *pubsub_channels; /* Map channels to list of subscribed clients */
469 list *pubsub_patterns; /* A list of pubsub_patterns */
befec3cd 470 /* Misc */
b9bc0eef 471 FILE *devnull;
560db612 472 unsigned lruclock:22; /* clock incrementing every minute, for LRU */
473 unsigned lruclock_padding:10;
ed9b544e 474};
475
ffc6b7f8 476typedef struct pubsubPattern {
477 redisClient *client;
478 robj *pattern;
479} pubsubPattern;
480
ed9b544e 481typedef void redisCommandProc(redisClient *c);
ca1788b5 482typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
ed9b544e 483struct redisCommand {
484 char *name;
485 redisCommandProc *proc;
486 int arity;
487 int flags;
76583ea4
PN
488 /* Use a function to determine which keys need to be loaded
489 * in the background prior to executing this command. Takes precedence
490 * over vm_firstkey and others, ignored when NULL */
ca1788b5 491 redisVmPreloadProc *vm_preload_proc;
7c775e09 492 /* What keys should be loaded in background when calling this command? */
493 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
494 int vm_lastkey; /* THe last argument that's a key */
495 int vm_keystep; /* The step between first and last key */
ed9b544e 496};
497
de96dbfe 498struct redisFunctionSym {
499 char *name;
56906eef 500 unsigned long pointer;
de96dbfe 501};
502
ed9b544e 503typedef struct _redisSortObject {
504 robj *obj;
505 union {
506 double score;
507 robj *cmpobj;
508 } u;
509} redisSortObject;
510
511typedef struct _redisSortOperation {
512 int type;
513 robj *pattern;
514} redisSortOperation;
515
6b47e12e 516/* ZSETs use a specialized version of Skiplists */
517
518typedef struct zskiplistNode {
519 struct zskiplistNode **forward;
e3870fab 520 struct zskiplistNode *backward;
912b9165 521 unsigned int *span;
6b47e12e 522 double score;
523 robj *obj;
524} zskiplistNode;
525
526typedef struct zskiplist {
e3870fab 527 struct zskiplistNode *header, *tail;
d13f767c 528 unsigned long length;
6b47e12e 529 int level;
530} zskiplist;
531
1812e024 532typedef struct zset {
533 dict *dict;
6b47e12e 534 zskiplist *zsl;
1812e024 535} zset;
536
6b47e12e 537/* Our shared "common" objects */
538
05df7621 539#define REDIS_SHARED_INTEGERS 10000
ed9b544e 540struct sharedObjectsStruct {
c937aa89 541 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 542 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 543 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
544 *outofrangeerr, *plus,
ed9b544e 545 *select0, *select1, *select2, *select3, *select4,
befec3cd 546 *select5, *select6, *select7, *select8, *select9,
c8d0ea0e 547 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
548 *mbulk4, *psubscribebulk, *punsubscribebulk,
549 *integers[REDIS_SHARED_INTEGERS];
ed9b544e 550} shared;
551
a7866db6 552/* Global vars that are actally used as constants. The following double
553 * values are used for double on-disk serialization, and are initialized
554 * at runtime to avoid strange compiler optimizations. */
555
556static double R_Zero, R_PosInf, R_NegInf, R_Nan;
557
92f8e882 558/* VM threaded I/O request message */
b9bc0eef 559#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
560#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
561#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 562typedef struct iojob {
996cb5f7 563 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 564 redisDb *db;/* Redis database */
92f8e882 565 robj *key; /* This I/O request is about swapping this key */
560db612 566 robj *id; /* Unique identifier of this job:
567 this is the object to swap for REDIS_IOREQ_*_SWAP, or the
568 vmpointer objct for REDIS_IOREQ_LOAD. */
b9bc0eef 569 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 570 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
571 off_t page; /* Swap page where to read/write the object */
248ea310 572 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 573 int canceled; /* True if this command was canceled by blocking side of VM */
574 pthread_t thread; /* ID of the thread processing this entry */
575} iojob;
92f8e882 576
ed9b544e 577/*================================ Prototypes =============================== */
578
579static void freeStringObject(robj *o);
580static void freeListObject(robj *o);
581static void freeSetObject(robj *o);
582static void decrRefCount(void *o);
583static robj *createObject(int type, void *ptr);
584static void freeClient(redisClient *c);
f78fd11b 585static int rdbLoad(char *filename);
ed9b544e 586static void addReply(redisClient *c, robj *obj);
587static void addReplySds(redisClient *c, sds s);
588static void incrRefCount(robj *o);
f78fd11b 589static int rdbSaveBackground(char *filename);
ed9b544e 590static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 591static robj *dupStringObject(robj *o);
248ea310 592static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
dd142b9c 593static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
28ed1f33 594static void flushAppendOnlyFile(void);
44b38ef4 595static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 596static int syncWithMaster(void);
05df7621 597static robj *tryObjectEncoding(robj *o);
9d65a1bb 598static robj *getDecodedObject(robj *o);
3305306f 599static int removeExpire(redisDb *db, robj *key);
600static int expireIfNeeded(redisDb *db, robj *key);
601static int deleteIfVolatile(redisDb *db, robj *key);
09241813 602static int dbDelete(redisDb *db, robj *key);
bb32ede5 603static time_t getExpire(redisDb *db, robj *key);
604static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 605static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 606static void freeMemoryIfNeeded(void);
de96dbfe 607static int processCommand(redisClient *c);
56906eef 608static void setupSigSegvAction(void);
a3b21203 609static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 610static void aofRemoveTempFile(pid_t childpid);
0ea663ea 611static size_t stringObjectLen(robj *o);
638e42ac 612static void processInputBuffer(redisClient *c);
6b47e12e 613static zskiplist *zslCreate(void);
fd8ccf44 614static void zslFree(zskiplist *zsl);
2b59cfdf 615static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 616static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 617static void initClientMultiState(redisClient *c);
618static void freeClientMultiState(redisClient *c);
619static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 620static void unblockClientWaitingData(redisClient *c);
4409877e 621static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 622static void vmInit(void);
a35ddf12 623static void vmMarkPagesFree(off_t page, off_t count);
560db612 624static robj *vmLoadObject(robj *o);
625static robj *vmPreviewObject(robj *o);
a69a0c9c 626static int vmSwapOneObjectBlocking(void);
627static int vmSwapOneObjectThreaded(void);
7e69548d 628static int vmCanSwapOut(void);
a5819310 629static int tryFreeOneObjectFromFreelist(void);
996cb5f7 630static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
631static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
632static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 633static void lockThreadedIO(void);
634static void unlockThreadedIO(void);
635static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
636static void freeIOJob(iojob *j);
637static void queueIOJob(iojob *j);
a5819310 638static int vmWriteObjectOnSwap(robj *o, off_t page);
639static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 640static void waitEmptyIOJobsQueue(void);
641static void vmReopenSwapFile(void);
970e10bb 642static int vmFreePage(off_t page);
ca1788b5 643static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
3805e04f 644static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
0a6f3f0f 645static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
d5d55fc3 646static int dontWaitForSwappedKey(redisClient *c, robj *key);
647static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
648static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
649static struct redisCommand *lookupCommand(char *name);
650static void call(redisClient *c, struct redisCommand *cmd);
651static void resetClient(redisClient *c);
ada386b2 652static void convertToRealHash(robj *o);
003f0840 653static void listTypeConvert(robj *o, int enc);
ffc6b7f8 654static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
655static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
656static void freePubsubPattern(void *p);
657static int listMatchPubsubPattern(void *a, void *b);
658static int compareStringObjects(robj *a, robj *b);
bf028098 659static int equalStringObjects(robj *a, robj *b);
befec3cd 660static void usage();
8f63ddca 661static int rewriteAppendOnlyFileBackground(void);
560db612 662static vmpointer *vmSwapObjectBlocking(robj *val);
fab43727 663static int prepareForShutdown();
37ab76c9 664static void touchWatchedKey(redisDb *db, robj *key);
9b30e1a2 665static void touchWatchedKeysOnFlush(int dbid);
37ab76c9 666static void unwatchAllKeys(redisClient *c);
ed9b544e 667
abcb223e 668static void authCommand(redisClient *c);
ed9b544e 669static void pingCommand(redisClient *c);
670static void echoCommand(redisClient *c);
671static void setCommand(redisClient *c);
672static void setnxCommand(redisClient *c);
526d00a5 673static void setexCommand(redisClient *c);
ed9b544e 674static void getCommand(redisClient *c);
675static void delCommand(redisClient *c);
676static void existsCommand(redisClient *c);
677static void incrCommand(redisClient *c);
678static void decrCommand(redisClient *c);
679static void incrbyCommand(redisClient *c);
680static void decrbyCommand(redisClient *c);
681static void selectCommand(redisClient *c);
682static void randomkeyCommand(redisClient *c);
683static void keysCommand(redisClient *c);
684static void dbsizeCommand(redisClient *c);
685static void lastsaveCommand(redisClient *c);
686static void saveCommand(redisClient *c);
687static void bgsaveCommand(redisClient *c);
9d65a1bb 688static void bgrewriteaofCommand(redisClient *c);
ed9b544e 689static void shutdownCommand(redisClient *c);
690static void moveCommand(redisClient *c);
691static void renameCommand(redisClient *c);
692static void renamenxCommand(redisClient *c);
693static void lpushCommand(redisClient *c);
694static void rpushCommand(redisClient *c);
695static void lpopCommand(redisClient *c);
696static void rpopCommand(redisClient *c);
697static void llenCommand(redisClient *c);
698static void lindexCommand(redisClient *c);
699static void lrangeCommand(redisClient *c);
700static void ltrimCommand(redisClient *c);
701static void typeCommand(redisClient *c);
702static void lsetCommand(redisClient *c);
703static void saddCommand(redisClient *c);
704static void sremCommand(redisClient *c);
a4460ef4 705static void smoveCommand(redisClient *c);
ed9b544e 706static void sismemberCommand(redisClient *c);
707static void scardCommand(redisClient *c);
12fea928 708static void spopCommand(redisClient *c);
2abb95a9 709static void srandmemberCommand(redisClient *c);
ed9b544e 710static void sinterCommand(redisClient *c);
711static void sinterstoreCommand(redisClient *c);
40d224a9 712static void sunionCommand(redisClient *c);
713static void sunionstoreCommand(redisClient *c);
f4f56e1d 714static void sdiffCommand(redisClient *c);
715static void sdiffstoreCommand(redisClient *c);
ed9b544e 716static void syncCommand(redisClient *c);
717static void flushdbCommand(redisClient *c);
718static void flushallCommand(redisClient *c);
719static void sortCommand(redisClient *c);
720static void lremCommand(redisClient *c);
0f5f7e9a 721static void rpoplpushcommand(redisClient *c);
ed9b544e 722static void infoCommand(redisClient *c);
70003d28 723static void mgetCommand(redisClient *c);
87eca727 724static void monitorCommand(redisClient *c);
3305306f 725static void expireCommand(redisClient *c);
802e8373 726static void expireatCommand(redisClient *c);
f6b141c5 727static void getsetCommand(redisClient *c);
fd88489a 728static void ttlCommand(redisClient *c);
321b0e13 729static void slaveofCommand(redisClient *c);
7f957c92 730static void debugCommand(redisClient *c);
f6b141c5 731static void msetCommand(redisClient *c);
732static void msetnxCommand(redisClient *c);
fd8ccf44 733static void zaddCommand(redisClient *c);
7db723ad 734static void zincrbyCommand(redisClient *c);
cc812361 735static void zrangeCommand(redisClient *c);
50c55df5 736static void zrangebyscoreCommand(redisClient *c);
f44dd428 737static void zcountCommand(redisClient *c);
e3870fab 738static void zrevrangeCommand(redisClient *c);
3c41331e 739static void zcardCommand(redisClient *c);
1b7106e7 740static void zremCommand(redisClient *c);
6e333bbe 741static void zscoreCommand(redisClient *c);
1807985b 742static void zremrangebyscoreCommand(redisClient *c);
6e469882 743static void multiCommand(redisClient *c);
744static void execCommand(redisClient *c);
18b6cb76 745static void discardCommand(redisClient *c);
4409877e 746static void blpopCommand(redisClient *c);
747static void brpopCommand(redisClient *c);
4b00bebd 748static void appendCommand(redisClient *c);
39191553 749static void substrCommand(redisClient *c);
69d95c3e 750static void zrankCommand(redisClient *c);
798d9e55 751static void zrevrankCommand(redisClient *c);
978c2c94 752static void hsetCommand(redisClient *c);
1f1c7695 753static void hsetnxCommand(redisClient *c);
978c2c94 754static void hgetCommand(redisClient *c);
09aeb579
PN
755static void hmsetCommand(redisClient *c);
756static void hmgetCommand(redisClient *c);
07efaf74 757static void hdelCommand(redisClient *c);
92b27fe9 758static void hlenCommand(redisClient *c);
9212eafd 759static void zremrangebyrankCommand(redisClient *c);
5d373da9 760static void zunionstoreCommand(redisClient *c);
761static void zinterstoreCommand(redisClient *c);
78409a0f 762static void hkeysCommand(redisClient *c);
763static void hvalsCommand(redisClient *c);
764static void hgetallCommand(redisClient *c);
a86f14b1 765static void hexistsCommand(redisClient *c);
500ece7c 766static void configCommand(redisClient *c);
01426b05 767static void hincrbyCommand(redisClient *c);
befec3cd 768static void subscribeCommand(redisClient *c);
769static void unsubscribeCommand(redisClient *c);
ffc6b7f8 770static void psubscribeCommand(redisClient *c);
771static void punsubscribeCommand(redisClient *c);
befec3cd 772static void publishCommand(redisClient *c);
37ab76c9 773static void watchCommand(redisClient *c);
774static void unwatchCommand(redisClient *c);
f6b141c5 775
ed9b544e 776/*================================= Globals ================================= */
777
778/* Global vars */
779static struct redisServer server; /* server global state */
1a132bbc 780static struct redisCommand *commandTable;
1a132bbc 781static struct redisCommand readonlyCommandTable[] = {
76583ea4
PN
782 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
783 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
784 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
526d00a5 785 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
786 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
787 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
789 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
790 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
791 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
792 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
793 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
794 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
795 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
796 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
797 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
798 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
800 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
802 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
805 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
806 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
807 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
808 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
809 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
810 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
811 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
812 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
813 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
814 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
815 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
816 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
817 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
818 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
819 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
820 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
821 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
822 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
823 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
824 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
5d373da9 825 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
826 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
827 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
828 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
829 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
830 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
831 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
832 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
833 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
834 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
835 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
1f1c7695 836 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 837 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
d33278d1 838 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 839 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
01426b05 840 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
841 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
842 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
843 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
844 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
845 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 846 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
847 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
848 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
849 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
850 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
851 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
852 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
853 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
854 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
855 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
856 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
857 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
858 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
859 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
860 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
861 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
862 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
863 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
864 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
865 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
866 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
867 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
868 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
869 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
870 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
3805e04f 871 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
872 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
873 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
874 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
875 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
876 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
877 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
878 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
879 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
880 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
881 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 882 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
befec3cd 883 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
884 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
ffc6b7f8 885 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
886 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
4005fef1 887 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
37ab76c9 888 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
d55d5c5d 889 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}
ed9b544e 890};
bcfc686d 891
ed9b544e 892/*============================ Utility functions ============================ */
893
894/* Glob-style pattern matching. */
500ece7c 895static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 896 const char *string, int stringLen, int nocase)
897{
898 while(patternLen) {
899 switch(pattern[0]) {
900 case '*':
901 while (pattern[1] == '*') {
902 pattern++;
903 patternLen--;
904 }
905 if (patternLen == 1)
906 return 1; /* match */
907 while(stringLen) {
908 if (stringmatchlen(pattern+1, patternLen-1,
909 string, stringLen, nocase))
910 return 1; /* match */
911 string++;
912 stringLen--;
913 }
914 return 0; /* no match */
915 break;
916 case '?':
917 if (stringLen == 0)
918 return 0; /* no match */
919 string++;
920 stringLen--;
921 break;
922 case '[':
923 {
924 int not, match;
925
926 pattern++;
927 patternLen--;
928 not = pattern[0] == '^';
929 if (not) {
930 pattern++;
931 patternLen--;
932 }
933 match = 0;
934 while(1) {
935 if (pattern[0] == '\\') {
936 pattern++;
937 patternLen--;
938 if (pattern[0] == string[0])
939 match = 1;
940 } else if (pattern[0] == ']') {
941 break;
942 } else if (patternLen == 0) {
943 pattern--;
944 patternLen++;
945 break;
946 } else if (pattern[1] == '-' && patternLen >= 3) {
947 int start = pattern[0];
948 int end = pattern[2];
949 int c = string[0];
950 if (start > end) {
951 int t = start;
952 start = end;
953 end = t;
954 }
955 if (nocase) {
956 start = tolower(start);
957 end = tolower(end);
958 c = tolower(c);
959 }
960 pattern += 2;
961 patternLen -= 2;
962 if (c >= start && c <= end)
963 match = 1;
964 } else {
965 if (!nocase) {
966 if (pattern[0] == string[0])
967 match = 1;
968 } else {
969 if (tolower((int)pattern[0]) == tolower((int)string[0]))
970 match = 1;
971 }
972 }
973 pattern++;
974 patternLen--;
975 }
976 if (not)
977 match = !match;
978 if (!match)
979 return 0; /* no match */
980 string++;
981 stringLen--;
982 break;
983 }
984 case '\\':
985 if (patternLen >= 2) {
986 pattern++;
987 patternLen--;
988 }
989 /* fall through */
990 default:
991 if (!nocase) {
992 if (pattern[0] != string[0])
993 return 0; /* no match */
994 } else {
995 if (tolower((int)pattern[0]) != tolower((int)string[0]))
996 return 0; /* no match */
997 }
998 string++;
999 stringLen--;
1000 break;
1001 }
1002 pattern++;
1003 patternLen--;
1004 if (stringLen == 0) {
1005 while(*pattern == '*') {
1006 pattern++;
1007 patternLen--;
1008 }
1009 break;
1010 }
1011 }
1012 if (patternLen == 0 && stringLen == 0)
1013 return 1;
1014 return 0;
1015}
1016
500ece7c 1017static int stringmatch(const char *pattern, const char *string, int nocase) {
1018 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
1019}
1020
2b619329 1021/* Convert a string representing an amount of memory into the number of
1022 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
1023 * (1024*1024*1024).
1024 *
1025 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1026 * set to 0 */
1027static long long memtoll(const char *p, int *err) {
1028 const char *u;
1029 char buf[128];
1030 long mul; /* unit multiplier */
1031 long long val;
1032 unsigned int digits;
1033
1034 if (err) *err = 0;
1035 /* Search the first non digit character. */
1036 u = p;
1037 if (*u == '-') u++;
1038 while(*u && isdigit(*u)) u++;
1039 if (*u == '\0' || !strcasecmp(u,"b")) {
1040 mul = 1;
72324005 1041 } else if (!strcasecmp(u,"k")) {
2b619329 1042 mul = 1000;
72324005 1043 } else if (!strcasecmp(u,"kb")) {
2b619329 1044 mul = 1024;
72324005 1045 } else if (!strcasecmp(u,"m")) {
2b619329 1046 mul = 1000*1000;
72324005 1047 } else if (!strcasecmp(u,"mb")) {
2b619329 1048 mul = 1024*1024;
72324005 1049 } else if (!strcasecmp(u,"g")) {
2b619329 1050 mul = 1000L*1000*1000;
72324005 1051 } else if (!strcasecmp(u,"gb")) {
2b619329 1052 mul = 1024L*1024*1024;
1053 } else {
1054 if (err) *err = 1;
1055 mul = 1;
1056 }
1057 digits = u-p;
1058 if (digits >= sizeof(buf)) {
1059 if (err) *err = 1;
1060 return LLONG_MAX;
1061 }
1062 memcpy(buf,p,digits);
1063 buf[digits] = '\0';
1064 val = strtoll(buf,NULL,10);
1065 return val*mul;
1066}
1067
ee14da56 1068/* Convert a long long into a string. Returns the number of
1069 * characters needed to represent the number, that can be shorter if passed
1070 * buffer length is not enough to store the whole number. */
1071static int ll2string(char *s, size_t len, long long value) {
1072 char buf[32], *p;
1073 unsigned long long v;
1074 size_t l;
1075
1076 if (len == 0) return 0;
1077 v = (value < 0) ? -value : value;
1078 p = buf+31; /* point to the last character */
1079 do {
1080 *p-- = '0'+(v%10);
1081 v /= 10;
1082 } while(v);
1083 if (value < 0) *p-- = '-';
1084 p++;
1085 l = 32-(p-buf);
1086 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1087 memcpy(s,p,l);
1088 s[l] = '\0';
1089 return l;
1090}
1091
56906eef 1092static void redisLog(int level, const char *fmt, ...) {
ed9b544e 1093 va_list ap;
1094 FILE *fp;
1095
1096 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1097 if (!fp) return;
1098
1099 va_start(ap, fmt);
1100 if (level >= server.verbosity) {
6766f45e 1101 char *c = ".-*#";
1904ecc1 1102 char buf[64];
1103 time_t now;
1104
1105 now = time(NULL);
6c9385e0 1106 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 1107 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 1108 vfprintf(fp, fmt, ap);
1109 fprintf(fp,"\n");
1110 fflush(fp);
1111 }
1112 va_end(ap);
1113
1114 if (server.logfile) fclose(fp);
1115}
1116
1117/*====================== Hash table type implementation ==================== */
1118
1119/* This is an hash table type that uses the SDS dynamic strings libary as
1120 * keys and radis objects as values (objects can hold SDS strings,
1121 * lists, sets). */
1122
1812e024 1123static void dictVanillaFree(void *privdata, void *val)
1124{
1125 DICT_NOTUSED(privdata);
1126 zfree(val);
1127}
1128
4409877e 1129static void dictListDestructor(void *privdata, void *val)
1130{
1131 DICT_NOTUSED(privdata);
1132 listRelease((list*)val);
1133}
1134
09241813 1135static int dictSdsKeyCompare(void *privdata, const void *key1,
ed9b544e 1136 const void *key2)
1137{
1138 int l1,l2;
1139 DICT_NOTUSED(privdata);
1140
1141 l1 = sdslen((sds)key1);
1142 l2 = sdslen((sds)key2);
1143 if (l1 != l2) return 0;
1144 return memcmp(key1, key2, l1) == 0;
1145}
1146
1147static void dictRedisObjectDestructor(void *privdata, void *val)
1148{
1149 DICT_NOTUSED(privdata);
1150
a35ddf12 1151 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 1152 decrRefCount(val);
1153}
1154
09241813 1155static void dictSdsDestructor(void *privdata, void *val)
1156{
1157 DICT_NOTUSED(privdata);
1158
1159 sdsfree(val);
1160}
1161
942a3961 1162static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1163 const void *key2)
1164{
1165 const robj *o1 = key1, *o2 = key2;
09241813 1166 return dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
ed9b544e 1167}
1168
942a3961 1169static unsigned int dictObjHash(const void *key) {
ed9b544e 1170 const robj *o = key;
1171 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1172}
1173
09241813 1174static unsigned int dictSdsHash(const void *key) {
1175 return dictGenHashFunction((unsigned char*)key, sdslen((char*)key));
1176}
1177
942a3961 1178static int dictEncObjKeyCompare(void *privdata, const void *key1,
1179 const void *key2)
1180{
9d65a1bb 1181 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1182 int cmp;
942a3961 1183
2a1198b4 1184 if (o1->encoding == REDIS_ENCODING_INT &&
dc05abde 1185 o2->encoding == REDIS_ENCODING_INT)
1186 return o1->ptr == o2->ptr;
2a1198b4 1187
9d65a1bb 1188 o1 = getDecodedObject(o1);
1189 o2 = getDecodedObject(o2);
09241813 1190 cmp = dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
9d65a1bb 1191 decrRefCount(o1);
1192 decrRefCount(o2);
1193 return cmp;
942a3961 1194}
1195
1196static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1197 robj *o = (robj*) key;
942a3961 1198
ed9e4966 1199 if (o->encoding == REDIS_ENCODING_RAW) {
1200 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1201 } else {
1202 if (o->encoding == REDIS_ENCODING_INT) {
1203 char buf[32];
1204 int len;
1205
ee14da56 1206 len = ll2string(buf,32,(long)o->ptr);
ed9e4966 1207 return dictGenHashFunction((unsigned char*)buf, len);
1208 } else {
1209 unsigned int hash;
1210
1211 o = getDecodedObject(o);
1212 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1213 decrRefCount(o);
1214 return hash;
1215 }
1216 }
942a3961 1217}
1218
09241813 1219/* Sets type */
ed9b544e 1220static dictType setDictType = {
942a3961 1221 dictEncObjHash, /* hash function */
ed9b544e 1222 NULL, /* key dup */
1223 NULL, /* val dup */
942a3961 1224 dictEncObjKeyCompare, /* key compare */
ed9b544e 1225 dictRedisObjectDestructor, /* key destructor */
1226 NULL /* val destructor */
1227};
1228
f2d9f50f 1229/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1230static dictType zsetDictType = {
1231 dictEncObjHash, /* hash function */
1232 NULL, /* key dup */
1233 NULL, /* val dup */
1234 dictEncObjKeyCompare, /* key compare */
1235 dictRedisObjectDestructor, /* key destructor */
da0a1620 1236 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1237};
1238
09241813 1239/* Db->dict, keys are sds strings, vals are Redis objects. */
5234952b 1240static dictType dbDictType = {
09241813 1241 dictSdsHash, /* hash function */
ed9b544e 1242 NULL, /* key dup */
1243 NULL, /* val dup */
09241813 1244 dictSdsKeyCompare, /* key compare */
1245 dictSdsDestructor, /* key destructor */
ed9b544e 1246 dictRedisObjectDestructor /* val destructor */
1247};
1248
f2d9f50f 1249/* Db->expires */
1250static dictType keyptrDictType = {
09241813 1251 dictSdsHash, /* hash function */
f2d9f50f 1252 NULL, /* key dup */
1253 NULL, /* val dup */
09241813 1254 dictSdsKeyCompare, /* key compare */
1255 dictSdsDestructor, /* key destructor */
f2d9f50f 1256 NULL /* val destructor */
1257};
1258
5234952b 1259/* Hash type hash table (note that small hashes are represented with zimpaps) */
1260static dictType hashDictType = {
1261 dictEncObjHash, /* hash function */
1262 NULL, /* key dup */
1263 NULL, /* val dup */
1264 dictEncObjKeyCompare, /* key compare */
1265 dictRedisObjectDestructor, /* key destructor */
1266 dictRedisObjectDestructor /* val destructor */
1267};
1268
4409877e 1269/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1270 * lists as values. It's used for blocking operations (BLPOP) and to
1271 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1272static dictType keylistDictType = {
1273 dictObjHash, /* hash function */
1274 NULL, /* key dup */
1275 NULL, /* val dup */
1276 dictObjKeyCompare, /* key compare */
1277 dictRedisObjectDestructor, /* key destructor */
1278 dictListDestructor /* val destructor */
1279};
1280
42ab0172
AO
1281static void version();
1282
ed9b544e 1283/* ========================= Random utility functions ======================= */
1284
1285/* Redis generally does not try to recover from out of memory conditions
1286 * when allocating objects or strings, it is not clear if it will be possible
1287 * to report this condition to the client since the networking layer itself
1288 * is based on heap allocation for send buffers, so we simply abort.
1289 * At least the code will be simpler to read... */
1290static void oom(const char *msg) {
71c54b21 1291 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1292 sleep(1);
1293 abort();
1294}
1295
1296/* ====================== Redis server networking stuff ===================== */
56906eef 1297static void closeTimedoutClients(void) {
ed9b544e 1298 redisClient *c;
ed9b544e 1299 listNode *ln;
1300 time_t now = time(NULL);
c7df85a4 1301 listIter li;
ed9b544e 1302
c7df85a4 1303 listRewind(server.clients,&li);
1304 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1305 c = listNodeValue(ln);
f86a74e9 1306 if (server.maxidletime &&
1307 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1308 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
ffc6b7f8 1309 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1310 listLength(c->pubsub_patterns) == 0 &&
d6cc8867 1311 (now - c->lastinteraction > server.maxidletime))
f86a74e9 1312 {
f870935d 1313 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1314 freeClient(c);
f86a74e9 1315 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1316 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1317 addReply(c,shared.nullmultibulk);
b0d8747d 1318 unblockClientWaitingData(c);
f86a74e9 1319 }
ed9b544e 1320 }
1321 }
ed9b544e 1322}
1323
12fea928 1324static int htNeedsResize(dict *dict) {
1325 long long size, used;
1326
1327 size = dictSlots(dict);
1328 used = dictSize(dict);
1329 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1330 (used*100/size < REDIS_HT_MINFILL));
1331}
1332
0bc03378 1333/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1334 * we resize the hash table to save memory */
56906eef 1335static void tryResizeHashTables(void) {
0bc03378 1336 int j;
1337
1338 for (j = 0; j < server.dbnum; j++) {
5413c40d 1339 if (htNeedsResize(server.db[j].dict))
0bc03378 1340 dictResize(server.db[j].dict);
12fea928 1341 if (htNeedsResize(server.db[j].expires))
1342 dictResize(server.db[j].expires);
0bc03378 1343 }
1344}
1345
8ca3e9d1 1346/* Our hash table implementation performs rehashing incrementally while
1347 * we write/read from the hash table. Still if the server is idle, the hash
1348 * table will use two tables for a long time. So we try to use 1 millisecond
1349 * of CPU time at every serverCron() loop in order to rehash some key. */
1350static void incrementallyRehash(void) {
1351 int j;
1352
1353 for (j = 0; j < server.dbnum; j++) {
1354 if (dictIsRehashing(server.db[j].dict)) {
1355 dictRehashMilliseconds(server.db[j].dict,1);
1356 break; /* already used our millisecond for this loop... */
1357 }
1358 }
1359}
1360
9d65a1bb 1361/* A background saving child (BGSAVE) terminated its work. Handle this. */
1362void backgroundSaveDoneHandler(int statloc) {
1363 int exitcode = WEXITSTATUS(statloc);
1364 int bysignal = WIFSIGNALED(statloc);
1365
1366 if (!bysignal && exitcode == 0) {
1367 redisLog(REDIS_NOTICE,
1368 "Background saving terminated with success");
1369 server.dirty = 0;
1370 server.lastsave = time(NULL);
1371 } else if (!bysignal && exitcode != 0) {
1372 redisLog(REDIS_WARNING, "Background saving error");
1373 } else {
1374 redisLog(REDIS_WARNING,
454eea7c 1375 "Background saving terminated by signal %d", WTERMSIG(statloc));
9d65a1bb 1376 rdbRemoveTempFile(server.bgsavechildpid);
1377 }
1378 server.bgsavechildpid = -1;
1379 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1380 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1381 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1382}
1383
1384/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1385 * Handle this. */
1386void backgroundRewriteDoneHandler(int statloc) {
1387 int exitcode = WEXITSTATUS(statloc);
1388 int bysignal = WIFSIGNALED(statloc);
1389
1390 if (!bysignal && exitcode == 0) {
1391 int fd;
1392 char tmpfile[256];
1393
1394 redisLog(REDIS_NOTICE,
1395 "Background append only file rewriting terminated with success");
1396 /* Now it's time to flush the differences accumulated by the parent */
1397 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1398 fd = open(tmpfile,O_WRONLY|O_APPEND);
1399 if (fd == -1) {
1400 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1401 goto cleanup;
1402 }
1403 /* Flush our data... */
1404 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1405 (signed) sdslen(server.bgrewritebuf)) {
1406 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1407 close(fd);
1408 goto cleanup;
1409 }
b32627cd 1410 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1411 /* Now our work is to rename the temp file into the stable file. And
1412 * switch the file descriptor used by the server for append only. */
1413 if (rename(tmpfile,server.appendfilename) == -1) {
1414 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1415 close(fd);
1416 goto cleanup;
1417 }
1418 /* Mission completed... almost */
1419 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1420 if (server.appendfd != -1) {
1421 /* If append only is actually enabled... */
1422 close(server.appendfd);
1423 server.appendfd = fd;
d5d23dab 1424 if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
85a83172 1425 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1426 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1427 } else {
1428 /* If append only is disabled we just generate a dump in this
1429 * format. Why not? */
1430 close(fd);
1431 }
1432 } else if (!bysignal && exitcode != 0) {
1433 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1434 } else {
1435 redisLog(REDIS_WARNING,
454eea7c 1436 "Background append only file rewriting terminated by signal %d",
1437 WTERMSIG(statloc));
9d65a1bb 1438 }
1439cleanup:
1440 sdsfree(server.bgrewritebuf);
1441 server.bgrewritebuf = sdsempty();
1442 aofRemoveTempFile(server.bgrewritechildpid);
1443 server.bgrewritechildpid = -1;
1444}
1445
884d4b39 1446/* This function is called once a background process of some kind terminates,
1447 * as we want to avoid resizing the hash tables when there is a child in order
1448 * to play well with copy-on-write (otherwise when a resize happens lots of
1449 * memory pages are copied). The goal of this function is to update the ability
1450 * for dict.c to resize the hash tables accordingly to the fact we have o not
1451 * running childs. */
1452static void updateDictResizePolicy(void) {
1453 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1454 dictEnableResize();
1455 else
1456 dictDisableResize();
1457}
1458
56906eef 1459static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1460 int j, loops = server.cronloops++;
ed9b544e 1461 REDIS_NOTUSED(eventLoop);
1462 REDIS_NOTUSED(id);
1463 REDIS_NOTUSED(clientData);
1464
3a66edc7 1465 /* We take a cached value of the unix time in the global state because
1466 * with virtual memory and aging there is to store the current time
1467 * in objects at every object access, and accuracy is not needed.
1468 * To access a global var is faster than calling time(NULL) */
1469 server.unixtime = time(NULL);
560db612 1470 /* We have just 21 bits per object for LRU information.
1471 * So we use an (eventually wrapping) LRU clock with minutes resolution.
1472 *
1473 * When we need to select what object to swap, we compute the minimum
1474 * time distance between the current lruclock and the object last access
1475 * lruclock info. Even if clocks will wrap on overflow, there is
1476 * the interesting property that we are sure that at least
1477 * ABS(A-B) minutes passed between current time and timestamp B.
1478 *
1479 * This is not precise but we don't need at all precision, but just
1480 * something statistically reasonable.
1481 */
1482 server.lruclock = (time(NULL)/60)&((1<<21)-1);
3a66edc7 1483
fab43727 1484 /* We received a SIGTERM, shutting down here in a safe way, as it is
1485 * not ok doing so inside the signal handler. */
1486 if (server.shutdown_asap) {
1487 if (prepareForShutdown() == REDIS_OK) exit(0);
1488 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1489 }
1490
0bc03378 1491 /* Show some info about non-empty databases */
ed9b544e 1492 for (j = 0; j < server.dbnum; j++) {
dec423d9 1493 long long size, used, vkeys;
94754ccc 1494
3305306f 1495 size = dictSlots(server.db[j].dict);
1496 used = dictSize(server.db[j].dict);
94754ccc 1497 vkeys = dictSize(server.db[j].expires);
1763929f 1498 if (!(loops % 50) && (used || vkeys)) {
f870935d 1499 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1500 /* dictPrintStats(server.dict); */
ed9b544e 1501 }
ed9b544e 1502 }
1503
0bc03378 1504 /* We don't want to resize the hash tables while a bacground saving
1505 * is in progress: the saving child is created using fork() that is
1506 * implemented with a copy-on-write semantic in most modern systems, so
1507 * if we resize the HT while there is the saving child at work actually
1508 * a lot of memory movements in the parent will cause a lot of pages
1509 * copied. */
8ca3e9d1 1510 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1511 if (!(loops % 10)) tryResizeHashTables();
1512 if (server.activerehashing) incrementallyRehash();
884d4b39 1513 }
0bc03378 1514
ed9b544e 1515 /* Show information about connected clients */
1763929f 1516 if (!(loops % 50)) {
bdcb92f2 1517 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
ed9b544e 1518 listLength(server.clients)-listLength(server.slaves),
1519 listLength(server.slaves),
bdcb92f2 1520 zmalloc_used_memory());
ed9b544e 1521 }
1522
1523 /* Close connections of timedout clients */
1763929f 1524 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1525 closeTimedoutClients();
1526
9d65a1bb 1527 /* Check if a background saving or AOF rewrite in progress terminated */
1528 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1529 int statloc;
9d65a1bb 1530 pid_t pid;
1531
1532 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1533 if (pid == server.bgsavechildpid) {
1534 backgroundSaveDoneHandler(statloc);
ed9b544e 1535 } else {
9d65a1bb 1536 backgroundRewriteDoneHandler(statloc);
ed9b544e 1537 }
884d4b39 1538 updateDictResizePolicy();
ed9b544e 1539 }
1540 } else {
1541 /* If there is not a background saving in progress check if
1542 * we have to save now */
1543 time_t now = time(NULL);
1544 for (j = 0; j < server.saveparamslen; j++) {
1545 struct saveparam *sp = server.saveparams+j;
1546
1547 if (server.dirty >= sp->changes &&
1548 now-server.lastsave > sp->seconds) {
1549 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1550 sp->changes, sp->seconds);
f78fd11b 1551 rdbSaveBackground(server.dbfilename);
ed9b544e 1552 break;
1553 }
1554 }
1555 }
94754ccc 1556
f2324293 1557 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1558 * will use few CPU cycles if there are few expiring keys, otherwise
1559 * it will get more aggressive to avoid that too much memory is used by
1560 * keys that can be removed from the keyspace. */
94754ccc 1561 for (j = 0; j < server.dbnum; j++) {
f2324293 1562 int expired;
94754ccc 1563 redisDb *db = server.db+j;
94754ccc 1564
f2324293 1565 /* Continue to expire if at the end of the cycle more than 25%
1566 * of the keys were expired. */
1567 do {
4ef8de8a 1568 long num = dictSize(db->expires);
94754ccc 1569 time_t now = time(NULL);
1570
f2324293 1571 expired = 0;
94754ccc 1572 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1573 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1574 while (num--) {
1575 dictEntry *de;
1576 time_t t;
1577
1578 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1579 t = (time_t) dictGetEntryVal(de);
1580 if (now > t) {
09241813 1581 sds key = dictGetEntryKey(de);
1582 robj *keyobj = createStringObject(key,sdslen(key));
1583
1584 dbDelete(db,keyobj);
1585 decrRefCount(keyobj);
f2324293 1586 expired++;
2a6a2ed1 1587 server.stat_expiredkeys++;
94754ccc 1588 }
1589 }
f2324293 1590 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1591 }
1592
4ef8de8a 1593 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1594 * is enbled. Try to free objects from the free list first. */
7e69548d 1595 if (vmCanSwapOut()) {
1596 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1597 server.vm_max_memory)
1598 {
72e9fd40 1599 int retval;
1600
a5819310 1601 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1602 retval = (server.vm_max_threads == 0) ?
1603 vmSwapOneObjectBlocking() :
1604 vmSwapOneObjectThreaded();
1763929f 1605 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1606 zmalloc_used_memory() >
1607 (server.vm_max_memory+server.vm_max_memory/10))
1608 {
1609 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1610 }
72e9fd40 1611 /* Note that when using threade I/O we free just one object,
1612 * because anyway when the I/O thread in charge to swap this
1613 * object out will finish, the handler of completed jobs
1614 * will try to swap more objects if we are still out of memory. */
1615 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1616 }
1617 }
1618
ed9b544e 1619 /* Check if we should connect to a MASTER */
1763929f 1620 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1621 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1622 if (syncWithMaster() == REDIS_OK) {
1623 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
8f63ddca 1624 if (server.appendonly) rewriteAppendOnlyFileBackground();
ed9b544e 1625 }
1626 }
1763929f 1627 return 100;
ed9b544e 1628}
1629
d5d55fc3 1630/* This function gets called every time Redis is entering the
1631 * main loop of the event driven library, that is, before to sleep
1632 * for ready file descriptors. */
1633static void beforeSleep(struct aeEventLoop *eventLoop) {
1634 REDIS_NOTUSED(eventLoop);
1635
28ed1f33 1636 /* Awake clients that got all the swapped keys they requested */
d5d55fc3 1637 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1638 listIter li;
1639 listNode *ln;
1640
1641 listRewind(server.io_ready_clients,&li);
1642 while((ln = listNext(&li))) {
1643 redisClient *c = ln->value;
1644 struct redisCommand *cmd;
1645
1646 /* Resume the client. */
1647 listDelNode(server.io_ready_clients,ln);
1648 c->flags &= (~REDIS_IO_WAIT);
1649 server.vm_blocked_clients--;
1650 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1651 readQueryFromClient, c);
1652 cmd = lookupCommand(c->argv[0]->ptr);
1653 assert(cmd != NULL);
1654 call(c,cmd);
1655 resetClient(c);
1656 /* There may be more data to process in the input buffer. */
1657 if (c->querybuf && sdslen(c->querybuf) > 0)
1658 processInputBuffer(c);
1659 }
1660 }
28ed1f33 1661 /* Write the AOF buffer on disk */
1662 flushAppendOnlyFile();
d5d55fc3 1663}
1664
ed9b544e 1665static void createSharedObjects(void) {
05df7621 1666 int j;
1667
ed9b544e 1668 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1669 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1670 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1671 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1672 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1673 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1674 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1675 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1676 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1677 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1678 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1679 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1680 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1681 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1682 "-ERR no such key\r\n"));
ed9b544e 1683 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1684 "-ERR syntax error\r\n"));
c937aa89 1685 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1686 "-ERR source and destination objects are the same\r\n"));
1687 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1688 "-ERR index out of range\r\n"));
ed9b544e 1689 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1690 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1691 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1692 shared.select0 = createStringObject("select 0\r\n",10);
1693 shared.select1 = createStringObject("select 1\r\n",10);
1694 shared.select2 = createStringObject("select 2\r\n",10);
1695 shared.select3 = createStringObject("select 3\r\n",10);
1696 shared.select4 = createStringObject("select 4\r\n",10);
1697 shared.select5 = createStringObject("select 5\r\n",10);
1698 shared.select6 = createStringObject("select 6\r\n",10);
1699 shared.select7 = createStringObject("select 7\r\n",10);
1700 shared.select8 = createStringObject("select 8\r\n",10);
1701 shared.select9 = createStringObject("select 9\r\n",10);
befec3cd 1702 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
c8d0ea0e 1703 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
befec3cd 1704 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
fc46bb71 1705 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
ffc6b7f8 1706 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1707 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
befec3cd 1708 shared.mbulk3 = createStringObject("*3\r\n",4);
c8d0ea0e 1709 shared.mbulk4 = createStringObject("*4\r\n",4);
05df7621 1710 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1711 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1712 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1713 }
ed9b544e 1714}
1715
1716static void appendServerSaveParams(time_t seconds, int changes) {
1717 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1718 server.saveparams[server.saveparamslen].seconds = seconds;
1719 server.saveparams[server.saveparamslen].changes = changes;
1720 server.saveparamslen++;
1721}
1722
bcfc686d 1723static void resetServerSaveParams() {
ed9b544e 1724 zfree(server.saveparams);
1725 server.saveparams = NULL;
1726 server.saveparamslen = 0;
1727}
1728
1729static void initServerConfig() {
1730 server.dbnum = REDIS_DEFAULT_DBNUM;
1731 server.port = REDIS_SERVERPORT;
f870935d 1732 server.verbosity = REDIS_VERBOSE;
ed9b544e 1733 server.maxidletime = REDIS_MAXIDLETIME;
1734 server.saveparams = NULL;
1735 server.logfile = NULL; /* NULL = log on standard output */
1736 server.bindaddr = NULL;
1737 server.glueoutputbuf = 1;
1738 server.daemonize = 0;
44b38ef4 1739 server.appendonly = 0;
1b677732 1740 server.appendfsync = APPENDFSYNC_EVERYSEC;
38db9171 1741 server.no_appendfsync_on_rewrite = 0;
48f0308a 1742 server.lastfsync = time(NULL);
44b38ef4 1743 server.appendfd = -1;
1744 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1745 server.pidfile = zstrdup("/var/run/redis.pid");
1746 server.dbfilename = zstrdup("dump.rdb");
1747 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1748 server.requirepass = NULL;
b0553789 1749 server.rdbcompression = 1;
8ca3e9d1 1750 server.activerehashing = 1;
285add55 1751 server.maxclients = 0;
d5d55fc3 1752 server.blpop_blocked_clients = 0;
3fd78bcd 1753 server.maxmemory = 0;
75680a3c 1754 server.vm_enabled = 0;
054e426d 1755 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1756 server.vm_page_size = 256; /* 256 bytes per page */
1757 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1758 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1759 server.vm_max_threads = 4;
d5d55fc3 1760 server.vm_blocked_clients = 0;
cbba7dd7 1761 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1762 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
d0686e07
PN
1763 server.list_max_ziplist_entries = REDIS_LIST_MAX_ZIPLIST_ENTRIES;
1764 server.list_max_ziplist_value = REDIS_LIST_MAX_ZIPLIST_VALUE;
fab43727 1765 server.shutdown_asap = 0;
75680a3c 1766
bcfc686d 1767 resetServerSaveParams();
ed9b544e 1768
1769 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1770 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1771 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1772 /* Replication related */
1773 server.isslave = 0;
d0ccebcf 1774 server.masterauth = NULL;
ed9b544e 1775 server.masterhost = NULL;
1776 server.masterport = 6379;
1777 server.master = NULL;
1778 server.replstate = REDIS_REPL_NONE;
a7866db6 1779
1780 /* Double constants initialization */
1781 R_Zero = 0.0;
1782 R_PosInf = 1.0/R_Zero;
1783 R_NegInf = -1.0/R_Zero;
1784 R_Nan = R_Zero/R_Zero;
ed9b544e 1785}
1786
1787static void initServer() {
1788 int j;
1789
1790 signal(SIGHUP, SIG_IGN);
1791 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1792 setupSigSegvAction();
ed9b544e 1793
b9bc0eef 1794 server.devnull = fopen("/dev/null","w");
1795 if (server.devnull == NULL) {
1796 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1797 exit(1);
1798 }
ed9b544e 1799 server.clients = listCreate();
1800 server.slaves = listCreate();
87eca727 1801 server.monitors = listCreate();
ed9b544e 1802 server.objfreelist = listCreate();
1803 createSharedObjects();
1804 server.el = aeCreateEventLoop();
3305306f 1805 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
ed9b544e 1806 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1807 if (server.fd == -1) {
1808 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1809 exit(1);
1810 }
3305306f 1811 for (j = 0; j < server.dbnum; j++) {
5234952b 1812 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1813 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
37ab76c9 1814 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1815 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1816 if (server.vm_enabled)
1817 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1818 server.db[j].id = j;
1819 }
ffc6b7f8 1820 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1821 server.pubsub_patterns = listCreate();
1822 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1823 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
ed9b544e 1824 server.cronloops = 0;
9f3c422c 1825 server.bgsavechildpid = -1;
9d65a1bb 1826 server.bgrewritechildpid = -1;
1827 server.bgrewritebuf = sdsempty();
28ed1f33 1828 server.aofbuf = sdsempty();
ed9b544e 1829 server.lastsave = time(NULL);
1830 server.dirty = 0;
ed9b544e 1831 server.stat_numcommands = 0;
1832 server.stat_numconnections = 0;
2a6a2ed1 1833 server.stat_expiredkeys = 0;
ed9b544e 1834 server.stat_starttime = time(NULL);
3a66edc7 1835 server.unixtime = time(NULL);
d8f8b666 1836 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1837 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1838 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1839
1840 if (server.appendonly) {
3bb225d6 1841 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1842 if (server.appendfd == -1) {
1843 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1844 strerror(errno));
1845 exit(1);
1846 }
1847 }
75680a3c 1848
1849 if (server.vm_enabled) vmInit();
ed9b544e 1850}
1851
1852/* Empty the whole database */
ca37e9cd 1853static long long emptyDb() {
ed9b544e 1854 int j;
ca37e9cd 1855 long long removed = 0;
ed9b544e 1856
3305306f 1857 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1858 removed += dictSize(server.db[j].dict);
3305306f 1859 dictEmpty(server.db[j].dict);
1860 dictEmpty(server.db[j].expires);
1861 }
ca37e9cd 1862 return removed;
ed9b544e 1863}
1864
85dd2f3a 1865static int yesnotoi(char *s) {
1866 if (!strcasecmp(s,"yes")) return 1;
1867 else if (!strcasecmp(s,"no")) return 0;
1868 else return -1;
1869}
1870
ed9b544e 1871/* I agree, this is a very rudimental way to load a configuration...
1872 will improve later if the config gets more complex */
1873static void loadServerConfig(char *filename) {
c9a111ac 1874 FILE *fp;
ed9b544e 1875 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1876 int linenum = 0;
1877 sds line = NULL;
c9a111ac 1878
1879 if (filename[0] == '-' && filename[1] == '\0')
1880 fp = stdin;
1881 else {
1882 if ((fp = fopen(filename,"r")) == NULL) {
9a22de82 1883 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
c9a111ac 1884 exit(1);
1885 }
ed9b544e 1886 }
c9a111ac 1887
ed9b544e 1888 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1889 sds *argv;
1890 int argc, j;
1891
1892 linenum++;
1893 line = sdsnew(buf);
1894 line = sdstrim(line," \t\r\n");
1895
1896 /* Skip comments and blank lines*/
1897 if (line[0] == '#' || line[0] == '\0') {
1898 sdsfree(line);
1899 continue;
1900 }
1901
1902 /* Split into arguments */
1903 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1904 sdstolower(argv[0]);
1905
1906 /* Execute config directives */
bb0b03a3 1907 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1908 server.maxidletime = atoi(argv[1]);
0150db36 1909 if (server.maxidletime < 0) {
ed9b544e 1910 err = "Invalid timeout value"; goto loaderr;
1911 }
bb0b03a3 1912 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1913 server.port = atoi(argv[1]);
1914 if (server.port < 1 || server.port > 65535) {
1915 err = "Invalid port"; goto loaderr;
1916 }
bb0b03a3 1917 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1918 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1919 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1920 int seconds = atoi(argv[1]);
1921 int changes = atoi(argv[2]);
1922 if (seconds < 1 || changes < 0) {
1923 err = "Invalid save parameters"; goto loaderr;
1924 }
1925 appendServerSaveParams(seconds,changes);
bb0b03a3 1926 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1927 if (chdir(argv[1]) == -1) {
1928 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1929 argv[1], strerror(errno));
1930 exit(1);
1931 }
bb0b03a3 1932 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1933 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1934 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1935 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1936 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1937 else {
1938 err = "Invalid log level. Must be one of debug, notice, warning";
1939 goto loaderr;
1940 }
bb0b03a3 1941 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1942 FILE *logfp;
ed9b544e 1943
1944 server.logfile = zstrdup(argv[1]);
bb0b03a3 1945 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1946 zfree(server.logfile);
1947 server.logfile = NULL;
1948 }
1949 if (server.logfile) {
1950 /* Test if we are able to open the file. The server will not
1951 * be able to abort just for this problem later... */
c9a111ac 1952 logfp = fopen(server.logfile,"a");
1953 if (logfp == NULL) {
ed9b544e 1954 err = sdscatprintf(sdsempty(),
1955 "Can't open the log file: %s", strerror(errno));
1956 goto loaderr;
1957 }
c9a111ac 1958 fclose(logfp);
ed9b544e 1959 }
bb0b03a3 1960 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1961 server.dbnum = atoi(argv[1]);
1962 if (server.dbnum < 1) {
1963 err = "Invalid number of databases"; goto loaderr;
1964 }
b3f83f12
JZ
1965 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1966 loadServerConfig(argv[1]);
285add55 1967 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1968 server.maxclients = atoi(argv[1]);
3fd78bcd 1969 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
2b619329 1970 server.maxmemory = memtoll(argv[1],NULL);
bb0b03a3 1971 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1972 server.masterhost = sdsnew(argv[1]);
1973 server.masterport = atoi(argv[2]);
1974 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1975 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1976 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1977 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1978 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1979 err = "argument must be 'yes' or 'no'"; goto loaderr;
1980 }
121f70cf 1981 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1982 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
8ca3e9d1 1983 err = "argument must be 'yes' or 'no'"; goto loaderr;
1984 }
1985 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1986 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
121f70cf 1987 err = "argument must be 'yes' or 'no'"; goto loaderr;
1988 }
bb0b03a3 1989 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1990 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1991 err = "argument must be 'yes' or 'no'"; goto loaderr;
1992 }
44b38ef4 1993 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1994 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1995 err = "argument must be 'yes' or 'no'"; goto loaderr;
1996 }
f3b52411
PN
1997 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1998 zfree(server.appendfilename);
1999 server.appendfilename = zstrdup(argv[1]);
38db9171 2000 } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite")
2001 && argc == 2) {
2002 if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) {
2003 err = "argument must be 'yes' or 'no'"; goto loaderr;
2004 }
48f0308a 2005 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 2006 if (!strcasecmp(argv[1],"no")) {
48f0308a 2007 server.appendfsync = APPENDFSYNC_NO;
1766c6da 2008 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 2009 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 2010 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 2011 server.appendfsync = APPENDFSYNC_EVERYSEC;
2012 } else {
2013 err = "argument must be 'no', 'always' or 'everysec'";
2014 goto loaderr;
2015 }
bb0b03a3 2016 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 2017 server.requirepass = zstrdup(argv[1]);
bb0b03a3 2018 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 2019 zfree(server.pidfile);
054e426d 2020 server.pidfile = zstrdup(argv[1]);
bb0b03a3 2021 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 2022 zfree(server.dbfilename);
054e426d 2023 server.dbfilename = zstrdup(argv[1]);
75680a3c 2024 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
2025 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
2026 err = "argument must be 'yes' or 'no'"; goto loaderr;
2027 }
054e426d 2028 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 2029 zfree(server.vm_swap_file);
054e426d 2030 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 2031 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2b619329 2032 server.vm_max_memory = memtoll(argv[1],NULL);
4ef8de8a 2033 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2b619329 2034 server.vm_page_size = memtoll(argv[1], NULL);
4ef8de8a 2035 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2b619329 2036 server.vm_pages = memtoll(argv[1], NULL);
92f8e882 2037 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
2038 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 2039 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2b619329 2040 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
cbba7dd7 2041 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2b619329 2042 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
d0686e07
PN
2043 } else if (!strcasecmp(argv[0],"list-max-ziplist-entries") && argc == 2){
2044 server.list_max_ziplist_entries = memtoll(argv[1], NULL);
2045 } else if (!strcasecmp(argv[0],"list-max-ziplist-value") && argc == 2){
2046 server.list_max_ziplist_value = memtoll(argv[1], NULL);
ed9b544e 2047 } else {
2048 err = "Bad directive or wrong number of arguments"; goto loaderr;
2049 }
2050 for (j = 0; j < argc; j++)
2051 sdsfree(argv[j]);
2052 zfree(argv);
2053 sdsfree(line);
2054 }
c9a111ac 2055 if (fp != stdin) fclose(fp);
ed9b544e 2056 return;
2057
2058loaderr:
2059 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
2060 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
2061 fprintf(stderr, ">>> '%s'\n", line);
2062 fprintf(stderr, "%s\n", err);
2063 exit(1);
2064}
2065
2066static void freeClientArgv(redisClient *c) {
2067 int j;
2068
2069 for (j = 0; j < c->argc; j++)
2070 decrRefCount(c->argv[j]);
e8a74421 2071 for (j = 0; j < c->mbargc; j++)
2072 decrRefCount(c->mbargv[j]);
ed9b544e 2073 c->argc = 0;
e8a74421 2074 c->mbargc = 0;
ed9b544e 2075}
2076
2077static void freeClient(redisClient *c) {
2078 listNode *ln;
2079
4409877e 2080 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 2081 * call, we have to set querybuf to NULL *before* to call
2082 * unblockClientWaitingData() to avoid processInputBuffer() will get
2083 * called. Also it is important to remove the file events after
2084 * this, because this call adds the READABLE event. */
4409877e 2085 sdsfree(c->querybuf);
2086 c->querybuf = NULL;
2087 if (c->flags & REDIS_BLOCKED)
b0d8747d 2088 unblockClientWaitingData(c);
4409877e 2089
37ab76c9 2090 /* UNWATCH all the keys */
2091 unwatchAllKeys(c);
2092 listRelease(c->watched_keys);
ffc6b7f8 2093 /* Unsubscribe from all the pubsub channels */
2094 pubsubUnsubscribeAllChannels(c,0);
2095 pubsubUnsubscribeAllPatterns(c,0);
2096 dictRelease(c->pubsub_channels);
2097 listRelease(c->pubsub_patterns);
befec3cd 2098 /* Obvious cleanup */
ed9b544e 2099 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2100 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 2101 listRelease(c->reply);
2102 freeClientArgv(c);
2103 close(c->fd);
92f8e882 2104 /* Remove from the list of clients */
ed9b544e 2105 ln = listSearchKey(server.clients,c);
dfc5e96c 2106 redisAssert(ln != NULL);
ed9b544e 2107 listDelNode(server.clients,ln);
37ab76c9 2108 /* Remove from the list of clients that are now ready to be restarted
2109 * after waiting for swapped keys */
d5d55fc3 2110 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2111 ln = listSearchKey(server.io_ready_clients,c);
2112 if (ln) {
2113 listDelNode(server.io_ready_clients,ln);
2114 server.vm_blocked_clients--;
2115 }
2116 }
37ab76c9 2117 /* Remove from the list of clients waiting for swapped keys */
d5d55fc3 2118 while (server.vm_enabled && listLength(c->io_keys)) {
2119 ln = listFirst(c->io_keys);
2120 dontWaitForSwappedKey(c,ln->value);
92f8e882 2121 }
b3e3d0d7 2122 listRelease(c->io_keys);
befec3cd 2123 /* Master/slave cleanup */
ed9b544e 2124 if (c->flags & REDIS_SLAVE) {
6208b3a7 2125 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2126 close(c->repldbfd);
87eca727 2127 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2128 ln = listSearchKey(l,c);
dfc5e96c 2129 redisAssert(ln != NULL);
87eca727 2130 listDelNode(l,ln);
ed9b544e 2131 }
2132 if (c->flags & REDIS_MASTER) {
2133 server.master = NULL;
2134 server.replstate = REDIS_REPL_CONNECT;
2135 }
befec3cd 2136 /* Release memory */
93ea3759 2137 zfree(c->argv);
e8a74421 2138 zfree(c->mbargv);
6e469882 2139 freeClientMultiState(c);
ed9b544e 2140 zfree(c);
2141}
2142
cc30e368 2143#define GLUEREPLY_UP_TO (1024)
ed9b544e 2144static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 2145 int copylen = 0;
2146 char buf[GLUEREPLY_UP_TO];
6208b3a7 2147 listNode *ln;
c7df85a4 2148 listIter li;
ed9b544e 2149 robj *o;
2150
c7df85a4 2151 listRewind(c->reply,&li);
2152 while((ln = listNext(&li))) {
c28b42ac 2153 int objlen;
2154
ed9b544e 2155 o = ln->value;
c28b42ac 2156 objlen = sdslen(o->ptr);
2157 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2158 memcpy(buf+copylen,o->ptr,objlen);
2159 copylen += objlen;
ed9b544e 2160 listDelNode(c->reply,ln);
c28b42ac 2161 } else {
2162 if (copylen == 0) return;
2163 break;
ed9b544e 2164 }
ed9b544e 2165 }
c28b42ac 2166 /* Now the output buffer is empty, add the new single element */
2167 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2168 listAddNodeHead(c->reply,o);
ed9b544e 2169}
2170
2171static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2172 redisClient *c = privdata;
2173 int nwritten = 0, totwritten = 0, objlen;
2174 robj *o;
2175 REDIS_NOTUSED(el);
2176 REDIS_NOTUSED(mask);
2177
2895e862 2178 /* Use writev() if we have enough buffers to send */
7ea870c0 2179 if (!server.glueoutputbuf &&
e0a62c7f 2180 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
7ea870c0 2181 !(c->flags & REDIS_MASTER))
2895e862 2182 {
2183 sendReplyToClientWritev(el, fd, privdata, mask);
2184 return;
2185 }
2895e862 2186
ed9b544e 2187 while(listLength(c->reply)) {
c28b42ac 2188 if (server.glueoutputbuf && listLength(c->reply) > 1)
2189 glueReplyBuffersIfNeeded(c);
2190
ed9b544e 2191 o = listNodeValue(listFirst(c->reply));
2192 objlen = sdslen(o->ptr);
2193
2194 if (objlen == 0) {
2195 listDelNode(c->reply,listFirst(c->reply));
2196 continue;
2197 }
2198
2199 if (c->flags & REDIS_MASTER) {
6f376729 2200 /* Don't reply to a master */
ed9b544e 2201 nwritten = objlen - c->sentlen;
2202 } else {
a4d1ba9a 2203 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 2204 if (nwritten <= 0) break;
2205 }
2206 c->sentlen += nwritten;
2207 totwritten += nwritten;
2208 /* If we fully sent the object on head go to the next one */
2209 if (c->sentlen == objlen) {
2210 listDelNode(c->reply,listFirst(c->reply));
2211 c->sentlen = 0;
2212 }
6f376729 2213 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 2214 * bytes, in a single threaded server it's a good idea to serve
6f376729 2215 * other clients as well, even if a very large request comes from
2216 * super fast link that is always able to accept data (in real world
12f9d551 2217 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 2218 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 2219 }
2220 if (nwritten == -1) {
2221 if (errno == EAGAIN) {
2222 nwritten = 0;
2223 } else {
f870935d 2224 redisLog(REDIS_VERBOSE,
ed9b544e 2225 "Error writing to client: %s", strerror(errno));
2226 freeClient(c);
2227 return;
2228 }
2229 }
2230 if (totwritten > 0) c->lastinteraction = time(NULL);
2231 if (listLength(c->reply) == 0) {
2232 c->sentlen = 0;
2233 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2234 }
2235}
2236
2895e862 2237static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2238{
2239 redisClient *c = privdata;
2240 int nwritten = 0, totwritten = 0, objlen, willwrite;
2241 robj *o;
2242 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2243 int offset, ion = 0;
2244 REDIS_NOTUSED(el);
2245 REDIS_NOTUSED(mask);
2246
2247 listNode *node;
2248 while (listLength(c->reply)) {
2249 offset = c->sentlen;
2250 ion = 0;
2251 willwrite = 0;
2252
2253 /* fill-in the iov[] array */
2254 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2255 o = listNodeValue(node);
2256 objlen = sdslen(o->ptr);
2257
e0a62c7f 2258 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2895e862 2259 break;
2260
2261 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2262 break; /* no more iovecs */
2263
2264 iov[ion].iov_base = ((char*)o->ptr) + offset;
2265 iov[ion].iov_len = objlen - offset;
2266 willwrite += objlen - offset;
2267 offset = 0; /* just for the first item */
2268 ion++;
2269 }
2270
2271 if(willwrite == 0)
2272 break;
2273
2274 /* write all collected blocks at once */
2275 if((nwritten = writev(fd, iov, ion)) < 0) {
2276 if (errno != EAGAIN) {
f870935d 2277 redisLog(REDIS_VERBOSE,
2895e862 2278 "Error writing to client: %s", strerror(errno));
2279 freeClient(c);
2280 return;
2281 }
2282 break;
2283 }
2284
2285 totwritten += nwritten;
2286 offset = c->sentlen;
2287
2288 /* remove written robjs from c->reply */
2289 while (nwritten && listLength(c->reply)) {
2290 o = listNodeValue(listFirst(c->reply));
2291 objlen = sdslen(o->ptr);
2292
2293 if(nwritten >= objlen - offset) {
2294 listDelNode(c->reply, listFirst(c->reply));
2295 nwritten -= objlen - offset;
2296 c->sentlen = 0;
2297 } else {
2298 /* partial write */
2299 c->sentlen += nwritten;
2300 break;
2301 }
2302 offset = 0;
2303 }
2304 }
2305
e0a62c7f 2306 if (totwritten > 0)
2895e862 2307 c->lastinteraction = time(NULL);
2308
2309 if (listLength(c->reply) == 0) {
2310 c->sentlen = 0;
2311 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2312 }
2313}
2314
1a132bbc
PN
2315static int qsortRedisCommands(const void *r1, const void *r2) {
2316 return strcasecmp(
2317 ((struct redisCommand*)r1)->name,
2318 ((struct redisCommand*)r2)->name);
2319}
2320
2321static void sortCommandTable() {
1a132bbc
PN
2322 /* Copy and sort the read-only version of the command table */
2323 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2324 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
d55d5c5d 2325 qsort(commandTable,
2326 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2327 sizeof(struct redisCommand),qsortRedisCommands);
1a132bbc
PN
2328}
2329
ed9b544e 2330static struct redisCommand *lookupCommand(char *name) {
1a132bbc
PN
2331 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2332 return bsearch(
2333 &tmp,
2334 commandTable,
d55d5c5d 2335 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
1a132bbc
PN
2336 sizeof(struct redisCommand),
2337 qsortRedisCommands);
ed9b544e 2338}
2339
2340/* resetClient prepare the client to process the next command */
2341static void resetClient(redisClient *c) {
2342 freeClientArgv(c);
2343 c->bulklen = -1;
e8a74421 2344 c->multibulk = 0;
ed9b544e 2345}
2346
6e469882 2347/* Call() is the core of Redis execution of a command */
2348static void call(redisClient *c, struct redisCommand *cmd) {
2349 long long dirty;
2350
2351 dirty = server.dirty;
2352 cmd->proc(c);
4005fef1 2353 dirty = server.dirty-dirty;
2354
2355 if (server.appendonly && dirty)
6e469882 2356 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
4005fef1 2357 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2358 listLength(server.slaves))
248ea310 2359 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2360 if (listLength(server.monitors))
dd142b9c 2361 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2362 server.stat_numcommands++;
2363}
2364
ed9b544e 2365/* If this function gets called we already read a whole
2366 * command, argments are in the client argv/argc fields.
2367 * processCommand() execute the command or prepare the
2368 * server for a bulk read from the client.
2369 *
2370 * If 1 is returned the client is still alive and valid and
2371 * and other operations can be performed by the caller. Otherwise
2372 * if 0 is returned the client was destroied (i.e. after QUIT). */
2373static int processCommand(redisClient *c) {
2374 struct redisCommand *cmd;
ed9b544e 2375
3fd78bcd 2376 /* Free some memory if needed (maxmemory setting) */
2377 if (server.maxmemory) freeMemoryIfNeeded();
2378
e8a74421 2379 /* Handle the multi bulk command type. This is an alternative protocol
2380 * supported by Redis in order to receive commands that are composed of
2381 * multiple binary-safe "bulk" arguments. The latency of processing is
2382 * a bit higher but this allows things like multi-sets, so if this
2383 * protocol is used only for MSET and similar commands this is a big win. */
2384 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2385 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2386 if (c->multibulk <= 0) {
2387 resetClient(c);
2388 return 1;
2389 } else {
2390 decrRefCount(c->argv[c->argc-1]);
2391 c->argc--;
2392 return 1;
2393 }
2394 } else if (c->multibulk) {
2395 if (c->bulklen == -1) {
2396 if (((char*)c->argv[0]->ptr)[0] != '$') {
2397 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2398 resetClient(c);
2399 return 1;
2400 } else {
2401 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2402 decrRefCount(c->argv[0]);
2403 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2404 c->argc--;
2405 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2406 resetClient(c);
2407 return 1;
2408 }
2409 c->argc--;
2410 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2411 return 1;
2412 }
2413 } else {
2414 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2415 c->mbargv[c->mbargc] = c->argv[0];
2416 c->mbargc++;
2417 c->argc--;
2418 c->multibulk--;
2419 if (c->multibulk == 0) {
2420 robj **auxargv;
2421 int auxargc;
2422
2423 /* Here we need to swap the multi-bulk argc/argv with the
2424 * normal argc/argv of the client structure. */
2425 auxargv = c->argv;
2426 c->argv = c->mbargv;
2427 c->mbargv = auxargv;
2428
2429 auxargc = c->argc;
2430 c->argc = c->mbargc;
2431 c->mbargc = auxargc;
2432
2433 /* We need to set bulklen to something different than -1
2434 * in order for the code below to process the command without
2435 * to try to read the last argument of a bulk command as
2436 * a special argument. */
2437 c->bulklen = 0;
2438 /* continue below and process the command */
2439 } else {
2440 c->bulklen = -1;
2441 return 1;
2442 }
2443 }
2444 }
2445 /* -- end of multi bulk commands processing -- */
2446
ed9b544e 2447 /* The QUIT command is handled as a special case. Normal command
2448 * procs are unable to close the client connection safely */
bb0b03a3 2449 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2450 freeClient(c);
2451 return 0;
2452 }
d5d55fc3 2453
2454 /* Now lookup the command and check ASAP about trivial error conditions
2455 * such wrong arity, bad command name and so forth. */
ed9b544e 2456 cmd = lookupCommand(c->argv[0]->ptr);
2457 if (!cmd) {
2c14807b 2458 addReplySds(c,
2459 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2460 (char*)c->argv[0]->ptr));
ed9b544e 2461 resetClient(c);
2462 return 1;
2463 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2464 (c->argc < -cmd->arity)) {
454d4e43 2465 addReplySds(c,
2466 sdscatprintf(sdsempty(),
2467 "-ERR wrong number of arguments for '%s' command\r\n",
2468 cmd->name));
ed9b544e 2469 resetClient(c);
2470 return 1;
2471 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2472 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2473 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2474
2475 decrRefCount(c->argv[c->argc-1]);
2476 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2477 c->argc--;
2478 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2479 resetClient(c);
2480 return 1;
2481 }
2482 c->argc--;
2483 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2484 /* It is possible that the bulk read is already in the
8d0490e7 2485 * buffer. Check this condition and handle it accordingly.
2486 * This is just a fast path, alternative to call processInputBuffer().
2487 * It's a good idea since the code is small and this condition
2488 * happens most of the times. */
ed9b544e 2489 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2490 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2491 c->argc++;
2492 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2493 } else {
d5d55fc3 2494 /* Otherwise return... there is to read the last argument
2495 * from the socket. */
ed9b544e 2496 return 1;
2497 }
2498 }
942a3961 2499 /* Let's try to encode the bulk object to save space. */
2500 if (cmd->flags & REDIS_CMD_BULK)
05df7621 2501 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
942a3961 2502
e63943a4 2503 /* Check if the user is authenticated */
2504 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2505 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2506 resetClient(c);
2507 return 1;
2508 }
2509
b61a28fe 2510 /* Handle the maxmemory directive */
2511 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2512 zmalloc_used_memory() > server.maxmemory)
2513 {
2514 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2515 resetClient(c);
2516 return 1;
2517 }
2518
d6cc8867 2519 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
e6cca5db 2520 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2521 &&
ffc6b7f8 2522 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2523 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2524 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
d6cc8867 2525 resetClient(c);
2526 return 1;
2527 }
2528
ed9b544e 2529 /* Exec the command */
6531c94d 2530 if (c->flags & REDIS_MULTI &&
2531 cmd->proc != execCommand && cmd->proc != discardCommand &&
2532 cmd->proc != multiCommand && cmd->proc != watchCommand)
2533 {
6e469882 2534 queueMultiCommand(c,cmd);
2535 addReply(c,shared.queued);
2536 } else {
d5d55fc3 2537 if (server.vm_enabled && server.vm_max_threads > 0 &&
0a6f3f0f 2538 blockClientOnSwappedKeys(c,cmd)) return 1;
6e469882 2539 call(c,cmd);
2540 }
ed9b544e 2541
2542 /* Prepare the client for the next command */
ed9b544e 2543 resetClient(c);
2544 return 1;
2545}
2546
248ea310 2547static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2548 listNode *ln;
c7df85a4 2549 listIter li;
ed9b544e 2550 int outc = 0, j;
93ea3759 2551 robj **outv;
248ea310 2552 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2553 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2554 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2555 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2556 robj *lenobj;
93ea3759 2557
2558 if (argc <= REDIS_STATIC_ARGS) {
2559 outv = static_outv;
2560 } else {
248ea310 2561 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2562 }
248ea310 2563
2564 lenobj = createObject(REDIS_STRING,
2565 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2566 lenobj->refcount = 0;
2567 outv[outc++] = lenobj;
ed9b544e 2568 for (j = 0; j < argc; j++) {
248ea310 2569 lenobj = createObject(REDIS_STRING,
2570 sdscatprintf(sdsempty(),"$%lu\r\n",
2571 (unsigned long) stringObjectLen(argv[j])));
2572 lenobj->refcount = 0;
2573 outv[outc++] = lenobj;
ed9b544e 2574 outv[outc++] = argv[j];
248ea310 2575 outv[outc++] = shared.crlf;
ed9b544e 2576 }
ed9b544e 2577
40d224a9 2578 /* Increment all the refcounts at start and decrement at end in order to
2579 * be sure to free objects if there is no slave in a replication state
2580 * able to be feed with commands */
2581 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2582 listRewind(slaves,&li);
2583 while((ln = listNext(&li))) {
ed9b544e 2584 redisClient *slave = ln->value;
40d224a9 2585
2586 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2587 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2588
2589 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2590 if (slave->slaveseldb != dictid) {
2591 robj *selectcmd;
2592
2593 switch(dictid) {
2594 case 0: selectcmd = shared.select0; break;
2595 case 1: selectcmd = shared.select1; break;
2596 case 2: selectcmd = shared.select2; break;
2597 case 3: selectcmd = shared.select3; break;
2598 case 4: selectcmd = shared.select4; break;
2599 case 5: selectcmd = shared.select5; break;
2600 case 6: selectcmd = shared.select6; break;
2601 case 7: selectcmd = shared.select7; break;
2602 case 8: selectcmd = shared.select8; break;
2603 case 9: selectcmd = shared.select9; break;
2604 default:
2605 selectcmd = createObject(REDIS_STRING,
2606 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2607 selectcmd->refcount = 0;
2608 break;
2609 }
2610 addReply(slave,selectcmd);
2611 slave->slaveseldb = dictid;
2612 }
2613 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2614 }
40d224a9 2615 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2616 if (outv != static_outv) zfree(outv);
ed9b544e 2617}
2618
dd142b9c 2619static sds sdscatrepr(sds s, char *p, size_t len) {
2620 s = sdscatlen(s,"\"",1);
2621 while(len--) {
2622 switch(*p) {
2623 case '\\':
2624 case '"':
2625 s = sdscatprintf(s,"\\%c",*p);
2626 break;
2627 case '\n': s = sdscatlen(s,"\\n",1); break;
2628 case '\r': s = sdscatlen(s,"\\r",1); break;
2629 case '\t': s = sdscatlen(s,"\\t",1); break;
2630 case '\a': s = sdscatlen(s,"\\a",1); break;
2631 case '\b': s = sdscatlen(s,"\\b",1); break;
2632 default:
2633 if (isprint(*p))
2634 s = sdscatprintf(s,"%c",*p);
2635 else
2636 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2637 break;
2638 }
2639 p++;
2640 }
2641 return sdscatlen(s,"\"",1);
2642}
2643
2644static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2645 listNode *ln;
2646 listIter li;
2647 int j;
2648 sds cmdrepr = sdsnew("+");
2649 robj *cmdobj;
2650 struct timeval tv;
2651
2652 gettimeofday(&tv,NULL);
2653 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2654 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2655
2656 for (j = 0; j < argc; j++) {
2657 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2658 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2659 } else {
2660 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2661 sdslen(argv[j]->ptr));
2662 }
2663 if (j != argc-1)
2664 cmdrepr = sdscatlen(cmdrepr," ",1);
2665 }
2666 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2667 cmdobj = createObject(REDIS_STRING,cmdrepr);
2668
2669 listRewind(monitors,&li);
2670 while((ln = listNext(&li))) {
2671 redisClient *monitor = ln->value;
2672 addReply(monitor,cmdobj);
2673 }
2674 decrRefCount(cmdobj);
2675}
2676
638e42ac 2677static void processInputBuffer(redisClient *c) {
ed9b544e 2678again:
4409877e 2679 /* Before to process the input buffer, make sure the client is not
2680 * waitig for a blocking operation such as BLPOP. Note that the first
2681 * iteration the client is never blocked, otherwise the processInputBuffer
2682 * would not be called at all, but after the execution of the first commands
2683 * in the input buffer the client may be blocked, and the "goto again"
2684 * will try to reiterate. The following line will make it return asap. */
92f8e882 2685 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2686 if (c->bulklen == -1) {
2687 /* Read the first line of the query */
2688 char *p = strchr(c->querybuf,'\n');
2689 size_t querylen;
644fafa3 2690
ed9b544e 2691 if (p) {
2692 sds query, *argv;
2693 int argc, j;
e0a62c7f 2694
ed9b544e 2695 query = c->querybuf;
2696 c->querybuf = sdsempty();
2697 querylen = 1+(p-(query));
2698 if (sdslen(query) > querylen) {
2699 /* leave data after the first line of the query in the buffer */
2700 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2701 }
2702 *p = '\0'; /* remove "\n" */
2703 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2704 sdsupdatelen(query);
2705
2706 /* Now we can split the query in arguments */
ed9b544e 2707 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2708 sdsfree(query);
2709
2710 if (c->argv) zfree(c->argv);
2711 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2712
2713 for (j = 0; j < argc; j++) {
ed9b544e 2714 if (sdslen(argv[j])) {
2715 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2716 c->argc++;
2717 } else {
2718 sdsfree(argv[j]);
2719 }
2720 }
2721 zfree(argv);
7c49733c 2722 if (c->argc) {
2723 /* Execute the command. If the client is still valid
2724 * after processCommand() return and there is something
2725 * on the query buffer try to process the next command. */
2726 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2727 } else {
2728 /* Nothing to process, argc == 0. Just process the query
2729 * buffer if it's not empty or return to the caller */
2730 if (sdslen(c->querybuf)) goto again;
2731 }
ed9b544e 2732 return;
644fafa3 2733 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2734 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2735 freeClient(c);
2736 return;
2737 }
2738 } else {
2739 /* Bulk read handling. Note that if we are at this point
2740 the client already sent a command terminated with a newline,
2741 we are reading the bulk data that is actually the last
2742 argument of the command. */
2743 int qbl = sdslen(c->querybuf);
2744
2745 if (c->bulklen <= qbl) {
2746 /* Copy everything but the final CRLF as final argument */
2747 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2748 c->argc++;
2749 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2750 /* Process the command. If the client is still valid after
2751 * the processing and there is more data in the buffer
2752 * try to parse it. */
2753 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2754 return;
2755 }
2756 }
2757}
2758
638e42ac 2759static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2760 redisClient *c = (redisClient*) privdata;
2761 char buf[REDIS_IOBUF_LEN];
2762 int nread;
2763 REDIS_NOTUSED(el);
2764 REDIS_NOTUSED(mask);
2765
2766 nread = read(fd, buf, REDIS_IOBUF_LEN);
2767 if (nread == -1) {
2768 if (errno == EAGAIN) {
2769 nread = 0;
2770 } else {
f870935d 2771 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2772 freeClient(c);
2773 return;
2774 }
2775 } else if (nread == 0) {
f870935d 2776 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2777 freeClient(c);
2778 return;
2779 }
2780 if (nread) {
2781 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2782 c->lastinteraction = time(NULL);
2783 } else {
2784 return;
2785 }
168ac5c6 2786 processInputBuffer(c);
638e42ac 2787}
2788
ed9b544e 2789static int selectDb(redisClient *c, int id) {
2790 if (id < 0 || id >= server.dbnum)
2791 return REDIS_ERR;
3305306f 2792 c->db = &server.db[id];
ed9b544e 2793 return REDIS_OK;
2794}
2795
40d224a9 2796static void *dupClientReplyValue(void *o) {
2797 incrRefCount((robj*)o);
12d090d2 2798 return o;
40d224a9 2799}
2800
ffc6b7f8 2801static int listMatchObjects(void *a, void *b) {
bf028098 2802 return equalStringObjects(a,b);
ffc6b7f8 2803}
2804
ed9b544e 2805static redisClient *createClient(int fd) {
2806 redisClient *c = zmalloc(sizeof(*c));
2807
2808 anetNonBlock(NULL,fd);
2809 anetTcpNoDelay(NULL,fd);
2810 if (!c) return NULL;
2811 selectDb(c,0);
2812 c->fd = fd;
2813 c->querybuf = sdsempty();
2814 c->argc = 0;
93ea3759 2815 c->argv = NULL;
ed9b544e 2816 c->bulklen = -1;
e8a74421 2817 c->multibulk = 0;
2818 c->mbargc = 0;
2819 c->mbargv = NULL;
ed9b544e 2820 c->sentlen = 0;
2821 c->flags = 0;
2822 c->lastinteraction = time(NULL);
abcb223e 2823 c->authenticated = 0;
40d224a9 2824 c->replstate = REDIS_REPL_NONE;
6b47e12e 2825 c->reply = listCreate();
ed9b544e 2826 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2827 listSetDupMethod(c->reply,dupClientReplyValue);
37ab76c9 2828 c->blocking_keys = NULL;
2829 c->blocking_keys_num = 0;
92f8e882 2830 c->io_keys = listCreate();
87c68815 2831 c->watched_keys = listCreate();
92f8e882 2832 listSetFreeMethod(c->io_keys,decrRefCount);
ffc6b7f8 2833 c->pubsub_channels = dictCreate(&setDictType,NULL);
2834 c->pubsub_patterns = listCreate();
2835 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2836 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
ed9b544e 2837 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2838 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2839 freeClient(c);
2840 return NULL;
2841 }
6b47e12e 2842 listAddNodeTail(server.clients,c);
6e469882 2843 initClientMultiState(c);
ed9b544e 2844 return c;
2845}
2846
2847static void addReply(redisClient *c, robj *obj) {
2848 if (listLength(c->reply) == 0 &&
6208b3a7 2849 (c->replstate == REDIS_REPL_NONE ||
2850 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2851 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2852 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2853
2854 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2855 obj = dupStringObject(obj);
2856 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2857 }
9d65a1bb 2858 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2859}
2860
2861static void addReplySds(redisClient *c, sds s) {
2862 robj *o = createObject(REDIS_STRING,s);
2863 addReply(c,o);
2864 decrRefCount(o);
2865}
2866
e2665397 2867static void addReplyDouble(redisClient *c, double d) {
2868 char buf[128];
2869
2870 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2871 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2872 (unsigned long) strlen(buf),buf));
e2665397 2873}
2874
aa7c2934
PN
2875static void addReplyLongLong(redisClient *c, long long ll) {
2876 char buf[128];
2877 size_t len;
2878
2879 if (ll == 0) {
2880 addReply(c,shared.czero);
2881 return;
2882 } else if (ll == 1) {
2883 addReply(c,shared.cone);
2884 return;
2885 }
482b672d 2886 buf[0] = ':';
2887 len = ll2string(buf+1,sizeof(buf)-1,ll);
2888 buf[len+1] = '\r';
2889 buf[len+2] = '\n';
2890 addReplySds(c,sdsnewlen(buf,len+3));
aa7c2934
PN
2891}
2892
92b27fe9 2893static void addReplyUlong(redisClient *c, unsigned long ul) {
2894 char buf[128];
2895 size_t len;
2896
dd88747b 2897 if (ul == 0) {
2898 addReply(c,shared.czero);
2899 return;
2900 } else if (ul == 1) {
2901 addReply(c,shared.cone);
2902 return;
2903 }
92b27fe9 2904 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2905 addReplySds(c,sdsnewlen(buf,len));
2906}
2907
942a3961 2908static void addReplyBulkLen(redisClient *c, robj *obj) {
482b672d 2909 size_t len, intlen;
2910 char buf[128];
942a3961 2911
2912 if (obj->encoding == REDIS_ENCODING_RAW) {
2913 len = sdslen(obj->ptr);
2914 } else {
2915 long n = (long)obj->ptr;
2916
e054afda 2917 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2918 len = 1;
2919 if (n < 0) {
2920 len++;
2921 n = -n;
2922 }
2923 while((n = n/10) != 0) {
2924 len++;
2925 }
2926 }
482b672d 2927 buf[0] = '$';
2928 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2929 buf[intlen+1] = '\r';
2930 buf[intlen+2] = '\n';
2931 addReplySds(c,sdsnewlen(buf,intlen+3));
942a3961 2932}
2933
dd88747b 2934static void addReplyBulk(redisClient *c, robj *obj) {
2935 addReplyBulkLen(c,obj);
2936 addReply(c,obj);
2937 addReply(c,shared.crlf);
2938}
2939
09241813 2940static void addReplyBulkSds(redisClient *c, sds s) {
2941 robj *o = createStringObject(s, sdslen(s));
2942 addReplyBulk(c,o);
2943 decrRefCount(o);
2944}
2945
500ece7c 2946/* In the CONFIG command we need to add vanilla C string as bulk replies */
2947static void addReplyBulkCString(redisClient *c, char *s) {
2948 if (s == NULL) {
2949 addReply(c,shared.nullbulk);
2950 } else {
2951 robj *o = createStringObject(s,strlen(s));
2952 addReplyBulk(c,o);
2953 decrRefCount(o);
2954 }
2955}
2956
ed9b544e 2957static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2958 int cport, cfd;
2959 char cip[128];
285add55 2960 redisClient *c;
ed9b544e 2961 REDIS_NOTUSED(el);
2962 REDIS_NOTUSED(mask);
2963 REDIS_NOTUSED(privdata);
2964
2965 cfd = anetAccept(server.neterr, fd, cip, &cport);
2966 if (cfd == AE_ERR) {
f870935d 2967 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2968 return;
2969 }
f870935d 2970 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2971 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2972 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2973 close(cfd); /* May be already closed, just ingore errors */
2974 return;
2975 }
285add55 2976 /* If maxclient directive is set and this is one client more... close the
2977 * connection. Note that we create the client instead to check before
2978 * for this condition, since now the socket is already set in nonblocking
2979 * mode and we can send an error for free using the Kernel I/O */
2980 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2981 char *err = "-ERR max number of clients reached\r\n";
2982
2983 /* That's a best effort error message, don't check write errors */
fee803ba 2984 if (write(c->fd,err,strlen(err)) == -1) {
2985 /* Nothing to do, Just to avoid the warning... */
2986 }
285add55 2987 freeClient(c);
2988 return;
2989 }
ed9b544e 2990 server.stat_numconnections++;
2991}
2992
2993/* ======================= Redis objects implementation ===================== */
2994
2995static robj *createObject(int type, void *ptr) {
2996 robj *o;
2997
a5819310 2998 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2999 if (listLength(server.objfreelist)) {
3000 listNode *head = listFirst(server.objfreelist);
3001 o = listNodeValue(head);
3002 listDelNode(server.objfreelist,head);
a5819310 3003 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 3004 } else {
560db612 3005 if (server.vm_enabled)
a5819310 3006 pthread_mutex_unlock(&server.obj_freelist_mutex);
560db612 3007 o = zmalloc(sizeof(*o));
ed9b544e 3008 }
ed9b544e 3009 o->type = type;
942a3961 3010 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 3011 o->ptr = ptr;
3012 o->refcount = 1;
3a66edc7 3013 if (server.vm_enabled) {
1064ef87 3014 /* Note that this code may run in the context of an I/O thread
560db612 3015 * and accessing server.lruclock in theory is an error
1064ef87 3016 * (no locks). But in practice this is safe, and even if we read
560db612 3017 * garbage Redis will not fail. */
3018 o->lru = server.lruclock;
3a66edc7 3019 o->storage = REDIS_VM_MEMORY;
3020 }
ed9b544e 3021 return o;
3022}
3023
3024static robj *createStringObject(char *ptr, size_t len) {
3025 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
3026}
3027
3f973463
PN
3028static robj *createStringObjectFromLongLong(long long value) {
3029 robj *o;
3030 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3031 incrRefCount(shared.integers[value]);
3032 o = shared.integers[value];
3033 } else {
3f973463 3034 if (value >= LONG_MIN && value <= LONG_MAX) {
10dea8dc 3035 o = createObject(REDIS_STRING, NULL);
3f973463
PN
3036 o->encoding = REDIS_ENCODING_INT;
3037 o->ptr = (void*)((long)value);
3038 } else {
ee14da56 3039 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3f973463
PN
3040 }
3041 }
3042 return o;
3043}
3044
4ef8de8a 3045static robj *dupStringObject(robj *o) {
b9bc0eef 3046 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 3047 return createStringObject(o->ptr,sdslen(o->ptr));
3048}
3049
ed9b544e 3050static robj *createListObject(void) {
3051 list *l = listCreate();
1cd92e7f 3052 robj *o = createObject(REDIS_LIST,l);
ed9b544e 3053 listSetFreeMethod(l,decrRefCount);
1cd92e7f
PN
3054 o->encoding = REDIS_ENCODING_LIST;
3055 return o;
3056}
3057
3058static robj *createZiplistObject(void) {
3059 unsigned char *zl = ziplistNew();
3060 robj *o = createObject(REDIS_LIST,zl);
3061 o->encoding = REDIS_ENCODING_ZIPLIST;
3062 return o;
ed9b544e 3063}
3064
3065static robj *createSetObject(void) {
3066 dict *d = dictCreate(&setDictType,NULL);
35cabcb5
PN
3067 robj *o = createObject(REDIS_SET,d);
3068 o->encoding = REDIS_ENCODING_HT;
3069 return o;
ed9b544e 3070}
3071
5234952b 3072static robj *createHashObject(void) {
3073 /* All the Hashes start as zipmaps. Will be automatically converted
3074 * into hash tables if there are enough elements or big elements
3075 * inside. */
3076 unsigned char *zm = zipmapNew();
3077 robj *o = createObject(REDIS_HASH,zm);
3078 o->encoding = REDIS_ENCODING_ZIPMAP;
3079 return o;
3080}
3081
1812e024 3082static robj *createZsetObject(void) {
6b47e12e 3083 zset *zs = zmalloc(sizeof(*zs));
3084
3085 zs->dict = dictCreate(&zsetDictType,NULL);
3086 zs->zsl = zslCreate();
3087 return createObject(REDIS_ZSET,zs);
1812e024 3088}
3089
ed9b544e 3090static void freeStringObject(robj *o) {
942a3961 3091 if (o->encoding == REDIS_ENCODING_RAW) {
3092 sdsfree(o->ptr);
3093 }
ed9b544e 3094}
3095
3096static void freeListObject(robj *o) {
c7d9d662
PN
3097 switch (o->encoding) {
3098 case REDIS_ENCODING_LIST:
3099 listRelease((list*) o->ptr);
3100 break;
3101 case REDIS_ENCODING_ZIPLIST:
3102 zfree(o->ptr);
3103 break;
3104 default:
3105 redisPanic("Unknown list encoding type");
3106 }
ed9b544e 3107}
3108
3109static void freeSetObject(robj *o) {
3110 dictRelease((dict*) o->ptr);
3111}
3112
fd8ccf44 3113static void freeZsetObject(robj *o) {
3114 zset *zs = o->ptr;
3115
3116 dictRelease(zs->dict);
3117 zslFree(zs->zsl);
3118 zfree(zs);
3119}
3120
ed9b544e 3121static void freeHashObject(robj *o) {
cbba7dd7 3122 switch (o->encoding) {
3123 case REDIS_ENCODING_HT:
3124 dictRelease((dict*) o->ptr);
3125 break;
3126 case REDIS_ENCODING_ZIPMAP:
3127 zfree(o->ptr);
3128 break;
3129 default:
f83c6cb5 3130 redisPanic("Unknown hash encoding type");
cbba7dd7 3131 break;
3132 }
ed9b544e 3133}
3134
3135static void incrRefCount(robj *o) {
3136 o->refcount++;
3137}
3138
3139static void decrRefCount(void *obj) {
3140 robj *o = obj;
94754ccc 3141
560db612 3142 /* Object is a swapped out value, or in the process of being loaded. */
996cb5f7 3143 if (server.vm_enabled &&
3144 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3145 {
560db612 3146 vmpointer *vp = obj;
3147 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(o);
3148 vmMarkPagesFree(vp->page,vp->usedpages);
7d98e08c 3149 server.vm_stats_swapped_objects--;
560db612 3150 zfree(vp);
a35ddf12 3151 return;
3152 }
560db612 3153
3154 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
e4ed181d 3155 /* Object is in memory, or in the process of being swapped out.
3156 *
3157 * If the object is being swapped out, abort the operation on
3158 * decrRefCount even if the refcount does not drop to 0: the object
3159 * is referenced at least two times, as value of the key AND as
3160 * job->val in the iojob. So if we don't invalidate the iojob, when it is
3161 * done but the relevant key was removed in the meantime, the
3162 * complete jobs handler will not find the key about the job and the
3163 * assert will fail. */
3164 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3165 vmCancelThreadedIOJob(o);
ed9b544e 3166 if (--(o->refcount) == 0) {
3167 switch(o->type) {
3168 case REDIS_STRING: freeStringObject(o); break;
3169 case REDIS_LIST: freeListObject(o); break;
3170 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 3171 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 3172 case REDIS_HASH: freeHashObject(o); break;
f83c6cb5 3173 default: redisPanic("Unknown object type"); break;
ed9b544e 3174 }
a5819310 3175 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 3176 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3177 !listAddNodeHead(server.objfreelist,o))
3178 zfree(o);
a5819310 3179 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 3180 }
3181}
3182
92b27fe9 3183static int checkType(redisClient *c, robj *o, int type) {
3184 if (o->type != type) {
3185 addReply(c,shared.wrongtypeerr);
3186 return 1;
3187 }
3188 return 0;
3189}
3190
724a51b1 3191/* Check if the nul-terminated string 's' can be represented by a long
3192 * (that is, is a number that fits into long without any other space or
3193 * character before or after the digits).
3194 *
3195 * If so, the function returns REDIS_OK and *longval is set to the value
3196 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 3197static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 3198 char buf[32], *endptr;
3199 long value;
3200 int slen;
e0a62c7f 3201
724a51b1 3202 value = strtol(s, &endptr, 10);
3203 if (endptr[0] != '\0') return REDIS_ERR;
ee14da56 3204 slen = ll2string(buf,32,value);
724a51b1 3205
3206 /* If the number converted back into a string is not identical
3207 * then it's not possible to encode the string as integer */
f69f2cba 3208 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 3209 if (longval) *longval = value;
3210 return REDIS_OK;
3211}
3212
942a3961 3213/* Try to encode a string object in order to save space */
05df7621 3214static robj *tryObjectEncoding(robj *o) {
942a3961 3215 long value;
942a3961 3216 sds s = o->ptr;
3305306f 3217
942a3961 3218 if (o->encoding != REDIS_ENCODING_RAW)
05df7621 3219 return o; /* Already encoded */
3305306f 3220
05df7621 3221 /* It's not safe to encode shared objects: shared objects can be shared
942a3961 3222 * everywhere in the "object space" of Redis. Encoded objects can only
3223 * appear as "values" (and not, for instance, as keys) */
05df7621 3224 if (o->refcount > 1) return o;
3305306f 3225
942a3961 3226 /* Currently we try to encode only strings */
dfc5e96c 3227 redisAssert(o->type == REDIS_STRING);
94754ccc 3228
724a51b1 3229 /* Check if we can represent this string as a long integer */
05df7621 3230 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
942a3961 3231
3232 /* Ok, this object can be encoded */
05df7621 3233 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3234 decrRefCount(o);
3235 incrRefCount(shared.integers[value]);
3236 return shared.integers[value];
3237 } else {
3238 o->encoding = REDIS_ENCODING_INT;
3239 sdsfree(o->ptr);
3240 o->ptr = (void*) value;
3241 return o;
3242 }
942a3961 3243}
3244
9d65a1bb 3245/* Get a decoded version of an encoded object (returned as a new object).
3246 * If the object is already raw-encoded just increment the ref count. */
3247static robj *getDecodedObject(robj *o) {
942a3961 3248 robj *dec;
e0a62c7f 3249
9d65a1bb 3250 if (o->encoding == REDIS_ENCODING_RAW) {
3251 incrRefCount(o);
3252 return o;
3253 }
942a3961 3254 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3255 char buf[32];
3256
ee14da56 3257 ll2string(buf,32,(long)o->ptr);
942a3961 3258 dec = createStringObject(buf,strlen(buf));
3259 return dec;
3260 } else {
08ee9b57 3261 redisPanic("Unknown encoding type");
942a3961 3262 }
3305306f 3263}
3264
d7f43c08 3265/* Compare two string objects via strcmp() or alike.
3266 * Note that the objects may be integer-encoded. In such a case we
ee14da56 3267 * use ll2string() to get a string representation of the numbers on the stack
1fd9bc8a 3268 * and compare the strings, it's much faster than calling getDecodedObject().
3269 *
3270 * Important note: if objects are not integer encoded, but binary-safe strings,
3271 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3272 * binary safe. */
724a51b1 3273static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 3274 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 3275 char bufa[128], bufb[128], *astr, *bstr;
3276 int bothsds = 1;
724a51b1 3277
e197b441 3278 if (a == b) return 0;
d7f43c08 3279 if (a->encoding != REDIS_ENCODING_RAW) {
ee14da56 3280 ll2string(bufa,sizeof(bufa),(long) a->ptr);
d7f43c08 3281 astr = bufa;
3282 bothsds = 0;
724a51b1 3283 } else {
d7f43c08 3284 astr = a->ptr;
724a51b1 3285 }
d7f43c08 3286 if (b->encoding != REDIS_ENCODING_RAW) {
ee14da56 3287 ll2string(bufb,sizeof(bufb),(long) b->ptr);
d7f43c08 3288 bstr = bufb;
3289 bothsds = 0;
3290 } else {
3291 bstr = b->ptr;
3292 }
3293 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3294}
3295
bf028098 3296/* Equal string objects return 1 if the two objects are the same from the
3297 * point of view of a string comparison, otherwise 0 is returned. Note that
3298 * this function is faster then checking for (compareStringObject(a,b) == 0)
3299 * because it can perform some more optimization. */
3300static int equalStringObjects(robj *a, robj *b) {
3301 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3302 return a->ptr == b->ptr;
3303 } else {
3304 return compareStringObjects(a,b) == 0;
3305 }
3306}
3307
0ea663ea 3308static size_t stringObjectLen(robj *o) {
dfc5e96c 3309 redisAssert(o->type == REDIS_STRING);
0ea663ea 3310 if (o->encoding == REDIS_ENCODING_RAW) {
3311 return sdslen(o->ptr);
3312 } else {
3313 char buf[32];
3314
ee14da56 3315 return ll2string(buf,32,(long)o->ptr);
0ea663ea 3316 }
3317}
3318
bd79a6bd
PN
3319static int getDoubleFromObject(robj *o, double *target) {
3320 double value;
682c73e8 3321 char *eptr;
bbe025e0 3322
bd79a6bd
PN
3323 if (o == NULL) {
3324 value = 0;
3325 } else {
3326 redisAssert(o->type == REDIS_STRING);
3327 if (o->encoding == REDIS_ENCODING_RAW) {
3328 value = strtod(o->ptr, &eptr);
682c73e8 3329 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3330 } else if (o->encoding == REDIS_ENCODING_INT) {
3331 value = (long)o->ptr;
3332 } else {
946342c1 3333 redisPanic("Unknown string encoding");
bd79a6bd
PN
3334 }
3335 }
3336
bd79a6bd
PN
3337 *target = value;
3338 return REDIS_OK;
3339}
bbe025e0 3340
bd79a6bd
PN
3341static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3342 double value;
3343 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3344 if (msg != NULL) {
3345 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3346 } else {
3347 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3348 }
bbe025e0
AM
3349 return REDIS_ERR;
3350 }
3351
bd79a6bd 3352 *target = value;
bbe025e0
AM
3353 return REDIS_OK;
3354}
3355
bd79a6bd
PN
3356static int getLongLongFromObject(robj *o, long long *target) {
3357 long long value;
682c73e8 3358 char *eptr;
bbe025e0 3359
bd79a6bd
PN
3360 if (o == NULL) {
3361 value = 0;
3362 } else {
3363 redisAssert(o->type == REDIS_STRING);
3364 if (o->encoding == REDIS_ENCODING_RAW) {
3365 value = strtoll(o->ptr, &eptr, 10);
682c73e8 3366 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3367 } else if (o->encoding == REDIS_ENCODING_INT) {
3368 value = (long)o->ptr;
3369 } else {
946342c1 3370 redisPanic("Unknown string encoding");
bd79a6bd
PN
3371 }
3372 }
3373
bd79a6bd
PN
3374 *target = value;
3375 return REDIS_OK;
3376}
bbe025e0 3377
bd79a6bd
PN
3378static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3379 long long value;
3380 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3381 if (msg != NULL) {
3382 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3383 } else {
3384 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3385 }
bbe025e0
AM
3386 return REDIS_ERR;
3387 }
3388
bd79a6bd 3389 *target = value;
bbe025e0
AM
3390 return REDIS_OK;
3391}
3392
bd79a6bd
PN
3393static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3394 long long value;
bbe025e0 3395
bd79a6bd
PN
3396 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3397 if (value < LONG_MIN || value > LONG_MAX) {
3398 if (msg != NULL) {
3399 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3400 } else {
3401 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3402 }
bbe025e0
AM
3403 return REDIS_ERR;
3404 }
3405
bd79a6bd 3406 *target = value;
bbe025e0
AM
3407 return REDIS_OK;
3408}
3409
612e4de8 3410/* =========================== Keyspace access API ========================== */
3411
3412static robj *lookupKey(redisDb *db, robj *key) {
09241813 3413 dictEntry *de = dictFind(db->dict,key->ptr);
612e4de8 3414 if (de) {
612e4de8 3415 robj *val = dictGetEntryVal(de);
3416
3417 if (server.vm_enabled) {
3418 if (val->storage == REDIS_VM_MEMORY ||
3419 val->storage == REDIS_VM_SWAPPING)
3420 {
3421 /* If we were swapping the object out, cancel the operation */
3422 if (val->storage == REDIS_VM_SWAPPING)
3423 vmCancelThreadedIOJob(val);
09241813 3424 /* Update the access time for the aging algorithm. */
612e4de8 3425 val->lru = server.lruclock;
3426 } else {
3427 int notify = (val->storage == REDIS_VM_LOADING);
3428
3429 /* Our value was swapped on disk. Bring it at home. */
3430 redisAssert(val->type == REDIS_VMPOINTER);
3431 val = vmLoadObject(val);
3432 dictGetEntryVal(de) = val;
3433
3434 /* Clients blocked by the VM subsystem may be waiting for
3435 * this key... */
3436 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3437 }
3438 }
3439 return val;
3440 } else {
3441 return NULL;
3442 }
3443}
3444
3445static robj *lookupKeyRead(redisDb *db, robj *key) {
3446 expireIfNeeded(db,key);
3447 return lookupKey(db,key);
3448}
3449
3450static robj *lookupKeyWrite(redisDb *db, robj *key) {
3451 deleteIfVolatile(db,key);
3452 touchWatchedKey(db,key);
3453 return lookupKey(db,key);
3454}
3455
3456static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3457 robj *o = lookupKeyRead(c->db, key);
3458 if (!o) addReply(c,reply);
3459 return o;
3460}
3461
3462static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3463 robj *o = lookupKeyWrite(c->db, key);
3464 if (!o) addReply(c,reply);
3465 return o;
3466}
3467
09241813 3468/* Add the key to the DB. If the key already exists REDIS_ERR is returned,
3469 * otherwise REDIS_OK is returned, and the caller should increment the
3470 * refcount of 'val'. */
3471static int dbAdd(redisDb *db, robj *key, robj *val) {
3472 /* Perform a lookup before adding the key, as we need to copy the
3473 * key value. */
3474 if (dictFind(db->dict, key->ptr) != NULL) {
3475 return REDIS_ERR;
3476 } else {
3477 sds copy = sdsdup(key->ptr);
3478 dictAdd(db->dict, copy, val);
3479 return REDIS_OK;
3480 }
3481}
3482
3483/* If the key does not exist, this is just like dbAdd(). Otherwise
3484 * the value associated to the key is replaced with the new one.
3485 *
3486 * On update (key already existed) 0 is returned. Otherwise 1. */
3487static int dbReplace(redisDb *db, robj *key, robj *val) {
3488 if (dictFind(db->dict,key->ptr) == NULL) {
3489 sds copy = sdsdup(key->ptr);
3490 dictAdd(db->dict, copy, val);
3491 return 1;
3492 } else {
3493 dictReplace(db->dict, key->ptr, val);
3494 return 0;
3495 }
3496}
3497
3498static int dbExists(redisDb *db, robj *key) {
3499 return dictFind(db->dict,key->ptr) != NULL;
3500}
3501
3502/* Return a random key, in form of a Redis object.
3503 * If there are no keys, NULL is returned.
3504 *
3505 * The function makes sure to return keys not already expired. */
3506static robj *dbRandomKey(redisDb *db) {
3507 struct dictEntry *de;
3508
3509 while(1) {
3510 sds key;
3511 robj *keyobj;
3512
3513 de = dictGetRandomKey(db->dict);
3514 if (de == NULL) return NULL;
3515
3516 key = dictGetEntryKey(de);
3517 keyobj = createStringObject(key,sdslen(key));
3518 if (dictFind(db->expires,key)) {
3519 if (expireIfNeeded(db,keyobj)) {
3520 decrRefCount(keyobj);
3521 continue; /* search for another key. This expired. */
3522 }
3523 }
3524 return keyobj;
3525 }
3526}
3527
3528/* Delete a key, value, and associated expiration entry if any, from the DB */
3529static int dbDelete(redisDb *db, robj *key) {
612e4de8 3530 int retval;
3531
09241813 3532 if (dictSize(db->expires)) dictDelete(db->expires,key->ptr);
3533 retval = dictDelete(db->dict,key->ptr);
612e4de8 3534
3535 return retval == DICT_OK;
3536}
3537
06233c45 3538/*============================ RDB saving/loading =========================== */
ed9b544e 3539
f78fd11b 3540static int rdbSaveType(FILE *fp, unsigned char type) {
3541 if (fwrite(&type,1,1,fp) == 0) return -1;
3542 return 0;
3543}
3544
bb32ede5 3545static int rdbSaveTime(FILE *fp, time_t t) {
3546 int32_t t32 = (int32_t) t;
3547 if (fwrite(&t32,4,1,fp) == 0) return -1;
3548 return 0;
3549}
3550
e3566d4b 3551/* check rdbLoadLen() comments for more info */
f78fd11b 3552static int rdbSaveLen(FILE *fp, uint32_t len) {
3553 unsigned char buf[2];
3554
3555 if (len < (1<<6)) {
3556 /* Save a 6 bit len */
10c43610 3557 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3558 if (fwrite(buf,1,1,fp) == 0) return -1;
3559 } else if (len < (1<<14)) {
3560 /* Save a 14 bit len */
10c43610 3561 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3562 buf[1] = len&0xFF;
17be1a4a 3563 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3564 } else {
3565 /* Save a 32 bit len */
10c43610 3566 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3567 if (fwrite(buf,1,1,fp) == 0) return -1;
3568 len = htonl(len);
3569 if (fwrite(&len,4,1,fp) == 0) return -1;
3570 }
3571 return 0;
3572}
3573
32a66513 3574/* Encode 'value' as an integer if possible (if integer will fit the
3575 * supported range). If the function sucessful encoded the integer
3576 * then the (up to 5 bytes) encoded representation is written in the
3577 * string pointed by 'enc' and the length is returned. Otherwise
3578 * 0 is returned. */
3579static int rdbEncodeInteger(long long value, unsigned char *enc) {
e3566d4b 3580 /* Finally check if it fits in our ranges */
3581 if (value >= -(1<<7) && value <= (1<<7)-1) {
3582 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3583 enc[1] = value&0xFF;
3584 return 2;
3585 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3586 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3587 enc[1] = value&0xFF;
3588 enc[2] = (value>>8)&0xFF;
3589 return 3;
3590 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3591 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3592 enc[1] = value&0xFF;
3593 enc[2] = (value>>8)&0xFF;
3594 enc[3] = (value>>16)&0xFF;
3595 enc[4] = (value>>24)&0xFF;
3596 return 5;
3597 } else {
3598 return 0;
3599 }
3600}
3601
32a66513 3602/* String objects in the form "2391" "-100" without any space and with a
3603 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3604 * encoded as integers to save space */
3605static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3606 long long value;
3607 char *endptr, buf[32];
3608
3609 /* Check if it's possible to encode this value as a number */
3610 value = strtoll(s, &endptr, 10);
3611 if (endptr[0] != '\0') return 0;
3612 ll2string(buf,32,value);
3613
3614 /* If the number converted back into a string is not identical
3615 * then it's not possible to encode the string as integer */
3616 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3617
3618 return rdbEncodeInteger(value,enc);
3619}
3620
b1befe6a 3621static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3622 size_t comprlen, outlen;
774e3047 3623 unsigned char byte;
3624 void *out;
3625
3626 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3627 if (len <= 4) return 0;
3628 outlen = len-4;
3a2694c4 3629 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3630 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3631 if (comprlen == 0) {
88e85998 3632 zfree(out);
774e3047 3633 return 0;
3634 }
3635 /* Data compressed! Let's save it on disk */
3636 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3637 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3638 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3639 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3640 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3641 zfree(out);
774e3047 3642 return comprlen;
3643
3644writeerr:
88e85998 3645 zfree(out);
774e3047 3646 return -1;
3647}
3648
e3566d4b 3649/* Save a string objet as [len][data] on disk. If the object is a string
3650 * representation of an integer value we try to safe it in a special form */
b1befe6a 3651static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3652 int enclen;
10c43610 3653
774e3047 3654 /* Try integer encoding */
e3566d4b 3655 if (len <= 11) {
3656 unsigned char buf[5];
b1befe6a 3657 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3658 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3659 return 0;
3660 }
3661 }
774e3047 3662
3663 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3664 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3665 if (server.rdbcompression && len > 20) {
774e3047 3666 int retval;
3667
b1befe6a 3668 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3669 if (retval == -1) return -1;
3670 if (retval > 0) return 0;
3671 /* retval == 0 means data can't be compressed, save the old way */
3672 }
3673
3674 /* Store verbatim */
10c43610 3675 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3676 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3677 return 0;
3678}
3679
2796f6da
PN
3680/* Save a long long value as either an encoded string or a string. */
3681static int rdbSaveLongLongAsStringObject(FILE *fp, long long value) {
3682 unsigned char buf[32];
3683 int enclen = rdbEncodeInteger(value,buf);
3684 if (enclen > 0) {
3685 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3686 } else {
3687 /* Encode as string */
3688 enclen = ll2string((char*)buf,32,value);
3689 redisAssert(enclen < 32);
3690 if (rdbSaveLen(fp,enclen) == -1) return -1;
3691 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3692 }
3693 return 0;
3694}
3695
942a3961 3696/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3697static int rdbSaveStringObject(FILE *fp, robj *obj) {
32a66513 3698 /* Avoid to decode the object, then encode it again, if the
3699 * object is alrady integer encoded. */
3700 if (obj->encoding == REDIS_ENCODING_INT) {
2796f6da 3701 return rdbSaveLongLongAsStringObject(fp,(long)obj->ptr);
996cb5f7 3702 } else {
2796f6da
PN
3703 redisAssert(obj->encoding == REDIS_ENCODING_RAW);
3704 return rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3705 }
942a3961 3706}
3707
a7866db6 3708/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3709 * 8 bit integer specifing the length of the representation.
3710 * This 8 bit integer has special values in order to specify the following
3711 * conditions:
3712 * 253: not a number
3713 * 254: + inf
3714 * 255: - inf
3715 */
3716static int rdbSaveDoubleValue(FILE *fp, double val) {
3717 unsigned char buf[128];
3718 int len;
3719
3720 if (isnan(val)) {
3721 buf[0] = 253;
3722 len = 1;
3723 } else if (!isfinite(val)) {
3724 len = 1;
3725 buf[0] = (val < 0) ? 255 : 254;
3726 } else {
88e8d89f 3727#if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
fe244589 3728 /* Check if the float is in a safe range to be casted into a
3729 * long long. We are assuming that long long is 64 bit here.
3730 * Also we are assuming that there are no implementations around where
3731 * double has precision < 52 bit.
3732 *
3733 * Under this assumptions we test if a double is inside an interval
3734 * where casting to long long is safe. Then using two castings we
3735 * make sure the decimal part is zero. If all this is true we use
3736 * integer printing function that is much faster. */
fb82e75c 3737 double min = -4503599627370495; /* (2^52)-1 */
3738 double max = 4503599627370496; /* -(2^52) */
fe244589 3739 if (val > min && val < max && val == ((double)((long long)val)))
8c096b16 3740 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3741 else
88e8d89f 3742#endif
8c096b16 3743 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3744 buf[0] = strlen((char*)buf+1);
a7866db6 3745 len = buf[0]+1;
3746 }
3747 if (fwrite(buf,len,1,fp) == 0) return -1;
3748 return 0;
3749}
3750
06233c45 3751/* Save a Redis object. */
3752static int rdbSaveObject(FILE *fp, robj *o) {
3753 if (o->type == REDIS_STRING) {
3754 /* Save a string value */
3755 if (rdbSaveStringObject(fp,o) == -1) return -1;
3756 } else if (o->type == REDIS_LIST) {
3757 /* Save a list value */
23f96494
PN
3758 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
3759 unsigned char *p;
3760 unsigned char *vstr;
3761 unsigned int vlen;
3762 long long vlong;
3763
3764 if (rdbSaveLen(fp,ziplistLen(o->ptr)) == -1) return -1;
3765 p = ziplistIndex(o->ptr,0);
3766 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
3767 if (vstr) {
3768 if (rdbSaveRawString(fp,vstr,vlen) == -1)
3769 return -1;
3770 } else {
3771 if (rdbSaveLongLongAsStringObject(fp,vlong) == -1)
3772 return -1;
3773 }
3774 p = ziplistNext(o->ptr,p);
3775 }
3776 } else if (o->encoding == REDIS_ENCODING_LIST) {
3777 list *list = o->ptr;
3778 listIter li;
3779 listNode *ln;
3780
3781 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3782 listRewind(list,&li);
3783 while((ln = listNext(&li))) {
3784 robj *eleobj = listNodeValue(ln);
3785 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3786 }
3787 } else {
3788 redisPanic("Unknown list encoding");
06233c45 3789 }
3790 } else if (o->type == REDIS_SET) {
3791 /* Save a set value */
3792 dict *set = o->ptr;
3793 dictIterator *di = dictGetIterator(set);
3794 dictEntry *de;
3795
3796 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3797 while((de = dictNext(di)) != NULL) {
3798 robj *eleobj = dictGetEntryKey(de);
3799
3800 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3801 }
3802 dictReleaseIterator(di);
3803 } else if (o->type == REDIS_ZSET) {
3804 /* Save a set value */
3805 zset *zs = o->ptr;
3806 dictIterator *di = dictGetIterator(zs->dict);
3807 dictEntry *de;
3808
3809 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3810 while((de = dictNext(di)) != NULL) {
3811 robj *eleobj = dictGetEntryKey(de);
3812 double *score = dictGetEntryVal(de);
3813
3814 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3815 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3816 }
3817 dictReleaseIterator(di);
b1befe6a 3818 } else if (o->type == REDIS_HASH) {
3819 /* Save a hash value */
3820 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3821 unsigned char *p = zipmapRewind(o->ptr);
3822 unsigned int count = zipmapLen(o->ptr);
3823 unsigned char *key, *val;
3824 unsigned int klen, vlen;
3825
3826 if (rdbSaveLen(fp,count) == -1) return -1;
3827 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3828 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3829 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3830 }
3831 } else {
3832 dictIterator *di = dictGetIterator(o->ptr);
3833 dictEntry *de;
3834
3835 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3836 while((de = dictNext(di)) != NULL) {
3837 robj *key = dictGetEntryKey(de);
3838 robj *val = dictGetEntryVal(de);
3839
3840 if (rdbSaveStringObject(fp,key) == -1) return -1;
3841 if (rdbSaveStringObject(fp,val) == -1) return -1;
3842 }
3843 dictReleaseIterator(di);
3844 }
06233c45 3845 } else {
f83c6cb5 3846 redisPanic("Unknown object type");
06233c45 3847 }
3848 return 0;
3849}
3850
3851/* Return the length the object will have on disk if saved with
3852 * the rdbSaveObject() function. Currently we use a trick to get
3853 * this length with very little changes to the code. In the future
3854 * we could switch to a faster solution. */
b9bc0eef 3855static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3856 if (fp == NULL) fp = server.devnull;
06233c45 3857 rewind(fp);
3858 assert(rdbSaveObject(fp,o) != 1);
3859 return ftello(fp);
3860}
3861
06224fec 3862/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3863static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3864 off_t bytes = rdbSavedObjectLen(o,fp);
e0a62c7f 3865
06224fec 3866 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3867}
3868
ed9b544e 3869/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3870static int rdbSave(char *filename) {
ed9b544e 3871 dictIterator *di = NULL;
3872 dictEntry *de;
ed9b544e 3873 FILE *fp;
3874 char tmpfile[256];
3875 int j;
bb32ede5 3876 time_t now = time(NULL);
ed9b544e 3877
2316bb3b 3878 /* Wait for I/O therads to terminate, just in case this is a
3879 * foreground-saving, to avoid seeking the swap file descriptor at the
3880 * same time. */
3881 if (server.vm_enabled)
3882 waitEmptyIOJobsQueue();
3883
a3b21203 3884 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3885 fp = fopen(tmpfile,"w");
3886 if (!fp) {
3887 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3888 return REDIS_ERR;
3889 }
f78fd11b 3890 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3891 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3892 redisDb *db = server.db+j;
3893 dict *d = db->dict;
3305306f 3894 if (dictSize(d) == 0) continue;
ed9b544e 3895 di = dictGetIterator(d);
3896 if (!di) {
3897 fclose(fp);
3898 return REDIS_ERR;
3899 }
3900
3901 /* Write the SELECT DB opcode */
f78fd11b 3902 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3903 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3904
3905 /* Iterate this DB writing every entry */
3906 while((de = dictNext(di)) != NULL) {
09241813 3907 sds keystr = dictGetEntryKey(de);
3908 robj key, *o = dictGetEntryVal(de);
3909 time_t expiretime;
3910
3911 initStaticStringObject(key,keystr);
3912 expiretime = getExpire(db,&key);
bb32ede5 3913
3914 /* Save the expire time */
3915 if (expiretime != -1) {
3916 /* If this key is already expired skip it */
3917 if (expiretime < now) continue;
3918 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3919 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3920 }
7e69548d 3921 /* Save the key and associated value. This requires special
3922 * handling if the value is swapped out. */
560db612 3923 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
3924 o->storage == REDIS_VM_SWAPPING) {
7e69548d 3925 /* Save type, key, value */
3926 if (rdbSaveType(fp,o->type) == -1) goto werr;
09241813 3927 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
7e69548d 3928 if (rdbSaveObject(fp,o) == -1) goto werr;
3929 } else {
996cb5f7 3930 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3931 robj *po;
7e69548d 3932 /* Get a preview of the object in memory */
560db612 3933 po = vmPreviewObject(o);
7e69548d 3934 /* Save type, key, value */
560db612 3935 if (rdbSaveType(fp,po->type) == -1) goto werr;
09241813 3936 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
7e69548d 3937 if (rdbSaveObject(fp,po) == -1) goto werr;
3938 /* Remove the loaded object from memory */
3939 decrRefCount(po);
7e69548d 3940 }
ed9b544e 3941 }
3942 dictReleaseIterator(di);
3943 }
3944 /* EOF opcode */
f78fd11b 3945 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3946
3947 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3948 fflush(fp);
3949 fsync(fileno(fp));
3950 fclose(fp);
e0a62c7f 3951
ed9b544e 3952 /* Use RENAME to make sure the DB file is changed atomically only
3953 * if the generate DB file is ok. */
3954 if (rename(tmpfile,filename) == -1) {
325d1eb4 3955 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3956 unlink(tmpfile);
3957 return REDIS_ERR;
3958 }
3959 redisLog(REDIS_NOTICE,"DB saved on disk");
3960 server.dirty = 0;
3961 server.lastsave = time(NULL);
3962 return REDIS_OK;
3963
3964werr:
3965 fclose(fp);
3966 unlink(tmpfile);
3967 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3968 if (di) dictReleaseIterator(di);
3969 return REDIS_ERR;
3970}
3971
f78fd11b 3972static int rdbSaveBackground(char *filename) {
ed9b544e 3973 pid_t childpid;
3974
9d65a1bb 3975 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3976 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3977 if ((childpid = fork()) == 0) {
3978 /* Child */
054e426d 3979 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3980 close(server.fd);
f78fd11b 3981 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3982 _exit(0);
ed9b544e 3983 } else {
478c2c6f 3984 _exit(1);
ed9b544e 3985 }
3986 } else {
3987 /* Parent */
5a7c647e 3988 if (childpid == -1) {
3989 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3990 strerror(errno));
3991 return REDIS_ERR;
3992 }
ed9b544e 3993 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3994 server.bgsavechildpid = childpid;
884d4b39 3995 updateDictResizePolicy();
ed9b544e 3996 return REDIS_OK;
3997 }
3998 return REDIS_OK; /* unreached */
3999}
4000
a3b21203 4001static void rdbRemoveTempFile(pid_t childpid) {
4002 char tmpfile[256];
4003
4004 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
4005 unlink(tmpfile);
4006}
4007
f78fd11b 4008static int rdbLoadType(FILE *fp) {
4009 unsigned char type;
7b45bfb2 4010 if (fread(&type,1,1,fp) == 0) return -1;
4011 return type;
4012}
4013
bb32ede5 4014static time_t rdbLoadTime(FILE *fp) {
4015 int32_t t32;
4016 if (fread(&t32,4,1,fp) == 0) return -1;
4017 return (time_t) t32;
4018}
4019
e3566d4b 4020/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
4021 * of this file for a description of how this are stored on disk.
4022 *
4023 * isencoded is set to 1 if the readed length is not actually a length but
4024 * an "encoding type", check the above comments for more info */
c78a8ccc 4025static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 4026 unsigned char buf[2];
4027 uint32_t len;
c78a8ccc 4028 int type;
f78fd11b 4029
e3566d4b 4030 if (isencoded) *isencoded = 0;
c78a8ccc 4031 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
4032 type = (buf[0]&0xC0)>>6;
4033 if (type == REDIS_RDB_6BITLEN) {
4034 /* Read a 6 bit len */
4035 return buf[0]&0x3F;
4036 } else if (type == REDIS_RDB_ENCVAL) {
4037 /* Read a 6 bit len encoding type */
4038 if (isencoded) *isencoded = 1;
4039 return buf[0]&0x3F;
4040 } else if (type == REDIS_RDB_14BITLEN) {
4041 /* Read a 14 bit len */
4042 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
4043 return ((buf[0]&0x3F)<<8)|buf[1];
4044 } else {
4045 /* Read a 32 bit len */
f78fd11b 4046 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
4047 return ntohl(len);
f78fd11b 4048 }
f78fd11b 4049}
4050
ad30aa60 4051/* Load an integer-encoded object from file 'fp', with the specified
4052 * encoding type 'enctype'. If encode is true the function may return
4053 * an integer-encoded object as reply, otherwise the returned object
4054 * will always be encoded as a raw string. */
4055static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
e3566d4b 4056 unsigned char enc[4];
4057 long long val;
4058
4059 if (enctype == REDIS_RDB_ENC_INT8) {
4060 if (fread(enc,1,1,fp) == 0) return NULL;
4061 val = (signed char)enc[0];
4062 } else if (enctype == REDIS_RDB_ENC_INT16) {
4063 uint16_t v;
4064 if (fread(enc,2,1,fp) == 0) return NULL;
4065 v = enc[0]|(enc[1]<<8);
4066 val = (int16_t)v;
4067 } else if (enctype == REDIS_RDB_ENC_INT32) {
4068 uint32_t v;
4069 if (fread(enc,4,1,fp) == 0) return NULL;
4070 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
4071 val = (int32_t)v;
4072 } else {
4073 val = 0; /* anti-warning */
f83c6cb5 4074 redisPanic("Unknown RDB integer encoding type");
e3566d4b 4075 }
ad30aa60 4076 if (encode)
4077 return createStringObjectFromLongLong(val);
4078 else
4079 return createObject(REDIS_STRING,sdsfromlonglong(val));
e3566d4b 4080}
4081
c78a8ccc 4082static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 4083 unsigned int len, clen;
4084 unsigned char *c = NULL;
4085 sds val = NULL;
4086
c78a8ccc 4087 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4088 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 4089 if ((c = zmalloc(clen)) == NULL) goto err;
4090 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
4091 if (fread(c,clen,1,fp) == 0) goto err;
4092 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 4093 zfree(c);
88e85998 4094 return createObject(REDIS_STRING,val);
4095err:
4096 zfree(c);
4097 sdsfree(val);
4098 return NULL;
4099}
4100
ad30aa60 4101static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
e3566d4b 4102 int isencoded;
4103 uint32_t len;
f78fd11b 4104 sds val;
4105
c78a8ccc 4106 len = rdbLoadLen(fp,&isencoded);
e3566d4b 4107 if (isencoded) {
4108 switch(len) {
4109 case REDIS_RDB_ENC_INT8:
4110 case REDIS_RDB_ENC_INT16:
4111 case REDIS_RDB_ENC_INT32:
ad30aa60 4112 return rdbLoadIntegerObject(fp,len,encode);
88e85998 4113 case REDIS_RDB_ENC_LZF:
bdcb92f2 4114 return rdbLoadLzfStringObject(fp);
e3566d4b 4115 default:
f83c6cb5 4116 redisPanic("Unknown RDB encoding type");
e3566d4b 4117 }
4118 }
4119
f78fd11b 4120 if (len == REDIS_RDB_LENERR) return NULL;
4121 val = sdsnewlen(NULL,len);
4122 if (len && fread(val,len,1,fp) == 0) {
4123 sdsfree(val);
4124 return NULL;
4125 }
bdcb92f2 4126 return createObject(REDIS_STRING,val);
f78fd11b 4127}
4128
ad30aa60 4129static robj *rdbLoadStringObject(FILE *fp) {
4130 return rdbGenericLoadStringObject(fp,0);
4131}
4132
4133static robj *rdbLoadEncodedStringObject(FILE *fp) {
4134 return rdbGenericLoadStringObject(fp,1);
4135}
4136
a7866db6 4137/* For information about double serialization check rdbSaveDoubleValue() */
4138static int rdbLoadDoubleValue(FILE *fp, double *val) {
4139 char buf[128];
4140 unsigned char len;
4141
4142 if (fread(&len,1,1,fp) == 0) return -1;
4143 switch(len) {
4144 case 255: *val = R_NegInf; return 0;
4145 case 254: *val = R_PosInf; return 0;
4146 case 253: *val = R_Nan; return 0;
4147 default:
4148 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 4149 buf[len] = '\0';
a7866db6 4150 sscanf(buf, "%lg", val);
4151 return 0;
4152 }
4153}
4154
c78a8ccc 4155/* Load a Redis object of the specified type from the specified file.
4156 * On success a newly allocated object is returned, otherwise NULL. */
4157static robj *rdbLoadObject(int type, FILE *fp) {
23f96494
PN
4158 robj *o, *ele, *dec;
4159 size_t len;
c78a8ccc 4160
bcd11906 4161 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 4162 if (type == REDIS_STRING) {
4163 /* Read string value */
ad30aa60 4164 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4165 o = tryObjectEncoding(o);
23f96494
PN
4166 } else if (type == REDIS_LIST) {
4167 /* Read list value */
4168 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4169
d0686e07
PN
4170 /* Use a real list when there are too many entries */
4171 if (len > server.list_max_ziplist_entries) {
4172 o = createListObject();
4173 } else {
4174 o = createZiplistObject();
4175 }
c78a8ccc 4176
23f96494
PN
4177 /* Load every single element of the list */
4178 while(len--) {
4179 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4180
d0686e07
PN
4181 /* If we are using a ziplist and the value is too big, convert
4182 * the object to a real list. */
4183 if (o->encoding == REDIS_ENCODING_ZIPLIST &&
4184 ele->encoding == REDIS_ENCODING_RAW &&
4185 sdslen(ele->ptr) > server.list_max_ziplist_value)
003f0840 4186 listTypeConvert(o,REDIS_ENCODING_LIST);
d0686e07 4187
23f96494
PN
4188 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
4189 dec = getDecodedObject(ele);
4190 o->ptr = ziplistPush(o->ptr,dec->ptr,sdslen(dec->ptr),REDIS_TAIL);
4191 decrRefCount(dec);
4192 decrRefCount(ele);
4193 } else {
4194 ele = tryObjectEncoding(ele);
4195 listAddNodeTail(o->ptr,ele);
23f96494
PN
4196 }
4197 }
4198 } else if (type == REDIS_SET) {
4199 /* Read list/set value */
4200 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4201 o = createSetObject();
3c68de9b 4202 /* It's faster to expand the dict to the right size asap in order
4203 * to avoid rehashing */
23f96494
PN
4204 if (len > DICT_HT_INITIAL_SIZE)
4205 dictExpand(o->ptr,len);
c78a8ccc 4206 /* Load every single element of the list/set */
23f96494 4207 while(len--) {
ad30aa60 4208 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4209 ele = tryObjectEncoding(ele);
23f96494 4210 dictAdd((dict*)o->ptr,ele,NULL);
c78a8ccc 4211 }
4212 } else if (type == REDIS_ZSET) {
4213 /* Read list/set value */
ada386b2 4214 size_t zsetlen;
c78a8ccc 4215 zset *zs;
4216
4217 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4218 o = createZsetObject();
4219 zs = o->ptr;
4220 /* Load every single element of the list/set */
4221 while(zsetlen--) {
4222 robj *ele;
4223 double *score = zmalloc(sizeof(double));
4224
ad30aa60 4225 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4226 ele = tryObjectEncoding(ele);
c78a8ccc 4227 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4228 dictAdd(zs->dict,ele,score);
4229 zslInsert(zs->zsl,*score,ele);
4230 incrRefCount(ele); /* added to skiplist */
4231 }
ada386b2 4232 } else if (type == REDIS_HASH) {
4233 size_t hashlen;
4234
4235 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4236 o = createHashObject();
4237 /* Too many entries? Use an hash table. */
4238 if (hashlen > server.hash_max_zipmap_entries)
4239 convertToRealHash(o);
4240 /* Load every key/value, then set it into the zipmap or hash
4241 * table, as needed. */
4242 while(hashlen--) {
4243 robj *key, *val;
4244
b785b2bf 4245 if ((key = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4246 if ((val = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
ada386b2 4247 /* If we are using a zipmap and there are too big values
4248 * the object is converted to real hash table encoding. */
4249 if (o->encoding != REDIS_ENCODING_HT &&
4250 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4251 sdslen(val->ptr) > server.hash_max_zipmap_value))
4252 {
4253 convertToRealHash(o);
4254 }
4255
4256 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4257 unsigned char *zm = o->ptr;
4258
4259 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4260 val->ptr,sdslen(val->ptr),NULL);
4261 o->ptr = zm;
4262 decrRefCount(key);
4263 decrRefCount(val);
4264 } else {
05df7621 4265 key = tryObjectEncoding(key);
4266 val = tryObjectEncoding(val);
ada386b2 4267 dictAdd((dict*)o->ptr,key,val);
ada386b2 4268 }
4269 }
c78a8ccc 4270 } else {
f83c6cb5 4271 redisPanic("Unknown object type");
c78a8ccc 4272 }
4273 return o;
4274}
4275
f78fd11b 4276static int rdbLoad(char *filename) {
ed9b544e 4277 FILE *fp;
f78fd11b 4278 uint32_t dbid;
bb32ede5 4279 int type, retval, rdbver;
585af7e2 4280 int swap_all_values = 0;
bb32ede5 4281 redisDb *db = server.db+0;
f78fd11b 4282 char buf[1024];
242a64f3 4283 time_t expiretime, now = time(NULL);
bb32ede5 4284
ed9b544e 4285 fp = fopen(filename,"r");
4286 if (!fp) return REDIS_ERR;
4287 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 4288 buf[9] = '\0';
4289 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 4290 fclose(fp);
4291 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4292 return REDIS_ERR;
4293 }
f78fd11b 4294 rdbver = atoi(buf+5);
c78a8ccc 4295 if (rdbver != 1) {
f78fd11b 4296 fclose(fp);
4297 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4298 return REDIS_ERR;
4299 }
ed9b544e 4300 while(1) {
585af7e2 4301 robj *key, *val;
7e02fe32 4302 int force_swapout;
ed9b544e 4303
585af7e2 4304 expiretime = -1;
ed9b544e 4305 /* Read type. */
f78fd11b 4306 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 4307 if (type == REDIS_EXPIRETIME) {
4308 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4309 /* We read the time so we need to read the object type again */
4310 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4311 }
ed9b544e 4312 if (type == REDIS_EOF) break;
4313 /* Handle SELECT DB opcode as a special case */
4314 if (type == REDIS_SELECTDB) {
c78a8ccc 4315 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 4316 goto eoferr;
ed9b544e 4317 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 4318 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 4319 exit(1);
4320 }
bb32ede5 4321 db = server.db+dbid;
ed9b544e 4322 continue;
4323 }
4324 /* Read key */
585af7e2 4325 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
c78a8ccc 4326 /* Read value */
585af7e2 4327 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
89e689c5 4328 /* Check if the key already expired */
4329 if (expiretime != -1 && expiretime < now) {
4330 decrRefCount(key);
4331 decrRefCount(val);
4332 continue;
4333 }
ed9b544e 4334 /* Add the new object in the hash table */
09241813 4335 retval = dbAdd(db,key,val);
4336 if (retval == REDIS_ERR) {
585af7e2 4337 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
ed9b544e 4338 exit(1);
4339 }
bb32ede5 4340 /* Set the expire time if needed */
89e689c5 4341 if (expiretime != -1) setExpire(db,key,expiretime);
242a64f3 4342
b492cf00 4343 /* Handle swapping while loading big datasets when VM is on */
242a64f3 4344
4345 /* If we detecter we are hopeless about fitting something in memory
4346 * we just swap every new key on disk. Directly...
4347 * Note that's important to check for this condition before resorting
4348 * to random sampling, otherwise we may try to swap already
4349 * swapped keys. */
585af7e2 4350 if (swap_all_values) {
09241813 4351 dictEntry *de = dictFind(db->dict,key->ptr);
242a64f3 4352
4353 /* de may be NULL since the key already expired */
4354 if (de) {
560db612 4355 vmpointer *vp;
585af7e2 4356 val = dictGetEntryVal(de);
242a64f3 4357
560db612 4358 if (val->refcount == 1 &&
4359 (vp = vmSwapObjectBlocking(val)) != NULL)
4360 dictGetEntryVal(de) = vp;
242a64f3 4361 }
09241813 4362 decrRefCount(key);
242a64f3 4363 continue;
4364 }
09241813 4365 decrRefCount(key);
242a64f3 4366
a89b7013 4367 /* Flush data on disk once 32 MB of additional RAM are used... */
7e02fe32 4368 force_swapout = 0;
4369 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
4370 force_swapout = 1;
242a64f3 4371
4372 /* If we have still some hope of having some value fitting memory
4373 * then we try random sampling. */
7e02fe32 4374 if (!swap_all_values && server.vm_enabled && force_swapout) {
b492cf00 4375 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 4376 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 4377 }
242a64f3 4378 if (zmalloc_used_memory() > server.vm_max_memory)
585af7e2 4379 swap_all_values = 1; /* We are already using too much mem */
b492cf00 4380 }
ed9b544e 4381 }
4382 fclose(fp);
4383 return REDIS_OK;
4384
4385eoferr: /* unexpected end of file is handled here with a fatal exit */
f80dff62 4386 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 4387 exit(1);
4388 return REDIS_ERR; /* Just to avoid warning */
4389}
4390
b58ba105 4391/*================================== Shutdown =============================== */
fab43727 4392static int prepareForShutdown() {
b58ba105
AM
4393 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4394 /* Kill the saving child if there is a background saving in progress.
4395 We want to avoid race conditions, for instance our saving child may
4396 overwrite the synchronous saving did by SHUTDOWN. */
4397 if (server.bgsavechildpid != -1) {
4398 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4399 kill(server.bgsavechildpid,SIGKILL);
4400 rdbRemoveTempFile(server.bgsavechildpid);
4401 }
4402 if (server.appendonly) {
4403 /* Append only file: fsync() the AOF and exit */
b0bd87f6 4404 aof_fsync(server.appendfd);
b58ba105 4405 if (server.vm_enabled) unlink(server.vm_swap_file);
b58ba105
AM
4406 } else {
4407 /* Snapshotting. Perform a SYNC SAVE and exit */
4408 if (rdbSave(server.dbfilename) == REDIS_OK) {
4409 if (server.daemonize)
4410 unlink(server.pidfile);
4411 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
b58ba105
AM
4412 } else {
4413 /* Ooops.. error saving! The best we can do is to continue
4414 * operating. Note that if there was a background saving process,
4415 * in the next cron() Redis will be notified that the background
4416 * saving aborted, handling special stuff like slaves pending for
4417 * synchronization... */
4418 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
fab43727 4419 return REDIS_ERR;
b58ba105
AM
4420 }
4421 }
8513a757 4422 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
fab43727 4423 return REDIS_OK;
b58ba105
AM
4424}
4425
ed9b544e 4426/*================================== Commands =============================== */
4427
abcb223e 4428static void authCommand(redisClient *c) {
2e77c2ee 4429 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
4430 c->authenticated = 1;
4431 addReply(c,shared.ok);
4432 } else {
4433 c->authenticated = 0;
fa4c0aba 4434 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
4435 }
4436}
4437
ed9b544e 4438static void pingCommand(redisClient *c) {
4439 addReply(c,shared.pong);
4440}
4441
4442static void echoCommand(redisClient *c) {
dd88747b 4443 addReplyBulk(c,c->argv[1]);
ed9b544e 4444}
4445
4446/*=================================== Strings =============================== */
4447
526d00a5 4448static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
ed9b544e 4449 int retval;
10ce1276 4450 long seconds = 0; /* initialized to avoid an harmness warning */
ed9b544e 4451
526d00a5 4452 if (expire) {
4453 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4454 return;
4455 if (seconds <= 0) {
4456 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4457 return;
4458 }
4459 }
4460
37ab76c9 4461 touchWatchedKey(c->db,key);
526d00a5 4462 if (nx) deleteIfVolatile(c->db,key);
09241813 4463 retval = dbAdd(c->db,key,val);
4464 if (retval == REDIS_ERR) {
ed9b544e 4465 if (!nx) {
09241813 4466 dbReplace(c->db,key,val);
526d00a5 4467 incrRefCount(val);
ed9b544e 4468 } else {
c937aa89 4469 addReply(c,shared.czero);
ed9b544e 4470 return;
4471 }
4472 } else {
526d00a5 4473 incrRefCount(val);
ed9b544e 4474 }
4475 server.dirty++;
526d00a5 4476 removeExpire(c->db,key);
4477 if (expire) setExpire(c->db,key,time(NULL)+seconds);
c937aa89 4478 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 4479}
4480
4481static void setCommand(redisClient *c) {
526d00a5 4482 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
ed9b544e 4483}
4484
4485static void setnxCommand(redisClient *c) {
526d00a5 4486 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4487}
4488
4489static void setexCommand(redisClient *c) {
4490 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
ed9b544e 4491}
4492
322fc7d8 4493static int getGenericCommand(redisClient *c) {
dd88747b 4494 robj *o;
e0a62c7f 4495
dd88747b 4496 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 4497 return REDIS_OK;
dd88747b 4498
4499 if (o->type != REDIS_STRING) {
4500 addReply(c,shared.wrongtypeerr);
4501 return REDIS_ERR;
ed9b544e 4502 } else {
dd88747b 4503 addReplyBulk(c,o);
4504 return REDIS_OK;
ed9b544e 4505 }
4506}
4507
322fc7d8 4508static void getCommand(redisClient *c) {
4509 getGenericCommand(c);
4510}
4511
f6b141c5 4512static void getsetCommand(redisClient *c) {
322fc7d8 4513 if (getGenericCommand(c) == REDIS_ERR) return;
09241813 4514 dbReplace(c->db,c->argv[1],c->argv[2]);
a431eb74 4515 incrRefCount(c->argv[2]);
4516 server.dirty++;
4517 removeExpire(c->db,c->argv[1]);
4518}
4519
70003d28 4520static void mgetCommand(redisClient *c) {
70003d28 4521 int j;
e0a62c7f 4522
c937aa89 4523 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 4524 for (j = 1; j < c->argc; j++) {
3305306f 4525 robj *o = lookupKeyRead(c->db,c->argv[j]);
4526 if (o == NULL) {
c937aa89 4527 addReply(c,shared.nullbulk);
70003d28 4528 } else {
70003d28 4529 if (o->type != REDIS_STRING) {
c937aa89 4530 addReply(c,shared.nullbulk);
70003d28 4531 } else {
dd88747b 4532 addReplyBulk(c,o);
70003d28 4533 }
4534 }
4535 }
4536}
4537
6c446631 4538static void msetGenericCommand(redisClient *c, int nx) {
906573e7 4539 int j, busykeys = 0;
6c446631 4540
4541 if ((c->argc % 2) == 0) {
454d4e43 4542 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 4543 return;
4544 }
4545 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4546 * set nothing at all if at least one already key exists. */
4547 if (nx) {
4548 for (j = 1; j < c->argc; j += 2) {
906573e7 4549 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4550 busykeys++;
6c446631 4551 }
4552 }
4553 }
906573e7 4554 if (busykeys) {
4555 addReply(c, shared.czero);
4556 return;
4557 }
6c446631 4558
4559 for (j = 1; j < c->argc; j += 2) {
05df7621 4560 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
09241813 4561 dbReplace(c->db,c->argv[j],c->argv[j+1]);
4562 incrRefCount(c->argv[j+1]);
6c446631 4563 removeExpire(c->db,c->argv[j]);
4564 }
4565 server.dirty += (c->argc-1)/2;
4566 addReply(c, nx ? shared.cone : shared.ok);
4567}
4568
4569static void msetCommand(redisClient *c) {
4570 msetGenericCommand(c,0);
4571}
4572
4573static void msetnxCommand(redisClient *c) {
4574 msetGenericCommand(c,1);
4575}
4576
d68ed120 4577static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 4578 long long value;
ed9b544e 4579 robj *o;
e0a62c7f 4580
3305306f 4581 o = lookupKeyWrite(c->db,c->argv[1]);
6485f293
PN
4582 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4583 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
ed9b544e 4584
4585 value += incr;
d6f4c262 4586 o = createStringObjectFromLongLong(value);
09241813 4587 dbReplace(c->db,c->argv[1],o);
ed9b544e 4588 server.dirty++;
c937aa89 4589 addReply(c,shared.colon);
ed9b544e 4590 addReply(c,o);
4591 addReply(c,shared.crlf);
4592}
4593
4594static void incrCommand(redisClient *c) {
a4d1ba9a 4595 incrDecrCommand(c,1);
ed9b544e 4596}
4597
4598static void decrCommand(redisClient *c) {
a4d1ba9a 4599 incrDecrCommand(c,-1);
ed9b544e 4600}
4601
4602static void incrbyCommand(redisClient *c) {
bbe025e0
AM
4603 long long incr;
4604
bd79a6bd 4605 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4606 incrDecrCommand(c,incr);
ed9b544e 4607}
4608
4609static void decrbyCommand(redisClient *c) {
bbe025e0
AM
4610 long long incr;
4611
bd79a6bd 4612 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4613 incrDecrCommand(c,-incr);
ed9b544e 4614}
4615
4b00bebd 4616static void appendCommand(redisClient *c) {
4617 int retval;
4618 size_t totlen;
4619 robj *o;
4620
4621 o = lookupKeyWrite(c->db,c->argv[1]);
4622 if (o == NULL) {
4623 /* Create the key */
09241813 4624 retval = dbAdd(c->db,c->argv[1],c->argv[2]);
4b00bebd 4625 incrRefCount(c->argv[2]);
4626 totlen = stringObjectLen(c->argv[2]);
4627 } else {
4b00bebd 4628 if (o->type != REDIS_STRING) {
4629 addReply(c,shared.wrongtypeerr);
4630 return;
4631 }
4632 /* If the object is specially encoded or shared we have to make
4633 * a copy */
4634 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4635 robj *decoded = getDecodedObject(o);
4636
4637 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4638 decrRefCount(decoded);
09241813 4639 dbReplace(c->db,c->argv[1],o);
4b00bebd 4640 }
4641 /* APPEND! */
4642 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4643 o->ptr = sdscatlen(o->ptr,
4644 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4645 } else {
4646 o->ptr = sdscatprintf(o->ptr, "%ld",
4647 (unsigned long) c->argv[2]->ptr);
4648 }
4649 totlen = sdslen(o->ptr);
4650 }
4651 server.dirty++;
4652 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4653}
4654
39191553 4655static void substrCommand(redisClient *c) {
4656 robj *o;
4657 long start = atoi(c->argv[2]->ptr);
4658 long end = atoi(c->argv[3]->ptr);
dd88747b 4659 size_t rangelen, strlen;
4660 sds range;
39191553 4661
dd88747b 4662 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4663 checkType(c,o,REDIS_STRING)) return;
39191553 4664
dd88747b 4665 o = getDecodedObject(o);
4666 strlen = sdslen(o->ptr);
8fe7fad7 4667
dd88747b 4668 /* convert negative indexes */
4669 if (start < 0) start = strlen+start;
4670 if (end < 0) end = strlen+end;
4671 if (start < 0) start = 0;
4672 if (end < 0) end = 0;
39191553 4673
dd88747b 4674 /* indexes sanity checks */
4675 if (start > end || (size_t)start >= strlen) {
4676 /* Out of range start or start > end result in null reply */
4677 addReply(c,shared.nullbulk);
4678 decrRefCount(o);
4679 return;
39191553 4680 }
dd88747b 4681 if ((size_t)end >= strlen) end = strlen-1;
4682 rangelen = (end-start)+1;
4683
4684 /* Return the result */
4685 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4686 range = sdsnewlen((char*)o->ptr+start,rangelen);
4687 addReplySds(c,range);
4688 addReply(c,shared.crlf);
4689 decrRefCount(o);
39191553 4690}
4691
ed9b544e 4692/* ========================= Type agnostic commands ========================= */
4693
4694static void delCommand(redisClient *c) {
5109cdff 4695 int deleted = 0, j;
4696
4697 for (j = 1; j < c->argc; j++) {
09241813 4698 if (dbDelete(c->db,c->argv[j])) {
37ab76c9 4699 touchWatchedKey(c->db,c->argv[j]);
5109cdff 4700 server.dirty++;
4701 deleted++;
4702 }
4703 }
482b672d 4704 addReplyLongLong(c,deleted);
ed9b544e 4705}
4706
4707static void existsCommand(redisClient *c) {
f4f06efc 4708 expireIfNeeded(c->db,c->argv[1]);
09241813 4709 if (dbExists(c->db,c->argv[1])) {
f4f06efc
PN
4710 addReply(c, shared.cone);
4711 } else {
4712 addReply(c, shared.czero);
4713 }
ed9b544e 4714}
4715
4716static void selectCommand(redisClient *c) {
4717 int id = atoi(c->argv[1]->ptr);
e0a62c7f 4718
ed9b544e 4719 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4720 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4721 } else {
4722 addReply(c,shared.ok);
4723 }
4724}
4725
4726static void randomkeyCommand(redisClient *c) {
dc4be23e 4727 robj *key;
e0a62c7f 4728
09241813 4729 if ((key = dbRandomKey(c->db)) == NULL) {
dc4be23e 4730 addReply(c,shared.nullbulk);
4731 return;
4732 }
4733
09241813 4734 addReplyBulk(c,key);
4735 decrRefCount(key);
ed9b544e 4736}
4737
4738static void keysCommand(redisClient *c) {
4739 dictIterator *di;
4740 dictEntry *de;
4741 sds pattern = c->argv[1]->ptr;
4742 int plen = sdslen(pattern);
a3f9eec2 4743 unsigned long numkeys = 0;
ed9b544e 4744 robj *lenobj = createObject(REDIS_STRING,NULL);
4745
3305306f 4746 di = dictGetIterator(c->db->dict);
ed9b544e 4747 addReply(c,lenobj);
4748 decrRefCount(lenobj);
4749 while((de = dictNext(di)) != NULL) {
09241813 4750 sds key = dictGetEntryKey(de);
4751 robj *keyobj;
3305306f 4752
ed9b544e 4753 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4754 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
09241813 4755 keyobj = createStringObject(key,sdslen(key));
3305306f 4756 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4757 addReplyBulk(c,keyobj);
3305306f 4758 numkeys++;
3305306f 4759 }
09241813 4760 decrRefCount(keyobj);
ed9b544e 4761 }
4762 }
4763 dictReleaseIterator(di);
a3f9eec2 4764 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4765}
4766
4767static void dbsizeCommand(redisClient *c) {
4768 addReplySds(c,
3305306f 4769 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4770}
4771
4772static void lastsaveCommand(redisClient *c) {
4773 addReplySds(c,
c937aa89 4774 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4775}
4776
4777static void typeCommand(redisClient *c) {
3305306f 4778 robj *o;
ed9b544e 4779 char *type;
3305306f 4780
4781 o = lookupKeyRead(c->db,c->argv[1]);
4782 if (o == NULL) {
c937aa89 4783 type = "+none";
ed9b544e 4784 } else {
ed9b544e 4785 switch(o->type) {
c937aa89 4786 case REDIS_STRING: type = "+string"; break;
4787 case REDIS_LIST: type = "+list"; break;
4788 case REDIS_SET: type = "+set"; break;
412a8bce 4789 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4790 case REDIS_HASH: type = "+hash"; break;
4791 default: type = "+unknown"; break;
ed9b544e 4792 }
4793 }
4794 addReplySds(c,sdsnew(type));
4795 addReply(c,shared.crlf);
4796}
4797
4798static void saveCommand(redisClient *c) {
9d65a1bb 4799 if (server.bgsavechildpid != -1) {
05557f6d 4800 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4801 return;
4802 }
f78fd11b 4803 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4804 addReply(c,shared.ok);
4805 } else {
4806 addReply(c,shared.err);
4807 }
4808}
4809
4810static void bgsaveCommand(redisClient *c) {
9d65a1bb 4811 if (server.bgsavechildpid != -1) {
ed9b544e 4812 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4813 return;
4814 }
f78fd11b 4815 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4816 char *status = "+Background saving started\r\n";
4817 addReplySds(c,sdsnew(status));
ed9b544e 4818 } else {
4819 addReply(c,shared.err);
4820 }
4821}
4822
4823static void shutdownCommand(redisClient *c) {
fab43727 4824 if (prepareForShutdown() == REDIS_OK)
4825 exit(0);
4826 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
ed9b544e 4827}
4828
4829static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4830 robj *o;
4831
4832 /* To use the same key as src and dst is probably an error */
4833 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4834 addReply(c,shared.sameobjecterr);
ed9b544e 4835 return;
4836 }
4837
dd88747b 4838 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4839 return;
dd88747b 4840
ed9b544e 4841 incrRefCount(o);
3305306f 4842 deleteIfVolatile(c->db,c->argv[2]);
09241813 4843 if (dbAdd(c->db,c->argv[2],o) == REDIS_ERR) {
ed9b544e 4844 if (nx) {
4845 decrRefCount(o);
c937aa89 4846 addReply(c,shared.czero);
ed9b544e 4847 return;
4848 }
09241813 4849 dbReplace(c->db,c->argv[2],o);
ed9b544e 4850 }
09241813 4851 dbDelete(c->db,c->argv[1]);
b167f877 4852 touchWatchedKey(c->db,c->argv[2]);
ed9b544e 4853 server.dirty++;
c937aa89 4854 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4855}
4856
4857static void renameCommand(redisClient *c) {
4858 renameGenericCommand(c,0);
4859}
4860
4861static void renamenxCommand(redisClient *c) {
4862 renameGenericCommand(c,1);
4863}
4864
4865static void moveCommand(redisClient *c) {
3305306f 4866 robj *o;
4867 redisDb *src, *dst;
ed9b544e 4868 int srcid;
4869
4870 /* Obtain source and target DB pointers */
3305306f 4871 src = c->db;
4872 srcid = c->db->id;
ed9b544e 4873 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4874 addReply(c,shared.outofrangeerr);
ed9b544e 4875 return;
4876 }
3305306f 4877 dst = c->db;
4878 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4879
4880 /* If the user is moving using as target the same
4881 * DB as the source DB it is probably an error. */
4882 if (src == dst) {
c937aa89 4883 addReply(c,shared.sameobjecterr);
ed9b544e 4884 return;
4885 }
4886
4887 /* Check if the element exists and get a reference */
3305306f 4888 o = lookupKeyWrite(c->db,c->argv[1]);
4889 if (!o) {
c937aa89 4890 addReply(c,shared.czero);
ed9b544e 4891 return;
4892 }
4893
4894 /* Try to add the element to the target DB */
3305306f 4895 deleteIfVolatile(dst,c->argv[1]);
09241813 4896 if (dbAdd(dst,c->argv[1],o) == REDIS_ERR) {
c937aa89 4897 addReply(c,shared.czero);
ed9b544e 4898 return;
4899 }
ed9b544e 4900 incrRefCount(o);
4901
4902 /* OK! key moved, free the entry in the source DB */
09241813 4903 dbDelete(src,c->argv[1]);
ed9b544e 4904 server.dirty++;
c937aa89 4905 addReply(c,shared.cone);
ed9b544e 4906}
4907
4908/* =================================== Lists ================================ */
d0686e07
PN
4909
4910
4911/* Check the argument length to see if it requires us to convert the ziplist
4912 * to a real list. Only check raw-encoded objects because integer encoded
4913 * objects are never too long. */
003f0840 4914static void listTypeTryConversion(robj *subject, robj *value) {
d0686e07
PN
4915 if (subject->encoding != REDIS_ENCODING_ZIPLIST) return;
4916 if (value->encoding == REDIS_ENCODING_RAW &&
4917 sdslen(value->ptr) > server.list_max_ziplist_value)
003f0840 4918 listTypeConvert(subject,REDIS_ENCODING_LIST);
d0686e07
PN
4919}
4920
003f0840 4921static void listTypePush(robj *subject, robj *value, int where) {
d0686e07 4922 /* Check if we need to convert the ziplist */
003f0840 4923 listTypeTryConversion(subject,value);
d0686e07
PN
4924 if (subject->encoding == REDIS_ENCODING_ZIPLIST &&
4925 ziplistLen(subject->ptr) > server.list_max_ziplist_entries)
003f0840 4926 listTypeConvert(subject,REDIS_ENCODING_LIST);
d0686e07 4927
c7d9d662
PN
4928 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4929 int pos = (where == REDIS_HEAD) ? ZIPLIST_HEAD : ZIPLIST_TAIL;
4930 value = getDecodedObject(value);
4931 subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),pos);
4932 decrRefCount(value);
4933 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4934 if (where == REDIS_HEAD) {
4935 listAddNodeHead(subject->ptr,value);
4936 } else {
4937 listAddNodeTail(subject->ptr,value);
4938 }
4939 incrRefCount(value);
4940 } else {
4941 redisPanic("Unknown list encoding");
4942 }
4943}
4944
003f0840 4945static robj *listTypePop(robj *subject, int where) {
d72562f7
PN
4946 robj *value = NULL;
4947 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4948 unsigned char *p;
b6eb9703 4949 unsigned char *vstr;
d72562f7 4950 unsigned int vlen;
b6eb9703 4951 long long vlong;
d72562f7
PN
4952 int pos = (where == REDIS_HEAD) ? 0 : -1;
4953 p = ziplistIndex(subject->ptr,pos);
b6eb9703
PN
4954 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
4955 if (vstr) {
4956 value = createStringObject((char*)vstr,vlen);
d72562f7 4957 } else {
b6eb9703 4958 value = createStringObjectFromLongLong(vlong);
d72562f7 4959 }
0f62e177
PN
4960 /* We only need to delete an element when it exists */
4961 subject->ptr = ziplistDelete(subject->ptr,&p);
d72562f7 4962 }
d72562f7
PN
4963 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4964 list *list = subject->ptr;
4965 listNode *ln;
4966 if (where == REDIS_HEAD) {
4967 ln = listFirst(list);
4968 } else {
4969 ln = listLast(list);
4970 }
4971 if (ln != NULL) {
4972 value = listNodeValue(ln);
4973 incrRefCount(value);
4974 listDelNode(list,ln);
4975 }
4976 } else {
4977 redisPanic("Unknown list encoding");
4978 }
4979 return value;
4980}
4981
003f0840 4982static unsigned long listTypeLength(robj *subject) {
d72562f7
PN
4983 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4984 return ziplistLen(subject->ptr);
4985 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4986 return listLength((list*)subject->ptr);
4987 } else {
4988 redisPanic("Unknown list encoding");
4989 }
4990}
4991
a6dd455b
PN
4992/* Structure to hold set iteration abstraction. */
4993typedef struct {
4994 robj *subject;
4995 unsigned char encoding;
be02a7c0 4996 unsigned char direction; /* Iteration direction */
a6dd455b
PN
4997 unsigned char *zi;
4998 listNode *ln;
003f0840 4999} listTypeIterator;
a6dd455b 5000
be02a7c0
PN
5001/* Structure for an entry while iterating over a list. */
5002typedef struct {
003f0840 5003 listTypeIterator *li;
be02a7c0
PN
5004 unsigned char *zi; /* Entry in ziplist */
5005 listNode *ln; /* Entry in linked list */
003f0840 5006} listTypeEntry;
be02a7c0 5007
a6dd455b 5008/* Initialize an iterator at the specified index. */
003f0840
PN
5009static listTypeIterator *listTypeInitIterator(robj *subject, int index, unsigned char direction) {
5010 listTypeIterator *li = zmalloc(sizeof(listTypeIterator));
a6dd455b
PN
5011 li->subject = subject;
5012 li->encoding = subject->encoding;
be02a7c0 5013 li->direction = direction;
a6dd455b
PN
5014 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5015 li->zi = ziplistIndex(subject->ptr,index);
5016 } else if (li->encoding == REDIS_ENCODING_LIST) {
5017 li->ln = listIndex(subject->ptr,index);
5018 } else {
5019 redisPanic("Unknown list encoding");
5020 }
5021 return li;
5022}
5023
5024/* Clean up the iterator. */
003f0840 5025static void listTypeReleaseIterator(listTypeIterator *li) {
a6dd455b
PN
5026 zfree(li);
5027}
5028
be02a7c0
PN
5029/* Stores pointer to current the entry in the provided entry structure
5030 * and advances the position of the iterator. Returns 1 when the current
5031 * entry is in fact an entry, 0 otherwise. */
003f0840 5032static int listTypeNext(listTypeIterator *li, listTypeEntry *entry) {
dda20542
PN
5033 /* Protect from converting when iterating */
5034 redisAssert(li->subject->encoding == li->encoding);
5035
be02a7c0 5036 entry->li = li;
d2ee16ab 5037 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
be02a7c0
PN
5038 entry->zi = li->zi;
5039 if (entry->zi != NULL) {
5040 if (li->direction == REDIS_TAIL)
5041 li->zi = ziplistNext(li->subject->ptr,li->zi);
5042 else
5043 li->zi = ziplistPrev(li->subject->ptr,li->zi);
5044 return 1;
5045 }
d2ee16ab 5046 } else if (li->encoding == REDIS_ENCODING_LIST) {
be02a7c0
PN
5047 entry->ln = li->ln;
5048 if (entry->ln != NULL) {
5049 if (li->direction == REDIS_TAIL)
5050 li->ln = li->ln->next;
5051 else
5052 li->ln = li->ln->prev;
5053 return 1;
5054 }
d2ee16ab
PN
5055 } else {
5056 redisPanic("Unknown list encoding");
5057 }
be02a7c0 5058 return 0;
d2ee16ab
PN
5059}
5060
a6dd455b 5061/* Return entry or NULL at the current position of the iterator. */
003f0840
PN
5062static robj *listTypeGet(listTypeEntry *entry) {
5063 listTypeIterator *li = entry->li;
a6dd455b
PN
5064 robj *value = NULL;
5065 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
b6eb9703 5066 unsigned char *vstr;
a6dd455b 5067 unsigned int vlen;
b6eb9703 5068 long long vlong;
be02a7c0 5069 redisAssert(entry->zi != NULL);
b6eb9703
PN
5070 if (ziplistGet(entry->zi,&vstr,&vlen,&vlong)) {
5071 if (vstr) {
5072 value = createStringObject((char*)vstr,vlen);
a6dd455b 5073 } else {
b6eb9703 5074 value = createStringObjectFromLongLong(vlong);
a6dd455b
PN
5075 }
5076 }
5077 } else if (li->encoding == REDIS_ENCODING_LIST) {
be02a7c0
PN
5078 redisAssert(entry->ln != NULL);
5079 value = listNodeValue(entry->ln);
a6dd455b
PN
5080 incrRefCount(value);
5081 } else {
5082 redisPanic("Unknown list encoding");
5083 }
5084 return value;
5085}
5086
d2ee16ab 5087/* Compare the given object with the entry at the current position. */
003f0840
PN
5088static int listTypeEqual(listTypeEntry *entry, robj *o) {
5089 listTypeIterator *li = entry->li;
d2ee16ab
PN
5090 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5091 redisAssert(o->encoding == REDIS_ENCODING_RAW);
be02a7c0 5092 return ziplistCompare(entry->zi,o->ptr,sdslen(o->ptr));
d2ee16ab 5093 } else if (li->encoding == REDIS_ENCODING_LIST) {
be02a7c0 5094 return equalStringObjects(o,listNodeValue(entry->ln));
d2ee16ab
PN
5095 } else {
5096 redisPanic("Unknown list encoding");
5097 }
5098}
5099
be02a7c0 5100/* Delete the element pointed to. */
003f0840
PN
5101static void listTypeDelete(listTypeEntry *entry) {
5102 listTypeIterator *li = entry->li;
a6dd455b 5103 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
be02a7c0
PN
5104 unsigned char *p = entry->zi;
5105 li->subject->ptr = ziplistDelete(li->subject->ptr,&p);
5106
5107 /* Update position of the iterator depending on the direction */
5108 if (li->direction == REDIS_TAIL)
5109 li->zi = p;
a6dd455b 5110 else
be02a7c0
PN
5111 li->zi = ziplistPrev(li->subject->ptr,p);
5112 } else if (entry->li->encoding == REDIS_ENCODING_LIST) {
5113 listNode *next;
5114 if (li->direction == REDIS_TAIL)
5115 next = entry->ln->next;
a6dd455b 5116 else
be02a7c0
PN
5117 next = entry->ln->prev;
5118 listDelNode(li->subject->ptr,entry->ln);
5119 li->ln = next;
a6dd455b
PN
5120 } else {
5121 redisPanic("Unknown list encoding");
5122 }
5123}
3305306f 5124
003f0840
PN
5125static void listTypeConvert(robj *subject, int enc) {
5126 listTypeIterator *li;
5127 listTypeEntry entry;
d0686e07
PN
5128 redisAssert(subject->type == REDIS_LIST);
5129
5130 if (enc == REDIS_ENCODING_LIST) {
5131 list *l = listCreate();
cd627d4e 5132 listSetFreeMethod(l,decrRefCount);
d0686e07 5133
003f0840
PN
5134 /* listTypeGet returns a robj with incremented refcount */
5135 li = listTypeInitIterator(subject,0,REDIS_TAIL);
5136 while (listTypeNext(li,&entry)) listAddNodeTail(l,listTypeGet(&entry));
5137 listTypeReleaseIterator(li);
d0686e07
PN
5138
5139 subject->encoding = REDIS_ENCODING_LIST;
5140 zfree(subject->ptr);
5141 subject->ptr = l;
5142 } else {
5143 redisPanic("Unsupported list conversion");
5144 }
5145}
5146
c7d9d662
PN
5147static void pushGenericCommand(redisClient *c, int where) {
5148 robj *lobj = lookupKeyWrite(c->db,c->argv[1]);
3305306f 5149 if (lobj == NULL) {
95242ab5 5150 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 5151 addReply(c,shared.cone);
95242ab5 5152 return;
5153 }
1cd92e7f 5154 lobj = createZiplistObject();
09241813 5155 dbAdd(c->db,c->argv[1],lobj);
ed9b544e 5156 } else {
ed9b544e 5157 if (lobj->type != REDIS_LIST) {
5158 addReply(c,shared.wrongtypeerr);
5159 return;
5160 }
95242ab5 5161 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 5162 addReply(c,shared.cone);
95242ab5 5163 return;
5164 }
ed9b544e 5165 }
003f0840
PN
5166 listTypePush(lobj,c->argv[2],where);
5167 addReplyLongLong(c,listTypeLength(lobj));
ed9b544e 5168 server.dirty++;
ed9b544e 5169}
5170
5171static void lpushCommand(redisClient *c) {
5172 pushGenericCommand(c,REDIS_HEAD);
5173}
5174
5175static void rpushCommand(redisClient *c) {
5176 pushGenericCommand(c,REDIS_TAIL);
5177}
5178
5179static void llenCommand(redisClient *c) {
d72562f7
PN
5180 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.czero);
5181 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
003f0840 5182 addReplyUlong(c,listTypeLength(o));
ed9b544e 5183}
5184
5185static void lindexCommand(redisClient *c) {
697bd567
PN
5186 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk);
5187 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
ed9b544e 5188 int index = atoi(c->argv[2]->ptr);
bd8db0ad 5189 robj *value = NULL;
dd88747b 5190
697bd567
PN
5191 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5192 unsigned char *p;
b6eb9703 5193 unsigned char *vstr;
697bd567 5194 unsigned int vlen;
b6eb9703 5195 long long vlong;
697bd567 5196 p = ziplistIndex(o->ptr,index);
b6eb9703
PN
5197 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
5198 if (vstr) {
5199 value = createStringObject((char*)vstr,vlen);
697bd567 5200 } else {
b6eb9703 5201 value = createStringObjectFromLongLong(vlong);
697bd567 5202 }
bd8db0ad
PN
5203 addReplyBulk(c,value);
5204 decrRefCount(value);
697bd567
PN
5205 } else {
5206 addReply(c,shared.nullbulk);
5207 }
5208 } else if (o->encoding == REDIS_ENCODING_LIST) {
5209 listNode *ln = listIndex(o->ptr,index);
5210 if (ln != NULL) {
bd8db0ad
PN
5211 value = listNodeValue(ln);
5212 addReplyBulk(c,value);
697bd567
PN
5213 } else {
5214 addReply(c,shared.nullbulk);
5215 }
ed9b544e 5216 } else {
697bd567 5217 redisPanic("Unknown list encoding");
ed9b544e 5218 }
5219}
5220
5221static void lsetCommand(redisClient *c) {
697bd567
PN
5222 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr);
5223 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
ed9b544e 5224 int index = atoi(c->argv[2]->ptr);
697bd567 5225 robj *value = c->argv[3];
dd88747b 5226
003f0840 5227 listTypeTryConversion(o,value);
697bd567
PN
5228 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5229 unsigned char *p, *zl = o->ptr;
5230 p = ziplistIndex(zl,index);
5231 if (p == NULL) {
5232 addReply(c,shared.outofrangeerr);
5233 } else {
be02a7c0 5234 o->ptr = ziplistDelete(o->ptr,&p);
697bd567
PN
5235 value = getDecodedObject(value);
5236 o->ptr = ziplistInsert(o->ptr,p,value->ptr,sdslen(value->ptr));
5237 decrRefCount(value);
5238 addReply(c,shared.ok);
5239 server.dirty++;
5240 }
5241 } else if (o->encoding == REDIS_ENCODING_LIST) {
5242 listNode *ln = listIndex(o->ptr,index);
5243 if (ln == NULL) {
5244 addReply(c,shared.outofrangeerr);
5245 } else {
5246 decrRefCount((robj*)listNodeValue(ln));
5247 listNodeValue(ln) = value;
5248 incrRefCount(value);
5249 addReply(c,shared.ok);
5250 server.dirty++;
5251 }
ed9b544e 5252 } else {
697bd567 5253 redisPanic("Unknown list encoding");
ed9b544e 5254 }
5255}
5256
5257static void popGenericCommand(redisClient *c, int where) {
d72562f7
PN
5258 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk);
5259 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
3305306f 5260
003f0840 5261 robj *value = listTypePop(o,where);
d72562f7 5262 if (value == NULL) {
dd88747b 5263 addReply(c,shared.nullbulk);
5264 } else {
d72562f7
PN
5265 addReplyBulk(c,value);
5266 decrRefCount(value);
003f0840 5267 if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5268 server.dirty++;
ed9b544e 5269 }
5270}
5271
5272static void lpopCommand(redisClient *c) {
5273 popGenericCommand(c,REDIS_HEAD);
5274}
5275
5276static void rpopCommand(redisClient *c) {
5277 popGenericCommand(c,REDIS_TAIL);
5278}
5279
5280static void lrangeCommand(redisClient *c) {
a6dd455b 5281 robj *o, *value;
ed9b544e 5282 int start = atoi(c->argv[2]->ptr);
5283 int end = atoi(c->argv[3]->ptr);
dd88747b 5284 int llen;
5285 int rangelen, j;
003f0840 5286 listTypeEntry entry;
dd88747b 5287
4e27f268 5288 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5289 || checkType(c,o,REDIS_LIST)) return;
003f0840 5290 llen = listTypeLength(o);
dd88747b 5291
5292 /* convert negative indexes */
5293 if (start < 0) start = llen+start;
5294 if (end < 0) end = llen+end;
5295 if (start < 0) start = 0;
5296 if (end < 0) end = 0;
5297
5298 /* indexes sanity checks */
5299 if (start > end || start >= llen) {
5300 /* Out of range start or start > end result in empty list */
5301 addReply(c,shared.emptymultibulk);
5302 return;
5303 }
5304 if (end >= llen) end = llen-1;
5305 rangelen = (end-start)+1;
3305306f 5306
dd88747b 5307 /* Return the result in form of a multi-bulk reply */
dd88747b 5308 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
003f0840 5309 listTypeIterator *li = listTypeInitIterator(o,start,REDIS_TAIL);
dd88747b 5310 for (j = 0; j < rangelen; j++) {
003f0840
PN
5311 redisAssert(listTypeNext(li,&entry));
5312 value = listTypeGet(&entry);
a6dd455b 5313 addReplyBulk(c,value);
be02a7c0 5314 decrRefCount(value);
ed9b544e 5315 }
003f0840 5316 listTypeReleaseIterator(li);
ed9b544e 5317}
5318
5319static void ltrimCommand(redisClient *c) {
3305306f 5320 robj *o;
ed9b544e 5321 int start = atoi(c->argv[2]->ptr);
5322 int end = atoi(c->argv[3]->ptr);
dd88747b 5323 int llen;
5324 int j, ltrim, rtrim;
5325 list *list;
5326 listNode *ln;
5327
5328 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
5329 checkType(c,o,REDIS_LIST)) return;
003f0840 5330 llen = listTypeLength(o);
dd88747b 5331
5332 /* convert negative indexes */
5333 if (start < 0) start = llen+start;
5334 if (end < 0) end = llen+end;
5335 if (start < 0) start = 0;
5336 if (end < 0) end = 0;
5337
5338 /* indexes sanity checks */
5339 if (start > end || start >= llen) {
5340 /* Out of range start or start > end result in empty list */
5341 ltrim = llen;
5342 rtrim = 0;
ed9b544e 5343 } else {
dd88747b 5344 if (end >= llen) end = llen-1;
5345 ltrim = start;
5346 rtrim = llen-end-1;
5347 }
ed9b544e 5348
dd88747b 5349 /* Remove list elements to perform the trim */
9ae6b0be
PN
5350 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5351 o->ptr = ziplistDeleteRange(o->ptr,0,ltrim);
5352 o->ptr = ziplistDeleteRange(o->ptr,-rtrim,rtrim);
5353 } else if (o->encoding == REDIS_ENCODING_LIST) {
5354 list = o->ptr;
5355 for (j = 0; j < ltrim; j++) {
5356 ln = listFirst(list);
5357 listDelNode(list,ln);
5358 }
5359 for (j = 0; j < rtrim; j++) {
5360 ln = listLast(list);
5361 listDelNode(list,ln);
5362 }
5363 } else {
5364 redisPanic("Unknown list encoding");
ed9b544e 5365 }
003f0840 5366 if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5367 server.dirty++;
5368 addReply(c,shared.ok);
ed9b544e 5369}
5370
5371static void lremCommand(redisClient *c) {
d2ee16ab 5372 robj *subject, *obj = c->argv[3];
dd88747b 5373 int toremove = atoi(c->argv[2]->ptr);
5374 int removed = 0;
003f0840 5375 listTypeEntry entry;
a4d1ba9a 5376
d2ee16ab
PN
5377 subject = lookupKeyWriteOrReply(c,c->argv[1],shared.czero);
5378 if (subject == NULL || checkType(c,subject,REDIS_LIST)) return;
dd88747b 5379
d2ee16ab
PN
5380 /* Make sure obj is raw when we're dealing with a ziplist */
5381 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5382 obj = getDecodedObject(obj);
5383
003f0840 5384 listTypeIterator *li;
dd88747b 5385 if (toremove < 0) {
5386 toremove = -toremove;
003f0840 5387 li = listTypeInitIterator(subject,-1,REDIS_HEAD);
d2ee16ab 5388 } else {
003f0840 5389 li = listTypeInitIterator(subject,0,REDIS_TAIL);
dd88747b 5390 }
dd88747b 5391
003f0840
PN
5392 while (listTypeNext(li,&entry)) {
5393 if (listTypeEqual(&entry,obj)) {
5394 listTypeDelete(&entry);
dd88747b 5395 server.dirty++;
5396 removed++;
3fbf9001 5397 if (toremove && removed == toremove) break;
ed9b544e 5398 }
5399 }
003f0840 5400 listTypeReleaseIterator(li);
d2ee16ab
PN
5401
5402 /* Clean up raw encoded object */
5403 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5404 decrRefCount(obj);
5405
003f0840 5406 if (listTypeLength(subject) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5407 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 5408}
5409
12f9d551 5410/* This is the semantic of this command:
0f5f7e9a 5411 * RPOPLPUSH srclist dstlist:
12f9d551 5412 * IF LLEN(srclist) > 0
5413 * element = RPOP srclist
5414 * LPUSH dstlist element
5415 * RETURN element
5416 * ELSE
5417 * RETURN nil
5418 * END
5419 * END
5420 *
5421 * The idea is to be able to get an element from a list in a reliable way
5422 * since the element is not just returned but pushed against another list
5423 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5424 */
0f5f7e9a 5425static void rpoplpushcommand(redisClient *c) {
0f62e177 5426 robj *sobj, *value;
dd88747b 5427 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5428 checkType(c,sobj,REDIS_LIST)) return;
12f9d551 5429
003f0840 5430 if (listTypeLength(sobj) == 0) {
12f9d551 5431 addReply(c,shared.nullbulk);
5432 } else {
dd88747b 5433 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
0f62e177 5434 if (dobj && checkType(c,dobj,REDIS_LIST)) return;
003f0840 5435 value = listTypePop(sobj,REDIS_TAIL);
12f9d551 5436
dd88747b 5437 /* Add the element to the target list (unless it's directly
5438 * passed to some BLPOP-ing client */
0f62e177
PN
5439 if (!handleClientsWaitingListPush(c,c->argv[2],value)) {
5440 /* Create the list if the key does not exist */
5441 if (!dobj) {
1cd92e7f 5442 dobj = createZiplistObject();
09241813 5443 dbAdd(c->db,c->argv[2],dobj);
12f9d551 5444 }
003f0840 5445 listTypePush(dobj,value,REDIS_HEAD);
12f9d551 5446 }
dd88747b 5447
5448 /* Send the element to the client as reply as well */
0f62e177
PN
5449 addReplyBulk(c,value);
5450
003f0840 5451 /* listTypePop returns an object with its refcount incremented */
0f62e177 5452 decrRefCount(value);
dd88747b 5453
0f62e177 5454 /* Delete the source list when it is empty */
003f0840 5455 if (listTypeLength(sobj) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5456 server.dirty++;
12f9d551 5457 }
5458}
5459
ed9b544e 5460/* ==================================== Sets ================================ */
5461
35cabcb5
PN
5462static int setTypeAdd(robj *subject, robj *value) {
5463 if (subject->encoding == REDIS_ENCODING_HT) {
5464 if (dictAdd(subject->ptr,value,NULL) == DICT_OK) {
5465 incrRefCount(value);
5466 return 1;
5467 }
5468 } else {
5469 redisPanic("Unknown set encoding");
5470 }
5471 return 0;
5472}
5473
5474static int setTypeRemove(robj *subject, robj *value) {
5475 if (subject->encoding == REDIS_ENCODING_HT) {
5476 if (dictDelete(subject->ptr,value) == DICT_OK) {
5477 if (htNeedsResize(subject->ptr)) dictResize(subject->ptr);
5478 return 1;
5479 }
5480 } else {
5481 redisPanic("Unknown set encoding");
5482 }
5483 return 0;
5484}
5485
5486static int setTypeIsMember(robj *subject, robj *value) {
5487 if (subject->encoding == REDIS_ENCODING_HT) {
5488 return dictFind((dict*)subject->ptr,value) != NULL;
5489 } else {
5490 redisPanic("Unknown set encoding");
5491 }
5492}
5493
5494/* Structure to hold set iteration abstraction. */
5495typedef struct {
5496 int encoding;
5497 dictIterator *di;
5498} setIterator;
5499
5500static setIterator *setTypeInitIterator(robj *subject) {
5501 setIterator *si = zmalloc(sizeof(setIterator));
5502 si->encoding = subject->encoding;
5503 if (si->encoding == REDIS_ENCODING_HT) {
5504 si->di = dictGetIterator(subject->ptr);
5505 } else {
5506 redisPanic("Unknown set encoding");
5507 }
5508 return si;
5509}
5510
5511static void setTypeReleaseIterator(setIterator *si) {
5512 if (si->encoding == REDIS_ENCODING_HT)
5513 dictReleaseIterator(si->di);
5514 zfree(si);
5515}
5516
5517/* Move to the next entry in the set. Returns the object at the current
5518 * position, or NULL when the end is reached. This object will have its
5519 * refcount incremented, so the caller needs to take care of this. */
5520static robj *setTypeNext(setIterator *si) {
5521 robj *ret = NULL;
5522 if (si->encoding == REDIS_ENCODING_HT) {
5523 dictEntry *de = dictNext(si->di);
5524 if (de != NULL) {
5525 ret = dictGetEntryKey(de);
5526 incrRefCount(ret);
5527 }
5528 }
5529 return ret;
5530}
5531
5532
5533/* Return random element from set. The returned object will always have
5534 * an incremented refcount. */
5535robj *setTypeRandomElement(robj *subject) {
5536 robj *ret = NULL;
5537 if (subject->encoding == REDIS_ENCODING_HT) {
5538 dictEntry *de = dictGetRandomKey(subject->ptr);
5539 ret = dictGetEntryKey(de);
5540 incrRefCount(ret);
5541 } else {
5542 redisPanic("Unknown set encoding");
5543 }
5544 return ret;
5545}
5546
5547static unsigned long setTypeSize(robj *subject) {
5548 if (subject->encoding == REDIS_ENCODING_HT) {
5549 return dictSize((dict*)subject->ptr);
5550 } else {
5551 redisPanic("Unknown set encoding");
5552 }
5553}
5554
ed9b544e 5555static void saddCommand(redisClient *c) {
ed9b544e 5556 robj *set;
5557
3305306f 5558 set = lookupKeyWrite(c->db,c->argv[1]);
5559 if (set == NULL) {
ed9b544e 5560 set = createSetObject();
09241813 5561 dbAdd(c->db,c->argv[1],set);
ed9b544e 5562 } else {
ed9b544e 5563 if (set->type != REDIS_SET) {
c937aa89 5564 addReply(c,shared.wrongtypeerr);
ed9b544e 5565 return;
5566 }
5567 }
35cabcb5 5568 if (setTypeAdd(set,c->argv[2])) {
ed9b544e 5569 server.dirty++;
c937aa89 5570 addReply(c,shared.cone);
ed9b544e 5571 } else {
c937aa89 5572 addReply(c,shared.czero);
ed9b544e 5573 }
5574}
5575
5576static void sremCommand(redisClient *c) {
3305306f 5577 robj *set;
ed9b544e 5578
dd88747b 5579 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5580 checkType(c,set,REDIS_SET)) return;
5581
35cabcb5
PN
5582 if (setTypeRemove(set,c->argv[2])) {
5583 if (setTypeSize(set) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5584 server.dirty++;
dd88747b 5585 addReply(c,shared.cone);
ed9b544e 5586 } else {
dd88747b 5587 addReply(c,shared.czero);
ed9b544e 5588 }
5589}
5590
a4460ef4 5591static void smoveCommand(redisClient *c) {
5592 robj *srcset, *dstset;
5593
5594 srcset = lookupKeyWrite(c->db,c->argv[1]);
5595 dstset = lookupKeyWrite(c->db,c->argv[2]);
5596
5597 /* If the source key does not exist return 0, if it's of the wrong type
5598 * raise an error */
5599 if (srcset == NULL || srcset->type != REDIS_SET) {
5600 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5601 return;
5602 }
5603 /* Error if the destination key is not a set as well */
5604 if (dstset && dstset->type != REDIS_SET) {
5605 addReply(c,shared.wrongtypeerr);
5606 return;
5607 }
5608 /* Remove the element from the source set */
35cabcb5 5609 if (!setTypeRemove(srcset,c->argv[3])) {
a4460ef4 5610 /* Key not found in the src set! return zero */
5611 addReply(c,shared.czero);
5612 return;
5613 }
35cabcb5 5614 if (setTypeSize(srcset) == 0 && srcset != dstset)
09241813 5615 dbDelete(c->db,c->argv[1]);
a4460ef4 5616 server.dirty++;
5617 /* Add the element to the destination set */
5618 if (!dstset) {
5619 dstset = createSetObject();
09241813 5620 dbAdd(c->db,c->argv[2],dstset);
a4460ef4 5621 }
35cabcb5 5622 setTypeAdd(dstset,c->argv[3]);
a4460ef4 5623 addReply(c,shared.cone);
5624}
5625
ed9b544e 5626static void sismemberCommand(redisClient *c) {
3305306f 5627 robj *set;
ed9b544e 5628
dd88747b 5629 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5630 checkType(c,set,REDIS_SET)) return;
5631
35cabcb5 5632 if (setTypeIsMember(set,c->argv[2]))
dd88747b 5633 addReply(c,shared.cone);
5634 else
c937aa89 5635 addReply(c,shared.czero);
ed9b544e 5636}
5637
5638static void scardCommand(redisClient *c) {
3305306f 5639 robj *o;
dd88747b 5640
5641 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5642 checkType(c,o,REDIS_SET)) return;
e0a62c7f 5643
35cabcb5 5644 addReplyUlong(c,setTypeSize(o));
ed9b544e 5645}
5646
12fea928 5647static void spopCommand(redisClient *c) {
35cabcb5 5648 robj *set, *ele;
12fea928 5649
dd88747b 5650 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5651 checkType(c,set,REDIS_SET)) return;
5652
35cabcb5
PN
5653 ele = setTypeRandomElement(set);
5654 if (ele == NULL) {
12fea928 5655 addReply(c,shared.nullbulk);
5656 } else {
35cabcb5 5657 setTypeRemove(set,ele);
dd88747b 5658 addReplyBulk(c,ele);
35cabcb5
PN
5659 decrRefCount(ele);
5660 if (setTypeSize(set) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5661 server.dirty++;
12fea928 5662 }
5663}
5664
2abb95a9 5665static void srandmemberCommand(redisClient *c) {
35cabcb5 5666 robj *set, *ele;
2abb95a9 5667
dd88747b 5668 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5669 checkType(c,set,REDIS_SET)) return;
5670
35cabcb5
PN
5671 ele = setTypeRandomElement(set);
5672 if (ele == NULL) {
2abb95a9 5673 addReply(c,shared.nullbulk);
5674 } else {
dd88747b 5675 addReplyBulk(c,ele);
35cabcb5 5676 decrRefCount(ele);
2abb95a9 5677 }
5678}
5679
ed9b544e 5680static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
35cabcb5 5681 return setTypeSize(*(robj**)s1)-setTypeSize(*(robj**)s2);
ed9b544e 5682}
5683
35cabcb5
PN
5684static void sinterGenericCommand(redisClient *c, robj **setkeys, unsigned long setnum, robj *dstkey) {
5685 robj **sets = zmalloc(sizeof(robj*)*setnum);
5686 setIterator *si;
5687 robj *ele, *lenobj = NULL, *dstset = NULL;
682ac724 5688 unsigned long j, cardinality = 0;
ed9b544e 5689
35cabcb5
PN
5690 for (j = 0; j < setnum; j++) {
5691 robj *setobj = dstkey ?
5692 lookupKeyWrite(c->db,setkeys[j]) :
5693 lookupKeyRead(c->db,setkeys[j]);
3305306f 5694 if (!setobj) {
35cabcb5 5695 zfree(sets);
5faa6025 5696 if (dstkey) {
09241813 5697 if (dbDelete(c->db,dstkey))
fdcaae84 5698 server.dirty++;
0d36ded0 5699 addReply(c,shared.czero);
5faa6025 5700 } else {
4e27f268 5701 addReply(c,shared.emptymultibulk);
5faa6025 5702 }
ed9b544e 5703 return;
5704 }
35cabcb5
PN
5705 if (checkType(c,setobj,REDIS_SET)) {
5706 zfree(sets);
ed9b544e 5707 return;
5708 }
35cabcb5 5709 sets[j] = setobj;
ed9b544e 5710 }
5711 /* Sort sets from the smallest to largest, this will improve our
5712 * algorithm's performace */
35cabcb5 5713 qsort(sets,setnum,sizeof(robj*),qsortCompareSetsByCardinality);
ed9b544e 5714
5715 /* The first thing we should output is the total number of elements...
5716 * since this is a multi-bulk write, but at this stage we don't know
5717 * the intersection set size, so we use a trick, append an empty object
5718 * to the output list and save the pointer to later modify it with the
5719 * right length */
5720 if (!dstkey) {
5721 lenobj = createObject(REDIS_STRING,NULL);
5722 addReply(c,lenobj);
5723 decrRefCount(lenobj);
5724 } else {
5725 /* If we have a target key where to store the resulting set
5726 * create this key with an empty set inside */
5727 dstset = createSetObject();
ed9b544e 5728 }
5729
5730 /* Iterate all the elements of the first (smallest) set, and test
5731 * the element against all the other sets, if at least one set does
5732 * not include the element it is discarded */
35cabcb5
PN
5733 si = setTypeInitIterator(sets[0]);
5734 while((ele = setTypeNext(si)) != NULL) {
5735 for (j = 1; j < setnum; j++)
5736 if (!setTypeIsMember(sets[j],ele)) break;
5737
5738 /* Only take action when all sets contain the member */
5739 if (j == setnum) {
5740 if (!dstkey) {
5741 addReplyBulk(c,ele);
5742 cardinality++;
5743 } else {
5744 setTypeAdd(dstset,ele);
5745 }
ed9b544e 5746 }
35cabcb5 5747 decrRefCount(ele);
ed9b544e 5748 }
35cabcb5 5749 setTypeReleaseIterator(si);
ed9b544e 5750
83cdfe18 5751 if (dstkey) {
3ea27d37 5752 /* Store the resulting set into the target, if the intersection
5753 * is not an empty set. */
09241813 5754 dbDelete(c->db,dstkey);
35cabcb5 5755 if (setTypeSize(dstset) > 0) {
09241813 5756 dbAdd(c->db,dstkey,dstset);
35cabcb5 5757 addReplyLongLong(c,setTypeSize(dstset));
3ea27d37 5758 } else {
5759 decrRefCount(dstset);
d36c4e97 5760 addReply(c,shared.czero);
3ea27d37 5761 }
40d224a9 5762 server.dirty++;
d36c4e97 5763 } else {
5764 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 5765 }
35cabcb5 5766 zfree(sets);
ed9b544e 5767}
5768
5769static void sinterCommand(redisClient *c) {
5770 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5771}
5772
5773static void sinterstoreCommand(redisClient *c) {
5774 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5775}
5776
f4f56e1d 5777#define REDIS_OP_UNION 0
5778#define REDIS_OP_DIFF 1
2830ca53 5779#define REDIS_OP_INTER 2
f4f56e1d 5780
35cabcb5
PN
5781static void sunionDiffGenericCommand(redisClient *c, robj **setkeys, int setnum, robj *dstkey, int op) {
5782 robj **sets = zmalloc(sizeof(robj*)*setnum);
5783 setIterator *si;
5784 robj *ele, *dstset = NULL;
40d224a9 5785 int j, cardinality = 0;
5786
35cabcb5
PN
5787 for (j = 0; j < setnum; j++) {
5788 robj *setobj = dstkey ?
5789 lookupKeyWrite(c->db,setkeys[j]) :
5790 lookupKeyRead(c->db,setkeys[j]);
40d224a9 5791 if (!setobj) {
35cabcb5 5792 sets[j] = NULL;
40d224a9 5793 continue;
5794 }
35cabcb5
PN
5795 if (checkType(c,setobj,REDIS_SET)) {
5796 zfree(sets);
40d224a9 5797 return;
5798 }
35cabcb5 5799 sets[j] = setobj;
40d224a9 5800 }
5801
5802 /* We need a temp set object to store our union. If the dstkey
5803 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5804 * this set object will be the resulting object to set into the target key*/
5805 dstset = createSetObject();
5806
40d224a9 5807 /* Iterate all the elements of all the sets, add every element a single
5808 * time to the result set */
35cabcb5
PN
5809 for (j = 0; j < setnum; j++) {
5810 if (op == REDIS_OP_DIFF && j == 0 && !sets[j]) break; /* result set is empty */
5811 if (!sets[j]) continue; /* non existing keys are like empty sets */
40d224a9 5812
35cabcb5
PN
5813 si = setTypeInitIterator(sets[j]);
5814 while((ele = setTypeNext(si)) != NULL) {
f4f56e1d 5815 if (op == REDIS_OP_UNION || j == 0) {
35cabcb5 5816 if (setTypeAdd(dstset,ele)) {
40d224a9 5817 cardinality++;
5818 }
f4f56e1d 5819 } else if (op == REDIS_OP_DIFF) {
35cabcb5 5820 if (setTypeRemove(dstset,ele)) {
f4f56e1d 5821 cardinality--;
5822 }
40d224a9 5823 }
35cabcb5 5824 decrRefCount(ele);
40d224a9 5825 }
35cabcb5 5826 setTypeReleaseIterator(si);
51829ed3 5827
35cabcb5 5828 /* Exit when result set is empty. */
d36c4e97 5829 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 5830 }
5831
f4f56e1d 5832 /* Output the content of the resulting set, if not in STORE mode */
5833 if (!dstkey) {
5834 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
35cabcb5
PN
5835 si = setTypeInitIterator(dstset);
5836 while((ele = setTypeNext(si)) != NULL) {
dd88747b 5837 addReplyBulk(c,ele);
35cabcb5 5838 decrRefCount(ele);
f4f56e1d 5839 }
35cabcb5 5840 setTypeReleaseIterator(si);
d36c4e97 5841 decrRefCount(dstset);
83cdfe18
AG
5842 } else {
5843 /* If we have a target key where to store the resulting set
5844 * create this key with the result set inside */
09241813 5845 dbDelete(c->db,dstkey);
35cabcb5 5846 if (setTypeSize(dstset) > 0) {
09241813 5847 dbAdd(c->db,dstkey,dstset);
35cabcb5 5848 addReplyLongLong(c,setTypeSize(dstset));
3ea27d37 5849 } else {
5850 decrRefCount(dstset);
d36c4e97 5851 addReply(c,shared.czero);
3ea27d37 5852 }
40d224a9 5853 server.dirty++;
5854 }
35cabcb5 5855 zfree(sets);
40d224a9 5856}
5857
5858static void sunionCommand(redisClient *c) {
f4f56e1d 5859 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 5860}
5861
5862static void sunionstoreCommand(redisClient *c) {
f4f56e1d 5863 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5864}
5865
5866static void sdiffCommand(redisClient *c) {
5867 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5868}
5869
5870static void sdiffstoreCommand(redisClient *c) {
5871 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 5872}
5873
6b47e12e 5874/* ==================================== ZSets =============================== */
5875
5876/* ZSETs are ordered sets using two data structures to hold the same elements
5877 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5878 * data structure.
5879 *
5880 * The elements are added to an hash table mapping Redis objects to scores.
5881 * At the same time the elements are added to a skip list mapping scores
5882 * to Redis objects (so objects are sorted by scores in this "view"). */
5883
5884/* This skiplist implementation is almost a C translation of the original
5885 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5886 * Alternative to Balanced Trees", modified in three ways:
5887 * a) this implementation allows for repeated values.
5888 * b) the comparison is not just by key (our 'score') but by satellite data.
5889 * c) there is a back pointer, so it's a doubly linked list with the back
5890 * pointers being only at "level 1". This allows to traverse the list
5891 * from tail to head, useful for ZREVRANGE. */
5892
5893static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5894 zskiplistNode *zn = zmalloc(sizeof(*zn));
5895
5896 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2f4dd7e0 5897 if (level > 1)
2b37892e 5898 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
2f4dd7e0 5899 else
5900 zn->span = NULL;
6b47e12e 5901 zn->score = score;
5902 zn->obj = obj;
5903 return zn;
5904}
5905
5906static zskiplist *zslCreate(void) {
5907 int j;
5908 zskiplist *zsl;
e0a62c7f 5909
6b47e12e 5910 zsl = zmalloc(sizeof(*zsl));
5911 zsl->level = 1;
cc812361 5912 zsl->length = 0;
6b47e12e 5913 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 5914 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 5915 zsl->header->forward[j] = NULL;
94e543b5 5916
5917 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5918 if (j < ZSKIPLIST_MAXLEVEL-1)
5919 zsl->header->span[j] = 0;
69d95c3e 5920 }
e3870fab 5921 zsl->header->backward = NULL;
5922 zsl->tail = NULL;
6b47e12e 5923 return zsl;
5924}
5925
fd8ccf44 5926static void zslFreeNode(zskiplistNode *node) {
5927 decrRefCount(node->obj);
ad807e6f 5928 zfree(node->forward);
69d95c3e 5929 zfree(node->span);
fd8ccf44 5930 zfree(node);
5931}
5932
5933static void zslFree(zskiplist *zsl) {
ad807e6f 5934 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 5935
ad807e6f 5936 zfree(zsl->header->forward);
69d95c3e 5937 zfree(zsl->header->span);
ad807e6f 5938 zfree(zsl->header);
fd8ccf44 5939 while(node) {
599379dd 5940 next = node->forward[0];
fd8ccf44 5941 zslFreeNode(node);
5942 node = next;
5943 }
ad807e6f 5944 zfree(zsl);
fd8ccf44 5945}
5946
6b47e12e 5947static int zslRandomLevel(void) {
5948 int level = 1;
5949 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5950 level += 1;
10c2baa5 5951 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6b47e12e 5952}
5953
5954static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5955 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 5956 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 5957 int i, level;
5958
5959 x = zsl->header;
5960 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
5961 /* store rank that is crossed to reach the insert position */
5962 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 5963
9d60e6e4 5964 while (x->forward[i] &&
5965 (x->forward[i]->score < score ||
5966 (x->forward[i]->score == score &&
69d95c3e 5967 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 5968 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 5969 x = x->forward[i];
69d95c3e 5970 }
6b47e12e 5971 update[i] = x;
5972 }
6b47e12e 5973 /* we assume the key is not already inside, since we allow duplicated
5974 * scores, and the re-insertion of score and redis object should never
5975 * happpen since the caller of zslInsert() should test in the hash table
5976 * if the element is already inside or not. */
5977 level = zslRandomLevel();
5978 if (level > zsl->level) {
69d95c3e 5979 for (i = zsl->level; i < level; i++) {
2b37892e 5980 rank[i] = 0;
6b47e12e 5981 update[i] = zsl->header;
2b37892e 5982 update[i]->span[i-1] = zsl->length;
69d95c3e 5983 }
6b47e12e 5984 zsl->level = level;
5985 }
5986 x = zslCreateNode(level,score,obj);
5987 for (i = 0; i < level; i++) {
5988 x->forward[i] = update[i]->forward[i];
5989 update[i]->forward[i] = x;
69d95c3e
PN
5990
5991 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
5992 if (i > 0) {
5993 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5994 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5995 }
6b47e12e 5996 }
69d95c3e
PN
5997
5998 /* increment span for untouched levels */
5999 for (i = level; i < zsl->level; i++) {
2b37892e 6000 update[i]->span[i-1]++;
69d95c3e
PN
6001 }
6002
bb975144 6003 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 6004 if (x->forward[0])
6005 x->forward[0]->backward = x;
6006 else
6007 zsl->tail = x;
cc812361 6008 zsl->length++;
6b47e12e 6009}
6010
84105336
PN
6011/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
6012void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
6013 int i;
6014 for (i = 0; i < zsl->level; i++) {
6015 if (update[i]->forward[i] == x) {
6016 if (i > 0) {
6017 update[i]->span[i-1] += x->span[i-1] - 1;
6018 }
6019 update[i]->forward[i] = x->forward[i];
6020 } else {
6021 /* invariant: i > 0, because update[0]->forward[0]
6022 * is always equal to x */
6023 update[i]->span[i-1] -= 1;
6024 }
6025 }
6026 if (x->forward[0]) {
6027 x->forward[0]->backward = x->backward;
6028 } else {
6029 zsl->tail = x->backward;
6030 }
6031 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
6032 zsl->level--;
6033 zsl->length--;
6034}
6035
50c55df5 6036/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 6037static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 6038 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6039 int i;
6040
6041 x = zsl->header;
6042 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 6043 while (x->forward[i] &&
6044 (x->forward[i]->score < score ||
6045 (x->forward[i]->score == score &&
6046 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 6047 x = x->forward[i];
6048 update[i] = x;
6049 }
6050 /* We may have multiple elements with the same score, what we need
6051 * is to find the element with both the right score and object. */
6052 x = x->forward[0];
bf028098 6053 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
84105336 6054 zslDeleteNode(zsl, x, update);
9d60e6e4 6055 zslFreeNode(x);
9d60e6e4 6056 return 1;
6057 } else {
6058 return 0; /* not found */
e197b441 6059 }
6060 return 0; /* not found */
fd8ccf44 6061}
6062
1807985b 6063/* Delete all the elements with score between min and max from the skiplist.
6064 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
6065 * Note that this function takes the reference to the hash table view of the
6066 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 6067static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 6068 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6069 unsigned long removed = 0;
6070 int i;
6071
6072 x = zsl->header;
6073 for (i = zsl->level-1; i >= 0; i--) {
6074 while (x->forward[i] && x->forward[i]->score < min)
6075 x = x->forward[i];
6076 update[i] = x;
6077 }
6078 /* We may have multiple elements with the same score, what we need
6079 * is to find the element with both the right score and object. */
6080 x = x->forward[0];
6081 while (x && x->score <= max) {
84105336
PN
6082 zskiplistNode *next = x->forward[0];
6083 zslDeleteNode(zsl, x, update);
1807985b 6084 dictDelete(dict,x->obj);
6085 zslFreeNode(x);
1807985b 6086 removed++;
6087 x = next;
6088 }
6089 return removed; /* not found */
6090}
1807985b 6091
9212eafd 6092/* Delete all the elements with rank between start and end from the skiplist.
2424490f 6093 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
6094static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
6095 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6096 unsigned long traversed = 0, removed = 0;
6097 int i;
6098
9212eafd
PN
6099 x = zsl->header;
6100 for (i = zsl->level-1; i >= 0; i--) {
6101 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
6102 traversed += i > 0 ? x->span[i-1] : 1;
6103 x = x->forward[i];
1807985b 6104 }
9212eafd
PN
6105 update[i] = x;
6106 }
6107
6108 traversed++;
6109 x = x->forward[0];
6110 while (x && traversed <= end) {
84105336
PN
6111 zskiplistNode *next = x->forward[0];
6112 zslDeleteNode(zsl, x, update);
1807985b 6113 dictDelete(dict,x->obj);
6114 zslFreeNode(x);
1807985b 6115 removed++;
9212eafd 6116 traversed++;
1807985b 6117 x = next;
6118 }
9212eafd 6119 return removed;
1807985b 6120}
6121
50c55df5 6122/* Find the first node having a score equal or greater than the specified one.
6123 * Returns NULL if there is no match. */
6124static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
6125 zskiplistNode *x;
6126 int i;
6127
6128 x = zsl->header;
6129 for (i = zsl->level-1; i >= 0; i--) {
6130 while (x->forward[i] && x->forward[i]->score < score)
6131 x = x->forward[i];
6132 }
6133 /* We may have multiple elements with the same score, what we need
6134 * is to find the element with both the right score and object. */
6135 return x->forward[0];
6136}
6137
27b0ccca
PN
6138/* Find the rank for an element by both score and key.
6139 * Returns 0 when the element cannot be found, rank otherwise.
6140 * Note that the rank is 1-based due to the span of zsl->header to the
6141 * first element. */
003f0840 6142static unsigned long zslistTypeGetRank(zskiplist *zsl, double score, robj *o) {
27b0ccca
PN
6143 zskiplistNode *x;
6144 unsigned long rank = 0;
6145 int i;
6146
6147 x = zsl->header;
6148 for (i = zsl->level-1; i >= 0; i--) {
6149 while (x->forward[i] &&
6150 (x->forward[i]->score < score ||
6151 (x->forward[i]->score == score &&
6152 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 6153 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
6154 x = x->forward[i];
6155 }
6156
6157 /* x might be equal to zsl->header, so test if obj is non-NULL */
bf028098 6158 if (x->obj && equalStringObjects(x->obj,o)) {
27b0ccca
PN
6159 return rank;
6160 }
6161 }
6162 return 0;
6163}
6164
e74825c2 6165/* Finds an element by its rank. The rank argument needs to be 1-based. */
003f0840 6166zskiplistNode* zslistTypeGetElementByRank(zskiplist *zsl, unsigned long rank) {
e74825c2
PN
6167 zskiplistNode *x;
6168 unsigned long traversed = 0;
6169 int i;
6170
6171 x = zsl->header;
6172 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 6173 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
6174 {
a50ea45c 6175 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
6176 x = x->forward[i];
6177 }
e74825c2
PN
6178 if (traversed == rank) {
6179 return x;
6180 }
6181 }
6182 return NULL;
6183}
6184
fd8ccf44 6185/* The actual Z-commands implementations */
6186
7db723ad 6187/* This generic command implements both ZADD and ZINCRBY.
e2665397 6188 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 6189 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 6190static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 6191 robj *zsetobj;
6192 zset *zs;
6193 double *score;
6194
5fc9229c 6195 if (isnan(scoreval)) {
6196 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
6197 return;
6198 }
6199
e2665397 6200 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 6201 if (zsetobj == NULL) {
6202 zsetobj = createZsetObject();
09241813 6203 dbAdd(c->db,key,zsetobj);
fd8ccf44 6204 } else {
6205 if (zsetobj->type != REDIS_ZSET) {
6206 addReply(c,shared.wrongtypeerr);
6207 return;
6208 }
6209 }
fd8ccf44 6210 zs = zsetobj->ptr;
e2665397 6211
7db723ad 6212 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 6213 * needs to handle the two different conditions. It's all about setting
6214 * '*score', that is, the new score to set, to the right value. */
6215 score = zmalloc(sizeof(double));
6216 if (doincrement) {
6217 dictEntry *de;
6218
6219 /* Read the old score. If the element was not present starts from 0 */
6220 de = dictFind(zs->dict,ele);
6221 if (de) {
6222 double *oldscore = dictGetEntryVal(de);
6223 *score = *oldscore + scoreval;
6224 } else {
6225 *score = scoreval;
6226 }
5fc9229c 6227 if (isnan(*score)) {
6228 addReplySds(c,
6229 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
6230 zfree(score);
6231 /* Note that we don't need to check if the zset may be empty and
6232 * should be removed here, as we can only obtain Nan as score if
6233 * there was already an element in the sorted set. */
6234 return;
6235 }
e2665397 6236 } else {
6237 *score = scoreval;
6238 }
6239
6240 /* What follows is a simple remove and re-insert operation that is common
7db723ad 6241 * to both ZADD and ZINCRBY... */
e2665397 6242 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 6243 /* case 1: New element */
e2665397 6244 incrRefCount(ele); /* added to hash */
6245 zslInsert(zs->zsl,*score,ele);
6246 incrRefCount(ele); /* added to skiplist */
fd8ccf44 6247 server.dirty++;
e2665397 6248 if (doincrement)
e2665397 6249 addReplyDouble(c,*score);
91d71bfc 6250 else
6251 addReply(c,shared.cone);
fd8ccf44 6252 } else {
6253 dictEntry *de;
6254 double *oldscore;
e0a62c7f 6255
fd8ccf44 6256 /* case 2: Score update operation */
e2665397 6257 de = dictFind(zs->dict,ele);
dfc5e96c 6258 redisAssert(de != NULL);
fd8ccf44 6259 oldscore = dictGetEntryVal(de);
6260 if (*score != *oldscore) {
6261 int deleted;
6262
e2665397 6263 /* Remove and insert the element in the skip list with new score */
6264 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 6265 redisAssert(deleted != 0);
e2665397 6266 zslInsert(zs->zsl,*score,ele);
6267 incrRefCount(ele);
6268 /* Update the score in the hash table */
6269 dictReplace(zs->dict,ele,score);
fd8ccf44 6270 server.dirty++;
2161a965 6271 } else {
6272 zfree(score);
fd8ccf44 6273 }
e2665397 6274 if (doincrement)
6275 addReplyDouble(c,*score);
6276 else
6277 addReply(c,shared.czero);
fd8ccf44 6278 }
6279}
6280
e2665397 6281static void zaddCommand(redisClient *c) {
6282 double scoreval;
6283
bd79a6bd 6284 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 6285 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
6286}
6287
7db723ad 6288static void zincrbyCommand(redisClient *c) {
e2665397 6289 double scoreval;
6290
bd79a6bd 6291 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 6292 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
6293}
6294
1b7106e7 6295static void zremCommand(redisClient *c) {
6296 robj *zsetobj;
6297 zset *zs;
dd88747b 6298 dictEntry *de;
6299 double *oldscore;
6300 int deleted;
1b7106e7 6301
dd88747b 6302 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6303 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 6304
dd88747b 6305 zs = zsetobj->ptr;
6306 de = dictFind(zs->dict,c->argv[2]);
6307 if (de == NULL) {
6308 addReply(c,shared.czero);
6309 return;
1b7106e7 6310 }
dd88747b 6311 /* Delete from the skiplist */
6312 oldscore = dictGetEntryVal(de);
6313 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
6314 redisAssert(deleted != 0);
6315
6316 /* Delete from the hash table */
6317 dictDelete(zs->dict,c->argv[2]);
6318 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
09241813 6319 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 6320 server.dirty++;
6321 addReply(c,shared.cone);
1b7106e7 6322}
6323
1807985b 6324static void zremrangebyscoreCommand(redisClient *c) {
bbe025e0
AM
6325 double min;
6326 double max;
dd88747b 6327 long deleted;
1807985b 6328 robj *zsetobj;
6329 zset *zs;
6330
bd79a6bd
PN
6331 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
6332 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
bbe025e0 6333
dd88747b 6334 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6335 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 6336
dd88747b 6337 zs = zsetobj->ptr;
6338 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
6339 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
09241813 6340 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 6341 server.dirty += deleted;
482b672d 6342 addReplyLongLong(c,deleted);
1807985b 6343}
6344
9212eafd 6345static void zremrangebyrankCommand(redisClient *c) {
bbe025e0
AM
6346 long start;
6347 long end;
dd88747b 6348 int llen;
6349 long deleted;
9212eafd
PN
6350 robj *zsetobj;
6351 zset *zs;
6352
bd79a6bd
PN
6353 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6354 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 6355
dd88747b 6356 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6357 checkType(c,zsetobj,REDIS_ZSET)) return;
6358 zs = zsetobj->ptr;
6359 llen = zs->zsl->length;
9212eafd 6360
dd88747b 6361 /* convert negative indexes */
6362 if (start < 0) start = llen+start;
6363 if (end < 0) end = llen+end;
6364 if (start < 0) start = 0;
6365 if (end < 0) end = 0;
9212eafd 6366
dd88747b 6367 /* indexes sanity checks */
6368 if (start > end || start >= llen) {
6369 addReply(c,shared.czero);
6370 return;
9212eafd 6371 }
dd88747b 6372 if (end >= llen) end = llen-1;
6373
6374 /* increment start and end because zsl*Rank functions
6375 * use 1-based rank */
6376 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
6377 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
09241813 6378 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 6379 server.dirty += deleted;
482b672d 6380 addReplyLongLong(c, deleted);
9212eafd
PN
6381}
6382
8f92e768
PN
6383typedef struct {
6384 dict *dict;
6385 double weight;
6386} zsetopsrc;
6387
6388static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
6389 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
6390 unsigned long size1, size2;
6391 size1 = d1->dict ? dictSize(d1->dict) : 0;
6392 size2 = d2->dict ? dictSize(d2->dict) : 0;
6393 return size1 - size2;
6394}
6395
d2764cd6
PN
6396#define REDIS_AGGR_SUM 1
6397#define REDIS_AGGR_MIN 2
6398#define REDIS_AGGR_MAX 3
bc000c1d 6399#define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
d2764cd6
PN
6400
6401inline static void zunionInterAggregate(double *target, double val, int aggregate) {
6402 if (aggregate == REDIS_AGGR_SUM) {
6403 *target = *target + val;
6404 } else if (aggregate == REDIS_AGGR_MIN) {
6405 *target = val < *target ? val : *target;
6406 } else if (aggregate == REDIS_AGGR_MAX) {
6407 *target = val > *target ? val : *target;
6408 } else {
6409 /* safety net */
f83c6cb5 6410 redisPanic("Unknown ZUNION/INTER aggregate type");
d2764cd6
PN
6411 }
6412}
6413
2830ca53 6414static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
bc000c1d 6415 int i, j, setnum;
d2764cd6 6416 int aggregate = REDIS_AGGR_SUM;
8f92e768 6417 zsetopsrc *src;
2830ca53
PN
6418 robj *dstobj;
6419 zset *dstzset;
b287c9bb
PN
6420 dictIterator *di;
6421 dictEntry *de;
6422
bc000c1d
JC
6423 /* expect setnum input keys to be given */
6424 setnum = atoi(c->argv[2]->ptr);
6425 if (setnum < 1) {
5d373da9 6426 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
2830ca53 6427 return;
b287c9bb 6428 }
2830ca53
PN
6429
6430 /* test if the expected number of keys would overflow */
bc000c1d 6431 if (3+setnum > c->argc) {
b287c9bb
PN
6432 addReply(c,shared.syntaxerr);
6433 return;
6434 }
6435
2830ca53 6436 /* read keys to be used for input */
bc000c1d
JC
6437 src = zmalloc(sizeof(zsetopsrc) * setnum);
6438 for (i = 0, j = 3; i < setnum; i++, j++) {
6439 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
6440 if (!obj) {
8f92e768 6441 src[i].dict = NULL;
b287c9bb 6442 } else {
bc000c1d
JC
6443 if (obj->type == REDIS_ZSET) {
6444 src[i].dict = ((zset*)obj->ptr)->dict;
6445 } else if (obj->type == REDIS_SET) {
6446 src[i].dict = (obj->ptr);
6447 } else {
8f92e768 6448 zfree(src);
b287c9bb
PN
6449 addReply(c,shared.wrongtypeerr);
6450 return;
6451 }
b287c9bb 6452 }
2830ca53
PN
6453
6454 /* default all weights to 1 */
8f92e768 6455 src[i].weight = 1.0;
b287c9bb
PN
6456 }
6457
2830ca53
PN
6458 /* parse optional extra arguments */
6459 if (j < c->argc) {
d2764cd6 6460 int remaining = c->argc - j;
b287c9bb 6461
2830ca53 6462 while (remaining) {
bc000c1d 6463 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 6464 j++; remaining--;
bc000c1d 6465 for (i = 0; i < setnum; i++, j++, remaining--) {
bd79a6bd 6466 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
bbe025e0 6467 return;
2830ca53 6468 }
d2764cd6
PN
6469 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6470 j++; remaining--;
6471 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6472 aggregate = REDIS_AGGR_SUM;
6473 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6474 aggregate = REDIS_AGGR_MIN;
6475 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6476 aggregate = REDIS_AGGR_MAX;
6477 } else {
6478 zfree(src);
6479 addReply(c,shared.syntaxerr);
6480 return;
6481 }
6482 j++; remaining--;
2830ca53 6483 } else {
8f92e768 6484 zfree(src);
2830ca53
PN
6485 addReply(c,shared.syntaxerr);
6486 return;
6487 }
6488 }
6489 }
b287c9bb 6490
d2764cd6
PN
6491 /* sort sets from the smallest to largest, this will improve our
6492 * algorithm's performance */
bc000c1d 6493 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
d2764cd6 6494
2830ca53
PN
6495 dstobj = createZsetObject();
6496 dstzset = dstobj->ptr;
6497
6498 if (op == REDIS_OP_INTER) {
8f92e768
PN
6499 /* skip going over all entries if the smallest zset is NULL or empty */
6500 if (src[0].dict && dictSize(src[0].dict) > 0) {
6501 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6502 * from small to large, all src[i > 0].dict are non-empty too */
6503 di = dictGetIterator(src[0].dict);
2830ca53 6504 while((de = dictNext(di)) != NULL) {
d2764cd6 6505 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6506 *score = src[0].weight * zunionInterDictValue(de);
2830ca53 6507
bc000c1d 6508 for (j = 1; j < setnum; j++) {
d2764cd6 6509 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6510 if (other) {
bc000c1d 6511 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6512 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6513 } else {
6514 break;
6515 }
6516 }
b287c9bb 6517
2830ca53 6518 /* skip entry when not present in every source dict */
bc000c1d 6519 if (j != setnum) {
2830ca53
PN
6520 zfree(score);
6521 } else {
6522 robj *o = dictGetEntryKey(de);
6523 dictAdd(dstzset->dict,o,score);
6524 incrRefCount(o); /* added to dictionary */
6525 zslInsert(dstzset->zsl,*score,o);
6526 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
6527 }
6528 }
2830ca53
PN
6529 dictReleaseIterator(di);
6530 }
6531 } else if (op == REDIS_OP_UNION) {
bc000c1d 6532 for (i = 0; i < setnum; i++) {
8f92e768 6533 if (!src[i].dict) continue;
2830ca53 6534
8f92e768 6535 di = dictGetIterator(src[i].dict);
2830ca53
PN
6536 while((de = dictNext(di)) != NULL) {
6537 /* skip key when already processed */
6538 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6539
d2764cd6 6540 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6541 *score = src[i].weight * zunionInterDictValue(de);
2830ca53 6542
d2764cd6
PN
6543 /* because the zsets are sorted by size, its only possible
6544 * for sets at larger indices to hold this entry */
bc000c1d 6545 for (j = (i+1); j < setnum; j++) {
d2764cd6 6546 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6547 if (other) {
bc000c1d 6548 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6549 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6550 }
6551 }
b287c9bb 6552
2830ca53
PN
6553 robj *o = dictGetEntryKey(de);
6554 dictAdd(dstzset->dict,o,score);
6555 incrRefCount(o); /* added to dictionary */
6556 zslInsert(dstzset->zsl,*score,o);
6557 incrRefCount(o); /* added to skiplist */
6558 }
6559 dictReleaseIterator(di);
b287c9bb 6560 }
2830ca53
PN
6561 } else {
6562 /* unknown operator */
6563 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
6564 }
6565
09241813 6566 dbDelete(c->db,dstkey);
3ea27d37 6567 if (dstzset->zsl->length) {
09241813 6568 dbAdd(c->db,dstkey,dstobj);
482b672d 6569 addReplyLongLong(c, dstzset->zsl->length);
3ea27d37 6570 server.dirty++;
6571 } else {
8bca8773 6572 decrRefCount(dstobj);
3ea27d37 6573 addReply(c, shared.czero);
6574 }
8f92e768 6575 zfree(src);
b287c9bb
PN
6576}
6577
5d373da9 6578static void zunionstoreCommand(redisClient *c) {
2830ca53 6579 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
6580}
6581
5d373da9 6582static void zinterstoreCommand(redisClient *c) {
2830ca53 6583 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
6584}
6585
e3870fab 6586static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 6587 robj *o;
bbe025e0
AM
6588 long start;
6589 long end;
752da584 6590 int withscores = 0;
dd88747b 6591 int llen;
6592 int rangelen, j;
6593 zset *zsetobj;
6594 zskiplist *zsl;
6595 zskiplistNode *ln;
6596 robj *ele;
752da584 6597
bd79a6bd
PN
6598 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6599 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 6600
752da584 6601 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6602 withscores = 1;
6603 } else if (c->argc >= 5) {
6604 addReply(c,shared.syntaxerr);
6605 return;
6606 }
cc812361 6607
4e27f268 6608 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6609 || checkType(c,o,REDIS_ZSET)) return;
dd88747b 6610 zsetobj = o->ptr;
6611 zsl = zsetobj->zsl;
6612 llen = zsl->length;
cc812361 6613
dd88747b 6614 /* convert negative indexes */
6615 if (start < 0) start = llen+start;
6616 if (end < 0) end = llen+end;
6617 if (start < 0) start = 0;
6618 if (end < 0) end = 0;
cc812361 6619
dd88747b 6620 /* indexes sanity checks */
6621 if (start > end || start >= llen) {
6622 /* Out of range start or start > end result in empty list */
6623 addReply(c,shared.emptymultibulk);
6624 return;
6625 }
6626 if (end >= llen) end = llen-1;
6627 rangelen = (end-start)+1;
cc812361 6628
dd88747b 6629 /* check if starting point is trivial, before searching
6630 * the element in log(N) time */
6631 if (reverse) {
003f0840 6632 ln = start == 0 ? zsl->tail : zslistTypeGetElementByRank(zsl, llen-start);
dd88747b 6633 } else {
6634 ln = start == 0 ?
003f0840 6635 zsl->header->forward[0] : zslistTypeGetElementByRank(zsl, start+1);
dd88747b 6636 }
cc812361 6637
dd88747b 6638 /* Return the result in form of a multi-bulk reply */
6639 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6640 withscores ? (rangelen*2) : rangelen));
6641 for (j = 0; j < rangelen; j++) {
6642 ele = ln->obj;
6643 addReplyBulk(c,ele);
6644 if (withscores)
6645 addReplyDouble(c,ln->score);
6646 ln = reverse ? ln->backward : ln->forward[0];
cc812361 6647 }
6648}
6649
e3870fab 6650static void zrangeCommand(redisClient *c) {
6651 zrangeGenericCommand(c,0);
6652}
6653
6654static void zrevrangeCommand(redisClient *c) {
6655 zrangeGenericCommand(c,1);
6656}
6657
f44dd428 6658/* This command implements both ZRANGEBYSCORE and ZCOUNT.
6659 * If justcount is non-zero, just the count is returned. */
6660static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 6661 robj *o;
f44dd428 6662 double min, max;
6663 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 6664 int offset = 0, limit = -1;
0500ef27
SH
6665 int withscores = 0;
6666 int badsyntax = 0;
6667
f44dd428 6668 /* Parse the min-max interval. If one of the values is prefixed
6669 * by the "(" character, it's considered "open". For instance
6670 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6671 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6672 if (((char*)c->argv[2]->ptr)[0] == '(') {
6673 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6674 minex = 1;
6675 } else {
6676 min = strtod(c->argv[2]->ptr,NULL);
6677 }
6678 if (((char*)c->argv[3]->ptr)[0] == '(') {
6679 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6680 maxex = 1;
6681 } else {
6682 max = strtod(c->argv[3]->ptr,NULL);
6683 }
6684
6685 /* Parse "WITHSCORES": note that if the command was called with
6686 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6687 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 6688 if (c->argc == 5 || c->argc == 8) {
3a3978b1 6689 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6690 withscores = 1;
6691 else
6692 badsyntax = 1;
0500ef27 6693 }
3a3978b1 6694 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 6695 badsyntax = 1;
0500ef27 6696 if (badsyntax) {
454d4e43 6697 addReplySds(c,
6698 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 6699 return;
0500ef27
SH
6700 }
6701
f44dd428 6702 /* Parse "LIMIT" */
0500ef27 6703 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 6704 addReply(c,shared.syntaxerr);
6705 return;
0500ef27 6706 } else if (c->argc == (7 + withscores)) {
80181f78 6707 offset = atoi(c->argv[5]->ptr);
6708 limit = atoi(c->argv[6]->ptr);
0b13687c 6709 if (offset < 0) offset = 0;
80181f78 6710 }
50c55df5 6711
f44dd428 6712 /* Ok, lookup the key and get the range */
50c55df5 6713 o = lookupKeyRead(c->db,c->argv[1]);
6714 if (o == NULL) {
4e27f268 6715 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6716 } else {
6717 if (o->type != REDIS_ZSET) {
6718 addReply(c,shared.wrongtypeerr);
6719 } else {
6720 zset *zsetobj = o->ptr;
6721 zskiplist *zsl = zsetobj->zsl;
6722 zskiplistNode *ln;
f44dd428 6723 robj *ele, *lenobj = NULL;
6724 unsigned long rangelen = 0;
50c55df5 6725
f44dd428 6726 /* Get the first node with the score >= min, or with
6727 * score > min if 'minex' is true. */
50c55df5 6728 ln = zslFirstWithScore(zsl,min);
f44dd428 6729 while (minex && ln && ln->score == min) ln = ln->forward[0];
6730
50c55df5 6731 if (ln == NULL) {
6732 /* No element matching the speciifed interval */
f44dd428 6733 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6734 return;
6735 }
6736
6737 /* We don't know in advance how many matching elements there
6738 * are in the list, so we push this object that will represent
6739 * the multi-bulk length in the output buffer, and will "fix"
6740 * it later */
f44dd428 6741 if (!justcount) {
6742 lenobj = createObject(REDIS_STRING,NULL);
6743 addReply(c,lenobj);
6744 decrRefCount(lenobj);
6745 }
50c55df5 6746
f44dd428 6747 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 6748 if (offset) {
6749 offset--;
6750 ln = ln->forward[0];
6751 continue;
6752 }
6753 if (limit == 0) break;
f44dd428 6754 if (!justcount) {
6755 ele = ln->obj;
dd88747b 6756 addReplyBulk(c,ele);
f44dd428 6757 if (withscores)
6758 addReplyDouble(c,ln->score);
6759 }
50c55df5 6760 ln = ln->forward[0];
6761 rangelen++;
80181f78 6762 if (limit > 0) limit--;
50c55df5 6763 }
f44dd428 6764 if (justcount) {
482b672d 6765 addReplyLongLong(c,(long)rangelen);
f44dd428 6766 } else {
6767 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6768 withscores ? (rangelen*2) : rangelen);
6769 }
50c55df5 6770 }
6771 }
6772}
6773
f44dd428 6774static void zrangebyscoreCommand(redisClient *c) {
6775 genericZrangebyscoreCommand(c,0);
6776}
6777
6778static void zcountCommand(redisClient *c) {
6779 genericZrangebyscoreCommand(c,1);
6780}
6781
3c41331e 6782static void zcardCommand(redisClient *c) {
e197b441 6783 robj *o;
6784 zset *zs;
dd88747b 6785
6786 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6787 checkType(c,o,REDIS_ZSET)) return;
6788
6789 zs = o->ptr;
6790 addReplyUlong(c,zs->zsl->length);
e197b441 6791}
6792
6e333bbe 6793static void zscoreCommand(redisClient *c) {
6794 robj *o;
6795 zset *zs;
dd88747b 6796 dictEntry *de;
6797
6798 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6799 checkType(c,o,REDIS_ZSET)) return;
6800
6801 zs = o->ptr;
6802 de = dictFind(zs->dict,c->argv[2]);
6803 if (!de) {
96d8b4ee 6804 addReply(c,shared.nullbulk);
6e333bbe 6805 } else {
dd88747b 6806 double *score = dictGetEntryVal(de);
6e333bbe 6807
dd88747b 6808 addReplyDouble(c,*score);
6e333bbe 6809 }
6810}
6811
798d9e55 6812static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 6813 robj *o;
dd88747b 6814 zset *zs;
6815 zskiplist *zsl;
6816 dictEntry *de;
6817 unsigned long rank;
6818 double *score;
6819
6820 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6821 checkType(c,o,REDIS_ZSET)) return;
6822
6823 zs = o->ptr;
6824 zsl = zs->zsl;
6825 de = dictFind(zs->dict,c->argv[2]);
6826 if (!de) {
69d95c3e
PN
6827 addReply(c,shared.nullbulk);
6828 return;
6829 }
69d95c3e 6830
dd88747b 6831 score = dictGetEntryVal(de);
003f0840 6832 rank = zslistTypeGetRank(zsl, *score, c->argv[2]);
dd88747b 6833 if (rank) {
6834 if (reverse) {
482b672d 6835 addReplyLongLong(c, zsl->length - rank);
27b0ccca 6836 } else {
482b672d 6837 addReplyLongLong(c, rank-1);
69d95c3e 6838 }
dd88747b 6839 } else {
6840 addReply(c,shared.nullbulk);
978c2c94 6841 }
6842}
6843
798d9e55
PN
6844static void zrankCommand(redisClient *c) {
6845 zrankGenericCommand(c, 0);
6846}
6847
6848static void zrevrankCommand(redisClient *c) {
6849 zrankGenericCommand(c, 1);
6850}
6851
7fb16bac
PN
6852/* ========================= Hashes utility functions ======================= */
6853#define REDIS_HASH_KEY 1
6854#define REDIS_HASH_VALUE 2
978c2c94 6855
7fb16bac
PN
6856/* Check the length of a number of objects to see if we need to convert a
6857 * zipmap to a real hash. Note that we only check string encoded objects
6858 * as their string length can be queried in constant time. */
d1578a33 6859static void hashTypeTryConversion(robj *subject, robj **argv, int start, int end) {
7fb16bac
PN
6860 int i;
6861 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
978c2c94 6862
7fb16bac
PN
6863 for (i = start; i <= end; i++) {
6864 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6865 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6866 {
6867 convertToRealHash(subject);
978c2c94 6868 return;
6869 }
6870 }
7fb16bac 6871}
bae2c7ec 6872
97224de7 6873/* Encode given objects in-place when the hash uses a dict. */
d1578a33 6874static void hashTypeTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
97224de7 6875 if (subject->encoding == REDIS_ENCODING_HT) {
3f973463
PN
6876 if (o1) *o1 = tryObjectEncoding(*o1);
6877 if (o2) *o2 = tryObjectEncoding(*o2);
97224de7
PN
6878 }
6879}
6880
7fb16bac 6881/* Get the value from a hash identified by key. Returns either a string
a3f3af86
PN
6882 * object or NULL if the value cannot be found. The refcount of the object
6883 * is always increased by 1 when the value was found. */
d1578a33 6884static robj *hashTypeGet(robj *o, robj *key) {
7fb16bac 6885 robj *value = NULL;
978c2c94 6886 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7fb16bac
PN
6887 unsigned char *v;
6888 unsigned int vlen;
6889 key = getDecodedObject(key);
6890 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6891 value = createStringObject((char*)v,vlen);
6892 }
6893 decrRefCount(key);
6894 } else {
6895 dictEntry *de = dictFind(o->ptr,key);
6896 if (de != NULL) {
6897 value = dictGetEntryVal(de);
a3f3af86 6898 incrRefCount(value);
7fb16bac
PN
6899 }
6900 }
6901 return value;
6902}
978c2c94 6903
7fb16bac
PN
6904/* Test if the key exists in the given hash. Returns 1 if the key
6905 * exists and 0 when it doesn't. */
d1578a33 6906static int hashTypeExists(robj *o, robj *key) {
7fb16bac
PN
6907 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6908 key = getDecodedObject(key);
6909 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6910 decrRefCount(key);
6911 return 1;
6912 }
6913 decrRefCount(key);
6914 } else {
6915 if (dictFind(o->ptr,key) != NULL) {
6916 return 1;
6917 }
6918 }
6919 return 0;
6920}
bae2c7ec 6921
7fb16bac
PN
6922/* Add an element, discard the old if the key already exists.
6923 * Return 0 on insert and 1 on update. */
d1578a33 6924static int hashTypeSet(robj *o, robj *key, robj *value) {
7fb16bac
PN
6925 int update = 0;
6926 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6927 key = getDecodedObject(key);
6928 value = getDecodedObject(value);
6929 o->ptr = zipmapSet(o->ptr,
6930 key->ptr,sdslen(key->ptr),
6931 value->ptr,sdslen(value->ptr), &update);
6932 decrRefCount(key);
6933 decrRefCount(value);
6934
6935 /* Check if the zipmap needs to be upgraded to a real hash table */
6936 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
bae2c7ec 6937 convertToRealHash(o);
978c2c94 6938 } else {
7fb16bac
PN
6939 if (dictReplace(o->ptr,key,value)) {
6940 /* Insert */
6941 incrRefCount(key);
978c2c94 6942 } else {
7fb16bac 6943 /* Update */
978c2c94 6944 update = 1;
6945 }
7fb16bac 6946 incrRefCount(value);
978c2c94 6947 }
7fb16bac 6948 return update;
978c2c94 6949}
6950
7fb16bac
PN
6951/* Delete an element from a hash.
6952 * Return 1 on deleted and 0 on not found. */
d1578a33 6953static int hashTypeDelete(robj *o, robj *key) {
7fb16bac
PN
6954 int deleted = 0;
6955 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6956 key = getDecodedObject(key);
6957 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6958 decrRefCount(key);
6959 } else {
6960 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6961 /* Always check if the dictionary needs a resize after a delete. */
6962 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
d33278d1 6963 }
7fb16bac
PN
6964 return deleted;
6965}
d33278d1 6966
7fb16bac 6967/* Return the number of elements in a hash. */
d1578a33 6968static unsigned long hashTypeLength(robj *o) {
7fb16bac
PN
6969 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6970 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6971}
6972
6973/* Structure to hold hash iteration abstration. Note that iteration over
6974 * hashes involves both fields and values. Because it is possible that
6975 * not both are required, store pointers in the iterator to avoid
6976 * unnecessary memory allocation for fields/values. */
6977typedef struct {
6978 int encoding;
6979 unsigned char *zi;
6980 unsigned char *zk, *zv;
6981 unsigned int zklen, zvlen;
6982
6983 dictIterator *di;
6984 dictEntry *de;
d1578a33 6985} hashTypeIterator;
7fb16bac 6986
d1578a33
PN
6987static hashTypeIterator *hashTypeInitIterator(robj *subject) {
6988 hashTypeIterator *hi = zmalloc(sizeof(hashTypeIterator));
7fb16bac
PN
6989 hi->encoding = subject->encoding;
6990 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6991 hi->zi = zipmapRewind(subject->ptr);
6992 } else if (hi->encoding == REDIS_ENCODING_HT) {
6993 hi->di = dictGetIterator(subject->ptr);
d33278d1 6994 } else {
7fb16bac 6995 redisAssert(NULL);
d33278d1 6996 }
c44d3b56 6997 return hi;
7fb16bac 6998}
d33278d1 6999
d1578a33 7000static void hashTypeReleaseIterator(hashTypeIterator *hi) {
7fb16bac
PN
7001 if (hi->encoding == REDIS_ENCODING_HT) {
7002 dictReleaseIterator(hi->di);
d33278d1 7003 }
c44d3b56 7004 zfree(hi);
7fb16bac 7005}
d33278d1 7006
7fb16bac
PN
7007/* Move to the next entry in the hash. Return REDIS_OK when the next entry
7008 * could be found and REDIS_ERR when the iterator reaches the end. */
d1578a33 7009static int hashTypeNext(hashTypeIterator *hi) {
7fb16bac
PN
7010 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7011 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
7012 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
7013 } else {
7014 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
7015 }
7016 return REDIS_OK;
7017}
d33278d1 7018
0c390abc 7019/* Get key or value object at current iteration position.
a3f3af86 7020 * This increases the refcount of the field object by 1. */
d1578a33 7021static robj *hashTypeCurrent(hashTypeIterator *hi, int what) {
7fb16bac
PN
7022 robj *o;
7023 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7024 if (what & REDIS_HASH_KEY) {
7025 o = createStringObject((char*)hi->zk,hi->zklen);
7026 } else {
7027 o = createStringObject((char*)hi->zv,hi->zvlen);
d33278d1 7028 }
d33278d1 7029 } else {
7fb16bac
PN
7030 if (what & REDIS_HASH_KEY) {
7031 o = dictGetEntryKey(hi->de);
7032 } else {
7033 o = dictGetEntryVal(hi->de);
d33278d1 7034 }
a3f3af86 7035 incrRefCount(o);
d33278d1 7036 }
7fb16bac 7037 return o;
d33278d1
PN
7038}
7039
d1578a33 7040static robj *hashTypeLookupWriteOrCreate(redisClient *c, robj *key) {
7fb16bac 7041 robj *o = lookupKeyWrite(c->db,key);
01426b05
PN
7042 if (o == NULL) {
7043 o = createHashObject();
09241813 7044 dbAdd(c->db,key,o);
01426b05
PN
7045 } else {
7046 if (o->type != REDIS_HASH) {
7047 addReply(c,shared.wrongtypeerr);
7fb16bac 7048 return NULL;
01426b05
PN
7049 }
7050 }
7fb16bac
PN
7051 return o;
7052}
01426b05 7053
7fb16bac
PN
7054/* ============================= Hash commands ============================== */
7055static void hsetCommand(redisClient *c) {
6e9e463f 7056 int update;
7fb16bac 7057 robj *o;
bbe025e0 7058
d1578a33
PN
7059 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7060 hashTypeTryConversion(o,c->argv,2,3);
7061 hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
7062 update = hashTypeSet(o,c->argv[2],c->argv[3]);
6e9e463f 7063 addReply(c, update ? shared.czero : shared.cone);
7fb16bac
PN
7064 server.dirty++;
7065}
01426b05 7066
1f1c7695
PN
7067static void hsetnxCommand(redisClient *c) {
7068 robj *o;
d1578a33
PN
7069 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7070 hashTypeTryConversion(o,c->argv,2,3);
1f1c7695 7071
d1578a33 7072 if (hashTypeExists(o, c->argv[2])) {
1f1c7695 7073 addReply(c, shared.czero);
01426b05 7074 } else {
d1578a33
PN
7075 hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
7076 hashTypeSet(o,c->argv[2],c->argv[3]);
1f1c7695
PN
7077 addReply(c, shared.cone);
7078 server.dirty++;
7079 }
7080}
01426b05 7081
7fb16bac
PN
7082static void hmsetCommand(redisClient *c) {
7083 int i;
7084 robj *o;
01426b05 7085
7fb16bac
PN
7086 if ((c->argc % 2) == 1) {
7087 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
7088 return;
7089 }
01426b05 7090
d1578a33
PN
7091 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7092 hashTypeTryConversion(o,c->argv,2,c->argc-1);
7fb16bac 7093 for (i = 2; i < c->argc; i += 2) {
d1578a33
PN
7094 hashTypeTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
7095 hashTypeSet(o,c->argv[i],c->argv[i+1]);
7fb16bac
PN
7096 }
7097 addReply(c, shared.ok);
edc2f63a 7098 server.dirty++;
7fb16bac
PN
7099}
7100
7101static void hincrbyCommand(redisClient *c) {
7102 long long value, incr;
7103 robj *o, *current, *new;
7104
bd79a6bd 7105 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
d1578a33
PN
7106 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7107 if ((current = hashTypeGet(o,c->argv[2])) != NULL) {
946342c1
PN
7108 if (getLongLongFromObjectOrReply(c,current,&value,
7109 "hash value is not an integer") != REDIS_OK) {
7110 decrRefCount(current);
7111 return;
7112 }
a3f3af86 7113 decrRefCount(current);
7fb16bac
PN
7114 } else {
7115 value = 0;
01426b05
PN
7116 }
7117
7fb16bac 7118 value += incr;
3f973463 7119 new = createStringObjectFromLongLong(value);
d1578a33
PN
7120 hashTypeTryObjectEncoding(o,&c->argv[2],NULL);
7121 hashTypeSet(o,c->argv[2],new);
7fb16bac
PN
7122 decrRefCount(new);
7123 addReplyLongLong(c,value);
01426b05 7124 server.dirty++;
01426b05
PN
7125}
7126
978c2c94 7127static void hgetCommand(redisClient *c) {
7fb16bac 7128 robj *o, *value;
dd88747b 7129 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
7130 checkType(c,o,REDIS_HASH)) return;
7131
d1578a33 7132 if ((value = hashTypeGet(o,c->argv[2])) != NULL) {
7fb16bac 7133 addReplyBulk(c,value);
a3f3af86 7134 decrRefCount(value);
dd88747b 7135 } else {
7fb16bac 7136 addReply(c,shared.nullbulk);
69d95c3e 7137 }
69d95c3e
PN
7138}
7139
09aeb579
PN
7140static void hmgetCommand(redisClient *c) {
7141 int i;
7fb16bac
PN
7142 robj *o, *value;
7143 o = lookupKeyRead(c->db,c->argv[1]);
7144 if (o != NULL && o->type != REDIS_HASH) {
7145 addReply(c,shared.wrongtypeerr);
09aeb579
PN
7146 }
7147
7fb16bac
PN
7148 /* Note the check for o != NULL happens inside the loop. This is
7149 * done because objects that cannot be found are considered to be
7150 * an empty hash. The reply should then be a series of NULLs. */
09aeb579 7151 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7fb16bac 7152 for (i = 2; i < c->argc; i++) {
d1578a33 7153 if (o != NULL && (value = hashTypeGet(o,c->argv[i])) != NULL) {
7fb16bac 7154 addReplyBulk(c,value);
a3f3af86 7155 decrRefCount(value);
7fb16bac
PN
7156 } else {
7157 addReply(c,shared.nullbulk);
09aeb579
PN
7158 }
7159 }
7160}
7161
07efaf74 7162static void hdelCommand(redisClient *c) {
dd88747b 7163 robj *o;
dd88747b 7164 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
7165 checkType(c,o,REDIS_HASH)) return;
07efaf74 7166
d1578a33
PN
7167 if (hashTypeDelete(o,c->argv[2])) {
7168 if (hashTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
7fb16bac
PN
7169 addReply(c,shared.cone);
7170 server.dirty++;
dd88747b 7171 } else {
7fb16bac 7172 addReply(c,shared.czero);
07efaf74 7173 }
7174}
7175
92b27fe9 7176static void hlenCommand(redisClient *c) {
7177 robj *o;
dd88747b 7178 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 7179 checkType(c,o,REDIS_HASH)) return;
7180
d1578a33 7181 addReplyUlong(c,hashTypeLength(o));
92b27fe9 7182}
7183
78409a0f 7184static void genericHgetallCommand(redisClient *c, int flags) {
7fb16bac 7185 robj *o, *lenobj, *obj;
78409a0f 7186 unsigned long count = 0;
d1578a33 7187 hashTypeIterator *hi;
78409a0f 7188
4e27f268 7189 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
78409a0f 7190 || checkType(c,o,REDIS_HASH)) return;
7191
7192 lenobj = createObject(REDIS_STRING,NULL);
7193 addReply(c,lenobj);
7194 decrRefCount(lenobj);
7195
d1578a33
PN
7196 hi = hashTypeInitIterator(o);
7197 while (hashTypeNext(hi) != REDIS_ERR) {
7fb16bac 7198 if (flags & REDIS_HASH_KEY) {
d1578a33 7199 obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
7fb16bac 7200 addReplyBulk(c,obj);
a3f3af86 7201 decrRefCount(obj);
7fb16bac 7202 count++;
78409a0f 7203 }
7fb16bac 7204 if (flags & REDIS_HASH_VALUE) {
d1578a33 7205 obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
7fb16bac 7206 addReplyBulk(c,obj);
a3f3af86 7207 decrRefCount(obj);
7fb16bac 7208 count++;
78409a0f 7209 }
78409a0f 7210 }
d1578a33 7211 hashTypeReleaseIterator(hi);
7fb16bac 7212
78409a0f 7213 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
7214}
7215
7216static void hkeysCommand(redisClient *c) {
7fb16bac 7217 genericHgetallCommand(c,REDIS_HASH_KEY);
78409a0f 7218}
7219
7220static void hvalsCommand(redisClient *c) {
7fb16bac 7221 genericHgetallCommand(c,REDIS_HASH_VALUE);
78409a0f 7222}
7223
7224static void hgetallCommand(redisClient *c) {
7fb16bac 7225 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
78409a0f 7226}
7227
a86f14b1 7228static void hexistsCommand(redisClient *c) {
7229 robj *o;
a86f14b1 7230 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
7231 checkType(c,o,REDIS_HASH)) return;
7232
d1578a33 7233 addReply(c, hashTypeExists(o,c->argv[2]) ? shared.cone : shared.czero);
a86f14b1 7234}
7235
ada386b2 7236static void convertToRealHash(robj *o) {
7237 unsigned char *key, *val, *p, *zm = o->ptr;
7238 unsigned int klen, vlen;
7239 dict *dict = dictCreate(&hashDictType,NULL);
7240
7241 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
7242 p = zipmapRewind(zm);
7243 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
7244 robj *keyobj, *valobj;
7245
7246 keyobj = createStringObject((char*)key,klen);
7247 valobj = createStringObject((char*)val,vlen);
05df7621 7248 keyobj = tryObjectEncoding(keyobj);
7249 valobj = tryObjectEncoding(valobj);
ada386b2 7250 dictAdd(dict,keyobj,valobj);
7251 }
7252 o->encoding = REDIS_ENCODING_HT;
7253 o->ptr = dict;
7254 zfree(zm);
7255}
7256
6b47e12e 7257/* ========================= Non type-specific commands ==================== */
7258
ed9b544e 7259static void flushdbCommand(redisClient *c) {
ca37e9cd 7260 server.dirty += dictSize(c->db->dict);
9b30e1a2 7261 touchWatchedKeysOnFlush(c->db->id);
3305306f 7262 dictEmpty(c->db->dict);
7263 dictEmpty(c->db->expires);
ed9b544e 7264 addReply(c,shared.ok);
ed9b544e 7265}
7266
7267static void flushallCommand(redisClient *c) {
9b30e1a2 7268 touchWatchedKeysOnFlush(-1);
ca37e9cd 7269 server.dirty += emptyDb();
ed9b544e 7270 addReply(c,shared.ok);
500ece7c 7271 if (server.bgsavechildpid != -1) {
7272 kill(server.bgsavechildpid,SIGKILL);
7273 rdbRemoveTempFile(server.bgsavechildpid);
7274 }
f78fd11b 7275 rdbSave(server.dbfilename);
ca37e9cd 7276 server.dirty++;
ed9b544e 7277}
7278
56906eef 7279static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 7280 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 7281 so->type = type;
7282 so->pattern = pattern;
7283 return so;
7284}
7285
7286/* Return the value associated to the key with a name obtained
55017f9d
PN
7287 * substituting the first occurence of '*' in 'pattern' with 'subst'.
7288 * The returned object will always have its refcount increased by 1
7289 * when it is non-NULL. */
56906eef 7290static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6d7d1370 7291 char *p, *f;
ed9b544e 7292 sds spat, ssub;
6d7d1370
PN
7293 robj keyobj, fieldobj, *o;
7294 int prefixlen, sublen, postfixlen, fieldlen;
ed9b544e 7295 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
7296 struct {
f1017b3f 7297 long len;
7298 long free;
ed9b544e 7299 char buf[REDIS_SORTKEY_MAX+1];
6d7d1370 7300 } keyname, fieldname;
ed9b544e 7301
28173a49 7302 /* If the pattern is "#" return the substitution object itself in order
7303 * to implement the "SORT ... GET #" feature. */
7304 spat = pattern->ptr;
7305 if (spat[0] == '#' && spat[1] == '\0') {
55017f9d 7306 incrRefCount(subst);
28173a49 7307 return subst;
7308 }
7309
7310 /* The substitution object may be specially encoded. If so we create
9d65a1bb 7311 * a decoded object on the fly. Otherwise getDecodedObject will just
7312 * increment the ref count, that we'll decrement later. */
7313 subst = getDecodedObject(subst);
942a3961 7314
ed9b544e 7315 ssub = subst->ptr;
7316 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
7317 p = strchr(spat,'*');
ed5a857a 7318 if (!p) {
7319 decrRefCount(subst);
7320 return NULL;
7321 }
ed9b544e 7322
6d7d1370
PN
7323 /* Find out if we're dealing with a hash dereference. */
7324 if ((f = strstr(p+1, "->")) != NULL) {
7325 fieldlen = sdslen(spat)-(f-spat);
7326 /* this also copies \0 character */
7327 memcpy(fieldname.buf,f+2,fieldlen-1);
7328 fieldname.len = fieldlen-2;
7329 } else {
7330 fieldlen = 0;
7331 }
7332
ed9b544e 7333 prefixlen = p-spat;
7334 sublen = sdslen(ssub);
6d7d1370 7335 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
ed9b544e 7336 memcpy(keyname.buf,spat,prefixlen);
7337 memcpy(keyname.buf+prefixlen,ssub,sublen);
7338 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
7339 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
7340 keyname.len = prefixlen+sublen+postfixlen;
942a3961 7341 decrRefCount(subst);
7342
6d7d1370
PN
7343 /* Lookup substituted key */
7344 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
7345 o = lookupKeyRead(db,&keyobj);
55017f9d
PN
7346 if (o == NULL) return NULL;
7347
7348 if (fieldlen > 0) {
7349 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6d7d1370 7350
705dad38
PN
7351 /* Retrieve value from hash by the field name. This operation
7352 * already increases the refcount of the returned object. */
6d7d1370 7353 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
d1578a33 7354 o = hashTypeGet(o, &fieldobj);
705dad38 7355 } else {
55017f9d 7356 if (o->type != REDIS_STRING) return NULL;
b6f07345 7357
705dad38
PN
7358 /* Every object that this function returns needs to have its refcount
7359 * increased. sortCommand decreases it again. */
7360 incrRefCount(o);
6d7d1370
PN
7361 }
7362
7363 return o;
ed9b544e 7364}
7365
7366/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
7367 * the additional parameter is not standard but a BSD-specific we have to
7368 * pass sorting parameters via the global 'server' structure */
7369static int sortCompare(const void *s1, const void *s2) {
7370 const redisSortObject *so1 = s1, *so2 = s2;
7371 int cmp;
7372
7373 if (!server.sort_alpha) {
7374 /* Numeric sorting. Here it's trivial as we precomputed scores */
7375 if (so1->u.score > so2->u.score) {
7376 cmp = 1;
7377 } else if (so1->u.score < so2->u.score) {
7378 cmp = -1;
7379 } else {
7380 cmp = 0;
7381 }
7382 } else {
7383 /* Alphanumeric sorting */
7384 if (server.sort_bypattern) {
7385 if (!so1->u.cmpobj || !so2->u.cmpobj) {
7386 /* At least one compare object is NULL */
7387 if (so1->u.cmpobj == so2->u.cmpobj)
7388 cmp = 0;
7389 else if (so1->u.cmpobj == NULL)
7390 cmp = -1;
7391 else
7392 cmp = 1;
7393 } else {
7394 /* We have both the objects, use strcoll */
7395 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
7396 }
7397 } else {
08ee9b57 7398 /* Compare elements directly. */
7399 cmp = compareStringObjects(so1->obj,so2->obj);
ed9b544e 7400 }
7401 }
7402 return server.sort_desc ? -cmp : cmp;
7403}
7404
7405/* The SORT command is the most complex command in Redis. Warning: this code
7406 * is optimized for speed and a bit less for readability */
7407static void sortCommand(redisClient *c) {
ed9b544e 7408 list *operations;
a03611e1 7409 unsigned int outputlen = 0;
ed9b544e 7410 int desc = 0, alpha = 0;
7411 int limit_start = 0, limit_count = -1, start, end;
7412 int j, dontsort = 0, vectorlen;
7413 int getop = 0; /* GET operation counter */
443c6409 7414 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 7415 redisSortObject *vector; /* Resulting vector to sort */
7416
7417 /* Lookup the key to sort. It must be of the right types */
3305306f 7418 sortval = lookupKeyRead(c->db,c->argv[1]);
7419 if (sortval == NULL) {
4e27f268 7420 addReply(c,shared.emptymultibulk);
ed9b544e 7421 return;
7422 }
a5eb649b 7423 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
7424 sortval->type != REDIS_ZSET)
7425 {
c937aa89 7426 addReply(c,shared.wrongtypeerr);
ed9b544e 7427 return;
7428 }
7429
7430 /* Create a list of operations to perform for every sorted element.
7431 * Operations can be GET/DEL/INCR/DECR */
7432 operations = listCreate();
092dac2a 7433 listSetFreeMethod(operations,zfree);
ed9b544e 7434 j = 2;
7435
7436 /* Now we need to protect sortval incrementing its count, in the future
7437 * SORT may have options able to overwrite/delete keys during the sorting
7438 * and the sorted key itself may get destroied */
7439 incrRefCount(sortval);
7440
7441 /* The SORT command has an SQL-alike syntax, parse it */
7442 while(j < c->argc) {
7443 int leftargs = c->argc-j-1;
7444 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7445 desc = 0;
7446 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7447 desc = 1;
7448 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7449 alpha = 1;
7450 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7451 limit_start = atoi(c->argv[j+1]->ptr);
7452 limit_count = atoi(c->argv[j+2]->ptr);
7453 j+=2;
443c6409 7454 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7455 storekey = c->argv[j+1];
7456 j++;
ed9b544e 7457 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7458 sortby = c->argv[j+1];
7459 /* If the BY pattern does not contain '*', i.e. it is constant,
7460 * we don't need to sort nor to lookup the weight keys. */
7461 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7462 j++;
7463 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7464 listAddNodeTail(operations,createSortOperation(
7465 REDIS_SORT_GET,c->argv[j+1]));
7466 getop++;
7467 j++;
ed9b544e 7468 } else {
7469 decrRefCount(sortval);
7470 listRelease(operations);
c937aa89 7471 addReply(c,shared.syntaxerr);
ed9b544e 7472 return;
7473 }
7474 j++;
7475 }
7476
7477 /* Load the sorting vector with all the objects to sort */
a5eb649b 7478 switch(sortval->type) {
003f0840 7479 case REDIS_LIST: vectorlen = listTypeLength(sortval); break;
a5eb649b 7480 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7481 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
f83c6cb5 7482 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
a5eb649b 7483 }
ed9b544e 7484 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 7485 j = 0;
a5eb649b 7486
ed9b544e 7487 if (sortval->type == REDIS_LIST) {
003f0840
PN
7488 listTypeIterator *li = listTypeInitIterator(sortval,0,REDIS_TAIL);
7489 listTypeEntry entry;
7490 while(listTypeNext(li,&entry)) {
7491 vector[j].obj = listTypeGet(&entry);
ed9b544e 7492 vector[j].u.score = 0;
7493 vector[j].u.cmpobj = NULL;
ed9b544e 7494 j++;
7495 }
003f0840 7496 listTypeReleaseIterator(li);
ed9b544e 7497 } else {
a5eb649b 7498 dict *set;
ed9b544e 7499 dictIterator *di;
7500 dictEntry *setele;
7501
a5eb649b 7502 if (sortval->type == REDIS_SET) {
7503 set = sortval->ptr;
7504 } else {
7505 zset *zs = sortval->ptr;
7506 set = zs->dict;
7507 }
7508
ed9b544e 7509 di = dictGetIterator(set);
ed9b544e 7510 while((setele = dictNext(di)) != NULL) {
7511 vector[j].obj = dictGetEntryKey(setele);
7512 vector[j].u.score = 0;
7513 vector[j].u.cmpobj = NULL;
7514 j++;
7515 }
7516 dictReleaseIterator(di);
7517 }
dfc5e96c 7518 redisAssert(j == vectorlen);
ed9b544e 7519
7520 /* Now it's time to load the right scores in the sorting vector */
7521 if (dontsort == 0) {
7522 for (j = 0; j < vectorlen; j++) {
6d7d1370 7523 robj *byval;
ed9b544e 7524 if (sortby) {
6d7d1370 7525 /* lookup value to sort by */
3305306f 7526 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
705dad38 7527 if (!byval) continue;
ed9b544e 7528 } else {
6d7d1370
PN
7529 /* use object itself to sort by */
7530 byval = vector[j].obj;
7531 }
7532
7533 if (alpha) {
08ee9b57 7534 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6d7d1370
PN
7535 } else {
7536 if (byval->encoding == REDIS_ENCODING_RAW) {
7537 vector[j].u.score = strtod(byval->ptr,NULL);
16fa22f1 7538 } else if (byval->encoding == REDIS_ENCODING_INT) {
6d7d1370
PN
7539 /* Don't need to decode the object if it's
7540 * integer-encoded (the only encoding supported) so
7541 * far. We can just cast it */
16fa22f1
PN
7542 vector[j].u.score = (long)byval->ptr;
7543 } else {
7544 redisAssert(1 != 1);
942a3961 7545 }
ed9b544e 7546 }
6d7d1370 7547
705dad38
PN
7548 /* when the object was retrieved using lookupKeyByPattern,
7549 * its refcount needs to be decreased. */
7550 if (sortby) {
7551 decrRefCount(byval);
ed9b544e 7552 }
7553 }
7554 }
7555
7556 /* We are ready to sort the vector... perform a bit of sanity check
7557 * on the LIMIT option too. We'll use a partial version of quicksort. */
7558 start = (limit_start < 0) ? 0 : limit_start;
7559 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7560 if (start >= vectorlen) {
7561 start = vectorlen-1;
7562 end = vectorlen-2;
7563 }
7564 if (end >= vectorlen) end = vectorlen-1;
7565
7566 if (dontsort == 0) {
7567 server.sort_desc = desc;
7568 server.sort_alpha = alpha;
7569 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 7570 if (sortby && (start != 0 || end != vectorlen-1))
7571 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7572 else
7573 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 7574 }
7575
7576 /* Send command output to the output buffer, performing the specified
7577 * GET/DEL/INCR/DECR operations if any. */
7578 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 7579 if (storekey == NULL) {
7580 /* STORE option not specified, sent the sorting result to client */
7581 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7582 for (j = start; j <= end; j++) {
7583 listNode *ln;
c7df85a4 7584 listIter li;
7585
dd88747b 7586 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 7587 listRewind(operations,&li);
7588 while((ln = listNext(&li))) {
443c6409 7589 redisSortOperation *sop = ln->value;
7590 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7591 vector[j].obj);
7592
7593 if (sop->type == REDIS_SORT_GET) {
55017f9d 7594 if (!val) {
443c6409 7595 addReply(c,shared.nullbulk);
7596 } else {
dd88747b 7597 addReplyBulk(c,val);
55017f9d 7598 decrRefCount(val);
443c6409 7599 }
7600 } else {
dfc5e96c 7601 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 7602 }
7603 }
ed9b544e 7604 }
443c6409 7605 } else {
74e0f445 7606 robj *sobj = createZiplistObject();
443c6409 7607
7608 /* STORE option specified, set the sorting result as a List object */
7609 for (j = start; j <= end; j++) {
7610 listNode *ln;
c7df85a4 7611 listIter li;
7612
443c6409 7613 if (!getop) {
003f0840 7614 listTypePush(sobj,vector[j].obj,REDIS_TAIL);
a03611e1
PN
7615 } else {
7616 listRewind(operations,&li);
7617 while((ln = listNext(&li))) {
7618 redisSortOperation *sop = ln->value;
7619 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7620 vector[j].obj);
7621
7622 if (sop->type == REDIS_SORT_GET) {
7623 if (!val) val = createStringObject("",0);
7624
003f0840 7625 /* listTypePush does an incrRefCount, so we should take care
a03611e1
PN
7626 * care of the incremented refcount caused by either
7627 * lookupKeyByPattern or createStringObject("",0) */
003f0840 7628 listTypePush(sobj,val,REDIS_TAIL);
a03611e1 7629 decrRefCount(val);
443c6409 7630 } else {
a03611e1
PN
7631 /* always fails */
7632 redisAssert(sop->type == REDIS_SORT_GET);
443c6409 7633 }
ed9b544e 7634 }
ed9b544e 7635 }
ed9b544e 7636 }
846d8b3e 7637 dbReplace(c->db,storekey,sobj);
443c6409 7638 /* Note: we add 1 because the DB is dirty anyway since even if the
7639 * SORT result is empty a new key is set and maybe the old content
7640 * replaced. */
7641 server.dirty += 1+outputlen;
7642 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 7643 }
7644
7645 /* Cleanup */
a03611e1
PN
7646 if (sortval->type == REDIS_LIST)
7647 for (j = 0; j < vectorlen; j++)
7648 decrRefCount(vector[j].obj);
ed9b544e 7649 decrRefCount(sortval);
7650 listRelease(operations);
7651 for (j = 0; j < vectorlen; j++) {
16fa22f1 7652 if (alpha && vector[j].u.cmpobj)
ed9b544e 7653 decrRefCount(vector[j].u.cmpobj);
7654 }
7655 zfree(vector);
7656}
7657
ec6c7a1d 7658/* Convert an amount of bytes into a human readable string in the form
7659 * of 100B, 2G, 100M, 4K, and so forth. */
7660static void bytesToHuman(char *s, unsigned long long n) {
7661 double d;
7662
7663 if (n < 1024) {
7664 /* Bytes */
7665 sprintf(s,"%lluB",n);
7666 return;
7667 } else if (n < (1024*1024)) {
7668 d = (double)n/(1024);
7669 sprintf(s,"%.2fK",d);
7670 } else if (n < (1024LL*1024*1024)) {
7671 d = (double)n/(1024*1024);
7672 sprintf(s,"%.2fM",d);
7673 } else if (n < (1024LL*1024*1024*1024)) {
7674 d = (double)n/(1024LL*1024*1024);
b72f6a4b 7675 sprintf(s,"%.2fG",d);
ec6c7a1d 7676 }
7677}
7678
1c85b79f 7679/* Create the string returned by the INFO command. This is decoupled
7680 * by the INFO command itself as we need to report the same information
7681 * on memory corruption problems. */
7682static sds genRedisInfoString(void) {
ed9b544e 7683 sds info;
7684 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 7685 int j;
ec6c7a1d 7686 char hmem[64];
55a8298f 7687
b72f6a4b 7688 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 7689 info = sdscatprintf(sdsempty(),
7690 "redis_version:%s\r\n"
5436146c
PN
7691 "redis_git_sha1:%s\r\n"
7692 "redis_git_dirty:%d\r\n"
f1017b3f 7693 "arch_bits:%s\r\n"
7a932b74 7694 "multiplexing_api:%s\r\n"
0d7170a4 7695 "process_id:%ld\r\n"
682ac724 7696 "uptime_in_seconds:%ld\r\n"
7697 "uptime_in_days:%ld\r\n"
ed9b544e 7698 "connected_clients:%d\r\n"
7699 "connected_slaves:%d\r\n"
f86a74e9 7700 "blocked_clients:%d\r\n"
5fba9f71 7701 "used_memory:%zu\r\n"
ec6c7a1d 7702 "used_memory_human:%s\r\n"
ed9b544e 7703 "changes_since_last_save:%lld\r\n"
be2bb6b0 7704 "bgsave_in_progress:%d\r\n"
682ac724 7705 "last_save_time:%ld\r\n"
b3fad521 7706 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 7707 "total_connections_received:%lld\r\n"
7708 "total_commands_processed:%lld\r\n"
2a6a2ed1 7709 "expired_keys:%lld\r\n"
3be2c9d7 7710 "hash_max_zipmap_entries:%zu\r\n"
7711 "hash_max_zipmap_value:%zu\r\n"
ffc6b7f8 7712 "pubsub_channels:%ld\r\n"
7713 "pubsub_patterns:%u\r\n"
7d98e08c 7714 "vm_enabled:%d\r\n"
a0f643ea 7715 "role:%s\r\n"
ed9b544e 7716 ,REDIS_VERSION,
5436146c 7717 REDIS_GIT_SHA1,
274e45e3 7718 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
f1017b3f 7719 (sizeof(long) == 8) ? "64" : "32",
7a932b74 7720 aeGetApiName(),
0d7170a4 7721 (long) getpid(),
a0f643ea 7722 uptime,
7723 uptime/(3600*24),
ed9b544e 7724 listLength(server.clients)-listLength(server.slaves),
7725 listLength(server.slaves),
d5d55fc3 7726 server.blpop_blocked_clients,
b72f6a4b 7727 zmalloc_used_memory(),
ec6c7a1d 7728 hmem,
ed9b544e 7729 server.dirty,
9d65a1bb 7730 server.bgsavechildpid != -1,
ed9b544e 7731 server.lastsave,
b3fad521 7732 server.bgrewritechildpid != -1,
ed9b544e 7733 server.stat_numconnections,
7734 server.stat_numcommands,
2a6a2ed1 7735 server.stat_expiredkeys,
55a8298f 7736 server.hash_max_zipmap_entries,
7737 server.hash_max_zipmap_value,
ffc6b7f8 7738 dictSize(server.pubsub_channels),
7739 listLength(server.pubsub_patterns),
7d98e08c 7740 server.vm_enabled != 0,
a0f643ea 7741 server.masterhost == NULL ? "master" : "slave"
ed9b544e 7742 );
a0f643ea 7743 if (server.masterhost) {
7744 info = sdscatprintf(info,
7745 "master_host:%s\r\n"
7746 "master_port:%d\r\n"
7747 "master_link_status:%s\r\n"
7748 "master_last_io_seconds_ago:%d\r\n"
7749 ,server.masterhost,
7750 server.masterport,
7751 (server.replstate == REDIS_REPL_CONNECTED) ?
7752 "up" : "down",
f72b934d 7753 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 7754 );
7755 }
7d98e08c 7756 if (server.vm_enabled) {
1064ef87 7757 lockThreadedIO();
7d98e08c 7758 info = sdscatprintf(info,
7759 "vm_conf_max_memory:%llu\r\n"
7760 "vm_conf_page_size:%llu\r\n"
7761 "vm_conf_pages:%llu\r\n"
7762 "vm_stats_used_pages:%llu\r\n"
7763 "vm_stats_swapped_objects:%llu\r\n"
7764 "vm_stats_swappin_count:%llu\r\n"
7765 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 7766 "vm_stats_io_newjobs_len:%lu\r\n"
7767 "vm_stats_io_processing_len:%lu\r\n"
7768 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 7769 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 7770 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 7771 ,(unsigned long long) server.vm_max_memory,
7772 (unsigned long long) server.vm_page_size,
7773 (unsigned long long) server.vm_pages,
7774 (unsigned long long) server.vm_stats_used_pages,
7775 (unsigned long long) server.vm_stats_swapped_objects,
7776 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 7777 (unsigned long long) server.vm_stats_swapouts,
7778 (unsigned long) listLength(server.io_newjobs),
7779 (unsigned long) listLength(server.io_processing),
7780 (unsigned long) listLength(server.io_processed),
d5d55fc3 7781 (unsigned long) server.io_active_threads,
7782 (unsigned long) server.vm_blocked_clients
7d98e08c 7783 );
1064ef87 7784 unlockThreadedIO();
7d98e08c 7785 }
c3cb078d 7786 for (j = 0; j < server.dbnum; j++) {
7787 long long keys, vkeys;
7788
7789 keys = dictSize(server.db[j].dict);
7790 vkeys = dictSize(server.db[j].expires);
7791 if (keys || vkeys) {
9d65a1bb 7792 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 7793 j, keys, vkeys);
7794 }
7795 }
1c85b79f 7796 return info;
7797}
7798
7799static void infoCommand(redisClient *c) {
7800 sds info = genRedisInfoString();
83c6a618 7801 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7802 (unsigned long)sdslen(info)));
ed9b544e 7803 addReplySds(c,info);
70003d28 7804 addReply(c,shared.crlf);
ed9b544e 7805}
7806
3305306f 7807static void monitorCommand(redisClient *c) {
7808 /* ignore MONITOR if aleady slave or in monitor mode */
7809 if (c->flags & REDIS_SLAVE) return;
7810
7811 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7812 c->slaveseldb = 0;
6b47e12e 7813 listAddNodeTail(server.monitors,c);
3305306f 7814 addReply(c,shared.ok);
7815}
7816
7817/* ================================= Expire ================================= */
7818static int removeExpire(redisDb *db, robj *key) {
09241813 7819 if (dictDelete(db->expires,key->ptr) == DICT_OK) {
3305306f 7820 return 1;
7821 } else {
7822 return 0;
7823 }
7824}
7825
7826static int setExpire(redisDb *db, robj *key, time_t when) {
09241813 7827 sds copy = sdsdup(key->ptr);
7828 if (dictAdd(db->expires,copy,(void*)when) == DICT_ERR) {
7829 sdsfree(copy);
3305306f 7830 return 0;
7831 } else {
3305306f 7832 return 1;
7833 }
7834}
7835
bb32ede5 7836/* Return the expire time of the specified key, or -1 if no expire
7837 * is associated with this key (i.e. the key is non volatile) */
7838static time_t getExpire(redisDb *db, robj *key) {
7839 dictEntry *de;
7840
7841 /* No expire? return ASAP */
7842 if (dictSize(db->expires) == 0 ||
09241813 7843 (de = dictFind(db->expires,key->ptr)) == NULL) return -1;
bb32ede5 7844
7845 return (time_t) dictGetEntryVal(de);
7846}
7847
3305306f 7848static int expireIfNeeded(redisDb *db, robj *key) {
7849 time_t when;
7850 dictEntry *de;
7851
7852 /* No expire? return ASAP */
7853 if (dictSize(db->expires) == 0 ||
09241813 7854 (de = dictFind(db->expires,key->ptr)) == NULL) return 0;
3305306f 7855
7856 /* Lookup the expire */
7857 when = (time_t) dictGetEntryVal(de);
7858 if (time(NULL) <= when) return 0;
7859
7860 /* Delete the key */
09241813 7861 dbDelete(db,key);
2a6a2ed1 7862 server.stat_expiredkeys++;
09241813 7863 return 1;
3305306f 7864}
7865
7866static int deleteIfVolatile(redisDb *db, robj *key) {
7867 dictEntry *de;
7868
7869 /* No expire? return ASAP */
7870 if (dictSize(db->expires) == 0 ||
09241813 7871 (de = dictFind(db->expires,key->ptr)) == NULL) return 0;
3305306f 7872
7873 /* Delete the key */
0c66a471 7874 server.dirty++;
2a6a2ed1 7875 server.stat_expiredkeys++;
09241813 7876 dictDelete(db->expires,key->ptr);
7877 return dictDelete(db->dict,key->ptr) == DICT_OK;
3305306f 7878}
7879
bbe025e0 7880static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
3305306f 7881 dictEntry *de;
bbe025e0
AM
7882 time_t seconds;
7883
bd79a6bd 7884 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
bbe025e0
AM
7885
7886 seconds -= offset;
3305306f 7887
09241813 7888 de = dictFind(c->db->dict,key->ptr);
3305306f 7889 if (de == NULL) {
7890 addReply(c,shared.czero);
7891 return;
7892 }
d4dd6556 7893 if (seconds <= 0) {
09241813 7894 if (dbDelete(c->db,key)) server.dirty++;
43e5ccdf 7895 addReply(c, shared.cone);
3305306f 7896 return;
7897 } else {
7898 time_t when = time(NULL)+seconds;
802e8373 7899 if (setExpire(c->db,key,when)) {
3305306f 7900 addReply(c,shared.cone);
77423026 7901 server.dirty++;
7902 } else {
3305306f 7903 addReply(c,shared.czero);
77423026 7904 }
3305306f 7905 return;
7906 }
7907}
7908
802e8373 7909static void expireCommand(redisClient *c) {
bbe025e0 7910 expireGenericCommand(c,c->argv[1],c->argv[2],0);
802e8373 7911}
7912
7913static void expireatCommand(redisClient *c) {
bbe025e0 7914 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
802e8373 7915}
7916
fd88489a 7917static void ttlCommand(redisClient *c) {
7918 time_t expire;
7919 int ttl = -1;
7920
7921 expire = getExpire(c->db,c->argv[1]);
7922 if (expire != -1) {
7923 ttl = (int) (expire-time(NULL));
7924 if (ttl < 0) ttl = -1;
7925 }
7926 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7927}
7928
6e469882 7929/* ================================ MULTI/EXEC ============================== */
7930
7931/* Client state initialization for MULTI/EXEC */
7932static void initClientMultiState(redisClient *c) {
7933 c->mstate.commands = NULL;
7934 c->mstate.count = 0;
7935}
7936
7937/* Release all the resources associated with MULTI/EXEC state */
7938static void freeClientMultiState(redisClient *c) {
7939 int j;
7940
7941 for (j = 0; j < c->mstate.count; j++) {
7942 int i;
7943 multiCmd *mc = c->mstate.commands+j;
7944
7945 for (i = 0; i < mc->argc; i++)
7946 decrRefCount(mc->argv[i]);
7947 zfree(mc->argv);
7948 }
7949 zfree(c->mstate.commands);
7950}
7951
7952/* Add a new command into the MULTI commands queue */
7953static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7954 multiCmd *mc;
7955 int j;
7956
7957 c->mstate.commands = zrealloc(c->mstate.commands,
7958 sizeof(multiCmd)*(c->mstate.count+1));
7959 mc = c->mstate.commands+c->mstate.count;
7960 mc->cmd = cmd;
7961 mc->argc = c->argc;
7962 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7963 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7964 for (j = 0; j < c->argc; j++)
7965 incrRefCount(mc->argv[j]);
7966 c->mstate.count++;
7967}
7968
7969static void multiCommand(redisClient *c) {
6531c94d 7970 if (c->flags & REDIS_MULTI) {
7971 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
7972 return;
7973 }
6e469882 7974 c->flags |= REDIS_MULTI;
36c548f0 7975 addReply(c,shared.ok);
6e469882 7976}
7977
18b6cb76
DJ
7978static void discardCommand(redisClient *c) {
7979 if (!(c->flags & REDIS_MULTI)) {
7980 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7981 return;
7982 }
7983
7984 freeClientMultiState(c);
7985 initClientMultiState(c);
7986 c->flags &= (~REDIS_MULTI);
a2645226 7987 unwatchAllKeys(c);
18b6cb76
DJ
7988 addReply(c,shared.ok);
7989}
7990
66c8853f 7991/* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7992 * implememntation for more information. */
7993static void execCommandReplicateMulti(redisClient *c) {
7994 struct redisCommand *cmd;
7995 robj *multistring = createStringObject("MULTI",5);
7996
7997 cmd = lookupCommand("multi");
7998 if (server.appendonly)
7999 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
8000 if (listLength(server.slaves))
8001 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
8002 decrRefCount(multistring);
8003}
8004
6e469882 8005static void execCommand(redisClient *c) {
8006 int j;
8007 robj **orig_argv;
8008 int orig_argc;
8009
8010 if (!(c->flags & REDIS_MULTI)) {
8011 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
8012 return;
8013 }
8014
37ab76c9 8015 /* Check if we need to abort the EXEC if some WATCHed key was touched.
8016 * A failed EXEC will return a multi bulk nil object. */
8017 if (c->flags & REDIS_DIRTY_CAS) {
8018 freeClientMultiState(c);
8019 initClientMultiState(c);
8020 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
8021 unwatchAllKeys(c);
8022 addReply(c,shared.nullmultibulk);
8023 return;
8024 }
8025
66c8853f 8026 /* Replicate a MULTI request now that we are sure the block is executed.
8027 * This way we'll deliver the MULTI/..../EXEC block as a whole and
8028 * both the AOF and the replication link will have the same consistency
8029 * and atomicity guarantees. */
8030 execCommandReplicateMulti(c);
8031
8032 /* Exec all the queued commands */
1ad4d316 8033 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
6e469882 8034 orig_argv = c->argv;
8035 orig_argc = c->argc;
8036 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
8037 for (j = 0; j < c->mstate.count; j++) {
8038 c->argc = c->mstate.commands[j].argc;
8039 c->argv = c->mstate.commands[j].argv;
8040 call(c,c->mstate.commands[j].cmd);
8041 }
8042 c->argv = orig_argv;
8043 c->argc = orig_argc;
8044 freeClientMultiState(c);
8045 initClientMultiState(c);
1ad4d316 8046 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
66c8853f 8047 /* Make sure the EXEC command is always replicated / AOF, since we
8048 * always send the MULTI command (we can't know beforehand if the
8049 * next operations will contain at least a modification to the DB). */
8050 server.dirty++;
6e469882 8051}
8052
4409877e 8053/* =========================== Blocking Operations ========================= */
8054
8055/* Currently Redis blocking operations support is limited to list POP ops,
8056 * so the current implementation is not fully generic, but it is also not
8057 * completely specific so it will not require a rewrite to support new
8058 * kind of blocking operations in the future.
8059 *
8060 * Still it's important to note that list blocking operations can be already
8061 * used as a notification mechanism in order to implement other blocking
8062 * operations at application level, so there must be a very strong evidence
8063 * of usefulness and generality before new blocking operations are implemented.
8064 *
8065 * This is how the current blocking POP works, we use BLPOP as example:
8066 * - If the user calls BLPOP and the key exists and contains a non empty list
8067 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
8068 * if there is not to block.
8069 * - If instead BLPOP is called and the key does not exists or the list is
8070 * empty we need to block. In order to do so we remove the notification for
8071 * new data to read in the client socket (so that we'll not serve new
8072 * requests if the blocking request is not served). Also we put the client
37ab76c9 8073 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
4409877e 8074 * blocking for this keys.
8075 * - If a PUSH operation against a key with blocked clients waiting is
8076 * performed, we serve the first in the list: basically instead to push
8077 * the new element inside the list we return it to the (first / oldest)
8078 * blocking client, unblock the client, and remove it form the list.
8079 *
8080 * The above comment and the source code should be enough in order to understand
8081 * the implementation and modify / fix it later.
8082 */
8083
8084/* Set a client in blocking mode for the specified key, with the specified
8085 * timeout */
b177fd30 8086static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 8087 dictEntry *de;
8088 list *l;
b177fd30 8089 int j;
4409877e 8090
37ab76c9 8091 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
8092 c->blocking_keys_num = numkeys;
4409877e 8093 c->blockingto = timeout;
b177fd30 8094 for (j = 0; j < numkeys; j++) {
8095 /* Add the key in the client structure, to map clients -> keys */
37ab76c9 8096 c->blocking_keys[j] = keys[j];
b177fd30 8097 incrRefCount(keys[j]);
4409877e 8098
b177fd30 8099 /* And in the other "side", to map keys -> clients */
37ab76c9 8100 de = dictFind(c->db->blocking_keys,keys[j]);
b177fd30 8101 if (de == NULL) {
8102 int retval;
8103
8104 /* For every key we take a list of clients blocked for it */
8105 l = listCreate();
37ab76c9 8106 retval = dictAdd(c->db->blocking_keys,keys[j],l);
b177fd30 8107 incrRefCount(keys[j]);
8108 assert(retval == DICT_OK);
8109 } else {
8110 l = dictGetEntryVal(de);
8111 }
8112 listAddNodeTail(l,c);
4409877e 8113 }
b177fd30 8114 /* Mark the client as a blocked client */
4409877e 8115 c->flags |= REDIS_BLOCKED;
d5d55fc3 8116 server.blpop_blocked_clients++;
4409877e 8117}
8118
8119/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 8120static void unblockClientWaitingData(redisClient *c) {
4409877e 8121 dictEntry *de;
8122 list *l;
b177fd30 8123 int j;
4409877e 8124
37ab76c9 8125 assert(c->blocking_keys != NULL);
b177fd30 8126 /* The client may wait for multiple keys, so unblock it for every key. */
37ab76c9 8127 for (j = 0; j < c->blocking_keys_num; j++) {
b177fd30 8128 /* Remove this client from the list of clients waiting for this key. */
37ab76c9 8129 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
b177fd30 8130 assert(de != NULL);
8131 l = dictGetEntryVal(de);
8132 listDelNode(l,listSearchKey(l,c));
8133 /* If the list is empty we need to remove it to avoid wasting memory */
8134 if (listLength(l) == 0)
37ab76c9 8135 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
8136 decrRefCount(c->blocking_keys[j]);
b177fd30 8137 }
8138 /* Cleanup the client structure */
37ab76c9 8139 zfree(c->blocking_keys);
8140 c->blocking_keys = NULL;
4409877e 8141 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 8142 server.blpop_blocked_clients--;
5921aa36 8143 /* We want to process data if there is some command waiting
b0d8747d 8144 * in the input buffer. Note that this is safe even if
8145 * unblockClientWaitingData() gets called from freeClient() because
8146 * freeClient() will be smart enough to call this function
8147 * *after* c->querybuf was set to NULL. */
4409877e 8148 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
8149}
8150
8151/* This should be called from any function PUSHing into lists.
8152 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
8153 * 'ele' is the element pushed.
8154 *
8155 * If the function returns 0 there was no client waiting for a list push
8156 * against this key.
8157 *
8158 * If the function returns 1 there was a client waiting for a list push
8159 * against this key, the element was passed to this client thus it's not
8160 * needed to actually add it to the list and the caller should return asap. */
8161static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
8162 struct dictEntry *de;
8163 redisClient *receiver;
8164 list *l;
8165 listNode *ln;
8166
37ab76c9 8167 de = dictFind(c->db->blocking_keys,key);
4409877e 8168 if (de == NULL) return 0;
8169 l = dictGetEntryVal(de);
8170 ln = listFirst(l);
8171 assert(ln != NULL);
8172 receiver = ln->value;
4409877e 8173
b177fd30 8174 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 8175 addReplyBulk(receiver,key);
8176 addReplyBulk(receiver,ele);
b0d8747d 8177 unblockClientWaitingData(receiver);
4409877e 8178 return 1;
8179}
8180
8181/* Blocking RPOP/LPOP */
8182static void blockingPopGenericCommand(redisClient *c, int where) {
8183 robj *o;
8184 time_t timeout;
b177fd30 8185 int j;
4409877e 8186
b177fd30 8187 for (j = 1; j < c->argc-1; j++) {
8188 o = lookupKeyWrite(c->db,c->argv[j]);
8189 if (o != NULL) {
8190 if (o->type != REDIS_LIST) {
8191 addReply(c,shared.wrongtypeerr);
4409877e 8192 return;
b177fd30 8193 } else {
8194 list *list = o->ptr;
8195 if (listLength(list) != 0) {
8196 /* If the list contains elements fall back to the usual
8197 * non-blocking POP operation */
8198 robj *argv[2], **orig_argv;
8199 int orig_argc;
e0a62c7f 8200
b177fd30 8201 /* We need to alter the command arguments before to call
8202 * popGenericCommand() as the command takes a single key. */
8203 orig_argv = c->argv;
8204 orig_argc = c->argc;
8205 argv[1] = c->argv[j];
8206 c->argv = argv;
8207 c->argc = 2;
8208
8209 /* Also the return value is different, we need to output
8210 * the multi bulk reply header and the key name. The
8211 * "real" command will add the last element (the value)
8212 * for us. If this souds like an hack to you it's just
8213 * because it is... */
8214 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 8215 addReplyBulk(c,argv[1]);
b177fd30 8216 popGenericCommand(c,where);
8217
8218 /* Fix the client structure with the original stuff */
8219 c->argv = orig_argv;
8220 c->argc = orig_argc;
8221 return;
8222 }
4409877e 8223 }
8224 }
8225 }
8226 /* If the list is empty or the key does not exists we must block */
b177fd30 8227 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 8228 if (timeout > 0) timeout += time(NULL);
b177fd30 8229 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 8230}
8231
8232static void blpopCommand(redisClient *c) {
8233 blockingPopGenericCommand(c,REDIS_HEAD);
8234}
8235
8236static void brpopCommand(redisClient *c) {
8237 blockingPopGenericCommand(c,REDIS_TAIL);
8238}
8239
ed9b544e 8240/* =============================== Replication ============================= */
8241
a4d1ba9a 8242static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 8243 ssize_t nwritten, ret = size;
8244 time_t start = time(NULL);
8245
8246 timeout++;
8247 while(size) {
8248 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
8249 nwritten = write(fd,ptr,size);
8250 if (nwritten == -1) return -1;
8251 ptr += nwritten;
8252 size -= nwritten;
8253 }
8254 if ((time(NULL)-start) > timeout) {
8255 errno = ETIMEDOUT;
8256 return -1;
8257 }
8258 }
8259 return ret;
8260}
8261
a4d1ba9a 8262static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 8263 ssize_t nread, totread = 0;
8264 time_t start = time(NULL);
8265
8266 timeout++;
8267 while(size) {
8268 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
8269 nread = read(fd,ptr,size);
8270 if (nread == -1) return -1;
8271 ptr += nread;
8272 size -= nread;
8273 totread += nread;
8274 }
8275 if ((time(NULL)-start) > timeout) {
8276 errno = ETIMEDOUT;
8277 return -1;
8278 }
8279 }
8280 return totread;
8281}
8282
8283static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
8284 ssize_t nread = 0;
8285
8286 size--;
8287 while(size) {
8288 char c;
8289
8290 if (syncRead(fd,&c,1,timeout) == -1) return -1;
8291 if (c == '\n') {
8292 *ptr = '\0';
8293 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
8294 return nread;
8295 } else {
8296 *ptr++ = c;
8297 *ptr = '\0';
8298 nread++;
8299 }
8300 }
8301 return nread;
8302}
8303
8304static void syncCommand(redisClient *c) {
40d224a9 8305 /* ignore SYNC if aleady slave or in monitor mode */
8306 if (c->flags & REDIS_SLAVE) return;
8307
8308 /* SYNC can't be issued when the server has pending data to send to
8309 * the client about already issued commands. We need a fresh reply
8310 * buffer registering the differences between the BGSAVE and the current
8311 * dataset, so that we can copy to other slaves if needed. */
8312 if (listLength(c->reply) != 0) {
8313 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
8314 return;
8315 }
8316
8317 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
8318 /* Here we need to check if there is a background saving operation
8319 * in progress, or if it is required to start one */
9d65a1bb 8320 if (server.bgsavechildpid != -1) {
40d224a9 8321 /* Ok a background save is in progress. Let's check if it is a good
8322 * one for replication, i.e. if there is another slave that is
8323 * registering differences since the server forked to save */
8324 redisClient *slave;
8325 listNode *ln;
c7df85a4 8326 listIter li;
40d224a9 8327
c7df85a4 8328 listRewind(server.slaves,&li);
8329 while((ln = listNext(&li))) {
40d224a9 8330 slave = ln->value;
8331 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 8332 }
8333 if (ln) {
8334 /* Perfect, the server is already registering differences for
8335 * another slave. Set the right state, and copy the buffer. */
8336 listRelease(c->reply);
8337 c->reply = listDup(slave->reply);
40d224a9 8338 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8339 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
8340 } else {
8341 /* No way, we need to wait for the next BGSAVE in order to
8342 * register differences */
8343 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8344 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
8345 }
8346 } else {
8347 /* Ok we don't have a BGSAVE in progress, let's start one */
8348 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
8349 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8350 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
8351 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
8352 return;
8353 }
8354 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8355 }
6208b3a7 8356 c->repldbfd = -1;
40d224a9 8357 c->flags |= REDIS_SLAVE;
8358 c->slaveseldb = 0;
6b47e12e 8359 listAddNodeTail(server.slaves,c);
40d224a9 8360 return;
8361}
8362
6208b3a7 8363static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
8364 redisClient *slave = privdata;
8365 REDIS_NOTUSED(el);
8366 REDIS_NOTUSED(mask);
8367 char buf[REDIS_IOBUF_LEN];
8368 ssize_t nwritten, buflen;
8369
8370 if (slave->repldboff == 0) {
8371 /* Write the bulk write count before to transfer the DB. In theory here
8372 * we don't know how much room there is in the output buffer of the
8373 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
8374 * operations) will never be smaller than the few bytes we need. */
8375 sds bulkcount;
8376
8377 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
8378 slave->repldbsize);
8379 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
8380 {
8381 sdsfree(bulkcount);
8382 freeClient(slave);
8383 return;
8384 }
8385 sdsfree(bulkcount);
8386 }
8387 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
8388 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
8389 if (buflen <= 0) {
8390 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
8391 (buflen == 0) ? "premature EOF" : strerror(errno));
8392 freeClient(slave);
8393 return;
8394 }
8395 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 8396 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 8397 strerror(errno));
8398 freeClient(slave);
8399 return;
8400 }
8401 slave->repldboff += nwritten;
8402 if (slave->repldboff == slave->repldbsize) {
8403 close(slave->repldbfd);
8404 slave->repldbfd = -1;
8405 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8406 slave->replstate = REDIS_REPL_ONLINE;
8407 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 8408 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 8409 freeClient(slave);
8410 return;
8411 }
8412 addReplySds(slave,sdsempty());
8413 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
8414 }
8415}
ed9b544e 8416
a3b21203 8417/* This function is called at the end of every backgrond saving.
8418 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
8419 * otherwise REDIS_ERR is passed to the function.
8420 *
8421 * The goal of this function is to handle slaves waiting for a successful
8422 * background saving in order to perform non-blocking synchronization. */
8423static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 8424 listNode *ln;
8425 int startbgsave = 0;
c7df85a4 8426 listIter li;
ed9b544e 8427
c7df85a4 8428 listRewind(server.slaves,&li);
8429 while((ln = listNext(&li))) {
6208b3a7 8430 redisClient *slave = ln->value;
ed9b544e 8431
6208b3a7 8432 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
8433 startbgsave = 1;
8434 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8435 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 8436 struct redis_stat buf;
e0a62c7f 8437
6208b3a7 8438 if (bgsaveerr != REDIS_OK) {
8439 freeClient(slave);
8440 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8441 continue;
8442 }
8443 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 8444 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 8445 freeClient(slave);
8446 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8447 continue;
8448 }
8449 slave->repldboff = 0;
8450 slave->repldbsize = buf.st_size;
8451 slave->replstate = REDIS_REPL_SEND_BULK;
8452 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 8453 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 8454 freeClient(slave);
8455 continue;
8456 }
8457 }
ed9b544e 8458 }
6208b3a7 8459 if (startbgsave) {
8460 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 8461 listIter li;
8462
8463 listRewind(server.slaves,&li);
6208b3a7 8464 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 8465 while((ln = listNext(&li))) {
6208b3a7 8466 redisClient *slave = ln->value;
ed9b544e 8467
6208b3a7 8468 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8469 freeClient(slave);
8470 }
8471 }
8472 }
ed9b544e 8473}
8474
8475static int syncWithMaster(void) {
d0ccebcf 8476 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 8477 long dumpsize;
ed9b544e 8478 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 8479 int dfd, maxtries = 5;
ed9b544e 8480
8481 if (fd == -1) {
8482 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8483 strerror(errno));
8484 return REDIS_ERR;
8485 }
d0ccebcf 8486
8487 /* AUTH with the master if required. */
8488 if(server.masterauth) {
8489 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8490 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8491 close(fd);
8492 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8493 strerror(errno));
8494 return REDIS_ERR;
8495 }
8496 /* Read the AUTH result. */
8497 if (syncReadLine(fd,buf,1024,3600) == -1) {
8498 close(fd);
8499 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8500 strerror(errno));
8501 return REDIS_ERR;
8502 }
8503 if (buf[0] != '+') {
8504 close(fd);
8505 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8506 return REDIS_ERR;
8507 }
8508 }
8509
ed9b544e 8510 /* Issue the SYNC command */
8511 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8512 close(fd);
8513 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8514 strerror(errno));
8515 return REDIS_ERR;
8516 }
8517 /* Read the bulk write count */
8c4d91fc 8518 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 8519 close(fd);
8520 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8521 strerror(errno));
8522 return REDIS_ERR;
8523 }
4aa701c1 8524 if (buf[0] != '$') {
8525 close(fd);
8526 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8527 return REDIS_ERR;
8528 }
18e61fa2 8529 dumpsize = strtol(buf+1,NULL,10);
8530 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 8531 /* Read the bulk write data on a temp file */
8c5abee8 8532 while(maxtries--) {
8533 snprintf(tmpfile,256,
8534 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8535 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8536 if (dfd != -1) break;
5de9ad7c 8537 sleep(1);
8c5abee8 8538 }
ed9b544e 8539 if (dfd == -1) {
8540 close(fd);
8541 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8542 return REDIS_ERR;
8543 }
8544 while(dumpsize) {
8545 int nread, nwritten;
8546
8547 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8548 if (nread == -1) {
8549 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8550 strerror(errno));
8551 close(fd);
8552 close(dfd);
8553 return REDIS_ERR;
8554 }
8555 nwritten = write(dfd,buf,nread);
8556 if (nwritten == -1) {
8557 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8558 close(fd);
8559 close(dfd);
8560 return REDIS_ERR;
8561 }
8562 dumpsize -= nread;
8563 }
8564 close(dfd);
8565 if (rename(tmpfile,server.dbfilename) == -1) {
8566 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8567 unlink(tmpfile);
8568 close(fd);
8569 return REDIS_ERR;
8570 }
8571 emptyDb();
f78fd11b 8572 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 8573 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8574 close(fd);
8575 return REDIS_ERR;
8576 }
8577 server.master = createClient(fd);
8578 server.master->flags |= REDIS_MASTER;
179b3952 8579 server.master->authenticated = 1;
ed9b544e 8580 server.replstate = REDIS_REPL_CONNECTED;
8581 return REDIS_OK;
8582}
8583
321b0e13 8584static void slaveofCommand(redisClient *c) {
8585 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8586 !strcasecmp(c->argv[2]->ptr,"one")) {
8587 if (server.masterhost) {
8588 sdsfree(server.masterhost);
8589 server.masterhost = NULL;
8590 if (server.master) freeClient(server.master);
8591 server.replstate = REDIS_REPL_NONE;
8592 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8593 }
8594 } else {
8595 sdsfree(server.masterhost);
8596 server.masterhost = sdsdup(c->argv[1]->ptr);
8597 server.masterport = atoi(c->argv[2]->ptr);
8598 if (server.master) freeClient(server.master);
8599 server.replstate = REDIS_REPL_CONNECT;
8600 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8601 server.masterhost, server.masterport);
8602 }
8603 addReply(c,shared.ok);
8604}
8605
3fd78bcd 8606/* ============================ Maxmemory directive ======================== */
8607
a5819310 8608/* Try to free one object form the pre-allocated objects free list.
8609 * This is useful under low mem conditions as by default we take 1 million
8610 * free objects allocated. On success REDIS_OK is returned, otherwise
8611 * REDIS_ERR. */
8612static int tryFreeOneObjectFromFreelist(void) {
f870935d 8613 robj *o;
8614
a5819310 8615 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8616 if (listLength(server.objfreelist)) {
8617 listNode *head = listFirst(server.objfreelist);
8618 o = listNodeValue(head);
8619 listDelNode(server.objfreelist,head);
8620 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8621 zfree(o);
8622 return REDIS_OK;
8623 } else {
8624 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8625 return REDIS_ERR;
8626 }
f870935d 8627}
8628
3fd78bcd 8629/* This function gets called when 'maxmemory' is set on the config file to limit
8630 * the max memory used by the server, and we are out of memory.
8631 * This function will try to, in order:
8632 *
8633 * - Free objects from the free list
8634 * - Try to remove keys with an EXPIRE set
8635 *
8636 * It is not possible to free enough memory to reach used-memory < maxmemory
8637 * the server will start refusing commands that will enlarge even more the
8638 * memory usage.
8639 */
8640static void freeMemoryIfNeeded(void) {
8641 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 8642 int j, k, freed = 0;
8643
8644 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8645 for (j = 0; j < server.dbnum; j++) {
8646 int minttl = -1;
8647 robj *minkey = NULL;
8648 struct dictEntry *de;
8649
8650 if (dictSize(server.db[j].expires)) {
8651 freed = 1;
8652 /* From a sample of three keys drop the one nearest to
8653 * the natural expire */
8654 for (k = 0; k < 3; k++) {
8655 time_t t;
8656
8657 de = dictGetRandomKey(server.db[j].expires);
8658 t = (time_t) dictGetEntryVal(de);
8659 if (minttl == -1 || t < minttl) {
8660 minkey = dictGetEntryKey(de);
8661 minttl = t;
3fd78bcd 8662 }
3fd78bcd 8663 }
09241813 8664 dbDelete(server.db+j,minkey);
3fd78bcd 8665 }
3fd78bcd 8666 }
a5819310 8667 if (!freed) return; /* nothing to free... */
3fd78bcd 8668 }
8669}
8670
f80dff62 8671/* ============================== Append Only file ========================== */
8672
560db612 8673/* Called when the user switches from "appendonly yes" to "appendonly no"
8674 * at runtime using the CONFIG command. */
8675static void stopAppendOnly(void) {
8676 flushAppendOnlyFile();
8677 aof_fsync(server.appendfd);
8678 close(server.appendfd);
8679
8680 server.appendfd = -1;
8681 server.appendseldb = -1;
8682 server.appendonly = 0;
8683 /* rewrite operation in progress? kill it, wait child exit */
8684 if (server.bgsavechildpid != -1) {
8685 int statloc;
8686
8687 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8688 wait3(&statloc,0,NULL);
8689 /* reset the buffer accumulating changes while the child saves */
8690 sdsfree(server.bgrewritebuf);
8691 server.bgrewritebuf = sdsempty();
8692 server.bgsavechildpid = -1;
8693 }
8694}
8695
8696/* Called when the user switches from "appendonly no" to "appendonly yes"
8697 * at runtime using the CONFIG command. */
8698static int startAppendOnly(void) {
8699 server.appendonly = 1;
8700 server.lastfsync = time(NULL);
8701 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8702 if (server.appendfd == -1) {
8703 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8704 return REDIS_ERR;
8705 }
8706 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8707 server.appendonly = 0;
8708 close(server.appendfd);
8709 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8710 return REDIS_ERR;
8711 }
8712 return REDIS_OK;
8713}
8714
28ed1f33 8715/* Write the append only file buffer on disk.
8716 *
8717 * Since we are required to write the AOF before replying to the client,
8718 * and the only way the client socket can get a write is entering when the
8719 * the event loop, we accumulate all the AOF writes in a memory
8720 * buffer and write it on disk using this function just before entering
8721 * the event loop again. */
8722static void flushAppendOnlyFile(void) {
8723 time_t now;
8724 ssize_t nwritten;
8725
8726 if (sdslen(server.aofbuf) == 0) return;
8727
8728 /* We want to perform a single write. This should be guaranteed atomic
8729 * at least if the filesystem we are writing is a real physical one.
8730 * While this will save us against the server being killed I don't think
8731 * there is much to do about the whole server stopping for power problems
8732 * or alike */
8733 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8734 if (nwritten != (signed)sdslen(server.aofbuf)) {
8735 /* Ooops, we are in troubles. The best thing to do for now is
8736 * aborting instead of giving the illusion that everything is
8737 * working as expected. */
8738 if (nwritten == -1) {
8739 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8740 } else {
8741 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8742 }
8743 exit(1);
8744 }
8745 sdsfree(server.aofbuf);
8746 server.aofbuf = sdsempty();
8747
38db9171 8748 /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
8749 * childs performing heavy I/O on disk. */
8750 if (server.no_appendfsync_on_rewrite &&
8751 (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
8752 return;
28ed1f33 8753 /* Fsync if needed */
8754 now = time(NULL);
8755 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8756 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8757 now-server.lastfsync > 1))
8758 {
8759 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8760 * flushing metadata. */
8761 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8762 server.lastfsync = now;
8763 }
8764}
8765
9376e434
PN
8766static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8767 int j;
8768 buf = sdscatprintf(buf,"*%d\r\n",argc);
8769 for (j = 0; j < argc; j++) {
8770 robj *o = getDecodedObject(argv[j]);
8771 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8772 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8773 buf = sdscatlen(buf,"\r\n",2);
8774 decrRefCount(o);
8775 }
8776 return buf;
8777}
8778
8779static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8780 int argc = 3;
8781 long when;
8782 robj *argv[3];
8783
8784 /* Make sure we can use strtol */
8785 seconds = getDecodedObject(seconds);
8786 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8787 decrRefCount(seconds);
8788
8789 argv[0] = createStringObject("EXPIREAT",8);
8790 argv[1] = key;
8791 argv[2] = createObject(REDIS_STRING,
8792 sdscatprintf(sdsempty(),"%ld",when));
8793 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8794 decrRefCount(argv[0]);
8795 decrRefCount(argv[2]);
8796 return buf;
8797}
8798
f80dff62 8799static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8800 sds buf = sdsempty();
f80dff62 8801 robj *tmpargv[3];
8802
8803 /* The DB this command was targetting is not the same as the last command
8804 * we appendend. To issue a SELECT command is needed. */
8805 if (dictid != server.appendseldb) {
8806 char seldb[64];
8807
8808 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 8809 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 8810 (unsigned long)strlen(seldb),seldb);
f80dff62 8811 server.appendseldb = dictid;
8812 }
8813
f80dff62 8814 if (cmd->proc == expireCommand) {
9376e434
PN
8815 /* Translate EXPIRE into EXPIREAT */
8816 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8817 } else if (cmd->proc == setexCommand) {
8818 /* Translate SETEX to SET and EXPIREAT */
8819 tmpargv[0] = createStringObject("SET",3);
f80dff62 8820 tmpargv[1] = argv[1];
9376e434
PN
8821 tmpargv[2] = argv[3];
8822 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8823 decrRefCount(tmpargv[0]);
8824 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8825 } else {
8826 buf = catAppendOnlyGenericCommand(buf,argc,argv);
f80dff62 8827 }
8828
28ed1f33 8829 /* Append to the AOF buffer. This will be flushed on disk just before
8830 * of re-entering the event loop, so before the client will get a
8831 * positive reply about the operation performed. */
8832 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8833
85a83172 8834 /* If a background append only file rewriting is in progress we want to
8835 * accumulate the differences between the child DB and the current one
8836 * in a buffer, so that when the child process will do its work we
8837 * can append the differences to the new append only file. */
8838 if (server.bgrewritechildpid != -1)
8839 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8840
8841 sdsfree(buf);
f80dff62 8842}
8843
8844/* In Redis commands are always executed in the context of a client, so in
8845 * order to load the append only file we need to create a fake client. */
8846static struct redisClient *createFakeClient(void) {
8847 struct redisClient *c = zmalloc(sizeof(*c));
8848
8849 selectDb(c,0);
8850 c->fd = -1;
8851 c->querybuf = sdsempty();
8852 c->argc = 0;
8853 c->argv = NULL;
8854 c->flags = 0;
9387d17d 8855 /* We set the fake client as a slave waiting for the synchronization
8856 * so that Redis will not try to send replies to this client. */
8857 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 8858 c->reply = listCreate();
8859 listSetFreeMethod(c->reply,decrRefCount);
8860 listSetDupMethod(c->reply,dupClientReplyValue);
4132ad8d 8861 initClientMultiState(c);
f80dff62 8862 return c;
8863}
8864
8865static void freeFakeClient(struct redisClient *c) {
8866 sdsfree(c->querybuf);
8867 listRelease(c->reply);
4132ad8d 8868 freeClientMultiState(c);
f80dff62 8869 zfree(c);
8870}
8871
8872/* Replay the append log file. On error REDIS_OK is returned. On non fatal
8873 * error (the append only file is zero-length) REDIS_ERR is returned. On
8874 * fatal error an error message is logged and the program exists. */
8875int loadAppendOnlyFile(char *filename) {
8876 struct redisClient *fakeClient;
8877 FILE *fp = fopen(filename,"r");
8878 struct redis_stat sb;
4132ad8d 8879 int appendonly = server.appendonly;
f80dff62 8880
8881 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8882 return REDIS_ERR;
8883
8884 if (fp == NULL) {
8885 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8886 exit(1);
8887 }
8888
4132ad8d
PN
8889 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8890 * to the same file we're about to read. */
8891 server.appendonly = 0;
8892
f80dff62 8893 fakeClient = createFakeClient();
8894 while(1) {
8895 int argc, j;
8896 unsigned long len;
8897 robj **argv;
8898 char buf[128];
8899 sds argsds;
8900 struct redisCommand *cmd;
a89b7013 8901 int force_swapout;
f80dff62 8902
8903 if (fgets(buf,sizeof(buf),fp) == NULL) {
8904 if (feof(fp))
8905 break;
8906 else
8907 goto readerr;
8908 }
8909 if (buf[0] != '*') goto fmterr;
8910 argc = atoi(buf+1);
8911 argv = zmalloc(sizeof(robj*)*argc);
8912 for (j = 0; j < argc; j++) {
8913 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8914 if (buf[0] != '$') goto fmterr;
8915 len = strtol(buf+1,NULL,10);
8916 argsds = sdsnewlen(NULL,len);
0f151ef1 8917 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 8918 argv[j] = createObject(REDIS_STRING,argsds);
8919 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8920 }
8921
8922 /* Command lookup */
8923 cmd = lookupCommand(argv[0]->ptr);
8924 if (!cmd) {
8925 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8926 exit(1);
8927 }
bdcb92f2 8928 /* Try object encoding */
f80dff62 8929 if (cmd->flags & REDIS_CMD_BULK)
05df7621 8930 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
f80dff62 8931 /* Run the command in the context of a fake client */
8932 fakeClient->argc = argc;
8933 fakeClient->argv = argv;
8934 cmd->proc(fakeClient);
8935 /* Discard the reply objects list from the fake client */
8936 while(listLength(fakeClient->reply))
8937 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8938 /* Clean up, ready for the next command */
8939 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8940 zfree(argv);
b492cf00 8941 /* Handle swapping while loading big datasets when VM is on */
a89b7013 8942 force_swapout = 0;
8943 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
8944 force_swapout = 1;
8945
8946 if (server.vm_enabled && force_swapout) {
b492cf00 8947 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 8948 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 8949 }
8950 }
f80dff62 8951 }
4132ad8d
PN
8952
8953 /* This point can only be reached when EOF is reached without errors.
8954 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8955 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8956
f80dff62 8957 fclose(fp);
8958 freeFakeClient(fakeClient);
4132ad8d 8959 server.appendonly = appendonly;
f80dff62 8960 return REDIS_OK;
8961
8962readerr:
8963 if (feof(fp)) {
8964 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8965 } else {
8966 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8967 }
8968 exit(1);
8969fmterr:
8970 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8971 exit(1);
8972}
8973
9c8e3cee 8974/* Write binary-safe string into a file in the bulkformat
8975 * $<count>\r\n<payload>\r\n */
8976static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
9eaef89f
PN
8977 char cbuf[128];
8978 int clen;
8979 cbuf[0] = '$';
8980 clen = 1+ll2string(cbuf+1,sizeof(cbuf)-1,len);
8981 cbuf[clen++] = '\r';
8982 cbuf[clen++] = '\n';
8983 if (fwrite(cbuf,clen,1,fp) == 0) return 0;
8984 if (len > 0 && fwrite(s,len,1,fp) == 0) return 0;
9c8e3cee 8985 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8986 return 1;
8987}
8988
9d65a1bb 8989/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8990static int fwriteBulkDouble(FILE *fp, double d) {
8991 char buf[128], dbuf[128];
8992
8993 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8994 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8995 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8996 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8997 return 1;
8998}
8999
9000/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
9eaef89f
PN
9001static int fwriteBulkLongLong(FILE *fp, long long l) {
9002 char bbuf[128], lbuf[128];
9003 unsigned int blen, llen;
9004 llen = ll2string(lbuf,32,l);
9005 blen = snprintf(bbuf,sizeof(bbuf),"$%u\r\n%s\r\n",llen,lbuf);
9006 if (fwrite(bbuf,blen,1,fp) == 0) return 0;
9d65a1bb 9007 return 1;
9008}
9009
9eaef89f
PN
9010/* Delegate writing an object to writing a bulk string or bulk long long. */
9011static int fwriteBulkObject(FILE *fp, robj *obj) {
9012 /* Avoid using getDecodedObject to help copy-on-write (we are often
9013 * in a child process when this function is called). */
9014 if (obj->encoding == REDIS_ENCODING_INT) {
9015 return fwriteBulkLongLong(fp,(long)obj->ptr);
9016 } else if (obj->encoding == REDIS_ENCODING_RAW) {
9017 return fwriteBulkString(fp,obj->ptr,sdslen(obj->ptr));
9018 } else {
9019 redisPanic("Unknown string encoding");
9020 }
9021}
9022
9d65a1bb 9023/* Write a sequence of commands able to fully rebuild the dataset into
9024 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
9025static int rewriteAppendOnlyFile(char *filename) {
9026 dictIterator *di = NULL;
9027 dictEntry *de;
9028 FILE *fp;
9029 char tmpfile[256];
9030 int j;
9031 time_t now = time(NULL);
9032
9033 /* Note that we have to use a different temp name here compared to the
9034 * one used by rewriteAppendOnlyFileBackground() function. */
9035 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
9036 fp = fopen(tmpfile,"w");
9037 if (!fp) {
9038 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
9039 return REDIS_ERR;
9040 }
9041 for (j = 0; j < server.dbnum; j++) {
9042 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
9043 redisDb *db = server.db+j;
9044 dict *d = db->dict;
9045 if (dictSize(d) == 0) continue;
9046 di = dictGetIterator(d);
9047 if (!di) {
9048 fclose(fp);
9049 return REDIS_ERR;
9050 }
9051
9052 /* SELECT the new DB */
9053 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
9eaef89f 9054 if (fwriteBulkLongLong(fp,j) == 0) goto werr;
9d65a1bb 9055
9056 /* Iterate this DB writing every entry */
9057 while((de = dictNext(di)) != NULL) {
09241813 9058 sds keystr = dictGetEntryKey(de);
9059 robj key, *o;
e7546c63 9060 time_t expiretime;
9061 int swapped;
9062
09241813 9063 keystr = dictGetEntryKey(de);
560db612 9064 o = dictGetEntryVal(de);
09241813 9065 initStaticStringObject(key,keystr);
b9bc0eef 9066 /* If the value for this key is swapped, load a preview in memory.
9067 * We use a "swapped" flag to remember if we need to free the
9068 * value object instead to just increment the ref count anyway
9069 * in order to avoid copy-on-write of pages if we are forked() */
560db612 9070 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
9071 o->storage == REDIS_VM_SWAPPING) {
e7546c63 9072 swapped = 0;
9073 } else {
560db612 9074 o = vmPreviewObject(o);
e7546c63 9075 swapped = 1;
9076 }
09241813 9077 expiretime = getExpire(db,&key);
9d65a1bb 9078
9079 /* Save the key and associated value */
9d65a1bb 9080 if (o->type == REDIS_STRING) {
9081 /* Emit a SET command */
9082 char cmd[]="*3\r\n$3\r\nSET\r\n";
9083 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9084 /* Key and value */
09241813 9085 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9c8e3cee 9086 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 9087 } else if (o->type == REDIS_LIST) {
9088 /* Emit the RPUSHes needed to rebuild the list */
6ddc908a
PN
9089 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
9090 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
9091 unsigned char *zl = o->ptr;
9092 unsigned char *p = ziplistIndex(zl,0);
9093 unsigned char *vstr;
9094 unsigned int vlen;
9095 long long vlong;
9096
9097 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
9098 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
846d8b3e 9099 if (fwriteBulkObject(fp,&key) == 0) goto werr;
6ddc908a
PN
9100 if (vstr) {
9101 if (fwriteBulkString(fp,(char*)vstr,vlen) == 0)
9102 goto werr;
9103 } else {
9104 if (fwriteBulkLongLong(fp,vlong) == 0)
9105 goto werr;
9106 }
9107 p = ziplistNext(zl,p);
9108 }
9109 } else if (o->encoding == REDIS_ENCODING_LIST) {
9110 list *list = o->ptr;
9111 listNode *ln;
9112 listIter li;
9113
9114 listRewind(list,&li);
9115 while((ln = listNext(&li))) {
9116 robj *eleobj = listNodeValue(ln);
9117
9118 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
846d8b3e 9119 if (fwriteBulkObject(fp,&key) == 0) goto werr;
6ddc908a
PN
9120 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9121 }
9122 } else {
9123 redisPanic("Unknown list encoding");
9d65a1bb 9124 }
9125 } else if (o->type == REDIS_SET) {
9126 /* Emit the SADDs needed to rebuild the set */
9127 dict *set = o->ptr;
9128 dictIterator *di = dictGetIterator(set);
9129 dictEntry *de;
9130
9131 while((de = dictNext(di)) != NULL) {
9132 char cmd[]="*3\r\n$4\r\nSADD\r\n";
9133 robj *eleobj = dictGetEntryKey(de);
9134
9135 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
09241813 9136 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9c8e3cee 9137 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 9138 }
9139 dictReleaseIterator(di);
9140 } else if (o->type == REDIS_ZSET) {
9141 /* Emit the ZADDs needed to rebuild the sorted set */
9142 zset *zs = o->ptr;
9143 dictIterator *di = dictGetIterator(zs->dict);
9144 dictEntry *de;
9145
9146 while((de = dictNext(di)) != NULL) {
9147 char cmd[]="*4\r\n$4\r\nZADD\r\n";
9148 robj *eleobj = dictGetEntryKey(de);
9149 double *score = dictGetEntryVal(de);
9150
9151 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
09241813 9152 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9d65a1bb 9153 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 9154 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 9155 }
9156 dictReleaseIterator(di);
9c8e3cee 9157 } else if (o->type == REDIS_HASH) {
9158 char cmd[]="*4\r\n$4\r\nHSET\r\n";
9159
9160 /* Emit the HSETs needed to rebuild the hash */
9161 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9162 unsigned char *p = zipmapRewind(o->ptr);
9163 unsigned char *field, *val;
9164 unsigned int flen, vlen;
9165
9166 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
9167 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
09241813 9168 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9c8e3cee 9169 if (fwriteBulkString(fp,(char*)field,flen) == -1)
9170 return -1;
9171 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
9172 return -1;
9173 }
9174 } else {
9175 dictIterator *di = dictGetIterator(o->ptr);
9176 dictEntry *de;
9177
9178 while((de = dictNext(di)) != NULL) {
9179 robj *field = dictGetEntryKey(de);
9180 robj *val = dictGetEntryVal(de);
9181
9182 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
09241813 9183 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9c8e3cee 9184 if (fwriteBulkObject(fp,field) == -1) return -1;
9185 if (fwriteBulkObject(fp,val) == -1) return -1;
9186 }
9187 dictReleaseIterator(di);
9188 }
9d65a1bb 9189 } else {
f83c6cb5 9190 redisPanic("Unknown object type");
9d65a1bb 9191 }
9192 /* Save the expire time */
9193 if (expiretime != -1) {
e96e4fbf 9194 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 9195 /* If this key is already expired skip it */
9196 if (expiretime < now) continue;
9197 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
09241813 9198 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9eaef89f 9199 if (fwriteBulkLongLong(fp,expiretime) == 0) goto werr;
9d65a1bb 9200 }
b9bc0eef 9201 if (swapped) decrRefCount(o);
9d65a1bb 9202 }
9203 dictReleaseIterator(di);
9204 }
9205
9206 /* Make sure data will not remain on the OS's output buffers */
9207 fflush(fp);
b0bd87f6 9208 aof_fsync(fileno(fp));
9d65a1bb 9209 fclose(fp);
e0a62c7f 9210
9d65a1bb 9211 /* Use RENAME to make sure the DB file is changed atomically only
9212 * if the generate DB file is ok. */
9213 if (rename(tmpfile,filename) == -1) {
9214 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
9215 unlink(tmpfile);
9216 return REDIS_ERR;
9217 }
9218 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
9219 return REDIS_OK;
9220
9221werr:
9222 fclose(fp);
9223 unlink(tmpfile);
e96e4fbf 9224 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 9225 if (di) dictReleaseIterator(di);
9226 return REDIS_ERR;
9227}
9228
9229/* This is how rewriting of the append only file in background works:
9230 *
9231 * 1) The user calls BGREWRITEAOF
9232 * 2) Redis calls this function, that forks():
9233 * 2a) the child rewrite the append only file in a temp file.
9234 * 2b) the parent accumulates differences in server.bgrewritebuf.
9235 * 3) When the child finished '2a' exists.
9236 * 4) The parent will trap the exit code, if it's OK, will append the
9237 * data accumulated into server.bgrewritebuf into the temp file, and
9238 * finally will rename(2) the temp file in the actual file name.
9239 * The the new file is reopened as the new append only file. Profit!
9240 */
9241static int rewriteAppendOnlyFileBackground(void) {
9242 pid_t childpid;
9243
9244 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 9245 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 9246 if ((childpid = fork()) == 0) {
9247 /* Child */
9248 char tmpfile[256];
9d65a1bb 9249
054e426d 9250 if (server.vm_enabled) vmReopenSwapFile();
9251 close(server.fd);
9d65a1bb 9252 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
9253 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 9254 _exit(0);
9d65a1bb 9255 } else {
478c2c6f 9256 _exit(1);
9d65a1bb 9257 }
9258 } else {
9259 /* Parent */
9260 if (childpid == -1) {
9261 redisLog(REDIS_WARNING,
9262 "Can't rewrite append only file in background: fork: %s",
9263 strerror(errno));
9264 return REDIS_ERR;
9265 }
9266 redisLog(REDIS_NOTICE,
9267 "Background append only file rewriting started by pid %d",childpid);
9268 server.bgrewritechildpid = childpid;
884d4b39 9269 updateDictResizePolicy();
85a83172 9270 /* We set appendseldb to -1 in order to force the next call to the
9271 * feedAppendOnlyFile() to issue a SELECT command, so the differences
9272 * accumulated by the parent into server.bgrewritebuf will start
9273 * with a SELECT statement and it will be safe to merge. */
9274 server.appendseldb = -1;
9d65a1bb 9275 return REDIS_OK;
9276 }
9277 return REDIS_OK; /* unreached */
9278}
9279
9280static void bgrewriteaofCommand(redisClient *c) {
9281 if (server.bgrewritechildpid != -1) {
9282 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
9283 return;
9284 }
9285 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 9286 char *status = "+Background append only file rewriting started\r\n";
9287 addReplySds(c,sdsnew(status));
9d65a1bb 9288 } else {
9289 addReply(c,shared.err);
9290 }
9291}
9292
9293static void aofRemoveTempFile(pid_t childpid) {
9294 char tmpfile[256];
9295
9296 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
9297 unlink(tmpfile);
9298}
9299
996cb5f7 9300/* Virtual Memory is composed mainly of two subsystems:
9301 * - Blocking Virutal Memory
9302 * - Threaded Virtual Memory I/O
9303 * The two parts are not fully decoupled, but functions are split among two
9304 * different sections of the source code (delimited by comments) in order to
9305 * make more clear what functionality is about the blocking VM and what about
9306 * the threaded (not blocking) VM.
9307 *
9308 * Redis VM design:
9309 *
9310 * Redis VM is a blocking VM (one that blocks reading swapped values from
9311 * disk into memory when a value swapped out is needed in memory) that is made
9312 * unblocking by trying to examine the command argument vector in order to
9313 * load in background values that will likely be needed in order to exec
9314 * the command. The command is executed only once all the relevant keys
9315 * are loaded into memory.
9316 *
9317 * This basically is almost as simple of a blocking VM, but almost as parallel
9318 * as a fully non-blocking VM.
9319 */
9320
560db612 9321/* =================== Virtual Memory - Blocking Side ====================== */
2e5eb04e 9322
560db612 9323/* Create a VM pointer object. This kind of objects are used in place of
9324 * values in the key -> value hash table, for swapped out objects. */
9325static vmpointer *createVmPointer(int vtype) {
9326 vmpointer *vp = zmalloc(sizeof(vmpointer));
2e5eb04e 9327
560db612 9328 vp->type = REDIS_VMPOINTER;
9329 vp->storage = REDIS_VM_SWAPPED;
9330 vp->vtype = vtype;
9331 return vp;
2e5eb04e 9332}
9333
75680a3c 9334static void vmInit(void) {
9335 off_t totsize;
996cb5f7 9336 int pipefds[2];
bcaa7a4f 9337 size_t stacksize;
8b5bb414 9338 struct flock fl;
75680a3c 9339
4ad37480 9340 if (server.vm_max_threads != 0)
9341 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
9342
054e426d 9343 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8b5bb414 9344 /* Try to open the old swap file, otherwise create it */
6fa987e3 9345 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
9346 server.vm_fp = fopen(server.vm_swap_file,"w+b");
9347 }
75680a3c 9348 if (server.vm_fp == NULL) {
6fa987e3 9349 redisLog(REDIS_WARNING,
8b5bb414 9350 "Can't open the swap file: %s. Exiting.",
6fa987e3 9351 strerror(errno));
75680a3c 9352 exit(1);
9353 }
9354 server.vm_fd = fileno(server.vm_fp);
8b5bb414 9355 /* Lock the swap file for writing, this is useful in order to avoid
9356 * another instance to use the same swap file for a config error. */
9357 fl.l_type = F_WRLCK;
9358 fl.l_whence = SEEK_SET;
9359 fl.l_start = fl.l_len = 0;
9360 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
9361 redisLog(REDIS_WARNING,
9362 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
9363 exit(1);
9364 }
9365 /* Initialize */
75680a3c 9366 server.vm_next_page = 0;
9367 server.vm_near_pages = 0;
7d98e08c 9368 server.vm_stats_used_pages = 0;
9369 server.vm_stats_swapped_objects = 0;
9370 server.vm_stats_swapouts = 0;
9371 server.vm_stats_swapins = 0;
75680a3c 9372 totsize = server.vm_pages*server.vm_page_size;
9373 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
9374 if (ftruncate(server.vm_fd,totsize) == -1) {
9375 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
9376 strerror(errno));
9377 exit(1);
9378 } else {
9379 redisLog(REDIS_NOTICE,"Swap file allocated with success");
9380 }
7d30035d 9381 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 9382 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 9383 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 9384 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 9385
996cb5f7 9386 /* Initialize threaded I/O (used by Virtual Memory) */
9387 server.io_newjobs = listCreate();
9388 server.io_processing = listCreate();
9389 server.io_processed = listCreate();
d5d55fc3 9390 server.io_ready_clients = listCreate();
92f8e882 9391 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 9392 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
9393 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 9394 server.io_active_threads = 0;
996cb5f7 9395 if (pipe(pipefds) == -1) {
9396 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
9397 ,strerror(errno));
9398 exit(1);
9399 }
9400 server.io_ready_pipe_read = pipefds[0];
9401 server.io_ready_pipe_write = pipefds[1];
9402 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 9403 /* LZF requires a lot of stack */
9404 pthread_attr_init(&server.io_threads_attr);
9405 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
9406 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
9407 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 9408 /* Listen for events in the threaded I/O pipe */
9409 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
9410 vmThreadedIOCompletedJob, NULL) == AE_ERR)
9411 oom("creating file event");
75680a3c 9412}
9413
06224fec 9414/* Mark the page as used */
9415static void vmMarkPageUsed(off_t page) {
9416 off_t byte = page/8;
9417 int bit = page&7;
970e10bb 9418 redisAssert(vmFreePage(page) == 1);
06224fec 9419 server.vm_bitmap[byte] |= 1<<bit;
9420}
9421
9422/* Mark N contiguous pages as used, with 'page' being the first. */
9423static void vmMarkPagesUsed(off_t page, off_t count) {
9424 off_t j;
9425
9426 for (j = 0; j < count; j++)
7d30035d 9427 vmMarkPageUsed(page+j);
7d98e08c 9428 server.vm_stats_used_pages += count;
7c775e09 9429 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
9430 (long long)count, (long long)page);
06224fec 9431}
9432
9433/* Mark the page as free */
9434static void vmMarkPageFree(off_t page) {
9435 off_t byte = page/8;
9436 int bit = page&7;
970e10bb 9437 redisAssert(vmFreePage(page) == 0);
06224fec 9438 server.vm_bitmap[byte] &= ~(1<<bit);
9439}
9440
9441/* Mark N contiguous pages as free, with 'page' being the first. */
9442static void vmMarkPagesFree(off_t page, off_t count) {
9443 off_t j;
9444
9445 for (j = 0; j < count; j++)
7d30035d 9446 vmMarkPageFree(page+j);
7d98e08c 9447 server.vm_stats_used_pages -= count;
7c775e09 9448 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
9449 (long long)count, (long long)page);
06224fec 9450}
9451
9452/* Test if the page is free */
9453static int vmFreePage(off_t page) {
9454 off_t byte = page/8;
9455 int bit = page&7;
7d30035d 9456 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 9457}
9458
9459/* Find N contiguous free pages storing the first page of the cluster in *first.
e0a62c7f 9460 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
3a66edc7 9461 * REDIS_ERR is returned.
06224fec 9462 *
9463 * This function uses a simple algorithm: we try to allocate
9464 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9465 * again from the start of the swap file searching for free spaces.
9466 *
9467 * If it looks pretty clear that there are no free pages near our offset
9468 * we try to find less populated places doing a forward jump of
9469 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9470 * without hurry, and then we jump again and so forth...
e0a62c7f 9471 *
06224fec 9472 * This function can be improved using a free list to avoid to guess
9473 * too much, since we could collect data about freed pages.
9474 *
9475 * note: I implemented this function just after watching an episode of
9476 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9477 */
c7df85a4 9478static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 9479 off_t base, offset = 0, since_jump = 0, numfree = 0;
9480
9481 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9482 server.vm_near_pages = 0;
9483 server.vm_next_page = 0;
9484 }
9485 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9486 base = server.vm_next_page;
9487
9488 while(offset < server.vm_pages) {
9489 off_t this = base+offset;
9490
9491 /* If we overflow, restart from page zero */
9492 if (this >= server.vm_pages) {
9493 this -= server.vm_pages;
9494 if (this == 0) {
9495 /* Just overflowed, what we found on tail is no longer
9496 * interesting, as it's no longer contiguous. */
9497 numfree = 0;
9498 }
9499 }
9500 if (vmFreePage(this)) {
9501 /* This is a free page */
9502 numfree++;
9503 /* Already got N free pages? Return to the caller, with success */
9504 if (numfree == n) {
7d30035d 9505 *first = this-(n-1);
9506 server.vm_next_page = this+1;
7c775e09 9507 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 9508 return REDIS_OK;
06224fec 9509 }
9510 } else {
9511 /* The current one is not a free page */
9512 numfree = 0;
9513 }
9514
9515 /* Fast-forward if the current page is not free and we already
9516 * searched enough near this place. */
9517 since_jump++;
9518 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9519 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9520 since_jump = 0;
9521 /* Note that even if we rewind after the jump, we are don't need
9522 * to make sure numfree is set to zero as we only jump *if* it
9523 * is set to zero. */
9524 } else {
9525 /* Otherwise just check the next page */
9526 offset++;
9527 }
9528 }
3a66edc7 9529 return REDIS_ERR;
9530}
9531
a5819310 9532/* Write the specified object at the specified page of the swap file */
9533static int vmWriteObjectOnSwap(robj *o, off_t page) {
9534 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9535 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9536 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9537 redisLog(REDIS_WARNING,
9ebed7cf 9538 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 9539 strerror(errno));
9540 return REDIS_ERR;
9541 }
9542 rdbSaveObject(server.vm_fp,o);
ba76a8f9 9543 fflush(server.vm_fp);
a5819310 9544 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9545 return REDIS_OK;
9546}
9547
a4798f73 9548/* Transfers the 'val' object to disk. Store all the information
9549 * a 'vmpointer' object containing all the information needed to load the
9550 * object back later is returned.
9551 *
3a66edc7 9552 * If we can't find enough contiguous empty pages to swap the object on disk
a4798f73 9553 * NULL is returned. */
560db612 9554static vmpointer *vmSwapObjectBlocking(robj *val) {
b9bc0eef 9555 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 9556 off_t page;
560db612 9557 vmpointer *vp;
3a66edc7 9558
560db612 9559 assert(val->storage == REDIS_VM_MEMORY);
9560 assert(val->refcount == 1);
9561 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return NULL;
9562 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return NULL;
9563
9564 vp = createVmPointer(val->type);
9565 vp->page = page;
9566 vp->usedpages = pages;
3a66edc7 9567 decrRefCount(val); /* Deallocate the object from memory. */
9568 vmMarkPagesUsed(page,pages);
560db612 9569 redisLog(REDIS_DEBUG,"VM: object %p swapped out at %lld (%lld pages)",
9570 (void*) val,
7d30035d 9571 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 9572 server.vm_stats_swapped_objects++;
9573 server.vm_stats_swapouts++;
560db612 9574 return vp;
3a66edc7 9575}
9576
a5819310 9577static robj *vmReadObjectFromSwap(off_t page, int type) {
9578 robj *o;
3a66edc7 9579
a5819310 9580 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9581 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 9582 redisLog(REDIS_WARNING,
d5d55fc3 9583 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 9584 strerror(errno));
478c2c6f 9585 _exit(1);
3a66edc7 9586 }
a5819310 9587 o = rdbLoadObject(type,server.vm_fp);
9588 if (o == NULL) {
d5d55fc3 9589 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 9590 _exit(1);
3a66edc7 9591 }
a5819310 9592 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9593 return o;
9594}
9595
560db612 9596/* Load the specified object from swap to memory.
a5819310 9597 * The newly allocated object is returned.
9598 *
9599 * If preview is true the unserialized object is returned to the caller but
560db612 9600 * the pages are not marked as freed, nor the vp object is freed. */
9601static robj *vmGenericLoadObject(vmpointer *vp, int preview) {
a5819310 9602 robj *val;
9603
560db612 9604 redisAssert(vp->type == REDIS_VMPOINTER &&
9605 (vp->storage == REDIS_VM_SWAPPED || vp->storage == REDIS_VM_LOADING));
9606 val = vmReadObjectFromSwap(vp->page,vp->vtype);
7e69548d 9607 if (!preview) {
560db612 9608 redisLog(REDIS_DEBUG, "VM: object %p loaded from disk", (void*)vp);
9609 vmMarkPagesFree(vp->page,vp->usedpages);
9610 zfree(vp);
7d98e08c 9611 server.vm_stats_swapped_objects--;
38aba9a1 9612 } else {
560db612 9613 redisLog(REDIS_DEBUG, "VM: object %p previewed from disk", (void*)vp);
7e69548d 9614 }
7d98e08c 9615 server.vm_stats_swapins++;
3a66edc7 9616 return val;
06224fec 9617}
9618
560db612 9619/* Plain object loading, from swap to memory.
9620 *
9621 * 'o' is actually a redisVmPointer structure that will be freed by the call.
9622 * The return value is the loaded object. */
9623static robj *vmLoadObject(robj *o) {
996cb5f7 9624 /* If we are loading the object in background, stop it, we
9625 * need to load this object synchronously ASAP. */
560db612 9626 if (o->storage == REDIS_VM_LOADING)
9627 vmCancelThreadedIOJob(o);
9628 return vmGenericLoadObject((vmpointer*)o,0);
7e69548d 9629}
9630
9631/* Just load the value on disk, without to modify the key.
9632 * This is useful when we want to perform some operation on the value
9633 * without to really bring it from swap to memory, like while saving the
9634 * dataset or rewriting the append only log. */
560db612 9635static robj *vmPreviewObject(robj *o) {
9636 return vmGenericLoadObject((vmpointer*)o,1);
7e69548d 9637}
9638
4ef8de8a 9639/* How a good candidate is this object for swapping?
9640 * The better candidate it is, the greater the returned value.
9641 *
9642 * Currently we try to perform a fast estimation of the object size in
9643 * memory, and combine it with aging informations.
9644 *
9645 * Basically swappability = idle-time * log(estimated size)
9646 *
9647 * Bigger objects are preferred over smaller objects, but not
9648 * proportionally, this is why we use the logarithm. This algorithm is
9649 * just a first try and will probably be tuned later. */
9650static double computeObjectSwappability(robj *o) {
560db612 9651 /* actual age can be >= minage, but not < minage. As we use wrapping
9652 * 21 bit clocks with minutes resolution for the LRU. */
9653 time_t minage = abs(server.lruclock - o->lru);
4ef8de8a 9654 long asize = 0;
9655 list *l;
9656 dict *d;
9657 struct dictEntry *de;
9658 int z;
9659
560db612 9660 if (minage <= 0) return 0;
4ef8de8a 9661 switch(o->type) {
9662 case REDIS_STRING:
9663 if (o->encoding != REDIS_ENCODING_RAW) {
9664 asize = sizeof(*o);
9665 } else {
9666 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9667 }
9668 break;
9669 case REDIS_LIST:
9670 l = o->ptr;
9671 listNode *ln = listFirst(l);
9672
9673 asize = sizeof(list);
9674 if (ln) {
9675 robj *ele = ln->value;
9676 long elesize;
9677
9678 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
560db612 9679 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
4ef8de8a 9680 asize += (sizeof(listNode)+elesize)*listLength(l);
9681 }
9682 break;
9683 case REDIS_SET:
9684 case REDIS_ZSET:
9685 z = (o->type == REDIS_ZSET);
9686 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9687
9688 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9689 if (z) asize += sizeof(zset)-sizeof(dict);
9690 if (dictSize(d)) {
9691 long elesize;
9692 robj *ele;
9693
9694 de = dictGetRandomKey(d);
9695 ele = dictGetEntryKey(de);
9696 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
560db612 9697 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
4ef8de8a 9698 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9699 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9700 }
9701 break;
a97b9060 9702 case REDIS_HASH:
9703 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9704 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9705 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9706 unsigned int klen, vlen;
9707 unsigned char *key, *val;
9708
9709 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9710 klen = 0;
9711 vlen = 0;
9712 }
9713 asize = len*(klen+vlen+3);
9714 } else if (o->encoding == REDIS_ENCODING_HT) {
9715 d = o->ptr;
9716 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9717 if (dictSize(d)) {
9718 long elesize;
9719 robj *ele;
9720
9721 de = dictGetRandomKey(d);
9722 ele = dictGetEntryKey(de);
9723 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
560db612 9724 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
a97b9060 9725 ele = dictGetEntryVal(de);
9726 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
560db612 9727 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
a97b9060 9728 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9729 }
9730 }
9731 break;
4ef8de8a 9732 }
560db612 9733 return (double)minage*log(1+asize);
4ef8de8a 9734}
9735
9736/* Try to swap an object that's a good candidate for swapping.
9737 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 9738 * to swap any object at all.
9739 *
9740 * If 'usethreaded' is true, Redis will try to swap the object in background
9741 * using I/O threads. */
9742static int vmSwapOneObject(int usethreads) {
4ef8de8a 9743 int j, i;
9744 struct dictEntry *best = NULL;
9745 double best_swappability = 0;
b9bc0eef 9746 redisDb *best_db = NULL;
44262c58 9747 robj *val;
9748 sds key;
4ef8de8a 9749
9750 for (j = 0; j < server.dbnum; j++) {
9751 redisDb *db = server.db+j;
b72f6a4b 9752 /* Why maxtries is set to 100?
9753 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9754 * are swappable objects */
b0d8747d 9755 int maxtries = 100;
4ef8de8a 9756
9757 if (dictSize(db->dict) == 0) continue;
9758 for (i = 0; i < 5; i++) {
9759 dictEntry *de;
9760 double swappability;
9761
e3cadb8a 9762 if (maxtries) maxtries--;
4ef8de8a 9763 de = dictGetRandomKey(db->dict);
4ef8de8a 9764 val = dictGetEntryVal(de);
1064ef87 9765 /* Only swap objects that are currently in memory.
9766 *
560db612 9767 * Also don't swap shared objects: not a good idea in general and
9768 * we need to ensure that the main thread does not touch the
1064ef87 9769 * object while the I/O thread is using it, but we can't
9770 * control other keys without adding additional mutex. */
560db612 9771 if (val->storage != REDIS_VM_MEMORY || val->refcount != 1) {
e3cadb8a 9772 if (maxtries) i--; /* don't count this try */
9773 continue;
9774 }
4ef8de8a 9775 swappability = computeObjectSwappability(val);
9776 if (!best || swappability > best_swappability) {
9777 best = de;
9778 best_swappability = swappability;
b9bc0eef 9779 best_db = db;
4ef8de8a 9780 }
9781 }
9782 }
7c775e09 9783 if (best == NULL) return REDIS_ERR;
4ef8de8a 9784 key = dictGetEntryKey(best);
9785 val = dictGetEntryVal(best);
9786
e3cadb8a 9787 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
44262c58 9788 key, best_swappability);
4ef8de8a 9789
4ef8de8a 9790 /* Swap it */
a69a0c9c 9791 if (usethreads) {
4c8f2370 9792 robj *keyobj = createStringObject(key,sdslen(key));
9793 vmSwapObjectThreaded(keyobj,val,best_db);
9794 decrRefCount(keyobj);
4ef8de8a 9795 return REDIS_OK;
9796 } else {
560db612 9797 vmpointer *vp;
9798
9799 if ((vp = vmSwapObjectBlocking(val)) != NULL) {
9800 dictGetEntryVal(best) = vp;
a69a0c9c 9801 return REDIS_OK;
9802 } else {
9803 return REDIS_ERR;
9804 }
4ef8de8a 9805 }
9806}
9807
a69a0c9c 9808static int vmSwapOneObjectBlocking() {
9809 return vmSwapOneObject(0);
9810}
9811
9812static int vmSwapOneObjectThreaded() {
9813 return vmSwapOneObject(1);
9814}
9815
7e69548d 9816/* Return true if it's safe to swap out objects in a given moment.
9817 * Basically we don't want to swap objects out while there is a BGSAVE
9818 * or a BGAEOREWRITE running in backgroud. */
9819static int vmCanSwapOut(void) {
9820 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9821}
9822
996cb5f7 9823/* =================== Virtual Memory - Threaded I/O ======================= */
9824
b9bc0eef 9825static void freeIOJob(iojob *j) {
d5d55fc3 9826 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9827 j->type == REDIS_IOJOB_DO_SWAP ||
9828 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
560db612 9829 {
e4ed181d 9830 /* we fix the storage type, otherwise decrRefCount() will try to
9831 * kill the I/O thread Job (that does no longer exists). */
9832 if (j->val->storage == REDIS_VM_SWAPPING)
560db612 9833 j->val->storage = REDIS_VM_MEMORY;
b9bc0eef 9834 decrRefCount(j->val);
560db612 9835 }
9836 decrRefCount(j->key);
b9bc0eef 9837 zfree(j);
9838}
9839
996cb5f7 9840/* Every time a thread finished a Job, it writes a byte into the write side
9841 * of an unix pipe in order to "awake" the main thread, and this function
9842 * is called. */
9843static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9844 int mask)
9845{
9846 char buf[1];
b0d8747d 9847 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 9848 REDIS_NOTUSED(el);
9849 REDIS_NOTUSED(mask);
9850 REDIS_NOTUSED(privdata);
9851
9852 /* For every byte we read in the read side of the pipe, there is one
9853 * I/O job completed to process. */
9854 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 9855 iojob *j;
9856 listNode *ln;
b9bc0eef 9857 struct dictEntry *de;
9858
996cb5f7 9859 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 9860
9861 /* Get the processed element (the oldest one) */
9862 lockThreadedIO();
1064ef87 9863 assert(listLength(server.io_processed) != 0);
f6c0bba8 9864 if (toprocess == -1) {
9865 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9866 if (toprocess <= 0) toprocess = 1;
9867 }
b9bc0eef 9868 ln = listFirst(server.io_processed);
9869 j = ln->value;
9870 listDelNode(server.io_processed,ln);
9871 unlockThreadedIO();
9872 /* If this job is marked as canceled, just ignore it */
9873 if (j->canceled) {
9874 freeIOJob(j);
9875 continue;
9876 }
9877 /* Post process it in the main thread, as there are things we
9878 * can do just here to avoid race conditions and/or invasive locks */
560db612 9879 redisLog(REDIS_DEBUG,"COMPLETED Job type: %d, ID %p, key: %s", j->type, (void*)j->id, (unsigned char*)j->key->ptr);
44262c58 9880 de = dictFind(j->db->dict,j->key->ptr);
e4ed181d 9881 redisAssert(de != NULL);
b9bc0eef 9882 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9883 redisDb *db;
560db612 9884 vmpointer *vp = dictGetEntryVal(de);
d5d55fc3 9885
b9bc0eef 9886 /* Key loaded, bring it at home */
560db612 9887 vmMarkPagesFree(vp->page,vp->usedpages);
b9bc0eef 9888 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
560db612 9889 (unsigned char*) j->key->ptr);
b9bc0eef 9890 server.vm_stats_swapped_objects--;
9891 server.vm_stats_swapins++;
d5d55fc3 9892 dictGetEntryVal(de) = j->val;
9893 incrRefCount(j->val);
9894 db = j->db;
d5d55fc3 9895 /* Handle clients waiting for this key to be loaded. */
560db612 9896 handleClientsBlockedOnSwappedKey(db,j->key);
9897 freeIOJob(j);
9898 zfree(vp);
b9bc0eef 9899 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9900 /* Now we know the amount of pages required to swap this object.
9901 * Let's find some space for it, and queue this task again
9902 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 9903 if (!vmCanSwapOut() ||
9904 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9905 {
9906 /* Ooops... no space or we can't swap as there is
9907 * a fork()ed Redis trying to save stuff on disk. */
560db612 9908 j->val->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 9909 freeIOJob(j);
9910 } else {
c7df85a4 9911 /* Note that we need to mark this pages as used now,
9912 * if the job will be canceled, we'll mark them as freed
9913 * again. */
9914 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 9915 j->type = REDIS_IOJOB_DO_SWAP;
9916 lockThreadedIO();
9917 queueIOJob(j);
9918 unlockThreadedIO();
9919 }
9920 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
560db612 9921 vmpointer *vp;
b9bc0eef 9922
9923 /* Key swapped. We can finally free some memory. */
560db612 9924 if (j->val->storage != REDIS_VM_SWAPPING) {
9925 vmpointer *vp = (vmpointer*) j->id;
9926 printf("storage: %d\n",vp->storage);
9927 printf("key->name: %s\n",(char*)j->key->ptr);
6c96ba7d 9928 printf("val: %p\n",(void*)j->val);
9929 printf("val->type: %d\n",j->val->type);
9930 printf("val->ptr: %s\n",(char*)j->val->ptr);
9931 }
560db612 9932 redisAssert(j->val->storage == REDIS_VM_SWAPPING);
9933 vp = createVmPointer(j->val->type);
9934 vp->page = j->page;
9935 vp->usedpages = j->pages;
9936 dictGetEntryVal(de) = vp;
e4ed181d 9937 /* Fix the storage otherwise decrRefCount will attempt to
9938 * remove the associated I/O job */
9939 j->val->storage = REDIS_VM_MEMORY;
560db612 9940 decrRefCount(j->val);
b9bc0eef 9941 redisLog(REDIS_DEBUG,
9942 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
560db612 9943 (unsigned char*) j->key->ptr,
b9bc0eef 9944 (unsigned long long) j->page, (unsigned long long) j->pages);
9945 server.vm_stats_swapped_objects++;
9946 server.vm_stats_swapouts++;
9947 freeIOJob(j);
f11b8647 9948 /* Put a few more swap requests in queue if we are still
9949 * out of memory */
b0d8747d 9950 if (trytoswap && vmCanSwapOut() &&
9951 zmalloc_used_memory() > server.vm_max_memory)
9952 {
f11b8647 9953 int more = 1;
9954 while(more) {
9955 lockThreadedIO();
9956 more = listLength(server.io_newjobs) <
9957 (unsigned) server.vm_max_threads;
9958 unlockThreadedIO();
9959 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 9960 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9961 trytoswap = 0;
9962 break;
9963 }
f11b8647 9964 }
9965 }
b9bc0eef 9966 }
c953f24b 9967 processed++;
f6c0bba8 9968 if (processed == toprocess) return;
996cb5f7 9969 }
9970 if (retval < 0 && errno != EAGAIN) {
9971 redisLog(REDIS_WARNING,
9972 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9973 strerror(errno));
9974 }
9975}
9976
9977static void lockThreadedIO(void) {
9978 pthread_mutex_lock(&server.io_mutex);
9979}
9980
9981static void unlockThreadedIO(void) {
9982 pthread_mutex_unlock(&server.io_mutex);
9983}
9984
9985/* Remove the specified object from the threaded I/O queue if still not
9986 * processed, otherwise make sure to flag it as canceled. */
9987static void vmCancelThreadedIOJob(robj *o) {
9988 list *lists[3] = {
6c96ba7d 9989 server.io_newjobs, /* 0 */
9990 server.io_processing, /* 1 */
9991 server.io_processed /* 2 */
996cb5f7 9992 };
9993 int i;
9994
9995 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 9996again:
996cb5f7 9997 lockThreadedIO();
560db612 9998 /* Search for a matching object in one of the queues */
996cb5f7 9999 for (i = 0; i < 3; i++) {
10000 listNode *ln;
c7df85a4 10001 listIter li;
996cb5f7 10002
c7df85a4 10003 listRewind(lists[i],&li);
10004 while ((ln = listNext(&li)) != NULL) {
996cb5f7 10005 iojob *job = ln->value;
10006
6c96ba7d 10007 if (job->canceled) continue; /* Skip this, already canceled. */
560db612 10008 if (job->id == o) {
dbc289ae 10009 redisLog(REDIS_DEBUG,"*** CANCELED %p (key %s) (type %d) (LIST ID %d)\n",
10010 (void*)job, (char*)job->key->ptr, job->type, i);
427a2153 10011 /* Mark the pages as free since the swap didn't happened
10012 * or happened but is now discarded. */
970e10bb 10013 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 10014 vmMarkPagesFree(job->page,job->pages);
10015 /* Cancel the job. It depends on the list the job is
10016 * living in. */
996cb5f7 10017 switch(i) {
10018 case 0: /* io_newjobs */
6c96ba7d 10019 /* If the job was yet not processed the best thing to do
996cb5f7 10020 * is to remove it from the queue at all */
6c96ba7d 10021 freeIOJob(job);
996cb5f7 10022 listDelNode(lists[i],ln);
10023 break;
10024 case 1: /* io_processing */
d5d55fc3 10025 /* Oh Shi- the thread is messing with the Job:
10026 *
10027 * Probably it's accessing the object if this is a
10028 * PREPARE_SWAP or DO_SWAP job.
10029 * If it's a LOAD job it may be reading from disk and
10030 * if we don't wait for the job to terminate before to
10031 * cancel it, maybe in a few microseconds data can be
10032 * corrupted in this pages. So the short story is:
10033 *
10034 * Better to wait for the job to move into the
10035 * next queue (processed)... */
10036
10037 /* We try again and again until the job is completed. */
10038 unlockThreadedIO();
10039 /* But let's wait some time for the I/O thread
10040 * to finish with this job. After all this condition
10041 * should be very rare. */
10042 usleep(1);
10043 goto again;
996cb5f7 10044 case 2: /* io_processed */
2e111efe 10045 /* The job was already processed, that's easy...
10046 * just mark it as canceled so that we'll ignore it
10047 * when processing completed jobs. */
996cb5f7 10048 job->canceled = 1;
10049 break;
10050 }
c7df85a4 10051 /* Finally we have to adjust the storage type of the object
10052 * in order to "UNDO" the operaiton. */
996cb5f7 10053 if (o->storage == REDIS_VM_LOADING)
10054 o->storage = REDIS_VM_SWAPPED;
10055 else if (o->storage == REDIS_VM_SWAPPING)
10056 o->storage = REDIS_VM_MEMORY;
10057 unlockThreadedIO();
e4ed181d 10058 redisLog(REDIS_DEBUG,"*** DONE");
996cb5f7 10059 return;
10060 }
10061 }
10062 }
10063 unlockThreadedIO();
560db612 10064 printf("Not found: %p\n", (void*)o);
10065 redisAssert(1 != 1); /* We should never reach this */
996cb5f7 10066}
10067
b9bc0eef 10068static void *IOThreadEntryPoint(void *arg) {
10069 iojob *j;
10070 listNode *ln;
10071 REDIS_NOTUSED(arg);
10072
10073 pthread_detach(pthread_self());
10074 while(1) {
10075 /* Get a new job to process */
10076 lockThreadedIO();
10077 if (listLength(server.io_newjobs) == 0) {
10078 /* No new jobs in queue, exit. */
9ebed7cf 10079 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
10080 (long) pthread_self());
b9bc0eef 10081 server.io_active_threads--;
10082 unlockThreadedIO();
10083 return NULL;
10084 }
10085 ln = listFirst(server.io_newjobs);
10086 j = ln->value;
10087 listDelNode(server.io_newjobs,ln);
10088 /* Add the job in the processing queue */
10089 j->thread = pthread_self();
10090 listAddNodeTail(server.io_processing,j);
10091 ln = listLast(server.io_processing); /* We use ln later to remove it */
10092 unlockThreadedIO();
9ebed7cf 10093 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
10094 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 10095
10096 /* Process the Job */
10097 if (j->type == REDIS_IOJOB_LOAD) {
560db612 10098 vmpointer *vp = (vmpointer*)j->id;
10099 j->val = vmReadObjectFromSwap(j->page,vp->vtype);
b9bc0eef 10100 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
10101 FILE *fp = fopen("/dev/null","w+");
10102 j->pages = rdbSavedObjectPages(j->val,fp);
10103 fclose(fp);
10104 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 10105 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
10106 j->canceled = 1;
b9bc0eef 10107 }
10108
10109 /* Done: insert the job into the processed queue */
9ebed7cf 10110 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
10111 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 10112 lockThreadedIO();
10113 listDelNode(server.io_processing,ln);
10114 listAddNodeTail(server.io_processed,j);
10115 unlockThreadedIO();
e0a62c7f 10116
b9bc0eef 10117 /* Signal the main thread there is new stuff to process */
10118 assert(write(server.io_ready_pipe_write,"x",1) == 1);
10119 }
10120 return NULL; /* never reached */
10121}
10122
10123static void spawnIOThread(void) {
10124 pthread_t thread;
478c2c6f 10125 sigset_t mask, omask;
a97b9060 10126 int err;
b9bc0eef 10127
478c2c6f 10128 sigemptyset(&mask);
10129 sigaddset(&mask,SIGCHLD);
10130 sigaddset(&mask,SIGHUP);
10131 sigaddset(&mask,SIGPIPE);
10132 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 10133 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
10134 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
10135 strerror(err));
10136 usleep(1000000);
10137 }
478c2c6f 10138 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 10139 server.io_active_threads++;
10140}
10141
4ee9488d 10142/* We need to wait for the last thread to exit before we are able to
10143 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 10144static void waitEmptyIOJobsQueue(void) {
4ee9488d 10145 while(1) {
76b7233a 10146 int io_processed_len;
10147
4ee9488d 10148 lockThreadedIO();
054e426d 10149 if (listLength(server.io_newjobs) == 0 &&
10150 listLength(server.io_processing) == 0 &&
10151 server.io_active_threads == 0)
10152 {
4ee9488d 10153 unlockThreadedIO();
10154 return;
10155 }
76b7233a 10156 /* While waiting for empty jobs queue condition we post-process some
10157 * finshed job, as I/O threads may be hanging trying to write against
10158 * the io_ready_pipe_write FD but there are so much pending jobs that
10159 * it's blocking. */
10160 io_processed_len = listLength(server.io_processed);
4ee9488d 10161 unlockThreadedIO();
76b7233a 10162 if (io_processed_len) {
10163 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
10164 usleep(1000); /* 1 millisecond */
10165 } else {
10166 usleep(10000); /* 10 milliseconds */
10167 }
4ee9488d 10168 }
10169}
10170
054e426d 10171static void vmReopenSwapFile(void) {
478c2c6f 10172 /* Note: we don't close the old one as we are in the child process
10173 * and don't want to mess at all with the original file object. */
054e426d 10174 server.vm_fp = fopen(server.vm_swap_file,"r+b");
10175 if (server.vm_fp == NULL) {
10176 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
10177 server.vm_swap_file);
478c2c6f 10178 _exit(1);
054e426d 10179 }
10180 server.vm_fd = fileno(server.vm_fp);
10181}
10182
b9bc0eef 10183/* This function must be called while with threaded IO locked */
10184static void queueIOJob(iojob *j) {
6c96ba7d 10185 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
10186 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 10187 listAddNodeTail(server.io_newjobs,j);
10188 if (server.io_active_threads < server.vm_max_threads)
10189 spawnIOThread();
10190}
10191
10192static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
10193 iojob *j;
e0a62c7f 10194
b9bc0eef 10195 j = zmalloc(sizeof(*j));
10196 j->type = REDIS_IOJOB_PREPARE_SWAP;
10197 j->db = db;
78ebe4c8 10198 j->key = key;
7dd8e7cf 10199 incrRefCount(key);
560db612 10200 j->id = j->val = val;
b9bc0eef 10201 incrRefCount(val);
10202 j->canceled = 0;
10203 j->thread = (pthread_t) -1;
560db612 10204 val->storage = REDIS_VM_SWAPPING;
b9bc0eef 10205
10206 lockThreadedIO();
10207 queueIOJob(j);
10208 unlockThreadedIO();
10209 return REDIS_OK;
10210}
10211
b0d8747d 10212/* ============ Virtual Memory - Blocking clients on missing keys =========== */
10213
d5d55fc3 10214/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
10215 * If there is not already a job loading the key, it is craeted.
10216 * The key is added to the io_keys list in the client structure, and also
10217 * in the hash table mapping swapped keys to waiting clients, that is,
10218 * server.io_waited_keys. */
10219static int waitForSwappedKey(redisClient *c, robj *key) {
10220 struct dictEntry *de;
10221 robj *o;
10222 list *l;
10223
10224 /* If the key does not exist or is already in RAM we don't need to
10225 * block the client at all. */
09241813 10226 de = dictFind(c->db->dict,key->ptr);
d5d55fc3 10227 if (de == NULL) return 0;
560db612 10228 o = dictGetEntryVal(de);
d5d55fc3 10229 if (o->storage == REDIS_VM_MEMORY) {
10230 return 0;
10231 } else if (o->storage == REDIS_VM_SWAPPING) {
10232 /* We were swapping the key, undo it! */
10233 vmCancelThreadedIOJob(o);
10234 return 0;
10235 }
e0a62c7f 10236
d5d55fc3 10237 /* OK: the key is either swapped, or being loaded just now. */
10238
10239 /* Add the key to the list of keys this client is waiting for.
10240 * This maps clients to keys they are waiting for. */
10241 listAddNodeTail(c->io_keys,key);
10242 incrRefCount(key);
10243
10244 /* Add the client to the swapped keys => clients waiting map. */
10245 de = dictFind(c->db->io_keys,key);
10246 if (de == NULL) {
10247 int retval;
10248
10249 /* For every key we take a list of clients blocked for it */
10250 l = listCreate();
10251 retval = dictAdd(c->db->io_keys,key,l);
10252 incrRefCount(key);
10253 assert(retval == DICT_OK);
10254 } else {
10255 l = dictGetEntryVal(de);
10256 }
10257 listAddNodeTail(l,c);
10258
10259 /* Are we already loading the key from disk? If not create a job */
10260 if (o->storage == REDIS_VM_SWAPPED) {
10261 iojob *j;
560db612 10262 vmpointer *vp = (vmpointer*)o;
d5d55fc3 10263
10264 o->storage = REDIS_VM_LOADING;
10265 j = zmalloc(sizeof(*j));
10266 j->type = REDIS_IOJOB_LOAD;
10267 j->db = c->db;
560db612 10268 j->id = (robj*)vp;
10269 j->key = key;
10270 incrRefCount(key);
10271 j->page = vp->page;
d5d55fc3 10272 j->val = NULL;
10273 j->canceled = 0;
10274 j->thread = (pthread_t) -1;
10275 lockThreadedIO();
10276 queueIOJob(j);
10277 unlockThreadedIO();
10278 }
10279 return 1;
10280}
10281
6f078746
PN
10282/* Preload keys for any command with first, last and step values for
10283 * the command keys prototype, as defined in the command table. */
10284static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10285 int j, last;
10286 if (cmd->vm_firstkey == 0) return;
10287 last = cmd->vm_lastkey;
10288 if (last < 0) last = argc+last;
10289 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
10290 redisAssert(j < argc);
10291 waitForSwappedKey(c,argv[j]);
10292 }
10293}
10294
5d373da9 10295/* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
739ba0d2
PN
10296 * Note that the number of keys to preload is user-defined, so we need to
10297 * apply a sanity check against argc. */
ca1788b5 10298static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
76583ea4 10299 int i, num;
ca1788b5 10300 REDIS_NOTUSED(cmd);
ca1788b5
PN
10301
10302 num = atoi(argv[2]->ptr);
739ba0d2 10303 if (num > (argc-3)) return;
76583ea4 10304 for (i = 0; i < num; i++) {
ca1788b5 10305 waitForSwappedKey(c,argv[3+i]);
76583ea4
PN
10306 }
10307}
10308
3805e04f
PN
10309/* Preload keys needed to execute the entire MULTI/EXEC block.
10310 *
10311 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
10312 * and will block the client when any command requires a swapped out value. */
10313static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10314 int i, margc;
10315 struct redisCommand *mcmd;
10316 robj **margv;
10317 REDIS_NOTUSED(cmd);
10318 REDIS_NOTUSED(argc);
10319 REDIS_NOTUSED(argv);
10320
10321 if (!(c->flags & REDIS_MULTI)) return;
10322 for (i = 0; i < c->mstate.count; i++) {
10323 mcmd = c->mstate.commands[i].cmd;
10324 margc = c->mstate.commands[i].argc;
10325 margv = c->mstate.commands[i].argv;
10326
10327 if (mcmd->vm_preload_proc != NULL) {
10328 mcmd->vm_preload_proc(c,mcmd,margc,margv);
10329 } else {
10330 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
10331 }
76583ea4
PN
10332 }
10333}
10334
b0d8747d 10335/* Is this client attempting to run a command against swapped keys?
d5d55fc3 10336 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 10337 *
d5d55fc3 10338 * The important idea about this function is that it can fail! If keys will
10339 * still be swapped when the client is resumed, this key lookups will
10340 * just block loading keys from disk. In practical terms this should only
10341 * happen with SORT BY command or if there is a bug in this function.
10342 *
10343 * Return 1 if the client is marked as blocked, 0 if the client can
10344 * continue as the keys it is going to access appear to be in memory. */
0a6f3f0f 10345static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
76583ea4 10346 if (cmd->vm_preload_proc != NULL) {
ca1788b5 10347 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
76583ea4 10348 } else {
6f078746 10349 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
76583ea4
PN
10350 }
10351
d5d55fc3 10352 /* If the client was blocked for at least one key, mark it as blocked. */
10353 if (listLength(c->io_keys)) {
10354 c->flags |= REDIS_IO_WAIT;
10355 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
10356 server.vm_blocked_clients++;
10357 return 1;
10358 } else {
10359 return 0;
10360 }
10361}
10362
10363/* Remove the 'key' from the list of blocked keys for a given client.
10364 *
10365 * The function returns 1 when there are no longer blocking keys after
10366 * the current one was removed (and the client can be unblocked). */
10367static int dontWaitForSwappedKey(redisClient *c, robj *key) {
10368 list *l;
10369 listNode *ln;
10370 listIter li;
10371 struct dictEntry *de;
10372
10373 /* Remove the key from the list of keys this client is waiting for. */
10374 listRewind(c->io_keys,&li);
10375 while ((ln = listNext(&li)) != NULL) {
bf028098 10376 if (equalStringObjects(ln->value,key)) {
d5d55fc3 10377 listDelNode(c->io_keys,ln);
10378 break;
10379 }
10380 }
10381 assert(ln != NULL);
10382
10383 /* Remove the client form the key => waiting clients map. */
10384 de = dictFind(c->db->io_keys,key);
10385 assert(de != NULL);
10386 l = dictGetEntryVal(de);
10387 ln = listSearchKey(l,c);
10388 assert(ln != NULL);
10389 listDelNode(l,ln);
10390 if (listLength(l) == 0)
10391 dictDelete(c->db->io_keys,key);
10392
10393 return listLength(c->io_keys) == 0;
10394}
10395
560db612 10396/* Every time we now a key was loaded back in memory, we handle clients
10397 * waiting for this key if any. */
d5d55fc3 10398static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
10399 struct dictEntry *de;
10400 list *l;
10401 listNode *ln;
10402 int len;
10403
10404 de = dictFind(db->io_keys,key);
10405 if (!de) return;
10406
10407 l = dictGetEntryVal(de);
10408 len = listLength(l);
10409 /* Note: we can't use something like while(listLength(l)) as the list
10410 * can be freed by the calling function when we remove the last element. */
10411 while (len--) {
10412 ln = listFirst(l);
10413 redisClient *c = ln->value;
10414
10415 if (dontWaitForSwappedKey(c,key)) {
10416 /* Put the client in the list of clients ready to go as we
10417 * loaded all the keys about it. */
10418 listAddNodeTail(server.io_ready_clients,c);
10419 }
10420 }
b0d8747d 10421}
b0d8747d 10422
500ece7c 10423/* =========================== Remote Configuration ========================= */
10424
10425static void configSetCommand(redisClient *c) {
10426 robj *o = getDecodedObject(c->argv[3]);
2e5eb04e 10427 long long ll;
10428
500ece7c 10429 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
10430 zfree(server.dbfilename);
10431 server.dbfilename = zstrdup(o->ptr);
10432 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
10433 zfree(server.requirepass);
10434 server.requirepass = zstrdup(o->ptr);
10435 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
10436 zfree(server.masterauth);
10437 server.masterauth = zstrdup(o->ptr);
10438 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
2e5eb04e 10439 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10440 ll < 0) goto badfmt;
10441 server.maxmemory = ll;
10442 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
10443 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10444 ll < 0 || ll > LONG_MAX) goto badfmt;
10445 server.maxidletime = ll;
1b677732 10446 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
10447 if (!strcasecmp(o->ptr,"no")) {
10448 server.appendfsync = APPENDFSYNC_NO;
10449 } else if (!strcasecmp(o->ptr,"everysec")) {
10450 server.appendfsync = APPENDFSYNC_EVERYSEC;
10451 } else if (!strcasecmp(o->ptr,"always")) {
10452 server.appendfsync = APPENDFSYNC_ALWAYS;
10453 } else {
10454 goto badfmt;
10455 }
38db9171 10456 } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) {
10457 int yn = yesnotoi(o->ptr);
10458
10459 if (yn == -1) goto badfmt;
10460 server.no_appendfsync_on_rewrite = yn;
2e5eb04e 10461 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
10462 int old = server.appendonly;
10463 int new = yesnotoi(o->ptr);
10464
10465 if (new == -1) goto badfmt;
10466 if (old != new) {
10467 if (new == 0) {
10468 stopAppendOnly();
10469 } else {
10470 if (startAppendOnly() == REDIS_ERR) {
10471 addReplySds(c,sdscatprintf(sdsempty(),
10472 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10473 decrRefCount(o);
10474 return;
10475 }
10476 }
10477 }
a34e0a25 10478 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10479 int vlen, j;
10480 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10481
10482 /* Perform sanity check before setting the new config:
10483 * - Even number of args
10484 * - Seconds >= 1, changes >= 0 */
10485 if (vlen & 1) {
10486 sdsfreesplitres(v,vlen);
10487 goto badfmt;
10488 }
10489 for (j = 0; j < vlen; j++) {
10490 char *eptr;
10491 long val;
10492
10493 val = strtoll(v[j], &eptr, 10);
10494 if (eptr[0] != '\0' ||
10495 ((j & 1) == 0 && val < 1) ||
10496 ((j & 1) == 1 && val < 0)) {
10497 sdsfreesplitres(v,vlen);
10498 goto badfmt;
10499 }
10500 }
10501 /* Finally set the new config */
10502 resetServerSaveParams();
10503 for (j = 0; j < vlen; j += 2) {
10504 time_t seconds;
10505 int changes;
10506
10507 seconds = strtoll(v[j],NULL,10);
10508 changes = strtoll(v[j+1],NULL,10);
10509 appendServerSaveParams(seconds, changes);
10510 }
10511 sdsfreesplitres(v,vlen);
500ece7c 10512 } else {
10513 addReplySds(c,sdscatprintf(sdsempty(),
10514 "-ERR not supported CONFIG parameter %s\r\n",
10515 (char*)c->argv[2]->ptr));
10516 decrRefCount(o);
10517 return;
10518 }
10519 decrRefCount(o);
10520 addReply(c,shared.ok);
a34e0a25 10521 return;
10522
10523badfmt: /* Bad format errors */
10524 addReplySds(c,sdscatprintf(sdsempty(),
10525 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10526 (char*)o->ptr,
10527 (char*)c->argv[2]->ptr));
10528 decrRefCount(o);
500ece7c 10529}
10530
10531static void configGetCommand(redisClient *c) {
10532 robj *o = getDecodedObject(c->argv[2]);
10533 robj *lenobj = createObject(REDIS_STRING,NULL);
10534 char *pattern = o->ptr;
10535 int matches = 0;
10536
10537 addReply(c,lenobj);
10538 decrRefCount(lenobj);
10539
10540 if (stringmatch(pattern,"dbfilename",0)) {
10541 addReplyBulkCString(c,"dbfilename");
10542 addReplyBulkCString(c,server.dbfilename);
10543 matches++;
10544 }
10545 if (stringmatch(pattern,"requirepass",0)) {
10546 addReplyBulkCString(c,"requirepass");
10547 addReplyBulkCString(c,server.requirepass);
10548 matches++;
10549 }
10550 if (stringmatch(pattern,"masterauth",0)) {
10551 addReplyBulkCString(c,"masterauth");
10552 addReplyBulkCString(c,server.masterauth);
10553 matches++;
10554 }
10555 if (stringmatch(pattern,"maxmemory",0)) {
10556 char buf[128];
10557
2e5eb04e 10558 ll2string(buf,128,server.maxmemory);
500ece7c 10559 addReplyBulkCString(c,"maxmemory");
10560 addReplyBulkCString(c,buf);
10561 matches++;
10562 }
2e5eb04e 10563 if (stringmatch(pattern,"timeout",0)) {
10564 char buf[128];
10565
10566 ll2string(buf,128,server.maxidletime);
10567 addReplyBulkCString(c,"timeout");
10568 addReplyBulkCString(c,buf);
10569 matches++;
10570 }
10571 if (stringmatch(pattern,"appendonly",0)) {
10572 addReplyBulkCString(c,"appendonly");
10573 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10574 matches++;
10575 }
38db9171 10576 if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) {
10577 addReplyBulkCString(c,"no-appendfsync-on-rewrite");
10578 addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no");
10579 matches++;
10580 }
1b677732 10581 if (stringmatch(pattern,"appendfsync",0)) {
10582 char *policy;
10583
10584 switch(server.appendfsync) {
10585 case APPENDFSYNC_NO: policy = "no"; break;
10586 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10587 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10588 default: policy = "unknown"; break; /* too harmless to panic */
10589 }
10590 addReplyBulkCString(c,"appendfsync");
10591 addReplyBulkCString(c,policy);
10592 matches++;
10593 }
a34e0a25 10594 if (stringmatch(pattern,"save",0)) {
10595 sds buf = sdsempty();
10596 int j;
10597
10598 for (j = 0; j < server.saveparamslen; j++) {
10599 buf = sdscatprintf(buf,"%ld %d",
10600 server.saveparams[j].seconds,
10601 server.saveparams[j].changes);
10602 if (j != server.saveparamslen-1)
10603 buf = sdscatlen(buf," ",1);
10604 }
10605 addReplyBulkCString(c,"save");
10606 addReplyBulkCString(c,buf);
10607 sdsfree(buf);
10608 matches++;
10609 }
500ece7c 10610 decrRefCount(o);
10611 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10612}
10613
10614static void configCommand(redisClient *c) {
10615 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10616 if (c->argc != 4) goto badarity;
10617 configSetCommand(c);
10618 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10619 if (c->argc != 3) goto badarity;
10620 configGetCommand(c);
10621 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10622 if (c->argc != 2) goto badarity;
10623 server.stat_numcommands = 0;
10624 server.stat_numconnections = 0;
10625 server.stat_expiredkeys = 0;
10626 server.stat_starttime = time(NULL);
10627 addReply(c,shared.ok);
10628 } else {
10629 addReplySds(c,sdscatprintf(sdsempty(),
10630 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10631 }
10632 return;
10633
10634badarity:
10635 addReplySds(c,sdscatprintf(sdsempty(),
10636 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10637 (char*) c->argv[1]->ptr));
10638}
10639
befec3cd 10640/* =========================== Pubsub implementation ======================== */
10641
ffc6b7f8 10642static void freePubsubPattern(void *p) {
10643 pubsubPattern *pat = p;
10644
10645 decrRefCount(pat->pattern);
10646 zfree(pat);
10647}
10648
10649static int listMatchPubsubPattern(void *a, void *b) {
10650 pubsubPattern *pa = a, *pb = b;
10651
10652 return (pa->client == pb->client) &&
bf028098 10653 (equalStringObjects(pa->pattern,pb->pattern));
ffc6b7f8 10654}
10655
10656/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10657 * 0 if the client was already subscribed to that channel. */
10658static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
befec3cd 10659 struct dictEntry *de;
10660 list *clients = NULL;
10661 int retval = 0;
10662
ffc6b7f8 10663 /* Add the channel to the client -> channels hash table */
10664 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
befec3cd 10665 retval = 1;
ffc6b7f8 10666 incrRefCount(channel);
10667 /* Add the client to the channel -> list of clients hash table */
10668 de = dictFind(server.pubsub_channels,channel);
befec3cd 10669 if (de == NULL) {
10670 clients = listCreate();
ffc6b7f8 10671 dictAdd(server.pubsub_channels,channel,clients);
10672 incrRefCount(channel);
befec3cd 10673 } else {
10674 clients = dictGetEntryVal(de);
10675 }
10676 listAddNodeTail(clients,c);
10677 }
10678 /* Notify the client */
10679 addReply(c,shared.mbulk3);
10680 addReply(c,shared.subscribebulk);
ffc6b7f8 10681 addReplyBulk(c,channel);
482b672d 10682 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
befec3cd 10683 return retval;
10684}
10685
ffc6b7f8 10686/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10687 * 0 if the client was not subscribed to the specified channel. */
10688static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
befec3cd 10689 struct dictEntry *de;
10690 list *clients;
10691 listNode *ln;
10692 int retval = 0;
10693
ffc6b7f8 10694 /* Remove the channel from the client -> channels hash table */
10695 incrRefCount(channel); /* channel may be just a pointer to the same object
201037f5 10696 we have in the hash tables. Protect it... */
ffc6b7f8 10697 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
befec3cd 10698 retval = 1;
ffc6b7f8 10699 /* Remove the client from the channel -> clients list hash table */
10700 de = dictFind(server.pubsub_channels,channel);
befec3cd 10701 assert(de != NULL);
10702 clients = dictGetEntryVal(de);
10703 ln = listSearchKey(clients,c);
10704 assert(ln != NULL);
10705 listDelNode(clients,ln);
ff767a75 10706 if (listLength(clients) == 0) {
10707 /* Free the list and associated hash entry at all if this was
10708 * the latest client, so that it will be possible to abuse
ffc6b7f8 10709 * Redis PUBSUB creating millions of channels. */
10710 dictDelete(server.pubsub_channels,channel);
ff767a75 10711 }
befec3cd 10712 }
10713 /* Notify the client */
10714 if (notify) {
10715 addReply(c,shared.mbulk3);
10716 addReply(c,shared.unsubscribebulk);
ffc6b7f8 10717 addReplyBulk(c,channel);
482b672d 10718 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10719 listLength(c->pubsub_patterns));
10720
10721 }
10722 decrRefCount(channel); /* it is finally safe to release it */
10723 return retval;
10724}
10725
10726/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10727static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10728 int retval = 0;
10729
10730 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10731 retval = 1;
10732 pubsubPattern *pat;
10733 listAddNodeTail(c->pubsub_patterns,pattern);
10734 incrRefCount(pattern);
10735 pat = zmalloc(sizeof(*pat));
10736 pat->pattern = getDecodedObject(pattern);
10737 pat->client = c;
10738 listAddNodeTail(server.pubsub_patterns,pat);
10739 }
10740 /* Notify the client */
10741 addReply(c,shared.mbulk3);
10742 addReply(c,shared.psubscribebulk);
10743 addReplyBulk(c,pattern);
482b672d 10744 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
ffc6b7f8 10745 return retval;
10746}
10747
10748/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10749 * 0 if the client was not subscribed to the specified channel. */
10750static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10751 listNode *ln;
10752 pubsubPattern pat;
10753 int retval = 0;
10754
10755 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10756 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10757 retval = 1;
10758 listDelNode(c->pubsub_patterns,ln);
10759 pat.client = c;
10760 pat.pattern = pattern;
10761 ln = listSearchKey(server.pubsub_patterns,&pat);
10762 listDelNode(server.pubsub_patterns,ln);
10763 }
10764 /* Notify the client */
10765 if (notify) {
10766 addReply(c,shared.mbulk3);
10767 addReply(c,shared.punsubscribebulk);
10768 addReplyBulk(c,pattern);
482b672d 10769 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10770 listLength(c->pubsub_patterns));
befec3cd 10771 }
ffc6b7f8 10772 decrRefCount(pattern);
befec3cd 10773 return retval;
10774}
10775
ffc6b7f8 10776/* Unsubscribe from all the channels. Return the number of channels the
10777 * client was subscribed from. */
10778static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10779 dictIterator *di = dictGetIterator(c->pubsub_channels);
befec3cd 10780 dictEntry *de;
10781 int count = 0;
10782
10783 while((de = dictNext(di)) != NULL) {
ffc6b7f8 10784 robj *channel = dictGetEntryKey(de);
befec3cd 10785
ffc6b7f8 10786 count += pubsubUnsubscribeChannel(c,channel,notify);
befec3cd 10787 }
10788 dictReleaseIterator(di);
10789 return count;
10790}
10791
ffc6b7f8 10792/* Unsubscribe from all the patterns. Return the number of patterns the
10793 * client was subscribed from. */
10794static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10795 listNode *ln;
10796 listIter li;
10797 int count = 0;
10798
10799 listRewind(c->pubsub_patterns,&li);
10800 while ((ln = listNext(&li)) != NULL) {
10801 robj *pattern = ln->value;
10802
10803 count += pubsubUnsubscribePattern(c,pattern,notify);
10804 }
10805 return count;
10806}
10807
befec3cd 10808/* Publish a message */
ffc6b7f8 10809static int pubsubPublishMessage(robj *channel, robj *message) {
befec3cd 10810 int receivers = 0;
10811 struct dictEntry *de;
ffc6b7f8 10812 listNode *ln;
10813 listIter li;
befec3cd 10814
ffc6b7f8 10815 /* Send to clients listening for that channel */
10816 de = dictFind(server.pubsub_channels,channel);
befec3cd 10817 if (de) {
10818 list *list = dictGetEntryVal(de);
10819 listNode *ln;
10820 listIter li;
10821
10822 listRewind(list,&li);
10823 while ((ln = listNext(&li)) != NULL) {
10824 redisClient *c = ln->value;
10825
10826 addReply(c,shared.mbulk3);
10827 addReply(c,shared.messagebulk);
ffc6b7f8 10828 addReplyBulk(c,channel);
befec3cd 10829 addReplyBulk(c,message);
10830 receivers++;
10831 }
10832 }
ffc6b7f8 10833 /* Send to clients listening to matching channels */
10834 if (listLength(server.pubsub_patterns)) {
10835 listRewind(server.pubsub_patterns,&li);
10836 channel = getDecodedObject(channel);
10837 while ((ln = listNext(&li)) != NULL) {
10838 pubsubPattern *pat = ln->value;
10839
10840 if (stringmatchlen((char*)pat->pattern->ptr,
10841 sdslen(pat->pattern->ptr),
10842 (char*)channel->ptr,
10843 sdslen(channel->ptr),0)) {
c8d0ea0e 10844 addReply(pat->client,shared.mbulk4);
10845 addReply(pat->client,shared.pmessagebulk);
10846 addReplyBulk(pat->client,pat->pattern);
ffc6b7f8 10847 addReplyBulk(pat->client,channel);
10848 addReplyBulk(pat->client,message);
10849 receivers++;
10850 }
10851 }
10852 decrRefCount(channel);
10853 }
befec3cd 10854 return receivers;
10855}
10856
10857static void subscribeCommand(redisClient *c) {
10858 int j;
10859
10860 for (j = 1; j < c->argc; j++)
ffc6b7f8 10861 pubsubSubscribeChannel(c,c->argv[j]);
befec3cd 10862}
10863
10864static void unsubscribeCommand(redisClient *c) {
10865 if (c->argc == 1) {
ffc6b7f8 10866 pubsubUnsubscribeAllChannels(c,1);
10867 return;
10868 } else {
10869 int j;
10870
10871 for (j = 1; j < c->argc; j++)
10872 pubsubUnsubscribeChannel(c,c->argv[j],1);
10873 }
10874}
10875
10876static void psubscribeCommand(redisClient *c) {
10877 int j;
10878
10879 for (j = 1; j < c->argc; j++)
10880 pubsubSubscribePattern(c,c->argv[j]);
10881}
10882
10883static void punsubscribeCommand(redisClient *c) {
10884 if (c->argc == 1) {
10885 pubsubUnsubscribeAllPatterns(c,1);
befec3cd 10886 return;
10887 } else {
10888 int j;
10889
10890 for (j = 1; j < c->argc; j++)
ffc6b7f8 10891 pubsubUnsubscribePattern(c,c->argv[j],1);
befec3cd 10892 }
10893}
10894
10895static void publishCommand(redisClient *c) {
10896 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
482b672d 10897 addReplyLongLong(c,receivers);
befec3cd 10898}
10899
37ab76c9 10900/* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10901 *
10902 * The implementation uses a per-DB hash table mapping keys to list of clients
10903 * WATCHing those keys, so that given a key that is going to be modified
10904 * we can mark all the associated clients as dirty.
10905 *
10906 * Also every client contains a list of WATCHed keys so that's possible to
10907 * un-watch such keys when the client is freed or when UNWATCH is called. */
10908
10909/* In the client->watched_keys list we need to use watchedKey structures
10910 * as in order to identify a key in Redis we need both the key name and the
10911 * DB */
10912typedef struct watchedKey {
10913 robj *key;
10914 redisDb *db;
10915} watchedKey;
10916
10917/* Watch for the specified key */
10918static void watchForKey(redisClient *c, robj *key) {
10919 list *clients = NULL;
10920 listIter li;
10921 listNode *ln;
10922 watchedKey *wk;
10923
10924 /* Check if we are already watching for this key */
10925 listRewind(c->watched_keys,&li);
10926 while((ln = listNext(&li))) {
10927 wk = listNodeValue(ln);
10928 if (wk->db == c->db && equalStringObjects(key,wk->key))
10929 return; /* Key already watched */
10930 }
10931 /* This key is not already watched in this DB. Let's add it */
10932 clients = dictFetchValue(c->db->watched_keys,key);
10933 if (!clients) {
10934 clients = listCreate();
10935 dictAdd(c->db->watched_keys,key,clients);
10936 incrRefCount(key);
10937 }
10938 listAddNodeTail(clients,c);
10939 /* Add the new key to the lits of keys watched by this client */
10940 wk = zmalloc(sizeof(*wk));
10941 wk->key = key;
10942 wk->db = c->db;
10943 incrRefCount(key);
10944 listAddNodeTail(c->watched_keys,wk);
10945}
10946
10947/* Unwatch all the keys watched by this client. To clean the EXEC dirty
10948 * flag is up to the caller. */
10949static void unwatchAllKeys(redisClient *c) {
10950 listIter li;
10951 listNode *ln;
10952
10953 if (listLength(c->watched_keys) == 0) return;
10954 listRewind(c->watched_keys,&li);
10955 while((ln = listNext(&li))) {
10956 list *clients;
10957 watchedKey *wk;
10958
10959 /* Lookup the watched key -> clients list and remove the client
10960 * from the list */
10961 wk = listNodeValue(ln);
10962 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10963 assert(clients != NULL);
10964 listDelNode(clients,listSearchKey(clients,c));
10965 /* Kill the entry at all if this was the only client */
10966 if (listLength(clients) == 0)
10967 dictDelete(wk->db->watched_keys, wk->key);
10968 /* Remove this watched key from the client->watched list */
10969 listDelNode(c->watched_keys,ln);
10970 decrRefCount(wk->key);
10971 zfree(wk);
10972 }
10973}
10974
ca3f830b 10975/* "Touch" a key, so that if this key is being WATCHed by some client the
37ab76c9 10976 * next EXEC will fail. */
10977static void touchWatchedKey(redisDb *db, robj *key) {
10978 list *clients;
10979 listIter li;
10980 listNode *ln;
10981
10982 if (dictSize(db->watched_keys) == 0) return;
10983 clients = dictFetchValue(db->watched_keys, key);
10984 if (!clients) return;
10985
10986 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
10987 /* Check if we are already watching for this key */
10988 listRewind(clients,&li);
10989 while((ln = listNext(&li))) {
10990 redisClient *c = listNodeValue(ln);
10991
10992 c->flags |= REDIS_DIRTY_CAS;
10993 }
10994}
10995
9b30e1a2 10996/* On FLUSHDB or FLUSHALL all the watched keys that are present before the
10997 * flush but will be deleted as effect of the flushing operation should
10998 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
10999 * a FLUSHALL operation (all the DBs flushed). */
11000static void touchWatchedKeysOnFlush(int dbid) {
11001 listIter li1, li2;
11002 listNode *ln;
11003
11004 /* For every client, check all the waited keys */
11005 listRewind(server.clients,&li1);
11006 while((ln = listNext(&li1))) {
11007 redisClient *c = listNodeValue(ln);
11008 listRewind(c->watched_keys,&li2);
11009 while((ln = listNext(&li2))) {
11010 watchedKey *wk = listNodeValue(ln);
11011
11012 /* For every watched key matching the specified DB, if the
11013 * key exists, mark the client as dirty, as the key will be
11014 * removed. */
11015 if (dbid == -1 || wk->db->id == dbid) {
09241813 11016 if (dictFind(wk->db->dict, wk->key->ptr) != NULL)
9b30e1a2 11017 c->flags |= REDIS_DIRTY_CAS;
11018 }
11019 }
11020 }
11021}
11022
37ab76c9 11023static void watchCommand(redisClient *c) {
11024 int j;
11025
6531c94d 11026 if (c->flags & REDIS_MULTI) {
11027 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
11028 return;
11029 }
37ab76c9 11030 for (j = 1; j < c->argc; j++)
11031 watchForKey(c,c->argv[j]);
11032 addReply(c,shared.ok);
11033}
11034
11035static void unwatchCommand(redisClient *c) {
11036 unwatchAllKeys(c);
11037 c->flags &= (~REDIS_DIRTY_CAS);
11038 addReply(c,shared.ok);
11039}
11040
7f957c92 11041/* ================================= Debugging ============================== */
11042
ba798261 11043/* Compute the sha1 of string at 's' with 'len' bytes long.
11044 * The SHA1 is then xored againt the string pointed by digest.
11045 * Since xor is commutative, this operation is used in order to
11046 * "add" digests relative to unordered elements.
11047 *
11048 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
11049static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
11050 SHA1_CTX ctx;
11051 unsigned char hash[20], *s = ptr;
11052 int j;
11053
11054 SHA1Init(&ctx);
11055 SHA1Update(&ctx,s,len);
11056 SHA1Final(hash,&ctx);
11057
11058 for (j = 0; j < 20; j++)
11059 digest[j] ^= hash[j];
11060}
11061
11062static void xorObjectDigest(unsigned char *digest, robj *o) {
11063 o = getDecodedObject(o);
11064 xorDigest(digest,o->ptr,sdslen(o->ptr));
11065 decrRefCount(o);
11066}
11067
11068/* This function instead of just computing the SHA1 and xoring it
11069 * against diget, also perform the digest of "digest" itself and
11070 * replace the old value with the new one.
11071 *
11072 * So the final digest will be:
11073 *
11074 * digest = SHA1(digest xor SHA1(data))
11075 *
11076 * This function is used every time we want to preserve the order so
11077 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
11078 *
11079 * Also note that mixdigest("foo") followed by mixdigest("bar")
11080 * will lead to a different digest compared to "fo", "obar".
11081 */
11082static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
11083 SHA1_CTX ctx;
11084 char *s = ptr;
11085
11086 xorDigest(digest,s,len);
11087 SHA1Init(&ctx);
11088 SHA1Update(&ctx,digest,20);
11089 SHA1Final(digest,&ctx);
11090}
11091
11092static void mixObjectDigest(unsigned char *digest, robj *o) {
11093 o = getDecodedObject(o);
11094 mixDigest(digest,o->ptr,sdslen(o->ptr));
11095 decrRefCount(o);
11096}
11097
11098/* Compute the dataset digest. Since keys, sets elements, hashes elements
11099 * are not ordered, we use a trick: every aggregate digest is the xor
11100 * of the digests of their elements. This way the order will not change
11101 * the result. For list instead we use a feedback entering the output digest
11102 * as input in order to ensure that a different ordered list will result in
11103 * a different digest. */
11104static void computeDatasetDigest(unsigned char *final) {
11105 unsigned char digest[20];
11106 char buf[128];
11107 dictIterator *di = NULL;
11108 dictEntry *de;
11109 int j;
11110 uint32_t aux;
11111
11112 memset(final,0,20); /* Start with a clean result */
11113
11114 for (j = 0; j < server.dbnum; j++) {
11115 redisDb *db = server.db+j;
11116
11117 if (dictSize(db->dict) == 0) continue;
11118 di = dictGetIterator(db->dict);
11119
11120 /* hash the DB id, so the same dataset moved in a different
11121 * DB will lead to a different digest */
11122 aux = htonl(j);
11123 mixDigest(final,&aux,sizeof(aux));
11124
11125 /* Iterate this DB writing every entry */
11126 while((de = dictNext(di)) != NULL) {
09241813 11127 sds key;
11128 robj *keyobj, *o;
ba798261 11129 time_t expiretime;
11130
11131 memset(digest,0,20); /* This key-val digest */
11132 key = dictGetEntryKey(de);
09241813 11133 keyobj = createStringObject(key,sdslen(key));
11134
11135 mixDigest(digest,key,sdslen(key));
11136
11137 /* Make sure the key is loaded if VM is active */
11138 o = lookupKeyRead(db,keyobj);
cbae1d34 11139
ba798261 11140 aux = htonl(o->type);
11141 mixDigest(digest,&aux,sizeof(aux));
09241813 11142 expiretime = getExpire(db,keyobj);
ba798261 11143
11144 /* Save the key and associated value */
11145 if (o->type == REDIS_STRING) {
11146 mixObjectDigest(digest,o);
11147 } else if (o->type == REDIS_LIST) {
003f0840
PN
11148 listTypeIterator *li = listTypeInitIterator(o,0,REDIS_TAIL);
11149 listTypeEntry entry;
11150 while(listTypeNext(li,&entry)) {
11151 robj *eleobj = listTypeGet(&entry);
ba798261 11152 mixObjectDigest(digest,eleobj);
dc845730 11153 decrRefCount(eleobj);
ba798261 11154 }
003f0840 11155 listTypeReleaseIterator(li);
ba798261 11156 } else if (o->type == REDIS_SET) {
11157 dict *set = o->ptr;
11158 dictIterator *di = dictGetIterator(set);
11159 dictEntry *de;
11160
11161 while((de = dictNext(di)) != NULL) {
11162 robj *eleobj = dictGetEntryKey(de);
11163
11164 xorObjectDigest(digest,eleobj);
11165 }
11166 dictReleaseIterator(di);
11167 } else if (o->type == REDIS_ZSET) {
11168 zset *zs = o->ptr;
11169 dictIterator *di = dictGetIterator(zs->dict);
11170 dictEntry *de;
11171
11172 while((de = dictNext(di)) != NULL) {
11173 robj *eleobj = dictGetEntryKey(de);
11174 double *score = dictGetEntryVal(de);
11175 unsigned char eledigest[20];
11176
11177 snprintf(buf,sizeof(buf),"%.17g",*score);
11178 memset(eledigest,0,20);
11179 mixObjectDigest(eledigest,eleobj);
11180 mixDigest(eledigest,buf,strlen(buf));
11181 xorDigest(digest,eledigest,20);
11182 }
11183 dictReleaseIterator(di);
11184 } else if (o->type == REDIS_HASH) {
d1578a33 11185 hashTypeIterator *hi;
ba798261 11186 robj *obj;
11187
d1578a33
PN
11188 hi = hashTypeInitIterator(o);
11189 while (hashTypeNext(hi) != REDIS_ERR) {
ba798261 11190 unsigned char eledigest[20];
11191
11192 memset(eledigest,0,20);
d1578a33 11193 obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
ba798261 11194 mixObjectDigest(eledigest,obj);
11195 decrRefCount(obj);
d1578a33 11196 obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
ba798261 11197 mixObjectDigest(eledigest,obj);
11198 decrRefCount(obj);
11199 xorDigest(digest,eledigest,20);
11200 }
d1578a33 11201 hashTypeReleaseIterator(hi);
ba798261 11202 } else {
11203 redisPanic("Unknown object type");
11204 }
ba798261 11205 /* If the key has an expire, add it to the mix */
11206 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
11207 /* We can finally xor the key-val digest to the final digest */
11208 xorDigest(final,digest,20);
09241813 11209 decrRefCount(keyobj);
ba798261 11210 }
11211 dictReleaseIterator(di);
11212 }
11213}
11214
7f957c92 11215static void debugCommand(redisClient *c) {
11216 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
11217 *((char*)-1) = 'x';
210e29f7 11218 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
11219 if (rdbSave(server.dbfilename) != REDIS_OK) {
11220 addReply(c,shared.err);
11221 return;
11222 }
11223 emptyDb();
11224 if (rdbLoad(server.dbfilename) != REDIS_OK) {
11225 addReply(c,shared.err);
11226 return;
11227 }
11228 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
11229 addReply(c,shared.ok);
71c2b467 11230 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
11231 emptyDb();
11232 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
11233 addReply(c,shared.err);
11234 return;
11235 }
11236 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
11237 addReply(c,shared.ok);
333298da 11238 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
09241813 11239 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11240 robj *val;
333298da 11241
11242 if (!de) {
11243 addReply(c,shared.nokeyerr);
11244 return;
11245 }
333298da 11246 val = dictGetEntryVal(de);
560db612 11247 if (!server.vm_enabled || (val->storage == REDIS_VM_MEMORY ||
11248 val->storage == REDIS_VM_SWAPPING)) {
07efaf74 11249 char *strenc;
11250 char buf[128];
11251
11252 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
11253 strenc = strencoding[val->encoding];
11254 } else {
11255 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
11256 strenc = buf;
11257 }
ace06542 11258 addReplySds(c,sdscatprintf(sdsempty(),
09241813 11259 "+Value at:%p refcount:%d "
07efaf74 11260 "encoding:%s serializedlength:%lld\r\n",
09241813 11261 (void*)val, val->refcount,
07efaf74 11262 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 11263 } else {
560db612 11264 vmpointer *vp = (vmpointer*) val;
ace06542 11265 addReplySds(c,sdscatprintf(sdsempty(),
09241813 11266 "+Value swapped at: page %llu "
ace06542 11267 "using %llu pages\r\n",
09241813 11268 (unsigned long long) vp->page,
560db612 11269 (unsigned long long) vp->usedpages));
ace06542 11270 }
78ebe4c8 11271 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
11272 lookupKeyRead(c->db,c->argv[2]);
11273 addReply(c,shared.ok);
7d30035d 11274 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
09241813 11275 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11276 robj *val;
560db612 11277 vmpointer *vp;
7d30035d 11278
11279 if (!server.vm_enabled) {
11280 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
11281 return;
11282 }
11283 if (!de) {
11284 addReply(c,shared.nokeyerr);
11285 return;
11286 }
7d30035d 11287 val = dictGetEntryVal(de);
4ef8de8a 11288 /* Swap it */
560db612 11289 if (val->storage != REDIS_VM_MEMORY) {
7d30035d 11290 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
560db612 11291 } else if (val->refcount != 1) {
11292 addReplySds(c,sdsnew("-ERR Object is shared\r\n"));
11293 } else if ((vp = vmSwapObjectBlocking(val)) != NULL) {
11294 dictGetEntryVal(de) = vp;
7d30035d 11295 addReply(c,shared.ok);
11296 } else {
11297 addReply(c,shared.err);
11298 }
59305dc7 11299 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
11300 long keys, j;
11301 robj *key, *val;
11302 char buf[128];
11303
11304 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
11305 return;
11306 for (j = 0; j < keys; j++) {
11307 snprintf(buf,sizeof(buf),"key:%lu",j);
11308 key = createStringObject(buf,strlen(buf));
11309 if (lookupKeyRead(c->db,key) != NULL) {
11310 decrRefCount(key);
11311 continue;
11312 }
11313 snprintf(buf,sizeof(buf),"value:%lu",j);
11314 val = createStringObject(buf,strlen(buf));
09241813 11315 dbAdd(c->db,key,val);
11316 decrRefCount(key);
59305dc7 11317 }
11318 addReply(c,shared.ok);
ba798261 11319 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
11320 unsigned char digest[20];
11321 sds d = sdsnew("+");
11322 int j;
11323
11324 computeDatasetDigest(digest);
11325 for (j = 0; j < 20; j++)
11326 d = sdscatprintf(d, "%02x",digest[j]);
11327
11328 d = sdscatlen(d,"\r\n",2);
11329 addReplySds(c,d);
7f957c92 11330 } else {
333298da 11331 addReplySds(c,sdsnew(
bdcb92f2 11332 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 11333 }
11334}
56906eef 11335
6c96ba7d 11336static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 11337 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
fdfb02e7 11338 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
dfc5e96c 11339#ifdef HAVE_BACKTRACE
11340 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11341 *((char*)-1) = 'x';
11342#endif
11343}
11344
c651fd9e 11345static void _redisPanic(char *msg, char *file, int line) {
11346 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
17772754 11347 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
c651fd9e 11348#ifdef HAVE_BACKTRACE
11349 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11350 *((char*)-1) = 'x';
11351#endif
11352}
11353
bcfc686d 11354/* =================================== Main! ================================ */
56906eef 11355
bcfc686d 11356#ifdef __linux__
11357int linuxOvercommitMemoryValue(void) {
11358 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
11359 char buf[64];
56906eef 11360
bcfc686d 11361 if (!fp) return -1;
11362 if (fgets(buf,64,fp) == NULL) {
11363 fclose(fp);
11364 return -1;
11365 }
11366 fclose(fp);
56906eef 11367
bcfc686d 11368 return atoi(buf);
11369}
11370
11371void linuxOvercommitMemoryWarning(void) {
11372 if (linuxOvercommitMemoryValue() == 0) {
7ccd2d0a 11373 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
bcfc686d 11374 }
11375}
11376#endif /* __linux__ */
11377
11378static void daemonize(void) {
11379 int fd;
11380 FILE *fp;
11381
11382 if (fork() != 0) exit(0); /* parent exits */
11383 setsid(); /* create a new session */
11384
11385 /* Every output goes to /dev/null. If Redis is daemonized but
11386 * the 'logfile' is set to 'stdout' in the configuration file
11387 * it will not log at all. */
11388 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
11389 dup2(fd, STDIN_FILENO);
11390 dup2(fd, STDOUT_FILENO);
11391 dup2(fd, STDERR_FILENO);
11392 if (fd > STDERR_FILENO) close(fd);
11393 }
11394 /* Try to write the pid file */
11395 fp = fopen(server.pidfile,"w");
11396 if (fp) {
11397 fprintf(fp,"%d\n",getpid());
11398 fclose(fp);
56906eef 11399 }
56906eef 11400}
11401
42ab0172 11402static void version() {
8a3b0d2d 11403 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
11404 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
42ab0172
AO
11405 exit(0);
11406}
11407
723fb69b
AO
11408static void usage() {
11409 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 11410 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
11411 exit(1);
11412}
11413
bcfc686d 11414int main(int argc, char **argv) {
9651a787 11415 time_t start;
11416
bcfc686d 11417 initServerConfig();
1a132bbc 11418 sortCommandTable();
bcfc686d 11419 if (argc == 2) {
44efe66e 11420 if (strcmp(argv[1], "-v") == 0 ||
11421 strcmp(argv[1], "--version") == 0) version();
11422 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 11423 resetServerSaveParams();
11424 loadServerConfig(argv[1]);
723fb69b
AO
11425 } else if ((argc > 2)) {
11426 usage();
bcfc686d 11427 } else {
11428 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
11429 }
bcfc686d 11430 if (server.daemonize) daemonize();
71c54b21 11431 initServer();
bcfc686d 11432 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
11433#ifdef __linux__
11434 linuxOvercommitMemoryWarning();
11435#endif
9651a787 11436 start = time(NULL);
bcfc686d 11437 if (server.appendonly) {
11438 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 11439 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 11440 } else {
11441 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 11442 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 11443 }
bcfc686d 11444 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 11445 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 11446 aeMain(server.el);
11447 aeDeleteEventLoop(server.el);
11448 return 0;
11449}
11450
11451/* ============================= Backtrace support ========================= */
11452
11453#ifdef HAVE_BACKTRACE
11454static char *findFuncName(void *pointer, unsigned long *offset);
11455
56906eef 11456static void *getMcontextEip(ucontext_t *uc) {
11457#if defined(__FreeBSD__)
11458 return (void*) uc->uc_mcontext.mc_eip;
11459#elif defined(__dietlibc__)
11460 return (void*) uc->uc_mcontext.eip;
06db1f50 11461#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 11462 #if __x86_64__
11463 return (void*) uc->uc_mcontext->__ss.__rip;
11464 #else
56906eef 11465 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 11466 #endif
06db1f50 11467#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 11468 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 11469 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 11470 #else
11471 return (void*) uc->uc_mcontext->__ss.__eip;
e0a62c7f 11472 #endif
54bac49d 11473#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 11474 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 11475#elif defined(__ia64__) /* Linux IA64 */
11476 return (void*) uc->uc_mcontext.sc_ip;
11477#else
11478 return NULL;
56906eef 11479#endif
11480}
11481
11482static void segvHandler(int sig, siginfo_t *info, void *secret) {
11483 void *trace[100];
11484 char **messages = NULL;
11485 int i, trace_size = 0;
11486 unsigned long offset=0;
56906eef 11487 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 11488 sds infostring;
56906eef 11489 REDIS_NOTUSED(info);
11490
11491 redisLog(REDIS_WARNING,
11492 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 11493 infostring = genRedisInfoString();
11494 redisLog(REDIS_WARNING, "%s",infostring);
11495 /* It's not safe to sdsfree() the returned string under memory
11496 * corruption conditions. Let it leak as we are going to abort */
e0a62c7f 11497
56906eef 11498 trace_size = backtrace(trace, 100);
de96dbfe 11499 /* overwrite sigaction with caller's address */
b91cf5ef 11500 if (getMcontextEip(uc) != NULL) {
11501 trace[1] = getMcontextEip(uc);
11502 }
56906eef 11503 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 11504
d76412d1 11505 for (i=1; i<trace_size; ++i) {
56906eef 11506 char *fn = findFuncName(trace[i], &offset), *p;
11507
11508 p = strchr(messages[i],'+');
11509 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11510 redisLog(REDIS_WARNING,"%s", messages[i]);
11511 } else {
11512 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11513 }
11514 }
b177fd30 11515 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 11516 _exit(0);
fe3bbfbe 11517}
56906eef 11518
fab43727 11519static void sigtermHandler(int sig) {
11520 REDIS_NOTUSED(sig);
b58ba105 11521
fab43727 11522 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11523 server.shutdown_asap = 1;
b58ba105
AM
11524}
11525
56906eef 11526static void setupSigSegvAction(void) {
11527 struct sigaction act;
11528
11529 sigemptyset (&act.sa_mask);
11530 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11531 * is used. Otherwise, sa_handler is used */
11532 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11533 act.sa_sigaction = segvHandler;
11534 sigaction (SIGSEGV, &act, NULL);
11535 sigaction (SIGBUS, &act, NULL);
12fea928 11536 sigaction (SIGFPE, &act, NULL);
11537 sigaction (SIGILL, &act, NULL);
11538 sigaction (SIGBUS, &act, NULL);
b58ba105
AM
11539
11540 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
fab43727 11541 act.sa_handler = sigtermHandler;
b58ba105 11542 sigaction (SIGTERM, &act, NULL);
e65fdc78 11543 return;
56906eef 11544}
e65fdc78 11545
bcfc686d 11546#include "staticsymbols.h"
11547/* This function try to convert a pointer into a function name. It's used in
11548 * oreder to provide a backtrace under segmentation fault that's able to
11549 * display functions declared as static (otherwise the backtrace is useless). */
11550static char *findFuncName(void *pointer, unsigned long *offset){
11551 int i, ret = -1;
11552 unsigned long off, minoff = 0;
ed9b544e 11553
bcfc686d 11554 /* Try to match against the Symbol with the smallest offset */
11555 for (i=0; symsTable[i].pointer; i++) {
11556 unsigned long lp = (unsigned long) pointer;
0bc03378 11557
bcfc686d 11558 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11559 off=lp-symsTable[i].pointer;
11560 if (ret < 0 || off < minoff) {
11561 minoff=off;
11562 ret=i;
11563 }
11564 }
0bc03378 11565 }
bcfc686d 11566 if (ret == -1) return NULL;
11567 *offset = minoff;
11568 return symsTable[ret].name;
0bc03378 11569}
bcfc686d 11570#else /* HAVE_BACKTRACE */
11571static void setupSigSegvAction(void) {
0bc03378 11572}
bcfc686d 11573#endif /* HAVE_BACKTRACE */
0bc03378 11574
ed9b544e 11575
ed9b544e 11576
bcfc686d 11577/* The End */
11578
11579
ed9b544e 11580