]> git.saurik.com Git - redis.git/blame - redis.c
Debug message was printing stuff that are sometimes not initialized/valid
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
9005896c 30#define REDIS_VERSION "2.1.1"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
40#include <signal.h>
fbf9bcdb 41
42#ifdef HAVE_BACKTRACE
c9468bcf 43#include <execinfo.h>
44#include <ucontext.h>
fbf9bcdb 45#endif /* HAVE_BACKTRACE */
46
ed9b544e 47#include <sys/wait.h>
48#include <errno.h>
49#include <assert.h>
50#include <ctype.h>
51#include <stdarg.h>
52#include <inttypes.h>
53#include <arpa/inet.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#include <sys/time.h>
57#include <sys/resource.h>
2895e862 58#include <sys/uio.h>
f78fd11b 59#include <limits.h>
fb82e75c 60#include <float.h>
a7866db6 61#include <math.h>
92f8e882 62#include <pthread.h>
0bc1b2f6 63
64#if defined(__sun)
5043dff3 65#include "solarisfixes.h"
66#endif
ed9b544e 67
c9468bcf 68#include "redis.h"
ed9b544e 69#include "ae.h" /* Event driven programming library */
70#include "sds.h" /* Dynamic safe strings */
71#include "anet.h" /* Networking the easy way */
72#include "dict.h" /* Hash tables */
73#include "adlist.h" /* Linked lists */
74#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 75#include "lzf.h" /* LZF compression library */
76#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
ba798261 77#include "zipmap.h" /* Compact dictionary-alike data structure */
78#include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
5436146c 79#include "release.h" /* Release and/or git repository information */
ed9b544e 80
81/* Error codes */
82#define REDIS_OK 0
83#define REDIS_ERR -1
84
85/* Static server configuration */
86#define REDIS_SERVERPORT 6379 /* TCP port */
87#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 88#define REDIS_IOBUF_LEN 1024
ed9b544e 89#define REDIS_LOADBUF_LEN 1024
248ea310 90#define REDIS_STATIC_ARGS 8
ed9b544e 91#define REDIS_DEFAULT_DBNUM 16
92#define REDIS_CONFIGLINE_MAX 1024
e4ed181d 93#define REDIS_OBJFREELIST_MAX 0 /* Max number of objects to cache */
ed9b544e 94#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
8ca3e9d1 95#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
6f376729 96#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 97#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
98
99/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
100#define REDIS_WRITEV_THRESHOLD 3
101/* Max number of iovecs used for each writev call */
102#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 103
104/* Hash table parameters */
105#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 106
107/* Command flags */
3fd78bcd 108#define REDIS_CMD_BULK 1 /* Bulk write command */
109#define REDIS_CMD_INLINE 2 /* Inline command */
110/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
111 this flags will return an error when the 'maxmemory' option is set in the
112 config file and the server is using more than maxmemory bytes of memory.
113 In short this commands are denied on low memory conditions. */
114#define REDIS_CMD_DENYOOM 4
4005fef1 115#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
ed9b544e 116
117/* Object types */
118#define REDIS_STRING 0
119#define REDIS_LIST 1
120#define REDIS_SET 2
1812e024 121#define REDIS_ZSET 3
122#define REDIS_HASH 4
560db612 123#define REDIS_VMPOINTER 8
f78fd11b 124
5234952b 125/* Objects encoding. Some kind of objects like Strings and Hashes can be
126 * internally represented in multiple ways. The 'encoding' field of the object
127 * is set to one of this fields for this object. */
942a3961 128#define REDIS_ENCODING_RAW 0 /* Raw representation */
129#define REDIS_ENCODING_INT 1 /* Encoded as integer */
5234952b 130#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
131#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
942a3961 132
07efaf74 133static char* strencoding[] = {
134 "raw", "int", "zipmap", "hashtable"
135};
136
f78fd11b 137/* Object types only used for dumping to disk */
bb32ede5 138#define REDIS_EXPIRETIME 253
ed9b544e 139#define REDIS_SELECTDB 254
140#define REDIS_EOF 255
141
f78fd11b 142/* Defines related to the dump file format. To store 32 bits lengths for short
143 * keys requires a lot of space, so we check the most significant 2 bits of
144 * the first byte to interpreter the length:
145 *
146 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
147 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
148 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 149 * 11|000000 this means: specially encoded object will follow. The six bits
150 * number specify the kind of object that follows.
151 * See the REDIS_RDB_ENC_* defines.
f78fd11b 152 *
10c43610 153 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
154 * values, will fit inside. */
f78fd11b 155#define REDIS_RDB_6BITLEN 0
156#define REDIS_RDB_14BITLEN 1
157#define REDIS_RDB_32BITLEN 2
17be1a4a 158#define REDIS_RDB_ENCVAL 3
f78fd11b 159#define REDIS_RDB_LENERR UINT_MAX
160
a4d1ba9a 161/* When a length of a string object stored on disk has the first two bits
162 * set, the remaining two bits specify a special encoding for the object
163 * accordingly to the following defines: */
164#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
165#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
166#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 167#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 168
75680a3c 169/* Virtual memory object->where field. */
170#define REDIS_VM_MEMORY 0 /* The object is on memory */
171#define REDIS_VM_SWAPPED 1 /* The object is on disk */
172#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
173#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
174
06224fec 175/* Virtual memory static configuration stuff.
176 * Check vmFindContiguousPages() to know more about this magic numbers. */
177#define REDIS_VM_MAX_NEAR_PAGES 65536
178#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 179#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 180#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 181/* The following is the *percentage* of completed I/O jobs to process when the
182 * handelr is called. While Virtual Memory I/O operations are performed by
183 * threads, this operations must be processed by the main thread when completed
184 * in order to take effect. */
c953f24b 185#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 186
ed9b544e 187/* Client flags */
d5d55fc3 188#define REDIS_SLAVE 1 /* This client is a slave server */
189#define REDIS_MASTER 2 /* This client is a master server */
190#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
191#define REDIS_MULTI 8 /* This client is in a MULTI context */
192#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
193#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
37ab76c9 194#define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
ed9b544e 195
40d224a9 196/* Slave replication state - slave side */
ed9b544e 197#define REDIS_REPL_NONE 0 /* No active replication */
198#define REDIS_REPL_CONNECT 1 /* Must connect to master */
199#define REDIS_REPL_CONNECTED 2 /* Connected to master */
200
40d224a9 201/* Slave replication state - from the point of view of master
202 * Note that in SEND_BULK and ONLINE state the slave receives new updates
203 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
204 * to start the next background saving in order to send updates to it. */
205#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
206#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
207#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
208#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
209
ed9b544e 210/* List related stuff */
211#define REDIS_HEAD 0
212#define REDIS_TAIL 1
213
214/* Sort operations */
215#define REDIS_SORT_GET 0
443c6409 216#define REDIS_SORT_ASC 1
217#define REDIS_SORT_DESC 2
ed9b544e 218#define REDIS_SORTKEY_MAX 1024
219
220/* Log levels */
221#define REDIS_DEBUG 0
f870935d 222#define REDIS_VERBOSE 1
223#define REDIS_NOTICE 2
224#define REDIS_WARNING 3
ed9b544e 225
226/* Anti-warning macro... */
227#define REDIS_NOTUSED(V) ((void) V)
228
6b47e12e 229#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
230#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 231
48f0308a 232/* Append only defines */
233#define APPENDFSYNC_NO 0
234#define APPENDFSYNC_ALWAYS 1
235#define APPENDFSYNC_EVERYSEC 2
236
cbba7dd7 237/* Hashes related defaults */
238#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
239#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
240
dfc5e96c 241/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 242#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
c651fd9e 243#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
6c96ba7d 244static void _redisAssert(char *estr, char *file, int line);
c651fd9e 245static void _redisPanic(char *msg, char *file, int line);
dfc5e96c 246
ed9b544e 247/*================================= Data types ============================== */
248
249/* A redis object, that is a type able to hold a string / list / set */
75680a3c 250
75680a3c 251/* The actual Redis Object */
ed9b544e 252typedef struct redisObject {
560db612 253 unsigned type:4;
254 unsigned storage:2; /* REDIS_VM_MEMORY or REDIS_VM_SWAPPING */
255 unsigned encoding:4;
256 unsigned lru:22; /* lru time (relative to server.lruclock) */
ed9b544e 257 int refcount;
560db612 258 void *ptr;
75680a3c 259 /* VM fields, this are only allocated if VM is active, otherwise the
260 * object allocation function will just allocate
261 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
262 * Redis without VM active will not have any overhead. */
ed9b544e 263} robj;
264
560db612 265/* The VM pointer structure - identifies an object in the swap file.
266 *
267 * This object is stored in place of the value
268 * object in the main key->value hash table representing a database.
269 * Note that the first fields (type, storage) are the same as the redisObject
270 * structure so that vmPointer strucuters can be accessed even when casted
271 * as redisObject structures.
272 *
273 * This is useful as we don't know if a value object is or not on disk, but we
274 * are always free of accessing obj->storage to check this. For vmPointer
275 * structures "type" is set to REDIS_VMPOINTER (even if without this field
276 * is still possible to check the kind of object from the value of 'storage').*/
277typedef struct vmPointer {
278 unsigned type:4;
279 unsigned storage:2; /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
280 unsigned notused:26;
281 unsigned int vtype; /* type of the object stored in the swap file */
282 off_t page; /* the page at witch the object is stored on disk */
283 off_t usedpages; /* number of pages used on disk */
284} vmpointer;
285
dfc5e96c 286/* Macro used to initalize a Redis object allocated on the stack.
287 * Note that this macro is taken near the structure definition to make sure
288 * we'll update it when the structure is changed, to avoid bugs like
289 * bug #85 introduced exactly in this way. */
290#define initStaticStringObject(_var,_ptr) do { \
291 _var.refcount = 1; \
292 _var.type = REDIS_STRING; \
293 _var.encoding = REDIS_ENCODING_RAW; \
294 _var.ptr = _ptr; \
560db612 295 _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 296} while(0);
297
3305306f 298typedef struct redisDb {
4409877e 299 dict *dict; /* The keyspace for this DB */
300 dict *expires; /* Timeout of keys with a timeout set */
37ab76c9 301 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 302 dict *io_keys; /* Keys with clients waiting for VM I/O */
37ab76c9 303 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
3305306f 304 int id;
305} redisDb;
306
6e469882 307/* Client MULTI/EXEC state */
308typedef struct multiCmd {
309 robj **argv;
310 int argc;
311 struct redisCommand *cmd;
312} multiCmd;
313
314typedef struct multiState {
315 multiCmd *commands; /* Array of MULTI commands */
316 int count; /* Total number of MULTI commands */
317} multiState;
318
ed9b544e 319/* With multiplexing we need to take per-clinet state.
320 * Clients are taken in a liked list. */
321typedef struct redisClient {
322 int fd;
3305306f 323 redisDb *db;
ed9b544e 324 int dictid;
325 sds querybuf;
e8a74421 326 robj **argv, **mbargv;
327 int argc, mbargc;
40d224a9 328 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 329 int multibulk; /* multi bulk command format active */
ed9b544e 330 list *reply;
331 int sentlen;
332 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 333 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 334 int slaveseldb; /* slave selected db, if this client is a slave */
335 int authenticated; /* when requirepass is non-NULL */
336 int replstate; /* replication state if this is a slave */
337 int repldbfd; /* replication DB file descriptor */
6e469882 338 long repldboff; /* replication DB file offset */
40d224a9 339 off_t repldbsize; /* replication DB file size */
6e469882 340 multiState mstate; /* MULTI/EXEC state */
37ab76c9 341 robj **blocking_keys; /* The key we are waiting to terminate a blocking
4409877e 342 * operation such as BLPOP. Otherwise NULL. */
37ab76c9 343 int blocking_keys_num; /* Number of blocking keys */
4409877e 344 time_t blockingto; /* Blocking operation timeout. If UNIX current time
345 * is >= blockingto then the operation timed out. */
92f8e882 346 list *io_keys; /* Keys this client is waiting to be loaded from the
347 * swap file in order to continue. */
37ab76c9 348 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
ffc6b7f8 349 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
350 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
ed9b544e 351} redisClient;
352
353struct saveparam {
354 time_t seconds;
355 int changes;
356};
357
358/* Global server state structure */
359struct redisServer {
360 int port;
361 int fd;
3305306f 362 redisDb *db;
ed9b544e 363 long long dirty; /* changes to DB from the last save */
364 list *clients;
87eca727 365 list *slaves, *monitors;
ed9b544e 366 char neterr[ANET_ERR_LEN];
367 aeEventLoop *el;
368 int cronloops; /* number of times the cron function run */
369 list *objfreelist; /* A list of freed objects to avoid malloc() */
370 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 371 /* Fields used only for stats */
372 time_t stat_starttime; /* server start time */
373 long long stat_numcommands; /* number of processed commands */
374 long long stat_numconnections; /* number of connections received */
2a6a2ed1 375 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 376 /* Configuration */
377 int verbosity;
378 int glueoutputbuf;
379 int maxidletime;
380 int dbnum;
381 int daemonize;
44b38ef4 382 int appendonly;
48f0308a 383 int appendfsync;
38db9171 384 int no_appendfsync_on_rewrite;
fab43727 385 int shutdown_asap;
48f0308a 386 time_t lastfsync;
44b38ef4 387 int appendfd;
388 int appendseldb;
ed329fcf 389 char *pidfile;
9f3c422c 390 pid_t bgsavechildpid;
9d65a1bb 391 pid_t bgrewritechildpid;
392 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
28ed1f33 393 sds aofbuf; /* AOF buffer, written before entering the event loop */
ed9b544e 394 struct saveparam *saveparams;
395 int saveparamslen;
396 char *logfile;
397 char *bindaddr;
398 char *dbfilename;
44b38ef4 399 char *appendfilename;
abcb223e 400 char *requirepass;
121f70cf 401 int rdbcompression;
8ca3e9d1 402 int activerehashing;
ed9b544e 403 /* Replication related */
404 int isslave;
d0ccebcf 405 char *masterauth;
ed9b544e 406 char *masterhost;
407 int masterport;
40d224a9 408 redisClient *master; /* client that is master for this slave */
ed9b544e 409 int replstate;
285add55 410 unsigned int maxclients;
4ef8de8a 411 unsigned long long maxmemory;
d5d55fc3 412 unsigned int blpop_blocked_clients;
413 unsigned int vm_blocked_clients;
ed9b544e 414 /* Sort parameters - qsort_r() is only available under BSD so we
415 * have to take this state global, in order to pass it to sortCompare() */
416 int sort_desc;
417 int sort_alpha;
418 int sort_bypattern;
75680a3c 419 /* Virtual memory configuration */
420 int vm_enabled;
054e426d 421 char *vm_swap_file;
75680a3c 422 off_t vm_page_size;
423 off_t vm_pages;
4ef8de8a 424 unsigned long long vm_max_memory;
cbba7dd7 425 /* Hashes config */
426 size_t hash_max_zipmap_entries;
427 size_t hash_max_zipmap_value;
75680a3c 428 /* Virtual memory state */
429 FILE *vm_fp;
430 int vm_fd;
431 off_t vm_next_page; /* Next probably empty page */
432 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 433 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 434 time_t unixtime; /* Unix time sampled every second. */
92f8e882 435 /* Virtual memory I/O threads stuff */
92f8e882 436 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 437 * put the result of the operation in the io_done list. While the
438 * job is being processed, it's put on io_processing queue. */
439 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
440 list *io_processing; /* List of VM I/O jobs being processed */
441 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 442 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 443 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 444 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
445 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 446 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 447 int io_active_threads; /* Number of running I/O threads */
448 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 449 /* Our main thread is blocked on the event loop, locking for sockets ready
450 * to be read or written, so when a threaded I/O operation is ready to be
451 * processed by the main thread, the I/O thread will use a unix pipe to
452 * awake the main thread. The followings are the two pipe FDs. */
453 int io_ready_pipe_read;
454 int io_ready_pipe_write;
7d98e08c 455 /* Virtual memory stats */
456 unsigned long long vm_stats_used_pages;
457 unsigned long long vm_stats_swapped_objects;
458 unsigned long long vm_stats_swapouts;
459 unsigned long long vm_stats_swapins;
befec3cd 460 /* Pubsub */
ffc6b7f8 461 dict *pubsub_channels; /* Map channels to list of subscribed clients */
462 list *pubsub_patterns; /* A list of pubsub_patterns */
befec3cd 463 /* Misc */
b9bc0eef 464 FILE *devnull;
560db612 465 unsigned lruclock:22; /* clock incrementing every minute, for LRU */
466 unsigned lruclock_padding:10;
ed9b544e 467};
468
ffc6b7f8 469typedef struct pubsubPattern {
470 redisClient *client;
471 robj *pattern;
472} pubsubPattern;
473
ed9b544e 474typedef void redisCommandProc(redisClient *c);
ca1788b5 475typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
ed9b544e 476struct redisCommand {
477 char *name;
478 redisCommandProc *proc;
479 int arity;
480 int flags;
76583ea4
PN
481 /* Use a function to determine which keys need to be loaded
482 * in the background prior to executing this command. Takes precedence
483 * over vm_firstkey and others, ignored when NULL */
ca1788b5 484 redisVmPreloadProc *vm_preload_proc;
7c775e09 485 /* What keys should be loaded in background when calling this command? */
486 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
487 int vm_lastkey; /* THe last argument that's a key */
488 int vm_keystep; /* The step between first and last key */
ed9b544e 489};
490
de96dbfe 491struct redisFunctionSym {
492 char *name;
56906eef 493 unsigned long pointer;
de96dbfe 494};
495
ed9b544e 496typedef struct _redisSortObject {
497 robj *obj;
498 union {
499 double score;
500 robj *cmpobj;
501 } u;
502} redisSortObject;
503
504typedef struct _redisSortOperation {
505 int type;
506 robj *pattern;
507} redisSortOperation;
508
6b47e12e 509/* ZSETs use a specialized version of Skiplists */
510
511typedef struct zskiplistNode {
512 struct zskiplistNode **forward;
e3870fab 513 struct zskiplistNode *backward;
912b9165 514 unsigned int *span;
6b47e12e 515 double score;
516 robj *obj;
517} zskiplistNode;
518
519typedef struct zskiplist {
e3870fab 520 struct zskiplistNode *header, *tail;
d13f767c 521 unsigned long length;
6b47e12e 522 int level;
523} zskiplist;
524
1812e024 525typedef struct zset {
526 dict *dict;
6b47e12e 527 zskiplist *zsl;
1812e024 528} zset;
529
6b47e12e 530/* Our shared "common" objects */
531
05df7621 532#define REDIS_SHARED_INTEGERS 10000
ed9b544e 533struct sharedObjectsStruct {
c937aa89 534 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 535 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 536 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
537 *outofrangeerr, *plus,
ed9b544e 538 *select0, *select1, *select2, *select3, *select4,
befec3cd 539 *select5, *select6, *select7, *select8, *select9,
c8d0ea0e 540 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
541 *mbulk4, *psubscribebulk, *punsubscribebulk,
542 *integers[REDIS_SHARED_INTEGERS];
ed9b544e 543} shared;
544
a7866db6 545/* Global vars that are actally used as constants. The following double
546 * values are used for double on-disk serialization, and are initialized
547 * at runtime to avoid strange compiler optimizations. */
548
549static double R_Zero, R_PosInf, R_NegInf, R_Nan;
550
92f8e882 551/* VM threaded I/O request message */
b9bc0eef 552#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
553#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
554#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 555typedef struct iojob {
996cb5f7 556 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 557 redisDb *db;/* Redis database */
92f8e882 558 robj *key; /* This I/O request is about swapping this key */
560db612 559 robj *id; /* Unique identifier of this job:
560 this is the object to swap for REDIS_IOREQ_*_SWAP, or the
561 vmpointer objct for REDIS_IOREQ_LOAD. */
b9bc0eef 562 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 563 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
564 off_t page; /* Swap page where to read/write the object */
248ea310 565 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 566 int canceled; /* True if this command was canceled by blocking side of VM */
567 pthread_t thread; /* ID of the thread processing this entry */
568} iojob;
92f8e882 569
ed9b544e 570/*================================ Prototypes =============================== */
571
572static void freeStringObject(robj *o);
573static void freeListObject(robj *o);
574static void freeSetObject(robj *o);
575static void decrRefCount(void *o);
576static robj *createObject(int type, void *ptr);
577static void freeClient(redisClient *c);
f78fd11b 578static int rdbLoad(char *filename);
ed9b544e 579static void addReply(redisClient *c, robj *obj);
580static void addReplySds(redisClient *c, sds s);
581static void incrRefCount(robj *o);
f78fd11b 582static int rdbSaveBackground(char *filename);
ed9b544e 583static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 584static robj *dupStringObject(robj *o);
248ea310 585static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
dd142b9c 586static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
28ed1f33 587static void flushAppendOnlyFile(void);
44b38ef4 588static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 589static int syncWithMaster(void);
05df7621 590static robj *tryObjectEncoding(robj *o);
9d65a1bb 591static robj *getDecodedObject(robj *o);
3305306f 592static int removeExpire(redisDb *db, robj *key);
593static int expireIfNeeded(redisDb *db, robj *key);
594static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 595static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 596static int deleteKey(redisDb *db, robj *key);
bb32ede5 597static time_t getExpire(redisDb *db, robj *key);
598static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 599static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 600static void freeMemoryIfNeeded(void);
de96dbfe 601static int processCommand(redisClient *c);
56906eef 602static void setupSigSegvAction(void);
a3b21203 603static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 604static void aofRemoveTempFile(pid_t childpid);
0ea663ea 605static size_t stringObjectLen(robj *o);
638e42ac 606static void processInputBuffer(redisClient *c);
6b47e12e 607static zskiplist *zslCreate(void);
fd8ccf44 608static void zslFree(zskiplist *zsl);
2b59cfdf 609static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 610static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 611static void initClientMultiState(redisClient *c);
612static void freeClientMultiState(redisClient *c);
613static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 614static void unblockClientWaitingData(redisClient *c);
4409877e 615static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 616static void vmInit(void);
a35ddf12 617static void vmMarkPagesFree(off_t page, off_t count);
560db612 618static robj *vmLoadObject(robj *o);
619static robj *vmPreviewObject(robj *o);
a69a0c9c 620static int vmSwapOneObjectBlocking(void);
621static int vmSwapOneObjectThreaded(void);
7e69548d 622static int vmCanSwapOut(void);
a5819310 623static int tryFreeOneObjectFromFreelist(void);
996cb5f7 624static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
625static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
626static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 627static void lockThreadedIO(void);
628static void unlockThreadedIO(void);
629static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
630static void freeIOJob(iojob *j);
631static void queueIOJob(iojob *j);
a5819310 632static int vmWriteObjectOnSwap(robj *o, off_t page);
633static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 634static void waitEmptyIOJobsQueue(void);
635static void vmReopenSwapFile(void);
970e10bb 636static int vmFreePage(off_t page);
ca1788b5 637static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
3805e04f 638static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
0a6f3f0f 639static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
d5d55fc3 640static int dontWaitForSwappedKey(redisClient *c, robj *key);
641static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
642static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
643static struct redisCommand *lookupCommand(char *name);
644static void call(redisClient *c, struct redisCommand *cmd);
645static void resetClient(redisClient *c);
ada386b2 646static void convertToRealHash(robj *o);
ffc6b7f8 647static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
648static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
649static void freePubsubPattern(void *p);
650static int listMatchPubsubPattern(void *a, void *b);
651static int compareStringObjects(robj *a, robj *b);
bf028098 652static int equalStringObjects(robj *a, robj *b);
befec3cd 653static void usage();
8f63ddca 654static int rewriteAppendOnlyFileBackground(void);
560db612 655static vmpointer *vmSwapObjectBlocking(robj *val);
fab43727 656static int prepareForShutdown();
37ab76c9 657static void touchWatchedKey(redisDb *db, robj *key);
9b30e1a2 658static void touchWatchedKeysOnFlush(int dbid);
37ab76c9 659static void unwatchAllKeys(redisClient *c);
ed9b544e 660
abcb223e 661static void authCommand(redisClient *c);
ed9b544e 662static void pingCommand(redisClient *c);
663static void echoCommand(redisClient *c);
664static void setCommand(redisClient *c);
665static void setnxCommand(redisClient *c);
526d00a5 666static void setexCommand(redisClient *c);
ed9b544e 667static void getCommand(redisClient *c);
668static void delCommand(redisClient *c);
669static void existsCommand(redisClient *c);
670static void incrCommand(redisClient *c);
671static void decrCommand(redisClient *c);
672static void incrbyCommand(redisClient *c);
673static void decrbyCommand(redisClient *c);
674static void selectCommand(redisClient *c);
675static void randomkeyCommand(redisClient *c);
676static void keysCommand(redisClient *c);
677static void dbsizeCommand(redisClient *c);
678static void lastsaveCommand(redisClient *c);
679static void saveCommand(redisClient *c);
680static void bgsaveCommand(redisClient *c);
9d65a1bb 681static void bgrewriteaofCommand(redisClient *c);
ed9b544e 682static void shutdownCommand(redisClient *c);
683static void moveCommand(redisClient *c);
684static void renameCommand(redisClient *c);
685static void renamenxCommand(redisClient *c);
686static void lpushCommand(redisClient *c);
687static void rpushCommand(redisClient *c);
688static void lpopCommand(redisClient *c);
689static void rpopCommand(redisClient *c);
690static void llenCommand(redisClient *c);
691static void lindexCommand(redisClient *c);
692static void lrangeCommand(redisClient *c);
693static void ltrimCommand(redisClient *c);
694static void typeCommand(redisClient *c);
695static void lsetCommand(redisClient *c);
696static void saddCommand(redisClient *c);
697static void sremCommand(redisClient *c);
a4460ef4 698static void smoveCommand(redisClient *c);
ed9b544e 699static void sismemberCommand(redisClient *c);
700static void scardCommand(redisClient *c);
12fea928 701static void spopCommand(redisClient *c);
2abb95a9 702static void srandmemberCommand(redisClient *c);
ed9b544e 703static void sinterCommand(redisClient *c);
704static void sinterstoreCommand(redisClient *c);
40d224a9 705static void sunionCommand(redisClient *c);
706static void sunionstoreCommand(redisClient *c);
f4f56e1d 707static void sdiffCommand(redisClient *c);
708static void sdiffstoreCommand(redisClient *c);
ed9b544e 709static void syncCommand(redisClient *c);
710static void flushdbCommand(redisClient *c);
711static void flushallCommand(redisClient *c);
712static void sortCommand(redisClient *c);
713static void lremCommand(redisClient *c);
0f5f7e9a 714static void rpoplpushcommand(redisClient *c);
ed9b544e 715static void infoCommand(redisClient *c);
70003d28 716static void mgetCommand(redisClient *c);
87eca727 717static void monitorCommand(redisClient *c);
3305306f 718static void expireCommand(redisClient *c);
802e8373 719static void expireatCommand(redisClient *c);
f6b141c5 720static void getsetCommand(redisClient *c);
fd88489a 721static void ttlCommand(redisClient *c);
321b0e13 722static void slaveofCommand(redisClient *c);
7f957c92 723static void debugCommand(redisClient *c);
f6b141c5 724static void msetCommand(redisClient *c);
725static void msetnxCommand(redisClient *c);
fd8ccf44 726static void zaddCommand(redisClient *c);
7db723ad 727static void zincrbyCommand(redisClient *c);
cc812361 728static void zrangeCommand(redisClient *c);
50c55df5 729static void zrangebyscoreCommand(redisClient *c);
f44dd428 730static void zcountCommand(redisClient *c);
e3870fab 731static void zrevrangeCommand(redisClient *c);
3c41331e 732static void zcardCommand(redisClient *c);
1b7106e7 733static void zremCommand(redisClient *c);
6e333bbe 734static void zscoreCommand(redisClient *c);
1807985b 735static void zremrangebyscoreCommand(redisClient *c);
6e469882 736static void multiCommand(redisClient *c);
737static void execCommand(redisClient *c);
18b6cb76 738static void discardCommand(redisClient *c);
4409877e 739static void blpopCommand(redisClient *c);
740static void brpopCommand(redisClient *c);
4b00bebd 741static void appendCommand(redisClient *c);
39191553 742static void substrCommand(redisClient *c);
69d95c3e 743static void zrankCommand(redisClient *c);
798d9e55 744static void zrevrankCommand(redisClient *c);
978c2c94 745static void hsetCommand(redisClient *c);
1f1c7695 746static void hsetnxCommand(redisClient *c);
978c2c94 747static void hgetCommand(redisClient *c);
09aeb579
PN
748static void hmsetCommand(redisClient *c);
749static void hmgetCommand(redisClient *c);
07efaf74 750static void hdelCommand(redisClient *c);
92b27fe9 751static void hlenCommand(redisClient *c);
9212eafd 752static void zremrangebyrankCommand(redisClient *c);
5d373da9 753static void zunionstoreCommand(redisClient *c);
754static void zinterstoreCommand(redisClient *c);
78409a0f 755static void hkeysCommand(redisClient *c);
756static void hvalsCommand(redisClient *c);
757static void hgetallCommand(redisClient *c);
a86f14b1 758static void hexistsCommand(redisClient *c);
500ece7c 759static void configCommand(redisClient *c);
01426b05 760static void hincrbyCommand(redisClient *c);
befec3cd 761static void subscribeCommand(redisClient *c);
762static void unsubscribeCommand(redisClient *c);
ffc6b7f8 763static void psubscribeCommand(redisClient *c);
764static void punsubscribeCommand(redisClient *c);
befec3cd 765static void publishCommand(redisClient *c);
37ab76c9 766static void watchCommand(redisClient *c);
767static void unwatchCommand(redisClient *c);
f6b141c5 768
ed9b544e 769/*================================= Globals ================================= */
770
771/* Global vars */
772static struct redisServer server; /* server global state */
1a132bbc 773static struct redisCommand *commandTable;
1a132bbc 774static struct redisCommand readonlyCommandTable[] = {
76583ea4
PN
775 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
777 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
526d00a5 778 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
779 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
780 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
781 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
782 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
783 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
784 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
785 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
786 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
787 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
788 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
790 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
791 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
792 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
793 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
794 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
795 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
796 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
797 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
798 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
799 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
800 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
801 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
802 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
803 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
807 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
808 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
809 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
810 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
811 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
812 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
813 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
814 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
815 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
816 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
817 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
5d373da9 818 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
819 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
820 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
821 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
822 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
823 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
824 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
825 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
826 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
827 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
828 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
1f1c7695 829 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 830 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
d33278d1 831 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 832 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
01426b05 833 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
834 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
835 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
836 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
837 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
838 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 839 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
840 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
841 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
842 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
843 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
844 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
845 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
846 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
847 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
848 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
849 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
850 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
851 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
852 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
853 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
854 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
855 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
856 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
857 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
858 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
859 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
860 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
861 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
862 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
863 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
3805e04f 864 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
865 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
866 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
867 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
868 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
869 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
870 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
871 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
872 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
873 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
874 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 875 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
befec3cd 876 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
877 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
ffc6b7f8 878 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
879 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
4005fef1 880 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
37ab76c9 881 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
d55d5c5d 882 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}
ed9b544e 883};
bcfc686d 884
ed9b544e 885/*============================ Utility functions ============================ */
886
887/* Glob-style pattern matching. */
500ece7c 888static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 889 const char *string, int stringLen, int nocase)
890{
891 while(patternLen) {
892 switch(pattern[0]) {
893 case '*':
894 while (pattern[1] == '*') {
895 pattern++;
896 patternLen--;
897 }
898 if (patternLen == 1)
899 return 1; /* match */
900 while(stringLen) {
901 if (stringmatchlen(pattern+1, patternLen-1,
902 string, stringLen, nocase))
903 return 1; /* match */
904 string++;
905 stringLen--;
906 }
907 return 0; /* no match */
908 break;
909 case '?':
910 if (stringLen == 0)
911 return 0; /* no match */
912 string++;
913 stringLen--;
914 break;
915 case '[':
916 {
917 int not, match;
918
919 pattern++;
920 patternLen--;
921 not = pattern[0] == '^';
922 if (not) {
923 pattern++;
924 patternLen--;
925 }
926 match = 0;
927 while(1) {
928 if (pattern[0] == '\\') {
929 pattern++;
930 patternLen--;
931 if (pattern[0] == string[0])
932 match = 1;
933 } else if (pattern[0] == ']') {
934 break;
935 } else if (patternLen == 0) {
936 pattern--;
937 patternLen++;
938 break;
939 } else if (pattern[1] == '-' && patternLen >= 3) {
940 int start = pattern[0];
941 int end = pattern[2];
942 int c = string[0];
943 if (start > end) {
944 int t = start;
945 start = end;
946 end = t;
947 }
948 if (nocase) {
949 start = tolower(start);
950 end = tolower(end);
951 c = tolower(c);
952 }
953 pattern += 2;
954 patternLen -= 2;
955 if (c >= start && c <= end)
956 match = 1;
957 } else {
958 if (!nocase) {
959 if (pattern[0] == string[0])
960 match = 1;
961 } else {
962 if (tolower((int)pattern[0]) == tolower((int)string[0]))
963 match = 1;
964 }
965 }
966 pattern++;
967 patternLen--;
968 }
969 if (not)
970 match = !match;
971 if (!match)
972 return 0; /* no match */
973 string++;
974 stringLen--;
975 break;
976 }
977 case '\\':
978 if (patternLen >= 2) {
979 pattern++;
980 patternLen--;
981 }
982 /* fall through */
983 default:
984 if (!nocase) {
985 if (pattern[0] != string[0])
986 return 0; /* no match */
987 } else {
988 if (tolower((int)pattern[0]) != tolower((int)string[0]))
989 return 0; /* no match */
990 }
991 string++;
992 stringLen--;
993 break;
994 }
995 pattern++;
996 patternLen--;
997 if (stringLen == 0) {
998 while(*pattern == '*') {
999 pattern++;
1000 patternLen--;
1001 }
1002 break;
1003 }
1004 }
1005 if (patternLen == 0 && stringLen == 0)
1006 return 1;
1007 return 0;
1008}
1009
500ece7c 1010static int stringmatch(const char *pattern, const char *string, int nocase) {
1011 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
1012}
1013
2b619329 1014/* Convert a string representing an amount of memory into the number of
1015 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
1016 * (1024*1024*1024).
1017 *
1018 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1019 * set to 0 */
1020static long long memtoll(const char *p, int *err) {
1021 const char *u;
1022 char buf[128];
1023 long mul; /* unit multiplier */
1024 long long val;
1025 unsigned int digits;
1026
1027 if (err) *err = 0;
1028 /* Search the first non digit character. */
1029 u = p;
1030 if (*u == '-') u++;
1031 while(*u && isdigit(*u)) u++;
1032 if (*u == '\0' || !strcasecmp(u,"b")) {
1033 mul = 1;
72324005 1034 } else if (!strcasecmp(u,"k")) {
2b619329 1035 mul = 1000;
72324005 1036 } else if (!strcasecmp(u,"kb")) {
2b619329 1037 mul = 1024;
72324005 1038 } else if (!strcasecmp(u,"m")) {
2b619329 1039 mul = 1000*1000;
72324005 1040 } else if (!strcasecmp(u,"mb")) {
2b619329 1041 mul = 1024*1024;
72324005 1042 } else if (!strcasecmp(u,"g")) {
2b619329 1043 mul = 1000L*1000*1000;
72324005 1044 } else if (!strcasecmp(u,"gb")) {
2b619329 1045 mul = 1024L*1024*1024;
1046 } else {
1047 if (err) *err = 1;
1048 mul = 1;
1049 }
1050 digits = u-p;
1051 if (digits >= sizeof(buf)) {
1052 if (err) *err = 1;
1053 return LLONG_MAX;
1054 }
1055 memcpy(buf,p,digits);
1056 buf[digits] = '\0';
1057 val = strtoll(buf,NULL,10);
1058 return val*mul;
1059}
1060
ee14da56 1061/* Convert a long long into a string. Returns the number of
1062 * characters needed to represent the number, that can be shorter if passed
1063 * buffer length is not enough to store the whole number. */
1064static int ll2string(char *s, size_t len, long long value) {
1065 char buf[32], *p;
1066 unsigned long long v;
1067 size_t l;
1068
1069 if (len == 0) return 0;
1070 v = (value < 0) ? -value : value;
1071 p = buf+31; /* point to the last character */
1072 do {
1073 *p-- = '0'+(v%10);
1074 v /= 10;
1075 } while(v);
1076 if (value < 0) *p-- = '-';
1077 p++;
1078 l = 32-(p-buf);
1079 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1080 memcpy(s,p,l);
1081 s[l] = '\0';
1082 return l;
1083}
1084
56906eef 1085static void redisLog(int level, const char *fmt, ...) {
ed9b544e 1086 va_list ap;
1087 FILE *fp;
1088
1089 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1090 if (!fp) return;
1091
1092 va_start(ap, fmt);
1093 if (level >= server.verbosity) {
6766f45e 1094 char *c = ".-*#";
1904ecc1 1095 char buf[64];
1096 time_t now;
1097
1098 now = time(NULL);
6c9385e0 1099 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 1100 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 1101 vfprintf(fp, fmt, ap);
1102 fprintf(fp,"\n");
1103 fflush(fp);
1104 }
1105 va_end(ap);
1106
1107 if (server.logfile) fclose(fp);
1108}
1109
1110/*====================== Hash table type implementation ==================== */
1111
1112/* This is an hash table type that uses the SDS dynamic strings libary as
1113 * keys and radis objects as values (objects can hold SDS strings,
1114 * lists, sets). */
1115
1812e024 1116static void dictVanillaFree(void *privdata, void *val)
1117{
1118 DICT_NOTUSED(privdata);
1119 zfree(val);
1120}
1121
4409877e 1122static void dictListDestructor(void *privdata, void *val)
1123{
1124 DICT_NOTUSED(privdata);
1125 listRelease((list*)val);
1126}
1127
ed9b544e 1128static int sdsDictKeyCompare(void *privdata, const void *key1,
1129 const void *key2)
1130{
1131 int l1,l2;
1132 DICT_NOTUSED(privdata);
1133
1134 l1 = sdslen((sds)key1);
1135 l2 = sdslen((sds)key2);
1136 if (l1 != l2) return 0;
1137 return memcmp(key1, key2, l1) == 0;
1138}
1139
1140static void dictRedisObjectDestructor(void *privdata, void *val)
1141{
1142 DICT_NOTUSED(privdata);
1143
a35ddf12 1144 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 1145 decrRefCount(val);
1146}
1147
942a3961 1148static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1149 const void *key2)
1150{
1151 const robj *o1 = key1, *o2 = key2;
1152 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1153}
1154
942a3961 1155static unsigned int dictObjHash(const void *key) {
ed9b544e 1156 const robj *o = key;
1157 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1158}
1159
942a3961 1160static int dictEncObjKeyCompare(void *privdata, const void *key1,
1161 const void *key2)
1162{
9d65a1bb 1163 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1164 int cmp;
942a3961 1165
2a1198b4 1166 if (o1->encoding == REDIS_ENCODING_INT &&
dc05abde 1167 o2->encoding == REDIS_ENCODING_INT)
1168 return o1->ptr == o2->ptr;
2a1198b4 1169
9d65a1bb 1170 o1 = getDecodedObject(o1);
1171 o2 = getDecodedObject(o2);
1172 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1173 decrRefCount(o1);
1174 decrRefCount(o2);
1175 return cmp;
942a3961 1176}
1177
1178static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1179 robj *o = (robj*) key;
942a3961 1180
ed9e4966 1181 if (o->encoding == REDIS_ENCODING_RAW) {
1182 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1183 } else {
1184 if (o->encoding == REDIS_ENCODING_INT) {
1185 char buf[32];
1186 int len;
1187
ee14da56 1188 len = ll2string(buf,32,(long)o->ptr);
ed9e4966 1189 return dictGenHashFunction((unsigned char*)buf, len);
1190 } else {
1191 unsigned int hash;
1192
1193 o = getDecodedObject(o);
1194 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1195 decrRefCount(o);
1196 return hash;
1197 }
1198 }
942a3961 1199}
1200
f2d9f50f 1201/* Sets type and expires */
ed9b544e 1202static dictType setDictType = {
942a3961 1203 dictEncObjHash, /* hash function */
ed9b544e 1204 NULL, /* key dup */
1205 NULL, /* val dup */
942a3961 1206 dictEncObjKeyCompare, /* key compare */
ed9b544e 1207 dictRedisObjectDestructor, /* key destructor */
1208 NULL /* val destructor */
1209};
1210
f2d9f50f 1211/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1212static dictType zsetDictType = {
1213 dictEncObjHash, /* hash function */
1214 NULL, /* key dup */
1215 NULL, /* val dup */
1216 dictEncObjKeyCompare, /* key compare */
1217 dictRedisObjectDestructor, /* key destructor */
da0a1620 1218 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1219};
1220
f2d9f50f 1221/* Db->dict */
5234952b 1222static dictType dbDictType = {
942a3961 1223 dictObjHash, /* hash function */
ed9b544e 1224 NULL, /* key dup */
1225 NULL, /* val dup */
942a3961 1226 dictObjKeyCompare, /* key compare */
ed9b544e 1227 dictRedisObjectDestructor, /* key destructor */
1228 dictRedisObjectDestructor /* val destructor */
1229};
1230
f2d9f50f 1231/* Db->expires */
1232static dictType keyptrDictType = {
1233 dictObjHash, /* hash function */
1234 NULL, /* key dup */
1235 NULL, /* val dup */
1236 dictObjKeyCompare, /* key compare */
1237 dictRedisObjectDestructor, /* key destructor */
1238 NULL /* val destructor */
1239};
1240
5234952b 1241/* Hash type hash table (note that small hashes are represented with zimpaps) */
1242static dictType hashDictType = {
1243 dictEncObjHash, /* hash function */
1244 NULL, /* key dup */
1245 NULL, /* val dup */
1246 dictEncObjKeyCompare, /* key compare */
1247 dictRedisObjectDestructor, /* key destructor */
1248 dictRedisObjectDestructor /* val destructor */
1249};
1250
4409877e 1251/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1252 * lists as values. It's used for blocking operations (BLPOP) and to
1253 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1254static dictType keylistDictType = {
1255 dictObjHash, /* hash function */
1256 NULL, /* key dup */
1257 NULL, /* val dup */
1258 dictObjKeyCompare, /* key compare */
1259 dictRedisObjectDestructor, /* key destructor */
1260 dictListDestructor /* val destructor */
1261};
1262
42ab0172
AO
1263static void version();
1264
ed9b544e 1265/* ========================= Random utility functions ======================= */
1266
1267/* Redis generally does not try to recover from out of memory conditions
1268 * when allocating objects or strings, it is not clear if it will be possible
1269 * to report this condition to the client since the networking layer itself
1270 * is based on heap allocation for send buffers, so we simply abort.
1271 * At least the code will be simpler to read... */
1272static void oom(const char *msg) {
71c54b21 1273 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1274 sleep(1);
1275 abort();
1276}
1277
1278/* ====================== Redis server networking stuff ===================== */
56906eef 1279static void closeTimedoutClients(void) {
ed9b544e 1280 redisClient *c;
ed9b544e 1281 listNode *ln;
1282 time_t now = time(NULL);
c7df85a4 1283 listIter li;
ed9b544e 1284
c7df85a4 1285 listRewind(server.clients,&li);
1286 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1287 c = listNodeValue(ln);
f86a74e9 1288 if (server.maxidletime &&
1289 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1290 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
ffc6b7f8 1291 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1292 listLength(c->pubsub_patterns) == 0 &&
d6cc8867 1293 (now - c->lastinteraction > server.maxidletime))
f86a74e9 1294 {
f870935d 1295 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1296 freeClient(c);
f86a74e9 1297 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1298 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1299 addReply(c,shared.nullmultibulk);
b0d8747d 1300 unblockClientWaitingData(c);
f86a74e9 1301 }
ed9b544e 1302 }
1303 }
ed9b544e 1304}
1305
12fea928 1306static int htNeedsResize(dict *dict) {
1307 long long size, used;
1308
1309 size = dictSlots(dict);
1310 used = dictSize(dict);
1311 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1312 (used*100/size < REDIS_HT_MINFILL));
1313}
1314
0bc03378 1315/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1316 * we resize the hash table to save memory */
56906eef 1317static void tryResizeHashTables(void) {
0bc03378 1318 int j;
1319
1320 for (j = 0; j < server.dbnum; j++) {
5413c40d 1321 if (htNeedsResize(server.db[j].dict))
0bc03378 1322 dictResize(server.db[j].dict);
12fea928 1323 if (htNeedsResize(server.db[j].expires))
1324 dictResize(server.db[j].expires);
0bc03378 1325 }
1326}
1327
8ca3e9d1 1328/* Our hash table implementation performs rehashing incrementally while
1329 * we write/read from the hash table. Still if the server is idle, the hash
1330 * table will use two tables for a long time. So we try to use 1 millisecond
1331 * of CPU time at every serverCron() loop in order to rehash some key. */
1332static void incrementallyRehash(void) {
1333 int j;
1334
1335 for (j = 0; j < server.dbnum; j++) {
1336 if (dictIsRehashing(server.db[j].dict)) {
1337 dictRehashMilliseconds(server.db[j].dict,1);
1338 break; /* already used our millisecond for this loop... */
1339 }
1340 }
1341}
1342
9d65a1bb 1343/* A background saving child (BGSAVE) terminated its work. Handle this. */
1344void backgroundSaveDoneHandler(int statloc) {
1345 int exitcode = WEXITSTATUS(statloc);
1346 int bysignal = WIFSIGNALED(statloc);
1347
1348 if (!bysignal && exitcode == 0) {
1349 redisLog(REDIS_NOTICE,
1350 "Background saving terminated with success");
1351 server.dirty = 0;
1352 server.lastsave = time(NULL);
1353 } else if (!bysignal && exitcode != 0) {
1354 redisLog(REDIS_WARNING, "Background saving error");
1355 } else {
1356 redisLog(REDIS_WARNING,
454eea7c 1357 "Background saving terminated by signal %d", WTERMSIG(statloc));
9d65a1bb 1358 rdbRemoveTempFile(server.bgsavechildpid);
1359 }
1360 server.bgsavechildpid = -1;
1361 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1362 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1363 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1364}
1365
1366/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1367 * Handle this. */
1368void backgroundRewriteDoneHandler(int statloc) {
1369 int exitcode = WEXITSTATUS(statloc);
1370 int bysignal = WIFSIGNALED(statloc);
1371
1372 if (!bysignal && exitcode == 0) {
1373 int fd;
1374 char tmpfile[256];
1375
1376 redisLog(REDIS_NOTICE,
1377 "Background append only file rewriting terminated with success");
1378 /* Now it's time to flush the differences accumulated by the parent */
1379 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1380 fd = open(tmpfile,O_WRONLY|O_APPEND);
1381 if (fd == -1) {
1382 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1383 goto cleanup;
1384 }
1385 /* Flush our data... */
1386 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1387 (signed) sdslen(server.bgrewritebuf)) {
1388 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1389 close(fd);
1390 goto cleanup;
1391 }
b32627cd 1392 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1393 /* Now our work is to rename the temp file into the stable file. And
1394 * switch the file descriptor used by the server for append only. */
1395 if (rename(tmpfile,server.appendfilename) == -1) {
1396 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1397 close(fd);
1398 goto cleanup;
1399 }
1400 /* Mission completed... almost */
1401 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1402 if (server.appendfd != -1) {
1403 /* If append only is actually enabled... */
1404 close(server.appendfd);
1405 server.appendfd = fd;
d5d23dab 1406 if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
85a83172 1407 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1408 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1409 } else {
1410 /* If append only is disabled we just generate a dump in this
1411 * format. Why not? */
1412 close(fd);
1413 }
1414 } else if (!bysignal && exitcode != 0) {
1415 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1416 } else {
1417 redisLog(REDIS_WARNING,
454eea7c 1418 "Background append only file rewriting terminated by signal %d",
1419 WTERMSIG(statloc));
9d65a1bb 1420 }
1421cleanup:
1422 sdsfree(server.bgrewritebuf);
1423 server.bgrewritebuf = sdsempty();
1424 aofRemoveTempFile(server.bgrewritechildpid);
1425 server.bgrewritechildpid = -1;
1426}
1427
884d4b39 1428/* This function is called once a background process of some kind terminates,
1429 * as we want to avoid resizing the hash tables when there is a child in order
1430 * to play well with copy-on-write (otherwise when a resize happens lots of
1431 * memory pages are copied). The goal of this function is to update the ability
1432 * for dict.c to resize the hash tables accordingly to the fact we have o not
1433 * running childs. */
1434static void updateDictResizePolicy(void) {
1435 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1436 dictEnableResize();
1437 else
1438 dictDisableResize();
1439}
1440
56906eef 1441static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1442 int j, loops = server.cronloops++;
ed9b544e 1443 REDIS_NOTUSED(eventLoop);
1444 REDIS_NOTUSED(id);
1445 REDIS_NOTUSED(clientData);
1446
3a66edc7 1447 /* We take a cached value of the unix time in the global state because
1448 * with virtual memory and aging there is to store the current time
1449 * in objects at every object access, and accuracy is not needed.
1450 * To access a global var is faster than calling time(NULL) */
1451 server.unixtime = time(NULL);
560db612 1452 /* We have just 21 bits per object for LRU information.
1453 * So we use an (eventually wrapping) LRU clock with minutes resolution.
1454 *
1455 * When we need to select what object to swap, we compute the minimum
1456 * time distance between the current lruclock and the object last access
1457 * lruclock info. Even if clocks will wrap on overflow, there is
1458 * the interesting property that we are sure that at least
1459 * ABS(A-B) minutes passed between current time and timestamp B.
1460 *
1461 * This is not precise but we don't need at all precision, but just
1462 * something statistically reasonable.
1463 */
1464 server.lruclock = (time(NULL)/60)&((1<<21)-1);
3a66edc7 1465
fab43727 1466 /* We received a SIGTERM, shutting down here in a safe way, as it is
1467 * not ok doing so inside the signal handler. */
1468 if (server.shutdown_asap) {
1469 if (prepareForShutdown() == REDIS_OK) exit(0);
1470 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1471 }
1472
0bc03378 1473 /* Show some info about non-empty databases */
ed9b544e 1474 for (j = 0; j < server.dbnum; j++) {
dec423d9 1475 long long size, used, vkeys;
94754ccc 1476
3305306f 1477 size = dictSlots(server.db[j].dict);
1478 used = dictSize(server.db[j].dict);
94754ccc 1479 vkeys = dictSize(server.db[j].expires);
1763929f 1480 if (!(loops % 50) && (used || vkeys)) {
f870935d 1481 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1482 /* dictPrintStats(server.dict); */
ed9b544e 1483 }
ed9b544e 1484 }
1485
0bc03378 1486 /* We don't want to resize the hash tables while a bacground saving
1487 * is in progress: the saving child is created using fork() that is
1488 * implemented with a copy-on-write semantic in most modern systems, so
1489 * if we resize the HT while there is the saving child at work actually
1490 * a lot of memory movements in the parent will cause a lot of pages
1491 * copied. */
8ca3e9d1 1492 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1493 if (!(loops % 10)) tryResizeHashTables();
1494 if (server.activerehashing) incrementallyRehash();
884d4b39 1495 }
0bc03378 1496
ed9b544e 1497 /* Show information about connected clients */
1763929f 1498 if (!(loops % 50)) {
bdcb92f2 1499 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
ed9b544e 1500 listLength(server.clients)-listLength(server.slaves),
1501 listLength(server.slaves),
bdcb92f2 1502 zmalloc_used_memory());
ed9b544e 1503 }
1504
1505 /* Close connections of timedout clients */
1763929f 1506 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1507 closeTimedoutClients();
1508
9d65a1bb 1509 /* Check if a background saving or AOF rewrite in progress terminated */
1510 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1511 int statloc;
9d65a1bb 1512 pid_t pid;
1513
1514 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1515 if (pid == server.bgsavechildpid) {
1516 backgroundSaveDoneHandler(statloc);
ed9b544e 1517 } else {
9d65a1bb 1518 backgroundRewriteDoneHandler(statloc);
ed9b544e 1519 }
884d4b39 1520 updateDictResizePolicy();
ed9b544e 1521 }
1522 } else {
1523 /* If there is not a background saving in progress check if
1524 * we have to save now */
1525 time_t now = time(NULL);
1526 for (j = 0; j < server.saveparamslen; j++) {
1527 struct saveparam *sp = server.saveparams+j;
1528
1529 if (server.dirty >= sp->changes &&
1530 now-server.lastsave > sp->seconds) {
1531 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1532 sp->changes, sp->seconds);
f78fd11b 1533 rdbSaveBackground(server.dbfilename);
ed9b544e 1534 break;
1535 }
1536 }
1537 }
94754ccc 1538
f2324293 1539 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1540 * will use few CPU cycles if there are few expiring keys, otherwise
1541 * it will get more aggressive to avoid that too much memory is used by
1542 * keys that can be removed from the keyspace. */
94754ccc 1543 for (j = 0; j < server.dbnum; j++) {
f2324293 1544 int expired;
94754ccc 1545 redisDb *db = server.db+j;
94754ccc 1546
f2324293 1547 /* Continue to expire if at the end of the cycle more than 25%
1548 * of the keys were expired. */
1549 do {
4ef8de8a 1550 long num = dictSize(db->expires);
94754ccc 1551 time_t now = time(NULL);
1552
f2324293 1553 expired = 0;
94754ccc 1554 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1555 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1556 while (num--) {
1557 dictEntry *de;
1558 time_t t;
1559
1560 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1561 t = (time_t) dictGetEntryVal(de);
1562 if (now > t) {
1563 deleteKey(db,dictGetEntryKey(de));
f2324293 1564 expired++;
2a6a2ed1 1565 server.stat_expiredkeys++;
94754ccc 1566 }
1567 }
f2324293 1568 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1569 }
1570
4ef8de8a 1571 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1572 * is enbled. Try to free objects from the free list first. */
7e69548d 1573 if (vmCanSwapOut()) {
1574 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1575 server.vm_max_memory)
1576 {
72e9fd40 1577 int retval;
1578
a5819310 1579 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1580 retval = (server.vm_max_threads == 0) ?
1581 vmSwapOneObjectBlocking() :
1582 vmSwapOneObjectThreaded();
1763929f 1583 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1584 zmalloc_used_memory() >
1585 (server.vm_max_memory+server.vm_max_memory/10))
1586 {
1587 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1588 }
72e9fd40 1589 /* Note that when using threade I/O we free just one object,
1590 * because anyway when the I/O thread in charge to swap this
1591 * object out will finish, the handler of completed jobs
1592 * will try to swap more objects if we are still out of memory. */
1593 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1594 }
1595 }
1596
ed9b544e 1597 /* Check if we should connect to a MASTER */
1763929f 1598 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1599 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1600 if (syncWithMaster() == REDIS_OK) {
1601 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
8f63ddca 1602 if (server.appendonly) rewriteAppendOnlyFileBackground();
ed9b544e 1603 }
1604 }
1763929f 1605 return 100;
ed9b544e 1606}
1607
d5d55fc3 1608/* This function gets called every time Redis is entering the
1609 * main loop of the event driven library, that is, before to sleep
1610 * for ready file descriptors. */
1611static void beforeSleep(struct aeEventLoop *eventLoop) {
1612 REDIS_NOTUSED(eventLoop);
1613
28ed1f33 1614 /* Awake clients that got all the swapped keys they requested */
d5d55fc3 1615 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1616 listIter li;
1617 listNode *ln;
1618
1619 listRewind(server.io_ready_clients,&li);
1620 while((ln = listNext(&li))) {
1621 redisClient *c = ln->value;
1622 struct redisCommand *cmd;
1623
1624 /* Resume the client. */
1625 listDelNode(server.io_ready_clients,ln);
1626 c->flags &= (~REDIS_IO_WAIT);
1627 server.vm_blocked_clients--;
1628 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1629 readQueryFromClient, c);
1630 cmd = lookupCommand(c->argv[0]->ptr);
1631 assert(cmd != NULL);
1632 call(c,cmd);
1633 resetClient(c);
1634 /* There may be more data to process in the input buffer. */
1635 if (c->querybuf && sdslen(c->querybuf) > 0)
1636 processInputBuffer(c);
1637 }
1638 }
28ed1f33 1639 /* Write the AOF buffer on disk */
1640 flushAppendOnlyFile();
d5d55fc3 1641}
1642
ed9b544e 1643static void createSharedObjects(void) {
05df7621 1644 int j;
1645
ed9b544e 1646 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1647 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1648 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1649 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1650 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1651 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1652 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1653 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1654 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1655 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1656 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1657 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1658 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1659 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1660 "-ERR no such key\r\n"));
ed9b544e 1661 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1662 "-ERR syntax error\r\n"));
c937aa89 1663 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1664 "-ERR source and destination objects are the same\r\n"));
1665 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1666 "-ERR index out of range\r\n"));
ed9b544e 1667 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1668 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1669 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1670 shared.select0 = createStringObject("select 0\r\n",10);
1671 shared.select1 = createStringObject("select 1\r\n",10);
1672 shared.select2 = createStringObject("select 2\r\n",10);
1673 shared.select3 = createStringObject("select 3\r\n",10);
1674 shared.select4 = createStringObject("select 4\r\n",10);
1675 shared.select5 = createStringObject("select 5\r\n",10);
1676 shared.select6 = createStringObject("select 6\r\n",10);
1677 shared.select7 = createStringObject("select 7\r\n",10);
1678 shared.select8 = createStringObject("select 8\r\n",10);
1679 shared.select9 = createStringObject("select 9\r\n",10);
befec3cd 1680 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
c8d0ea0e 1681 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
befec3cd 1682 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
fc46bb71 1683 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
ffc6b7f8 1684 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1685 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
befec3cd 1686 shared.mbulk3 = createStringObject("*3\r\n",4);
c8d0ea0e 1687 shared.mbulk4 = createStringObject("*4\r\n",4);
05df7621 1688 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1689 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1690 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1691 }
ed9b544e 1692}
1693
1694static void appendServerSaveParams(time_t seconds, int changes) {
1695 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1696 server.saveparams[server.saveparamslen].seconds = seconds;
1697 server.saveparams[server.saveparamslen].changes = changes;
1698 server.saveparamslen++;
1699}
1700
bcfc686d 1701static void resetServerSaveParams() {
ed9b544e 1702 zfree(server.saveparams);
1703 server.saveparams = NULL;
1704 server.saveparamslen = 0;
1705}
1706
1707static void initServerConfig() {
1708 server.dbnum = REDIS_DEFAULT_DBNUM;
1709 server.port = REDIS_SERVERPORT;
f870935d 1710 server.verbosity = REDIS_VERBOSE;
ed9b544e 1711 server.maxidletime = REDIS_MAXIDLETIME;
1712 server.saveparams = NULL;
1713 server.logfile = NULL; /* NULL = log on standard output */
1714 server.bindaddr = NULL;
1715 server.glueoutputbuf = 1;
1716 server.daemonize = 0;
44b38ef4 1717 server.appendonly = 0;
1b677732 1718 server.appendfsync = APPENDFSYNC_EVERYSEC;
38db9171 1719 server.no_appendfsync_on_rewrite = 0;
48f0308a 1720 server.lastfsync = time(NULL);
44b38ef4 1721 server.appendfd = -1;
1722 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1723 server.pidfile = zstrdup("/var/run/redis.pid");
1724 server.dbfilename = zstrdup("dump.rdb");
1725 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1726 server.requirepass = NULL;
b0553789 1727 server.rdbcompression = 1;
8ca3e9d1 1728 server.activerehashing = 1;
285add55 1729 server.maxclients = 0;
d5d55fc3 1730 server.blpop_blocked_clients = 0;
3fd78bcd 1731 server.maxmemory = 0;
75680a3c 1732 server.vm_enabled = 0;
054e426d 1733 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1734 server.vm_page_size = 256; /* 256 bytes per page */
1735 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1736 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1737 server.vm_max_threads = 4;
d5d55fc3 1738 server.vm_blocked_clients = 0;
cbba7dd7 1739 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1740 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
fab43727 1741 server.shutdown_asap = 0;
75680a3c 1742
bcfc686d 1743 resetServerSaveParams();
ed9b544e 1744
1745 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1746 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1747 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1748 /* Replication related */
1749 server.isslave = 0;
d0ccebcf 1750 server.masterauth = NULL;
ed9b544e 1751 server.masterhost = NULL;
1752 server.masterport = 6379;
1753 server.master = NULL;
1754 server.replstate = REDIS_REPL_NONE;
a7866db6 1755
1756 /* Double constants initialization */
1757 R_Zero = 0.0;
1758 R_PosInf = 1.0/R_Zero;
1759 R_NegInf = -1.0/R_Zero;
1760 R_Nan = R_Zero/R_Zero;
ed9b544e 1761}
1762
1763static void initServer() {
1764 int j;
1765
1766 signal(SIGHUP, SIG_IGN);
1767 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1768 setupSigSegvAction();
ed9b544e 1769
b9bc0eef 1770 server.devnull = fopen("/dev/null","w");
1771 if (server.devnull == NULL) {
1772 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1773 exit(1);
1774 }
ed9b544e 1775 server.clients = listCreate();
1776 server.slaves = listCreate();
87eca727 1777 server.monitors = listCreate();
ed9b544e 1778 server.objfreelist = listCreate();
1779 createSharedObjects();
1780 server.el = aeCreateEventLoop();
3305306f 1781 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
ed9b544e 1782 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1783 if (server.fd == -1) {
1784 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1785 exit(1);
1786 }
3305306f 1787 for (j = 0; j < server.dbnum; j++) {
5234952b 1788 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1789 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
37ab76c9 1790 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1791 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1792 if (server.vm_enabled)
1793 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1794 server.db[j].id = j;
1795 }
ffc6b7f8 1796 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1797 server.pubsub_patterns = listCreate();
1798 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1799 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
ed9b544e 1800 server.cronloops = 0;
9f3c422c 1801 server.bgsavechildpid = -1;
9d65a1bb 1802 server.bgrewritechildpid = -1;
1803 server.bgrewritebuf = sdsempty();
28ed1f33 1804 server.aofbuf = sdsempty();
ed9b544e 1805 server.lastsave = time(NULL);
1806 server.dirty = 0;
ed9b544e 1807 server.stat_numcommands = 0;
1808 server.stat_numconnections = 0;
2a6a2ed1 1809 server.stat_expiredkeys = 0;
ed9b544e 1810 server.stat_starttime = time(NULL);
3a66edc7 1811 server.unixtime = time(NULL);
d8f8b666 1812 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1813 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1814 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1815
1816 if (server.appendonly) {
3bb225d6 1817 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1818 if (server.appendfd == -1) {
1819 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1820 strerror(errno));
1821 exit(1);
1822 }
1823 }
75680a3c 1824
1825 if (server.vm_enabled) vmInit();
ed9b544e 1826}
1827
1828/* Empty the whole database */
ca37e9cd 1829static long long emptyDb() {
ed9b544e 1830 int j;
ca37e9cd 1831 long long removed = 0;
ed9b544e 1832
3305306f 1833 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1834 removed += dictSize(server.db[j].dict);
3305306f 1835 dictEmpty(server.db[j].dict);
1836 dictEmpty(server.db[j].expires);
1837 }
ca37e9cd 1838 return removed;
ed9b544e 1839}
1840
85dd2f3a 1841static int yesnotoi(char *s) {
1842 if (!strcasecmp(s,"yes")) return 1;
1843 else if (!strcasecmp(s,"no")) return 0;
1844 else return -1;
1845}
1846
ed9b544e 1847/* I agree, this is a very rudimental way to load a configuration...
1848 will improve later if the config gets more complex */
1849static void loadServerConfig(char *filename) {
c9a111ac 1850 FILE *fp;
ed9b544e 1851 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1852 int linenum = 0;
1853 sds line = NULL;
c9a111ac 1854
1855 if (filename[0] == '-' && filename[1] == '\0')
1856 fp = stdin;
1857 else {
1858 if ((fp = fopen(filename,"r")) == NULL) {
9a22de82 1859 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
c9a111ac 1860 exit(1);
1861 }
ed9b544e 1862 }
c9a111ac 1863
ed9b544e 1864 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1865 sds *argv;
1866 int argc, j;
1867
1868 linenum++;
1869 line = sdsnew(buf);
1870 line = sdstrim(line," \t\r\n");
1871
1872 /* Skip comments and blank lines*/
1873 if (line[0] == '#' || line[0] == '\0') {
1874 sdsfree(line);
1875 continue;
1876 }
1877
1878 /* Split into arguments */
1879 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1880 sdstolower(argv[0]);
1881
1882 /* Execute config directives */
bb0b03a3 1883 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1884 server.maxidletime = atoi(argv[1]);
0150db36 1885 if (server.maxidletime < 0) {
ed9b544e 1886 err = "Invalid timeout value"; goto loaderr;
1887 }
bb0b03a3 1888 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1889 server.port = atoi(argv[1]);
1890 if (server.port < 1 || server.port > 65535) {
1891 err = "Invalid port"; goto loaderr;
1892 }
bb0b03a3 1893 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1894 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1895 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1896 int seconds = atoi(argv[1]);
1897 int changes = atoi(argv[2]);
1898 if (seconds < 1 || changes < 0) {
1899 err = "Invalid save parameters"; goto loaderr;
1900 }
1901 appendServerSaveParams(seconds,changes);
bb0b03a3 1902 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1903 if (chdir(argv[1]) == -1) {
1904 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1905 argv[1], strerror(errno));
1906 exit(1);
1907 }
bb0b03a3 1908 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1909 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1910 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1911 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1912 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1913 else {
1914 err = "Invalid log level. Must be one of debug, notice, warning";
1915 goto loaderr;
1916 }
bb0b03a3 1917 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1918 FILE *logfp;
ed9b544e 1919
1920 server.logfile = zstrdup(argv[1]);
bb0b03a3 1921 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1922 zfree(server.logfile);
1923 server.logfile = NULL;
1924 }
1925 if (server.logfile) {
1926 /* Test if we are able to open the file. The server will not
1927 * be able to abort just for this problem later... */
c9a111ac 1928 logfp = fopen(server.logfile,"a");
1929 if (logfp == NULL) {
ed9b544e 1930 err = sdscatprintf(sdsempty(),
1931 "Can't open the log file: %s", strerror(errno));
1932 goto loaderr;
1933 }
c9a111ac 1934 fclose(logfp);
ed9b544e 1935 }
bb0b03a3 1936 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1937 server.dbnum = atoi(argv[1]);
1938 if (server.dbnum < 1) {
1939 err = "Invalid number of databases"; goto loaderr;
1940 }
b3f83f12
JZ
1941 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1942 loadServerConfig(argv[1]);
285add55 1943 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1944 server.maxclients = atoi(argv[1]);
3fd78bcd 1945 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
2b619329 1946 server.maxmemory = memtoll(argv[1],NULL);
bb0b03a3 1947 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1948 server.masterhost = sdsnew(argv[1]);
1949 server.masterport = atoi(argv[2]);
1950 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1951 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1952 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1953 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1954 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1955 err = "argument must be 'yes' or 'no'"; goto loaderr;
1956 }
121f70cf 1957 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1958 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
8ca3e9d1 1959 err = "argument must be 'yes' or 'no'"; goto loaderr;
1960 }
1961 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1962 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
121f70cf 1963 err = "argument must be 'yes' or 'no'"; goto loaderr;
1964 }
bb0b03a3 1965 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1966 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1967 err = "argument must be 'yes' or 'no'"; goto loaderr;
1968 }
44b38ef4 1969 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1970 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1971 err = "argument must be 'yes' or 'no'"; goto loaderr;
1972 }
f3b52411
PN
1973 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
1974 zfree(server.appendfilename);
1975 server.appendfilename = zstrdup(argv[1]);
38db9171 1976 } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite")
1977 && argc == 2) {
1978 if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) {
1979 err = "argument must be 'yes' or 'no'"; goto loaderr;
1980 }
48f0308a 1981 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1982 if (!strcasecmp(argv[1],"no")) {
48f0308a 1983 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1984 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1985 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1986 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1987 server.appendfsync = APPENDFSYNC_EVERYSEC;
1988 } else {
1989 err = "argument must be 'no', 'always' or 'everysec'";
1990 goto loaderr;
1991 }
bb0b03a3 1992 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 1993 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1994 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 1995 zfree(server.pidfile);
054e426d 1996 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1997 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 1998 zfree(server.dbfilename);
054e426d 1999 server.dbfilename = zstrdup(argv[1]);
75680a3c 2000 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
2001 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
2002 err = "argument must be 'yes' or 'no'"; goto loaderr;
2003 }
054e426d 2004 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 2005 zfree(server.vm_swap_file);
054e426d 2006 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 2007 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2b619329 2008 server.vm_max_memory = memtoll(argv[1],NULL);
4ef8de8a 2009 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2b619329 2010 server.vm_page_size = memtoll(argv[1], NULL);
4ef8de8a 2011 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2b619329 2012 server.vm_pages = memtoll(argv[1], NULL);
92f8e882 2013 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
2014 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 2015 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2b619329 2016 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
cbba7dd7 2017 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2b619329 2018 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
ed9b544e 2019 } else {
2020 err = "Bad directive or wrong number of arguments"; goto loaderr;
2021 }
2022 for (j = 0; j < argc; j++)
2023 sdsfree(argv[j]);
2024 zfree(argv);
2025 sdsfree(line);
2026 }
c9a111ac 2027 if (fp != stdin) fclose(fp);
ed9b544e 2028 return;
2029
2030loaderr:
2031 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
2032 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
2033 fprintf(stderr, ">>> '%s'\n", line);
2034 fprintf(stderr, "%s\n", err);
2035 exit(1);
2036}
2037
2038static void freeClientArgv(redisClient *c) {
2039 int j;
2040
2041 for (j = 0; j < c->argc; j++)
2042 decrRefCount(c->argv[j]);
e8a74421 2043 for (j = 0; j < c->mbargc; j++)
2044 decrRefCount(c->mbargv[j]);
ed9b544e 2045 c->argc = 0;
e8a74421 2046 c->mbargc = 0;
ed9b544e 2047}
2048
2049static void freeClient(redisClient *c) {
2050 listNode *ln;
2051
4409877e 2052 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 2053 * call, we have to set querybuf to NULL *before* to call
2054 * unblockClientWaitingData() to avoid processInputBuffer() will get
2055 * called. Also it is important to remove the file events after
2056 * this, because this call adds the READABLE event. */
4409877e 2057 sdsfree(c->querybuf);
2058 c->querybuf = NULL;
2059 if (c->flags & REDIS_BLOCKED)
b0d8747d 2060 unblockClientWaitingData(c);
4409877e 2061
37ab76c9 2062 /* UNWATCH all the keys */
2063 unwatchAllKeys(c);
2064 listRelease(c->watched_keys);
ffc6b7f8 2065 /* Unsubscribe from all the pubsub channels */
2066 pubsubUnsubscribeAllChannels(c,0);
2067 pubsubUnsubscribeAllPatterns(c,0);
2068 dictRelease(c->pubsub_channels);
2069 listRelease(c->pubsub_patterns);
befec3cd 2070 /* Obvious cleanup */
ed9b544e 2071 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2072 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 2073 listRelease(c->reply);
2074 freeClientArgv(c);
2075 close(c->fd);
92f8e882 2076 /* Remove from the list of clients */
ed9b544e 2077 ln = listSearchKey(server.clients,c);
dfc5e96c 2078 redisAssert(ln != NULL);
ed9b544e 2079 listDelNode(server.clients,ln);
37ab76c9 2080 /* Remove from the list of clients that are now ready to be restarted
2081 * after waiting for swapped keys */
d5d55fc3 2082 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2083 ln = listSearchKey(server.io_ready_clients,c);
2084 if (ln) {
2085 listDelNode(server.io_ready_clients,ln);
2086 server.vm_blocked_clients--;
2087 }
2088 }
37ab76c9 2089 /* Remove from the list of clients waiting for swapped keys */
d5d55fc3 2090 while (server.vm_enabled && listLength(c->io_keys)) {
2091 ln = listFirst(c->io_keys);
2092 dontWaitForSwappedKey(c,ln->value);
92f8e882 2093 }
b3e3d0d7 2094 listRelease(c->io_keys);
befec3cd 2095 /* Master/slave cleanup */
ed9b544e 2096 if (c->flags & REDIS_SLAVE) {
6208b3a7 2097 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2098 close(c->repldbfd);
87eca727 2099 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2100 ln = listSearchKey(l,c);
dfc5e96c 2101 redisAssert(ln != NULL);
87eca727 2102 listDelNode(l,ln);
ed9b544e 2103 }
2104 if (c->flags & REDIS_MASTER) {
2105 server.master = NULL;
2106 server.replstate = REDIS_REPL_CONNECT;
2107 }
befec3cd 2108 /* Release memory */
93ea3759 2109 zfree(c->argv);
e8a74421 2110 zfree(c->mbargv);
6e469882 2111 freeClientMultiState(c);
ed9b544e 2112 zfree(c);
2113}
2114
cc30e368 2115#define GLUEREPLY_UP_TO (1024)
ed9b544e 2116static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 2117 int copylen = 0;
2118 char buf[GLUEREPLY_UP_TO];
6208b3a7 2119 listNode *ln;
c7df85a4 2120 listIter li;
ed9b544e 2121 robj *o;
2122
c7df85a4 2123 listRewind(c->reply,&li);
2124 while((ln = listNext(&li))) {
c28b42ac 2125 int objlen;
2126
ed9b544e 2127 o = ln->value;
c28b42ac 2128 objlen = sdslen(o->ptr);
2129 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2130 memcpy(buf+copylen,o->ptr,objlen);
2131 copylen += objlen;
ed9b544e 2132 listDelNode(c->reply,ln);
c28b42ac 2133 } else {
2134 if (copylen == 0) return;
2135 break;
ed9b544e 2136 }
ed9b544e 2137 }
c28b42ac 2138 /* Now the output buffer is empty, add the new single element */
2139 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2140 listAddNodeHead(c->reply,o);
ed9b544e 2141}
2142
2143static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2144 redisClient *c = privdata;
2145 int nwritten = 0, totwritten = 0, objlen;
2146 robj *o;
2147 REDIS_NOTUSED(el);
2148 REDIS_NOTUSED(mask);
2149
2895e862 2150 /* Use writev() if we have enough buffers to send */
7ea870c0 2151 if (!server.glueoutputbuf &&
e0a62c7f 2152 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
7ea870c0 2153 !(c->flags & REDIS_MASTER))
2895e862 2154 {
2155 sendReplyToClientWritev(el, fd, privdata, mask);
2156 return;
2157 }
2895e862 2158
ed9b544e 2159 while(listLength(c->reply)) {
c28b42ac 2160 if (server.glueoutputbuf && listLength(c->reply) > 1)
2161 glueReplyBuffersIfNeeded(c);
2162
ed9b544e 2163 o = listNodeValue(listFirst(c->reply));
2164 objlen = sdslen(o->ptr);
2165
2166 if (objlen == 0) {
2167 listDelNode(c->reply,listFirst(c->reply));
2168 continue;
2169 }
2170
2171 if (c->flags & REDIS_MASTER) {
6f376729 2172 /* Don't reply to a master */
ed9b544e 2173 nwritten = objlen - c->sentlen;
2174 } else {
a4d1ba9a 2175 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 2176 if (nwritten <= 0) break;
2177 }
2178 c->sentlen += nwritten;
2179 totwritten += nwritten;
2180 /* If we fully sent the object on head go to the next one */
2181 if (c->sentlen == objlen) {
2182 listDelNode(c->reply,listFirst(c->reply));
2183 c->sentlen = 0;
2184 }
6f376729 2185 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 2186 * bytes, in a single threaded server it's a good idea to serve
6f376729 2187 * other clients as well, even if a very large request comes from
2188 * super fast link that is always able to accept data (in real world
12f9d551 2189 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 2190 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 2191 }
2192 if (nwritten == -1) {
2193 if (errno == EAGAIN) {
2194 nwritten = 0;
2195 } else {
f870935d 2196 redisLog(REDIS_VERBOSE,
ed9b544e 2197 "Error writing to client: %s", strerror(errno));
2198 freeClient(c);
2199 return;
2200 }
2201 }
2202 if (totwritten > 0) c->lastinteraction = time(NULL);
2203 if (listLength(c->reply) == 0) {
2204 c->sentlen = 0;
2205 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2206 }
2207}
2208
2895e862 2209static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2210{
2211 redisClient *c = privdata;
2212 int nwritten = 0, totwritten = 0, objlen, willwrite;
2213 robj *o;
2214 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2215 int offset, ion = 0;
2216 REDIS_NOTUSED(el);
2217 REDIS_NOTUSED(mask);
2218
2219 listNode *node;
2220 while (listLength(c->reply)) {
2221 offset = c->sentlen;
2222 ion = 0;
2223 willwrite = 0;
2224
2225 /* fill-in the iov[] array */
2226 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2227 o = listNodeValue(node);
2228 objlen = sdslen(o->ptr);
2229
e0a62c7f 2230 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2895e862 2231 break;
2232
2233 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2234 break; /* no more iovecs */
2235
2236 iov[ion].iov_base = ((char*)o->ptr) + offset;
2237 iov[ion].iov_len = objlen - offset;
2238 willwrite += objlen - offset;
2239 offset = 0; /* just for the first item */
2240 ion++;
2241 }
2242
2243 if(willwrite == 0)
2244 break;
2245
2246 /* write all collected blocks at once */
2247 if((nwritten = writev(fd, iov, ion)) < 0) {
2248 if (errno != EAGAIN) {
f870935d 2249 redisLog(REDIS_VERBOSE,
2895e862 2250 "Error writing to client: %s", strerror(errno));
2251 freeClient(c);
2252 return;
2253 }
2254 break;
2255 }
2256
2257 totwritten += nwritten;
2258 offset = c->sentlen;
2259
2260 /* remove written robjs from c->reply */
2261 while (nwritten && listLength(c->reply)) {
2262 o = listNodeValue(listFirst(c->reply));
2263 objlen = sdslen(o->ptr);
2264
2265 if(nwritten >= objlen - offset) {
2266 listDelNode(c->reply, listFirst(c->reply));
2267 nwritten -= objlen - offset;
2268 c->sentlen = 0;
2269 } else {
2270 /* partial write */
2271 c->sentlen += nwritten;
2272 break;
2273 }
2274 offset = 0;
2275 }
2276 }
2277
e0a62c7f 2278 if (totwritten > 0)
2895e862 2279 c->lastinteraction = time(NULL);
2280
2281 if (listLength(c->reply) == 0) {
2282 c->sentlen = 0;
2283 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2284 }
2285}
2286
1a132bbc
PN
2287static int qsortRedisCommands(const void *r1, const void *r2) {
2288 return strcasecmp(
2289 ((struct redisCommand*)r1)->name,
2290 ((struct redisCommand*)r2)->name);
2291}
2292
2293static void sortCommandTable() {
1a132bbc
PN
2294 /* Copy and sort the read-only version of the command table */
2295 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2296 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
d55d5c5d 2297 qsort(commandTable,
2298 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2299 sizeof(struct redisCommand),qsortRedisCommands);
1a132bbc
PN
2300}
2301
ed9b544e 2302static struct redisCommand *lookupCommand(char *name) {
1a132bbc
PN
2303 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2304 return bsearch(
2305 &tmp,
2306 commandTable,
d55d5c5d 2307 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
1a132bbc
PN
2308 sizeof(struct redisCommand),
2309 qsortRedisCommands);
ed9b544e 2310}
2311
2312/* resetClient prepare the client to process the next command */
2313static void resetClient(redisClient *c) {
2314 freeClientArgv(c);
2315 c->bulklen = -1;
e8a74421 2316 c->multibulk = 0;
ed9b544e 2317}
2318
6e469882 2319/* Call() is the core of Redis execution of a command */
2320static void call(redisClient *c, struct redisCommand *cmd) {
2321 long long dirty;
2322
2323 dirty = server.dirty;
2324 cmd->proc(c);
4005fef1 2325 dirty = server.dirty-dirty;
2326
2327 if (server.appendonly && dirty)
6e469882 2328 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
4005fef1 2329 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2330 listLength(server.slaves))
248ea310 2331 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2332 if (listLength(server.monitors))
dd142b9c 2333 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2334 server.stat_numcommands++;
2335}
2336
ed9b544e 2337/* If this function gets called we already read a whole
2338 * command, argments are in the client argv/argc fields.
2339 * processCommand() execute the command or prepare the
2340 * server for a bulk read from the client.
2341 *
2342 * If 1 is returned the client is still alive and valid and
2343 * and other operations can be performed by the caller. Otherwise
2344 * if 0 is returned the client was destroied (i.e. after QUIT). */
2345static int processCommand(redisClient *c) {
2346 struct redisCommand *cmd;
ed9b544e 2347
3fd78bcd 2348 /* Free some memory if needed (maxmemory setting) */
2349 if (server.maxmemory) freeMemoryIfNeeded();
2350
e8a74421 2351 /* Handle the multi bulk command type. This is an alternative protocol
2352 * supported by Redis in order to receive commands that are composed of
2353 * multiple binary-safe "bulk" arguments. The latency of processing is
2354 * a bit higher but this allows things like multi-sets, so if this
2355 * protocol is used only for MSET and similar commands this is a big win. */
2356 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2357 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2358 if (c->multibulk <= 0) {
2359 resetClient(c);
2360 return 1;
2361 } else {
2362 decrRefCount(c->argv[c->argc-1]);
2363 c->argc--;
2364 return 1;
2365 }
2366 } else if (c->multibulk) {
2367 if (c->bulklen == -1) {
2368 if (((char*)c->argv[0]->ptr)[0] != '$') {
2369 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2370 resetClient(c);
2371 return 1;
2372 } else {
2373 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2374 decrRefCount(c->argv[0]);
2375 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2376 c->argc--;
2377 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2378 resetClient(c);
2379 return 1;
2380 }
2381 c->argc--;
2382 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2383 return 1;
2384 }
2385 } else {
2386 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2387 c->mbargv[c->mbargc] = c->argv[0];
2388 c->mbargc++;
2389 c->argc--;
2390 c->multibulk--;
2391 if (c->multibulk == 0) {
2392 robj **auxargv;
2393 int auxargc;
2394
2395 /* Here we need to swap the multi-bulk argc/argv with the
2396 * normal argc/argv of the client structure. */
2397 auxargv = c->argv;
2398 c->argv = c->mbargv;
2399 c->mbargv = auxargv;
2400
2401 auxargc = c->argc;
2402 c->argc = c->mbargc;
2403 c->mbargc = auxargc;
2404
2405 /* We need to set bulklen to something different than -1
2406 * in order for the code below to process the command without
2407 * to try to read the last argument of a bulk command as
2408 * a special argument. */
2409 c->bulklen = 0;
2410 /* continue below and process the command */
2411 } else {
2412 c->bulklen = -1;
2413 return 1;
2414 }
2415 }
2416 }
2417 /* -- end of multi bulk commands processing -- */
2418
ed9b544e 2419 /* The QUIT command is handled as a special case. Normal command
2420 * procs are unable to close the client connection safely */
bb0b03a3 2421 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2422 freeClient(c);
2423 return 0;
2424 }
d5d55fc3 2425
2426 /* Now lookup the command and check ASAP about trivial error conditions
2427 * such wrong arity, bad command name and so forth. */
ed9b544e 2428 cmd = lookupCommand(c->argv[0]->ptr);
2429 if (!cmd) {
2c14807b 2430 addReplySds(c,
2431 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2432 (char*)c->argv[0]->ptr));
ed9b544e 2433 resetClient(c);
2434 return 1;
2435 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2436 (c->argc < -cmd->arity)) {
454d4e43 2437 addReplySds(c,
2438 sdscatprintf(sdsempty(),
2439 "-ERR wrong number of arguments for '%s' command\r\n",
2440 cmd->name));
ed9b544e 2441 resetClient(c);
2442 return 1;
2443 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2444 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2445 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2446
2447 decrRefCount(c->argv[c->argc-1]);
2448 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2449 c->argc--;
2450 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2451 resetClient(c);
2452 return 1;
2453 }
2454 c->argc--;
2455 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2456 /* It is possible that the bulk read is already in the
8d0490e7 2457 * buffer. Check this condition and handle it accordingly.
2458 * This is just a fast path, alternative to call processInputBuffer().
2459 * It's a good idea since the code is small and this condition
2460 * happens most of the times. */
ed9b544e 2461 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2462 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2463 c->argc++;
2464 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2465 } else {
d5d55fc3 2466 /* Otherwise return... there is to read the last argument
2467 * from the socket. */
ed9b544e 2468 return 1;
2469 }
2470 }
942a3961 2471 /* Let's try to encode the bulk object to save space. */
2472 if (cmd->flags & REDIS_CMD_BULK)
05df7621 2473 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
942a3961 2474
e63943a4 2475 /* Check if the user is authenticated */
2476 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2477 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2478 resetClient(c);
2479 return 1;
2480 }
2481
b61a28fe 2482 /* Handle the maxmemory directive */
2483 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2484 zmalloc_used_memory() > server.maxmemory)
2485 {
2486 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2487 resetClient(c);
2488 return 1;
2489 }
2490
d6cc8867 2491 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
e6cca5db 2492 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2493 &&
ffc6b7f8 2494 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2495 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2496 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
d6cc8867 2497 resetClient(c);
2498 return 1;
2499 }
2500
ed9b544e 2501 /* Exec the command */
6531c94d 2502 if (c->flags & REDIS_MULTI &&
2503 cmd->proc != execCommand && cmd->proc != discardCommand &&
2504 cmd->proc != multiCommand && cmd->proc != watchCommand)
2505 {
6e469882 2506 queueMultiCommand(c,cmd);
2507 addReply(c,shared.queued);
2508 } else {
d5d55fc3 2509 if (server.vm_enabled && server.vm_max_threads > 0 &&
0a6f3f0f 2510 blockClientOnSwappedKeys(c,cmd)) return 1;
6e469882 2511 call(c,cmd);
2512 }
ed9b544e 2513
2514 /* Prepare the client for the next command */
ed9b544e 2515 resetClient(c);
2516 return 1;
2517}
2518
248ea310 2519static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2520 listNode *ln;
c7df85a4 2521 listIter li;
ed9b544e 2522 int outc = 0, j;
93ea3759 2523 robj **outv;
248ea310 2524 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2525 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2526 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2527 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2528 robj *lenobj;
93ea3759 2529
2530 if (argc <= REDIS_STATIC_ARGS) {
2531 outv = static_outv;
2532 } else {
248ea310 2533 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2534 }
248ea310 2535
2536 lenobj = createObject(REDIS_STRING,
2537 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2538 lenobj->refcount = 0;
2539 outv[outc++] = lenobj;
ed9b544e 2540 for (j = 0; j < argc; j++) {
248ea310 2541 lenobj = createObject(REDIS_STRING,
2542 sdscatprintf(sdsempty(),"$%lu\r\n",
2543 (unsigned long) stringObjectLen(argv[j])));
2544 lenobj->refcount = 0;
2545 outv[outc++] = lenobj;
ed9b544e 2546 outv[outc++] = argv[j];
248ea310 2547 outv[outc++] = shared.crlf;
ed9b544e 2548 }
ed9b544e 2549
40d224a9 2550 /* Increment all the refcounts at start and decrement at end in order to
2551 * be sure to free objects if there is no slave in a replication state
2552 * able to be feed with commands */
2553 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2554 listRewind(slaves,&li);
2555 while((ln = listNext(&li))) {
ed9b544e 2556 redisClient *slave = ln->value;
40d224a9 2557
2558 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2559 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2560
2561 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2562 if (slave->slaveseldb != dictid) {
2563 robj *selectcmd;
2564
2565 switch(dictid) {
2566 case 0: selectcmd = shared.select0; break;
2567 case 1: selectcmd = shared.select1; break;
2568 case 2: selectcmd = shared.select2; break;
2569 case 3: selectcmd = shared.select3; break;
2570 case 4: selectcmd = shared.select4; break;
2571 case 5: selectcmd = shared.select5; break;
2572 case 6: selectcmd = shared.select6; break;
2573 case 7: selectcmd = shared.select7; break;
2574 case 8: selectcmd = shared.select8; break;
2575 case 9: selectcmd = shared.select9; break;
2576 default:
2577 selectcmd = createObject(REDIS_STRING,
2578 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2579 selectcmd->refcount = 0;
2580 break;
2581 }
2582 addReply(slave,selectcmd);
2583 slave->slaveseldb = dictid;
2584 }
2585 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2586 }
40d224a9 2587 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2588 if (outv != static_outv) zfree(outv);
ed9b544e 2589}
2590
dd142b9c 2591static sds sdscatrepr(sds s, char *p, size_t len) {
2592 s = sdscatlen(s,"\"",1);
2593 while(len--) {
2594 switch(*p) {
2595 case '\\':
2596 case '"':
2597 s = sdscatprintf(s,"\\%c",*p);
2598 break;
2599 case '\n': s = sdscatlen(s,"\\n",1); break;
2600 case '\r': s = sdscatlen(s,"\\r",1); break;
2601 case '\t': s = sdscatlen(s,"\\t",1); break;
2602 case '\a': s = sdscatlen(s,"\\a",1); break;
2603 case '\b': s = sdscatlen(s,"\\b",1); break;
2604 default:
2605 if (isprint(*p))
2606 s = sdscatprintf(s,"%c",*p);
2607 else
2608 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2609 break;
2610 }
2611 p++;
2612 }
2613 return sdscatlen(s,"\"",1);
2614}
2615
2616static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2617 listNode *ln;
2618 listIter li;
2619 int j;
2620 sds cmdrepr = sdsnew("+");
2621 robj *cmdobj;
2622 struct timeval tv;
2623
2624 gettimeofday(&tv,NULL);
2625 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2626 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2627
2628 for (j = 0; j < argc; j++) {
2629 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2630 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2631 } else {
2632 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2633 sdslen(argv[j]->ptr));
2634 }
2635 if (j != argc-1)
2636 cmdrepr = sdscatlen(cmdrepr," ",1);
2637 }
2638 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2639 cmdobj = createObject(REDIS_STRING,cmdrepr);
2640
2641 listRewind(monitors,&li);
2642 while((ln = listNext(&li))) {
2643 redisClient *monitor = ln->value;
2644 addReply(monitor,cmdobj);
2645 }
2646 decrRefCount(cmdobj);
2647}
2648
638e42ac 2649static void processInputBuffer(redisClient *c) {
ed9b544e 2650again:
4409877e 2651 /* Before to process the input buffer, make sure the client is not
2652 * waitig for a blocking operation such as BLPOP. Note that the first
2653 * iteration the client is never blocked, otherwise the processInputBuffer
2654 * would not be called at all, but after the execution of the first commands
2655 * in the input buffer the client may be blocked, and the "goto again"
2656 * will try to reiterate. The following line will make it return asap. */
92f8e882 2657 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2658 if (c->bulklen == -1) {
2659 /* Read the first line of the query */
2660 char *p = strchr(c->querybuf,'\n');
2661 size_t querylen;
644fafa3 2662
ed9b544e 2663 if (p) {
2664 sds query, *argv;
2665 int argc, j;
e0a62c7f 2666
ed9b544e 2667 query = c->querybuf;
2668 c->querybuf = sdsempty();
2669 querylen = 1+(p-(query));
2670 if (sdslen(query) > querylen) {
2671 /* leave data after the first line of the query in the buffer */
2672 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2673 }
2674 *p = '\0'; /* remove "\n" */
2675 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2676 sdsupdatelen(query);
2677
2678 /* Now we can split the query in arguments */
ed9b544e 2679 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2680 sdsfree(query);
2681
2682 if (c->argv) zfree(c->argv);
2683 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2684
2685 for (j = 0; j < argc; j++) {
ed9b544e 2686 if (sdslen(argv[j])) {
2687 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2688 c->argc++;
2689 } else {
2690 sdsfree(argv[j]);
2691 }
2692 }
2693 zfree(argv);
7c49733c 2694 if (c->argc) {
2695 /* Execute the command. If the client is still valid
2696 * after processCommand() return and there is something
2697 * on the query buffer try to process the next command. */
2698 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2699 } else {
2700 /* Nothing to process, argc == 0. Just process the query
2701 * buffer if it's not empty or return to the caller */
2702 if (sdslen(c->querybuf)) goto again;
2703 }
ed9b544e 2704 return;
644fafa3 2705 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2706 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2707 freeClient(c);
2708 return;
2709 }
2710 } else {
2711 /* Bulk read handling. Note that if we are at this point
2712 the client already sent a command terminated with a newline,
2713 we are reading the bulk data that is actually the last
2714 argument of the command. */
2715 int qbl = sdslen(c->querybuf);
2716
2717 if (c->bulklen <= qbl) {
2718 /* Copy everything but the final CRLF as final argument */
2719 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2720 c->argc++;
2721 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2722 /* Process the command. If the client is still valid after
2723 * the processing and there is more data in the buffer
2724 * try to parse it. */
2725 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2726 return;
2727 }
2728 }
2729}
2730
638e42ac 2731static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2732 redisClient *c = (redisClient*) privdata;
2733 char buf[REDIS_IOBUF_LEN];
2734 int nread;
2735 REDIS_NOTUSED(el);
2736 REDIS_NOTUSED(mask);
2737
2738 nread = read(fd, buf, REDIS_IOBUF_LEN);
2739 if (nread == -1) {
2740 if (errno == EAGAIN) {
2741 nread = 0;
2742 } else {
f870935d 2743 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2744 freeClient(c);
2745 return;
2746 }
2747 } else if (nread == 0) {
f870935d 2748 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2749 freeClient(c);
2750 return;
2751 }
2752 if (nread) {
2753 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2754 c->lastinteraction = time(NULL);
2755 } else {
2756 return;
2757 }
168ac5c6 2758 processInputBuffer(c);
638e42ac 2759}
2760
ed9b544e 2761static int selectDb(redisClient *c, int id) {
2762 if (id < 0 || id >= server.dbnum)
2763 return REDIS_ERR;
3305306f 2764 c->db = &server.db[id];
ed9b544e 2765 return REDIS_OK;
2766}
2767
40d224a9 2768static void *dupClientReplyValue(void *o) {
2769 incrRefCount((robj*)o);
12d090d2 2770 return o;
40d224a9 2771}
2772
ffc6b7f8 2773static int listMatchObjects(void *a, void *b) {
bf028098 2774 return equalStringObjects(a,b);
ffc6b7f8 2775}
2776
ed9b544e 2777static redisClient *createClient(int fd) {
2778 redisClient *c = zmalloc(sizeof(*c));
2779
2780 anetNonBlock(NULL,fd);
2781 anetTcpNoDelay(NULL,fd);
2782 if (!c) return NULL;
2783 selectDb(c,0);
2784 c->fd = fd;
2785 c->querybuf = sdsempty();
2786 c->argc = 0;
93ea3759 2787 c->argv = NULL;
ed9b544e 2788 c->bulklen = -1;
e8a74421 2789 c->multibulk = 0;
2790 c->mbargc = 0;
2791 c->mbargv = NULL;
ed9b544e 2792 c->sentlen = 0;
2793 c->flags = 0;
2794 c->lastinteraction = time(NULL);
abcb223e 2795 c->authenticated = 0;
40d224a9 2796 c->replstate = REDIS_REPL_NONE;
6b47e12e 2797 c->reply = listCreate();
ed9b544e 2798 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2799 listSetDupMethod(c->reply,dupClientReplyValue);
37ab76c9 2800 c->blocking_keys = NULL;
2801 c->blocking_keys_num = 0;
92f8e882 2802 c->io_keys = listCreate();
87c68815 2803 c->watched_keys = listCreate();
92f8e882 2804 listSetFreeMethod(c->io_keys,decrRefCount);
ffc6b7f8 2805 c->pubsub_channels = dictCreate(&setDictType,NULL);
2806 c->pubsub_patterns = listCreate();
2807 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2808 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
ed9b544e 2809 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2810 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2811 freeClient(c);
2812 return NULL;
2813 }
6b47e12e 2814 listAddNodeTail(server.clients,c);
6e469882 2815 initClientMultiState(c);
ed9b544e 2816 return c;
2817}
2818
2819static void addReply(redisClient *c, robj *obj) {
2820 if (listLength(c->reply) == 0 &&
6208b3a7 2821 (c->replstate == REDIS_REPL_NONE ||
2822 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2823 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2824 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2825
2826 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2827 obj = dupStringObject(obj);
2828 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2829 }
9d65a1bb 2830 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2831}
2832
2833static void addReplySds(redisClient *c, sds s) {
2834 robj *o = createObject(REDIS_STRING,s);
2835 addReply(c,o);
2836 decrRefCount(o);
2837}
2838
e2665397 2839static void addReplyDouble(redisClient *c, double d) {
2840 char buf[128];
2841
2842 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2843 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2844 (unsigned long) strlen(buf),buf));
e2665397 2845}
2846
aa7c2934
PN
2847static void addReplyLongLong(redisClient *c, long long ll) {
2848 char buf[128];
2849 size_t len;
2850
2851 if (ll == 0) {
2852 addReply(c,shared.czero);
2853 return;
2854 } else if (ll == 1) {
2855 addReply(c,shared.cone);
2856 return;
2857 }
482b672d 2858 buf[0] = ':';
2859 len = ll2string(buf+1,sizeof(buf)-1,ll);
2860 buf[len+1] = '\r';
2861 buf[len+2] = '\n';
2862 addReplySds(c,sdsnewlen(buf,len+3));
aa7c2934
PN
2863}
2864
92b27fe9 2865static void addReplyUlong(redisClient *c, unsigned long ul) {
2866 char buf[128];
2867 size_t len;
2868
dd88747b 2869 if (ul == 0) {
2870 addReply(c,shared.czero);
2871 return;
2872 } else if (ul == 1) {
2873 addReply(c,shared.cone);
2874 return;
2875 }
92b27fe9 2876 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2877 addReplySds(c,sdsnewlen(buf,len));
2878}
2879
942a3961 2880static void addReplyBulkLen(redisClient *c, robj *obj) {
482b672d 2881 size_t len, intlen;
2882 char buf[128];
942a3961 2883
2884 if (obj->encoding == REDIS_ENCODING_RAW) {
2885 len = sdslen(obj->ptr);
2886 } else {
2887 long n = (long)obj->ptr;
2888
e054afda 2889 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2890 len = 1;
2891 if (n < 0) {
2892 len++;
2893 n = -n;
2894 }
2895 while((n = n/10) != 0) {
2896 len++;
2897 }
2898 }
482b672d 2899 buf[0] = '$';
2900 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2901 buf[intlen+1] = '\r';
2902 buf[intlen+2] = '\n';
2903 addReplySds(c,sdsnewlen(buf,intlen+3));
942a3961 2904}
2905
dd88747b 2906static void addReplyBulk(redisClient *c, robj *obj) {
2907 addReplyBulkLen(c,obj);
2908 addReply(c,obj);
2909 addReply(c,shared.crlf);
2910}
2911
500ece7c 2912/* In the CONFIG command we need to add vanilla C string as bulk replies */
2913static void addReplyBulkCString(redisClient *c, char *s) {
2914 if (s == NULL) {
2915 addReply(c,shared.nullbulk);
2916 } else {
2917 robj *o = createStringObject(s,strlen(s));
2918 addReplyBulk(c,o);
2919 decrRefCount(o);
2920 }
2921}
2922
ed9b544e 2923static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2924 int cport, cfd;
2925 char cip[128];
285add55 2926 redisClient *c;
ed9b544e 2927 REDIS_NOTUSED(el);
2928 REDIS_NOTUSED(mask);
2929 REDIS_NOTUSED(privdata);
2930
2931 cfd = anetAccept(server.neterr, fd, cip, &cport);
2932 if (cfd == AE_ERR) {
f870935d 2933 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2934 return;
2935 }
f870935d 2936 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2937 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2938 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2939 close(cfd); /* May be already closed, just ingore errors */
2940 return;
2941 }
285add55 2942 /* If maxclient directive is set and this is one client more... close the
2943 * connection. Note that we create the client instead to check before
2944 * for this condition, since now the socket is already set in nonblocking
2945 * mode and we can send an error for free using the Kernel I/O */
2946 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2947 char *err = "-ERR max number of clients reached\r\n";
2948
2949 /* That's a best effort error message, don't check write errors */
fee803ba 2950 if (write(c->fd,err,strlen(err)) == -1) {
2951 /* Nothing to do, Just to avoid the warning... */
2952 }
285add55 2953 freeClient(c);
2954 return;
2955 }
ed9b544e 2956 server.stat_numconnections++;
2957}
2958
2959/* ======================= Redis objects implementation ===================== */
2960
2961static robj *createObject(int type, void *ptr) {
2962 robj *o;
2963
a5819310 2964 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2965 if (listLength(server.objfreelist)) {
2966 listNode *head = listFirst(server.objfreelist);
2967 o = listNodeValue(head);
2968 listDelNode(server.objfreelist,head);
a5819310 2969 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2970 } else {
560db612 2971 if (server.vm_enabled)
a5819310 2972 pthread_mutex_unlock(&server.obj_freelist_mutex);
560db612 2973 o = zmalloc(sizeof(*o));
ed9b544e 2974 }
ed9b544e 2975 o->type = type;
942a3961 2976 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2977 o->ptr = ptr;
2978 o->refcount = 1;
3a66edc7 2979 if (server.vm_enabled) {
1064ef87 2980 /* Note that this code may run in the context of an I/O thread
560db612 2981 * and accessing server.lruclock in theory is an error
1064ef87 2982 * (no locks). But in practice this is safe, and even if we read
560db612 2983 * garbage Redis will not fail. */
2984 o->lru = server.lruclock;
3a66edc7 2985 o->storage = REDIS_VM_MEMORY;
2986 }
ed9b544e 2987 return o;
2988}
2989
2990static robj *createStringObject(char *ptr, size_t len) {
2991 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2992}
2993
3f973463
PN
2994static robj *createStringObjectFromLongLong(long long value) {
2995 robj *o;
2996 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2997 incrRefCount(shared.integers[value]);
2998 o = shared.integers[value];
2999 } else {
3f973463 3000 if (value >= LONG_MIN && value <= LONG_MAX) {
10dea8dc 3001 o = createObject(REDIS_STRING, NULL);
3f973463
PN
3002 o->encoding = REDIS_ENCODING_INT;
3003 o->ptr = (void*)((long)value);
3004 } else {
ee14da56 3005 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3f973463
PN
3006 }
3007 }
3008 return o;
3009}
3010
4ef8de8a 3011static robj *dupStringObject(robj *o) {
b9bc0eef 3012 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 3013 return createStringObject(o->ptr,sdslen(o->ptr));
3014}
3015
ed9b544e 3016static robj *createListObject(void) {
3017 list *l = listCreate();
3018
ed9b544e 3019 listSetFreeMethod(l,decrRefCount);
3020 return createObject(REDIS_LIST,l);
3021}
3022
3023static robj *createSetObject(void) {
3024 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 3025 return createObject(REDIS_SET,d);
3026}
3027
5234952b 3028static robj *createHashObject(void) {
3029 /* All the Hashes start as zipmaps. Will be automatically converted
3030 * into hash tables if there are enough elements or big elements
3031 * inside. */
3032 unsigned char *zm = zipmapNew();
3033 robj *o = createObject(REDIS_HASH,zm);
3034 o->encoding = REDIS_ENCODING_ZIPMAP;
3035 return o;
3036}
3037
1812e024 3038static robj *createZsetObject(void) {
6b47e12e 3039 zset *zs = zmalloc(sizeof(*zs));
3040
3041 zs->dict = dictCreate(&zsetDictType,NULL);
3042 zs->zsl = zslCreate();
3043 return createObject(REDIS_ZSET,zs);
1812e024 3044}
3045
ed9b544e 3046static void freeStringObject(robj *o) {
942a3961 3047 if (o->encoding == REDIS_ENCODING_RAW) {
3048 sdsfree(o->ptr);
3049 }
ed9b544e 3050}
3051
3052static void freeListObject(robj *o) {
3053 listRelease((list*) o->ptr);
3054}
3055
3056static void freeSetObject(robj *o) {
3057 dictRelease((dict*) o->ptr);
3058}
3059
fd8ccf44 3060static void freeZsetObject(robj *o) {
3061 zset *zs = o->ptr;
3062
3063 dictRelease(zs->dict);
3064 zslFree(zs->zsl);
3065 zfree(zs);
3066}
3067
ed9b544e 3068static void freeHashObject(robj *o) {
cbba7dd7 3069 switch (o->encoding) {
3070 case REDIS_ENCODING_HT:
3071 dictRelease((dict*) o->ptr);
3072 break;
3073 case REDIS_ENCODING_ZIPMAP:
3074 zfree(o->ptr);
3075 break;
3076 default:
f83c6cb5 3077 redisPanic("Unknown hash encoding type");
cbba7dd7 3078 break;
3079 }
ed9b544e 3080}
3081
3082static void incrRefCount(robj *o) {
3083 o->refcount++;
3084}
3085
3086static void decrRefCount(void *obj) {
3087 robj *o = obj;
94754ccc 3088
560db612 3089 /* Object is a swapped out value, or in the process of being loaded. */
996cb5f7 3090 if (server.vm_enabled &&
3091 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3092 {
560db612 3093 vmpointer *vp = obj;
3094 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(o);
3095 vmMarkPagesFree(vp->page,vp->usedpages);
7d98e08c 3096 server.vm_stats_swapped_objects--;
560db612 3097 zfree(vp);
a35ddf12 3098 return;
3099 }
560db612 3100
3101 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
e4ed181d 3102 /* Object is in memory, or in the process of being swapped out.
3103 *
3104 * If the object is being swapped out, abort the operation on
3105 * decrRefCount even if the refcount does not drop to 0: the object
3106 * is referenced at least two times, as value of the key AND as
3107 * job->val in the iojob. So if we don't invalidate the iojob, when it is
3108 * done but the relevant key was removed in the meantime, the
3109 * complete jobs handler will not find the key about the job and the
3110 * assert will fail. */
3111 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3112 vmCancelThreadedIOJob(o);
ed9b544e 3113 if (--(o->refcount) == 0) {
3114 switch(o->type) {
3115 case REDIS_STRING: freeStringObject(o); break;
3116 case REDIS_LIST: freeListObject(o); break;
3117 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 3118 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 3119 case REDIS_HASH: freeHashObject(o); break;
f83c6cb5 3120 default: redisPanic("Unknown object type"); break;
ed9b544e 3121 }
a5819310 3122 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 3123 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3124 !listAddNodeHead(server.objfreelist,o))
3125 zfree(o);
a5819310 3126 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 3127 }
3128}
3129
942a3961 3130static robj *lookupKey(redisDb *db, robj *key) {
3131 dictEntry *de = dictFind(db->dict,key);
3a66edc7 3132 if (de) {
55cf8433 3133 robj *key = dictGetEntryKey(de);
3134 robj *val = dictGetEntryVal(de);
3a66edc7 3135
55cf8433 3136 if (server.vm_enabled) {
560db612 3137 if (val->storage == REDIS_VM_MEMORY ||
3138 val->storage == REDIS_VM_SWAPPING)
996cb5f7 3139 {
560db612 3140 /* If we were swapping the object out, cancel the operation */
3141 if (val->storage == REDIS_VM_SWAPPING)
3142 vmCancelThreadedIOJob(val);
55cf8433 3143 /* Update the access time of the key for the aging algorithm. */
560db612 3144 val->lru = server.lruclock;
55cf8433 3145 } else {
560db612 3146 int notify = (val->storage == REDIS_VM_LOADING);
d5d55fc3 3147
55cf8433 3148 /* Our value was swapped on disk. Bring it at home. */
560db612 3149 redisAssert(val->type == REDIS_VMPOINTER);
3150 val = vmLoadObject(val);
55cf8433 3151 dictGetEntryVal(de) = val;
d5d55fc3 3152
3153 /* Clients blocked by the VM subsystem may be waiting for
3154 * this key... */
3155 if (notify) handleClientsBlockedOnSwappedKey(db,key);
55cf8433 3156 }
3157 }
3158 return val;
3a66edc7 3159 } else {
3160 return NULL;
3161 }
942a3961 3162}
3163
3164static robj *lookupKeyRead(redisDb *db, robj *key) {
3165 expireIfNeeded(db,key);
3166 return lookupKey(db,key);
3167}
3168
3169static robj *lookupKeyWrite(redisDb *db, robj *key) {
3170 deleteIfVolatile(db,key);
37ab76c9 3171 touchWatchedKey(db,key);
942a3961 3172 return lookupKey(db,key);
3173}
3174
92b27fe9 3175static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3176 robj *o = lookupKeyRead(c->db, key);
3177 if (!o) addReply(c,reply);
3178 return o;
3179}
3180
3181static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3182 robj *o = lookupKeyWrite(c->db, key);
3183 if (!o) addReply(c,reply);
3184 return o;
3185}
3186
3187static int checkType(redisClient *c, robj *o, int type) {
3188 if (o->type != type) {
3189 addReply(c,shared.wrongtypeerr);
3190 return 1;
3191 }
3192 return 0;
3193}
3194
942a3961 3195static int deleteKey(redisDb *db, robj *key) {
3196 int retval;
3197
3198 /* We need to protect key from destruction: after the first dictDelete()
3199 * it may happen that 'key' is no longer valid if we don't increment
3200 * it's count. This may happen when we get the object reference directly
3201 * from the hash table with dictRandomKey() or dict iterators */
3202 incrRefCount(key);
3203 if (dictSize(db->expires)) dictDelete(db->expires,key);
3204 retval = dictDelete(db->dict,key);
3205 decrRefCount(key);
3206
3207 return retval == DICT_OK;
3208}
3209
724a51b1 3210/* Check if the nul-terminated string 's' can be represented by a long
3211 * (that is, is a number that fits into long without any other space or
3212 * character before or after the digits).
3213 *
3214 * If so, the function returns REDIS_OK and *longval is set to the value
3215 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 3216static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 3217 char buf[32], *endptr;
3218 long value;
3219 int slen;
e0a62c7f 3220
724a51b1 3221 value = strtol(s, &endptr, 10);
3222 if (endptr[0] != '\0') return REDIS_ERR;
ee14da56 3223 slen = ll2string(buf,32,value);
724a51b1 3224
3225 /* If the number converted back into a string is not identical
3226 * then it's not possible to encode the string as integer */
f69f2cba 3227 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 3228 if (longval) *longval = value;
3229 return REDIS_OK;
3230}
3231
942a3961 3232/* Try to encode a string object in order to save space */
05df7621 3233static robj *tryObjectEncoding(robj *o) {
942a3961 3234 long value;
942a3961 3235 sds s = o->ptr;
3305306f 3236
942a3961 3237 if (o->encoding != REDIS_ENCODING_RAW)
05df7621 3238 return o; /* Already encoded */
3305306f 3239
05df7621 3240 /* It's not safe to encode shared objects: shared objects can be shared
942a3961 3241 * everywhere in the "object space" of Redis. Encoded objects can only
3242 * appear as "values" (and not, for instance, as keys) */
05df7621 3243 if (o->refcount > 1) return o;
3305306f 3244
942a3961 3245 /* Currently we try to encode only strings */
dfc5e96c 3246 redisAssert(o->type == REDIS_STRING);
94754ccc 3247
724a51b1 3248 /* Check if we can represent this string as a long integer */
05df7621 3249 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
942a3961 3250
3251 /* Ok, this object can be encoded */
05df7621 3252 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3253 decrRefCount(o);
3254 incrRefCount(shared.integers[value]);
3255 return shared.integers[value];
3256 } else {
3257 o->encoding = REDIS_ENCODING_INT;
3258 sdsfree(o->ptr);
3259 o->ptr = (void*) value;
3260 return o;
3261 }
942a3961 3262}
3263
9d65a1bb 3264/* Get a decoded version of an encoded object (returned as a new object).
3265 * If the object is already raw-encoded just increment the ref count. */
3266static robj *getDecodedObject(robj *o) {
942a3961 3267 robj *dec;
e0a62c7f 3268
9d65a1bb 3269 if (o->encoding == REDIS_ENCODING_RAW) {
3270 incrRefCount(o);
3271 return o;
3272 }
942a3961 3273 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3274 char buf[32];
3275
ee14da56 3276 ll2string(buf,32,(long)o->ptr);
942a3961 3277 dec = createStringObject(buf,strlen(buf));
3278 return dec;
3279 } else {
08ee9b57 3280 redisPanic("Unknown encoding type");
942a3961 3281 }
3305306f 3282}
3283
d7f43c08 3284/* Compare two string objects via strcmp() or alike.
3285 * Note that the objects may be integer-encoded. In such a case we
ee14da56 3286 * use ll2string() to get a string representation of the numbers on the stack
1fd9bc8a 3287 * and compare the strings, it's much faster than calling getDecodedObject().
3288 *
3289 * Important note: if objects are not integer encoded, but binary-safe strings,
3290 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3291 * binary safe. */
724a51b1 3292static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 3293 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 3294 char bufa[128], bufb[128], *astr, *bstr;
3295 int bothsds = 1;
724a51b1 3296
e197b441 3297 if (a == b) return 0;
d7f43c08 3298 if (a->encoding != REDIS_ENCODING_RAW) {
ee14da56 3299 ll2string(bufa,sizeof(bufa),(long) a->ptr);
d7f43c08 3300 astr = bufa;
3301 bothsds = 0;
724a51b1 3302 } else {
d7f43c08 3303 astr = a->ptr;
724a51b1 3304 }
d7f43c08 3305 if (b->encoding != REDIS_ENCODING_RAW) {
ee14da56 3306 ll2string(bufb,sizeof(bufb),(long) b->ptr);
d7f43c08 3307 bstr = bufb;
3308 bothsds = 0;
3309 } else {
3310 bstr = b->ptr;
3311 }
3312 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3313}
3314
bf028098 3315/* Equal string objects return 1 if the two objects are the same from the
3316 * point of view of a string comparison, otherwise 0 is returned. Note that
3317 * this function is faster then checking for (compareStringObject(a,b) == 0)
3318 * because it can perform some more optimization. */
3319static int equalStringObjects(robj *a, robj *b) {
3320 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3321 return a->ptr == b->ptr;
3322 } else {
3323 return compareStringObjects(a,b) == 0;
3324 }
3325}
3326
0ea663ea 3327static size_t stringObjectLen(robj *o) {
dfc5e96c 3328 redisAssert(o->type == REDIS_STRING);
0ea663ea 3329 if (o->encoding == REDIS_ENCODING_RAW) {
3330 return sdslen(o->ptr);
3331 } else {
3332 char buf[32];
3333
ee14da56 3334 return ll2string(buf,32,(long)o->ptr);
0ea663ea 3335 }
3336}
3337
bd79a6bd
PN
3338static int getDoubleFromObject(robj *o, double *target) {
3339 double value;
682c73e8 3340 char *eptr;
bbe025e0 3341
bd79a6bd
PN
3342 if (o == NULL) {
3343 value = 0;
3344 } else {
3345 redisAssert(o->type == REDIS_STRING);
3346 if (o->encoding == REDIS_ENCODING_RAW) {
3347 value = strtod(o->ptr, &eptr);
682c73e8 3348 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3349 } else if (o->encoding == REDIS_ENCODING_INT) {
3350 value = (long)o->ptr;
3351 } else {
946342c1 3352 redisPanic("Unknown string encoding");
bd79a6bd
PN
3353 }
3354 }
3355
bd79a6bd
PN
3356 *target = value;
3357 return REDIS_OK;
3358}
bbe025e0 3359
bd79a6bd
PN
3360static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3361 double value;
3362 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3363 if (msg != NULL) {
3364 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3365 } else {
3366 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3367 }
bbe025e0
AM
3368 return REDIS_ERR;
3369 }
3370
bd79a6bd 3371 *target = value;
bbe025e0
AM
3372 return REDIS_OK;
3373}
3374
bd79a6bd
PN
3375static int getLongLongFromObject(robj *o, long long *target) {
3376 long long value;
682c73e8 3377 char *eptr;
bbe025e0 3378
bd79a6bd
PN
3379 if (o == NULL) {
3380 value = 0;
3381 } else {
3382 redisAssert(o->type == REDIS_STRING);
3383 if (o->encoding == REDIS_ENCODING_RAW) {
3384 value = strtoll(o->ptr, &eptr, 10);
682c73e8 3385 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3386 } else if (o->encoding == REDIS_ENCODING_INT) {
3387 value = (long)o->ptr;
3388 } else {
946342c1 3389 redisPanic("Unknown string encoding");
bd79a6bd
PN
3390 }
3391 }
3392
bd79a6bd
PN
3393 *target = value;
3394 return REDIS_OK;
3395}
bbe025e0 3396
bd79a6bd
PN
3397static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3398 long long value;
3399 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3400 if (msg != NULL) {
3401 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3402 } else {
3403 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3404 }
bbe025e0
AM
3405 return REDIS_ERR;
3406 }
3407
bd79a6bd 3408 *target = value;
bbe025e0
AM
3409 return REDIS_OK;
3410}
3411
bd79a6bd
PN
3412static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3413 long long value;
bbe025e0 3414
bd79a6bd
PN
3415 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3416 if (value < LONG_MIN || value > LONG_MAX) {
3417 if (msg != NULL) {
3418 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3419 } else {
3420 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3421 }
bbe025e0
AM
3422 return REDIS_ERR;
3423 }
3424
bd79a6bd 3425 *target = value;
bbe025e0
AM
3426 return REDIS_OK;
3427}
3428
06233c45 3429/*============================ RDB saving/loading =========================== */
ed9b544e 3430
f78fd11b 3431static int rdbSaveType(FILE *fp, unsigned char type) {
3432 if (fwrite(&type,1,1,fp) == 0) return -1;
3433 return 0;
3434}
3435
bb32ede5 3436static int rdbSaveTime(FILE *fp, time_t t) {
3437 int32_t t32 = (int32_t) t;
3438 if (fwrite(&t32,4,1,fp) == 0) return -1;
3439 return 0;
3440}
3441
e3566d4b 3442/* check rdbLoadLen() comments for more info */
f78fd11b 3443static int rdbSaveLen(FILE *fp, uint32_t len) {
3444 unsigned char buf[2];
3445
3446 if (len < (1<<6)) {
3447 /* Save a 6 bit len */
10c43610 3448 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3449 if (fwrite(buf,1,1,fp) == 0) return -1;
3450 } else if (len < (1<<14)) {
3451 /* Save a 14 bit len */
10c43610 3452 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3453 buf[1] = len&0xFF;
17be1a4a 3454 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3455 } else {
3456 /* Save a 32 bit len */
10c43610 3457 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3458 if (fwrite(buf,1,1,fp) == 0) return -1;
3459 len = htonl(len);
3460 if (fwrite(&len,4,1,fp) == 0) return -1;
3461 }
3462 return 0;
3463}
3464
32a66513 3465/* Encode 'value' as an integer if possible (if integer will fit the
3466 * supported range). If the function sucessful encoded the integer
3467 * then the (up to 5 bytes) encoded representation is written in the
3468 * string pointed by 'enc' and the length is returned. Otherwise
3469 * 0 is returned. */
3470static int rdbEncodeInteger(long long value, unsigned char *enc) {
e3566d4b 3471 /* Finally check if it fits in our ranges */
3472 if (value >= -(1<<7) && value <= (1<<7)-1) {
3473 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3474 enc[1] = value&0xFF;
3475 return 2;
3476 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3477 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3478 enc[1] = value&0xFF;
3479 enc[2] = (value>>8)&0xFF;
3480 return 3;
3481 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3482 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3483 enc[1] = value&0xFF;
3484 enc[2] = (value>>8)&0xFF;
3485 enc[3] = (value>>16)&0xFF;
3486 enc[4] = (value>>24)&0xFF;
3487 return 5;
3488 } else {
3489 return 0;
3490 }
3491}
3492
32a66513 3493/* String objects in the form "2391" "-100" without any space and with a
3494 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3495 * encoded as integers to save space */
3496static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3497 long long value;
3498 char *endptr, buf[32];
3499
3500 /* Check if it's possible to encode this value as a number */
3501 value = strtoll(s, &endptr, 10);
3502 if (endptr[0] != '\0') return 0;
3503 ll2string(buf,32,value);
3504
3505 /* If the number converted back into a string is not identical
3506 * then it's not possible to encode the string as integer */
3507 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3508
3509 return rdbEncodeInteger(value,enc);
3510}
3511
b1befe6a 3512static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3513 size_t comprlen, outlen;
774e3047 3514 unsigned char byte;
3515 void *out;
3516
3517 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3518 if (len <= 4) return 0;
3519 outlen = len-4;
3a2694c4 3520 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3521 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3522 if (comprlen == 0) {
88e85998 3523 zfree(out);
774e3047 3524 return 0;
3525 }
3526 /* Data compressed! Let's save it on disk */
3527 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3528 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3529 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3530 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3531 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3532 zfree(out);
774e3047 3533 return comprlen;
3534
3535writeerr:
88e85998 3536 zfree(out);
774e3047 3537 return -1;
3538}
3539
e3566d4b 3540/* Save a string objet as [len][data] on disk. If the object is a string
3541 * representation of an integer value we try to safe it in a special form */
b1befe6a 3542static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3543 int enclen;
10c43610 3544
774e3047 3545 /* Try integer encoding */
e3566d4b 3546 if (len <= 11) {
3547 unsigned char buf[5];
b1befe6a 3548 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3549 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3550 return 0;
3551 }
3552 }
774e3047 3553
3554 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3555 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3556 if (server.rdbcompression && len > 20) {
774e3047 3557 int retval;
3558
b1befe6a 3559 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3560 if (retval == -1) return -1;
3561 if (retval > 0) return 0;
3562 /* retval == 0 means data can't be compressed, save the old way */
3563 }
3564
3565 /* Store verbatim */
10c43610 3566 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3567 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3568 return 0;
3569}
3570
942a3961 3571/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3572static int rdbSaveStringObject(FILE *fp, robj *obj) {
3573 int retval;
942a3961 3574
32a66513 3575 /* Avoid to decode the object, then encode it again, if the
3576 * object is alrady integer encoded. */
3577 if (obj->encoding == REDIS_ENCODING_INT) {
3578 long val = (long) obj->ptr;
3579 unsigned char buf[5];
3580 int enclen;
3581
3582 if ((enclen = rdbEncodeInteger(val,buf)) > 0) {
3583 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3584 return 0;
3585 }
3586 /* otherwise... fall throught and continue with the usual
3587 * code path. */
3588 }
3589
f2d9f50f 3590 /* Avoid incr/decr ref count business when possible.
3591 * This plays well with copy-on-write given that we are probably
3592 * in a child process (BGSAVE). Also this makes sure key objects
3593 * of swapped objects are not incRefCount-ed (an assert does not allow
3594 * this in order to avoid bugs) */
3595 if (obj->encoding != REDIS_ENCODING_RAW) {
996cb5f7 3596 obj = getDecodedObject(obj);
b1befe6a 3597 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3598 decrRefCount(obj);
3599 } else {
b1befe6a 3600 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3601 }
9d65a1bb 3602 return retval;
942a3961 3603}
3604
a7866db6 3605/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3606 * 8 bit integer specifing the length of the representation.
3607 * This 8 bit integer has special values in order to specify the following
3608 * conditions:
3609 * 253: not a number
3610 * 254: + inf
3611 * 255: - inf
3612 */
3613static int rdbSaveDoubleValue(FILE *fp, double val) {
3614 unsigned char buf[128];
3615 int len;
3616
3617 if (isnan(val)) {
3618 buf[0] = 253;
3619 len = 1;
3620 } else if (!isfinite(val)) {
3621 len = 1;
3622 buf[0] = (val < 0) ? 255 : 254;
3623 } else {
88e8d89f 3624#if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
fe244589 3625 /* Check if the float is in a safe range to be casted into a
3626 * long long. We are assuming that long long is 64 bit here.
3627 * Also we are assuming that there are no implementations around where
3628 * double has precision < 52 bit.
3629 *
3630 * Under this assumptions we test if a double is inside an interval
3631 * where casting to long long is safe. Then using two castings we
3632 * make sure the decimal part is zero. If all this is true we use
3633 * integer printing function that is much faster. */
fb82e75c 3634 double min = -4503599627370495; /* (2^52)-1 */
3635 double max = 4503599627370496; /* -(2^52) */
fe244589 3636 if (val > min && val < max && val == ((double)((long long)val)))
8c096b16 3637 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3638 else
88e8d89f 3639#endif
8c096b16 3640 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3641 buf[0] = strlen((char*)buf+1);
a7866db6 3642 len = buf[0]+1;
3643 }
3644 if (fwrite(buf,len,1,fp) == 0) return -1;
3645 return 0;
3646}
3647
06233c45 3648/* Save a Redis object. */
3649static int rdbSaveObject(FILE *fp, robj *o) {
3650 if (o->type == REDIS_STRING) {
3651 /* Save a string value */
3652 if (rdbSaveStringObject(fp,o) == -1) return -1;
3653 } else if (o->type == REDIS_LIST) {
3654 /* Save a list value */
3655 list *list = o->ptr;
c7df85a4 3656 listIter li;
06233c45 3657 listNode *ln;
3658
06233c45 3659 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
c7df85a4 3660 listRewind(list,&li);
3661 while((ln = listNext(&li))) {
06233c45 3662 robj *eleobj = listNodeValue(ln);
3663
3664 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3665 }
3666 } else if (o->type == REDIS_SET) {
3667 /* Save a set value */
3668 dict *set = o->ptr;
3669 dictIterator *di = dictGetIterator(set);
3670 dictEntry *de;
3671
3672 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3673 while((de = dictNext(di)) != NULL) {
3674 robj *eleobj = dictGetEntryKey(de);
3675
3676 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3677 }
3678 dictReleaseIterator(di);
3679 } else if (o->type == REDIS_ZSET) {
3680 /* Save a set value */
3681 zset *zs = o->ptr;
3682 dictIterator *di = dictGetIterator(zs->dict);
3683 dictEntry *de;
3684
3685 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3686 while((de = dictNext(di)) != NULL) {
3687 robj *eleobj = dictGetEntryKey(de);
3688 double *score = dictGetEntryVal(de);
3689
3690 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3691 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3692 }
3693 dictReleaseIterator(di);
b1befe6a 3694 } else if (o->type == REDIS_HASH) {
3695 /* Save a hash value */
3696 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3697 unsigned char *p = zipmapRewind(o->ptr);
3698 unsigned int count = zipmapLen(o->ptr);
3699 unsigned char *key, *val;
3700 unsigned int klen, vlen;
3701
3702 if (rdbSaveLen(fp,count) == -1) return -1;
3703 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3704 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3705 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3706 }
3707 } else {
3708 dictIterator *di = dictGetIterator(o->ptr);
3709 dictEntry *de;
3710
3711 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3712 while((de = dictNext(di)) != NULL) {
3713 robj *key = dictGetEntryKey(de);
3714 robj *val = dictGetEntryVal(de);
3715
3716 if (rdbSaveStringObject(fp,key) == -1) return -1;
3717 if (rdbSaveStringObject(fp,val) == -1) return -1;
3718 }
3719 dictReleaseIterator(di);
3720 }
06233c45 3721 } else {
f83c6cb5 3722 redisPanic("Unknown object type");
06233c45 3723 }
3724 return 0;
3725}
3726
3727/* Return the length the object will have on disk if saved with
3728 * the rdbSaveObject() function. Currently we use a trick to get
3729 * this length with very little changes to the code. In the future
3730 * we could switch to a faster solution. */
b9bc0eef 3731static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3732 if (fp == NULL) fp = server.devnull;
06233c45 3733 rewind(fp);
3734 assert(rdbSaveObject(fp,o) != 1);
3735 return ftello(fp);
3736}
3737
06224fec 3738/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3739static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3740 off_t bytes = rdbSavedObjectLen(o,fp);
e0a62c7f 3741
06224fec 3742 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3743}
3744
ed9b544e 3745/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3746static int rdbSave(char *filename) {
ed9b544e 3747 dictIterator *di = NULL;
3748 dictEntry *de;
ed9b544e 3749 FILE *fp;
3750 char tmpfile[256];
3751 int j;
bb32ede5 3752 time_t now = time(NULL);
ed9b544e 3753
2316bb3b 3754 /* Wait for I/O therads to terminate, just in case this is a
3755 * foreground-saving, to avoid seeking the swap file descriptor at the
3756 * same time. */
3757 if (server.vm_enabled)
3758 waitEmptyIOJobsQueue();
3759
a3b21203 3760 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3761 fp = fopen(tmpfile,"w");
3762 if (!fp) {
3763 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3764 return REDIS_ERR;
3765 }
f78fd11b 3766 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3767 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3768 redisDb *db = server.db+j;
3769 dict *d = db->dict;
3305306f 3770 if (dictSize(d) == 0) continue;
ed9b544e 3771 di = dictGetIterator(d);
3772 if (!di) {
3773 fclose(fp);
3774 return REDIS_ERR;
3775 }
3776
3777 /* Write the SELECT DB opcode */
f78fd11b 3778 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3779 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3780
3781 /* Iterate this DB writing every entry */
3782 while((de = dictNext(di)) != NULL) {
3783 robj *key = dictGetEntryKey(de);
3784 robj *o = dictGetEntryVal(de);
bb32ede5 3785 time_t expiretime = getExpire(db,key);
3786
3787 /* Save the expire time */
3788 if (expiretime != -1) {
3789 /* If this key is already expired skip it */
3790 if (expiretime < now) continue;
3791 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3792 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3793 }
7e69548d 3794 /* Save the key and associated value. This requires special
3795 * handling if the value is swapped out. */
560db612 3796 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
3797 o->storage == REDIS_VM_SWAPPING) {
7e69548d 3798 /* Save type, key, value */
3799 if (rdbSaveType(fp,o->type) == -1) goto werr;
3800 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3801 if (rdbSaveObject(fp,o) == -1) goto werr;
3802 } else {
996cb5f7 3803 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3804 robj *po;
7e69548d 3805 /* Get a preview of the object in memory */
560db612 3806 po = vmPreviewObject(o);
7e69548d 3807 /* Save type, key, value */
560db612 3808 if (rdbSaveType(fp,po->type) == -1) goto werr;
b9bc0eef 3809 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3810 if (rdbSaveObject(fp,po) == -1) goto werr;
3811 /* Remove the loaded object from memory */
3812 decrRefCount(po);
7e69548d 3813 }
ed9b544e 3814 }
3815 dictReleaseIterator(di);
3816 }
3817 /* EOF opcode */
f78fd11b 3818 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3819
3820 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3821 fflush(fp);
3822 fsync(fileno(fp));
3823 fclose(fp);
e0a62c7f 3824
ed9b544e 3825 /* Use RENAME to make sure the DB file is changed atomically only
3826 * if the generate DB file is ok. */
3827 if (rename(tmpfile,filename) == -1) {
325d1eb4 3828 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3829 unlink(tmpfile);
3830 return REDIS_ERR;
3831 }
3832 redisLog(REDIS_NOTICE,"DB saved on disk");
3833 server.dirty = 0;
3834 server.lastsave = time(NULL);
3835 return REDIS_OK;
3836
3837werr:
3838 fclose(fp);
3839 unlink(tmpfile);
3840 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3841 if (di) dictReleaseIterator(di);
3842 return REDIS_ERR;
3843}
3844
f78fd11b 3845static int rdbSaveBackground(char *filename) {
ed9b544e 3846 pid_t childpid;
3847
9d65a1bb 3848 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3849 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3850 if ((childpid = fork()) == 0) {
3851 /* Child */
054e426d 3852 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3853 close(server.fd);
f78fd11b 3854 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3855 _exit(0);
ed9b544e 3856 } else {
478c2c6f 3857 _exit(1);
ed9b544e 3858 }
3859 } else {
3860 /* Parent */
5a7c647e 3861 if (childpid == -1) {
3862 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3863 strerror(errno));
3864 return REDIS_ERR;
3865 }
ed9b544e 3866 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3867 server.bgsavechildpid = childpid;
884d4b39 3868 updateDictResizePolicy();
ed9b544e 3869 return REDIS_OK;
3870 }
3871 return REDIS_OK; /* unreached */
3872}
3873
a3b21203 3874static void rdbRemoveTempFile(pid_t childpid) {
3875 char tmpfile[256];
3876
3877 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3878 unlink(tmpfile);
3879}
3880
f78fd11b 3881static int rdbLoadType(FILE *fp) {
3882 unsigned char type;
7b45bfb2 3883 if (fread(&type,1,1,fp) == 0) return -1;
3884 return type;
3885}
3886
bb32ede5 3887static time_t rdbLoadTime(FILE *fp) {
3888 int32_t t32;
3889 if (fread(&t32,4,1,fp) == 0) return -1;
3890 return (time_t) t32;
3891}
3892
e3566d4b 3893/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3894 * of this file for a description of how this are stored on disk.
3895 *
3896 * isencoded is set to 1 if the readed length is not actually a length but
3897 * an "encoding type", check the above comments for more info */
c78a8ccc 3898static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3899 unsigned char buf[2];
3900 uint32_t len;
c78a8ccc 3901 int type;
f78fd11b 3902
e3566d4b 3903 if (isencoded) *isencoded = 0;
c78a8ccc 3904 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3905 type = (buf[0]&0xC0)>>6;
3906 if (type == REDIS_RDB_6BITLEN) {
3907 /* Read a 6 bit len */
3908 return buf[0]&0x3F;
3909 } else if (type == REDIS_RDB_ENCVAL) {
3910 /* Read a 6 bit len encoding type */
3911 if (isencoded) *isencoded = 1;
3912 return buf[0]&0x3F;
3913 } else if (type == REDIS_RDB_14BITLEN) {
3914 /* Read a 14 bit len */
3915 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3916 return ((buf[0]&0x3F)<<8)|buf[1];
3917 } else {
3918 /* Read a 32 bit len */
f78fd11b 3919 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3920 return ntohl(len);
f78fd11b 3921 }
f78fd11b 3922}
3923
ad30aa60 3924/* Load an integer-encoded object from file 'fp', with the specified
3925 * encoding type 'enctype'. If encode is true the function may return
3926 * an integer-encoded object as reply, otherwise the returned object
3927 * will always be encoded as a raw string. */
3928static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
e3566d4b 3929 unsigned char enc[4];
3930 long long val;
3931
3932 if (enctype == REDIS_RDB_ENC_INT8) {
3933 if (fread(enc,1,1,fp) == 0) return NULL;
3934 val = (signed char)enc[0];
3935 } else if (enctype == REDIS_RDB_ENC_INT16) {
3936 uint16_t v;
3937 if (fread(enc,2,1,fp) == 0) return NULL;
3938 v = enc[0]|(enc[1]<<8);
3939 val = (int16_t)v;
3940 } else if (enctype == REDIS_RDB_ENC_INT32) {
3941 uint32_t v;
3942 if (fread(enc,4,1,fp) == 0) return NULL;
3943 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3944 val = (int32_t)v;
3945 } else {
3946 val = 0; /* anti-warning */
f83c6cb5 3947 redisPanic("Unknown RDB integer encoding type");
e3566d4b 3948 }
ad30aa60 3949 if (encode)
3950 return createStringObjectFromLongLong(val);
3951 else
3952 return createObject(REDIS_STRING,sdsfromlonglong(val));
e3566d4b 3953}
3954
c78a8ccc 3955static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3956 unsigned int len, clen;
3957 unsigned char *c = NULL;
3958 sds val = NULL;
3959
c78a8ccc 3960 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3961 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3962 if ((c = zmalloc(clen)) == NULL) goto err;
3963 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3964 if (fread(c,clen,1,fp) == 0) goto err;
3965 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3966 zfree(c);
88e85998 3967 return createObject(REDIS_STRING,val);
3968err:
3969 zfree(c);
3970 sdsfree(val);
3971 return NULL;
3972}
3973
ad30aa60 3974static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
e3566d4b 3975 int isencoded;
3976 uint32_t len;
f78fd11b 3977 sds val;
3978
c78a8ccc 3979 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3980 if (isencoded) {
3981 switch(len) {
3982 case REDIS_RDB_ENC_INT8:
3983 case REDIS_RDB_ENC_INT16:
3984 case REDIS_RDB_ENC_INT32:
ad30aa60 3985 return rdbLoadIntegerObject(fp,len,encode);
88e85998 3986 case REDIS_RDB_ENC_LZF:
bdcb92f2 3987 return rdbLoadLzfStringObject(fp);
e3566d4b 3988 default:
f83c6cb5 3989 redisPanic("Unknown RDB encoding type");
e3566d4b 3990 }
3991 }
3992
f78fd11b 3993 if (len == REDIS_RDB_LENERR) return NULL;
3994 val = sdsnewlen(NULL,len);
3995 if (len && fread(val,len,1,fp) == 0) {
3996 sdsfree(val);
3997 return NULL;
3998 }
bdcb92f2 3999 return createObject(REDIS_STRING,val);
f78fd11b 4000}
4001
ad30aa60 4002static robj *rdbLoadStringObject(FILE *fp) {
4003 return rdbGenericLoadStringObject(fp,0);
4004}
4005
4006static robj *rdbLoadEncodedStringObject(FILE *fp) {
4007 return rdbGenericLoadStringObject(fp,1);
4008}
4009
a7866db6 4010/* For information about double serialization check rdbSaveDoubleValue() */
4011static int rdbLoadDoubleValue(FILE *fp, double *val) {
4012 char buf[128];
4013 unsigned char len;
4014
4015 if (fread(&len,1,1,fp) == 0) return -1;
4016 switch(len) {
4017 case 255: *val = R_NegInf; return 0;
4018 case 254: *val = R_PosInf; return 0;
4019 case 253: *val = R_Nan; return 0;
4020 default:
4021 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 4022 buf[len] = '\0';
a7866db6 4023 sscanf(buf, "%lg", val);
4024 return 0;
4025 }
4026}
4027
c78a8ccc 4028/* Load a Redis object of the specified type from the specified file.
4029 * On success a newly allocated object is returned, otherwise NULL. */
4030static robj *rdbLoadObject(int type, FILE *fp) {
4031 robj *o;
4032
bcd11906 4033 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 4034 if (type == REDIS_STRING) {
4035 /* Read string value */
ad30aa60 4036 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4037 o = tryObjectEncoding(o);
c78a8ccc 4038 } else if (type == REDIS_LIST || type == REDIS_SET) {
4039 /* Read list/set value */
4040 uint32_t listlen;
4041
4042 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4043 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3c68de9b 4044 /* It's faster to expand the dict to the right size asap in order
4045 * to avoid rehashing */
4046 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
4047 dictExpand(o->ptr,listlen);
c78a8ccc 4048 /* Load every single element of the list/set */
4049 while(listlen--) {
4050 robj *ele;
4051
ad30aa60 4052 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4053 ele = tryObjectEncoding(ele);
c78a8ccc 4054 if (type == REDIS_LIST) {
4055 listAddNodeTail((list*)o->ptr,ele);
4056 } else {
4057 dictAdd((dict*)o->ptr,ele,NULL);
4058 }
4059 }
4060 } else if (type == REDIS_ZSET) {
4061 /* Read list/set value */
ada386b2 4062 size_t zsetlen;
c78a8ccc 4063 zset *zs;
4064
4065 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4066 o = createZsetObject();
4067 zs = o->ptr;
4068 /* Load every single element of the list/set */
4069 while(zsetlen--) {
4070 robj *ele;
4071 double *score = zmalloc(sizeof(double));
4072
ad30aa60 4073 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4074 ele = tryObjectEncoding(ele);
c78a8ccc 4075 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4076 dictAdd(zs->dict,ele,score);
4077 zslInsert(zs->zsl,*score,ele);
4078 incrRefCount(ele); /* added to skiplist */
4079 }
ada386b2 4080 } else if (type == REDIS_HASH) {
4081 size_t hashlen;
4082
4083 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4084 o = createHashObject();
4085 /* Too many entries? Use an hash table. */
4086 if (hashlen > server.hash_max_zipmap_entries)
4087 convertToRealHash(o);
4088 /* Load every key/value, then set it into the zipmap or hash
4089 * table, as needed. */
4090 while(hashlen--) {
4091 robj *key, *val;
4092
4093 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4094 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4095 /* If we are using a zipmap and there are too big values
4096 * the object is converted to real hash table encoding. */
4097 if (o->encoding != REDIS_ENCODING_HT &&
4098 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4099 sdslen(val->ptr) > server.hash_max_zipmap_value))
4100 {
4101 convertToRealHash(o);
4102 }
4103
4104 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4105 unsigned char *zm = o->ptr;
4106
4107 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4108 val->ptr,sdslen(val->ptr),NULL);
4109 o->ptr = zm;
4110 decrRefCount(key);
4111 decrRefCount(val);
4112 } else {
05df7621 4113 key = tryObjectEncoding(key);
4114 val = tryObjectEncoding(val);
ada386b2 4115 dictAdd((dict*)o->ptr,key,val);
ada386b2 4116 }
4117 }
c78a8ccc 4118 } else {
f83c6cb5 4119 redisPanic("Unknown object type");
c78a8ccc 4120 }
4121 return o;
4122}
4123
f78fd11b 4124static int rdbLoad(char *filename) {
ed9b544e 4125 FILE *fp;
f78fd11b 4126 uint32_t dbid;
bb32ede5 4127 int type, retval, rdbver;
585af7e2 4128 int swap_all_values = 0;
3305306f 4129 dict *d = server.db[0].dict;
bb32ede5 4130 redisDb *db = server.db+0;
f78fd11b 4131 char buf[1024];
242a64f3 4132 time_t expiretime, now = time(NULL);
b492cf00 4133 long long loadedkeys = 0;
bb32ede5 4134
ed9b544e 4135 fp = fopen(filename,"r");
4136 if (!fp) return REDIS_ERR;
4137 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 4138 buf[9] = '\0';
4139 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 4140 fclose(fp);
4141 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4142 return REDIS_ERR;
4143 }
f78fd11b 4144 rdbver = atoi(buf+5);
c78a8ccc 4145 if (rdbver != 1) {
f78fd11b 4146 fclose(fp);
4147 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4148 return REDIS_ERR;
4149 }
ed9b544e 4150 while(1) {
585af7e2 4151 robj *key, *val;
ed9b544e 4152
585af7e2 4153 expiretime = -1;
ed9b544e 4154 /* Read type. */
f78fd11b 4155 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 4156 if (type == REDIS_EXPIRETIME) {
4157 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4158 /* We read the time so we need to read the object type again */
4159 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4160 }
ed9b544e 4161 if (type == REDIS_EOF) break;
4162 /* Handle SELECT DB opcode as a special case */
4163 if (type == REDIS_SELECTDB) {
c78a8ccc 4164 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 4165 goto eoferr;
ed9b544e 4166 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 4167 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 4168 exit(1);
4169 }
bb32ede5 4170 db = server.db+dbid;
4171 d = db->dict;
ed9b544e 4172 continue;
4173 }
4174 /* Read key */
585af7e2 4175 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
c78a8ccc 4176 /* Read value */
585af7e2 4177 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
89e689c5 4178 /* Check if the key already expired */
4179 if (expiretime != -1 && expiretime < now) {
4180 decrRefCount(key);
4181 decrRefCount(val);
4182 continue;
4183 }
ed9b544e 4184 /* Add the new object in the hash table */
585af7e2 4185 retval = dictAdd(d,key,val);
ed9b544e 4186 if (retval == DICT_ERR) {
585af7e2 4187 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
ed9b544e 4188 exit(1);
4189 }
242a64f3 4190 loadedkeys++;
bb32ede5 4191 /* Set the expire time if needed */
89e689c5 4192 if (expiretime != -1) setExpire(db,key,expiretime);
242a64f3 4193
b492cf00 4194 /* Handle swapping while loading big datasets when VM is on */
242a64f3 4195
4196 /* If we detecter we are hopeless about fitting something in memory
4197 * we just swap every new key on disk. Directly...
4198 * Note that's important to check for this condition before resorting
4199 * to random sampling, otherwise we may try to swap already
4200 * swapped keys. */
585af7e2 4201 if (swap_all_values) {
4202 dictEntry *de = dictFind(d,key);
242a64f3 4203
4204 /* de may be NULL since the key already expired */
4205 if (de) {
560db612 4206 vmpointer *vp;
585af7e2 4207 key = dictGetEntryKey(de);
4208 val = dictGetEntryVal(de);
242a64f3 4209
560db612 4210 if (val->refcount == 1 &&
4211 (vp = vmSwapObjectBlocking(val)) != NULL)
4212 dictGetEntryVal(de) = vp;
242a64f3 4213 }
4214 continue;
4215 }
4216
4217 /* If we have still some hope of having some value fitting memory
4218 * then we try random sampling. */
585af7e2 4219 if (!swap_all_values && server.vm_enabled && (loadedkeys % 5000) == 0) {
b492cf00 4220 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 4221 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 4222 }
242a64f3 4223 if (zmalloc_used_memory() > server.vm_max_memory)
585af7e2 4224 swap_all_values = 1; /* We are already using too much mem */
b492cf00 4225 }
ed9b544e 4226 }
4227 fclose(fp);
4228 return REDIS_OK;
4229
4230eoferr: /* unexpected end of file is handled here with a fatal exit */
f80dff62 4231 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 4232 exit(1);
4233 return REDIS_ERR; /* Just to avoid warning */
4234}
4235
b58ba105 4236/*================================== Shutdown =============================== */
fab43727 4237static int prepareForShutdown() {
b58ba105
AM
4238 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4239 /* Kill the saving child if there is a background saving in progress.
4240 We want to avoid race conditions, for instance our saving child may
4241 overwrite the synchronous saving did by SHUTDOWN. */
4242 if (server.bgsavechildpid != -1) {
4243 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4244 kill(server.bgsavechildpid,SIGKILL);
4245 rdbRemoveTempFile(server.bgsavechildpid);
4246 }
4247 if (server.appendonly) {
4248 /* Append only file: fsync() the AOF and exit */
b0bd87f6 4249 aof_fsync(server.appendfd);
b58ba105 4250 if (server.vm_enabled) unlink(server.vm_swap_file);
b58ba105
AM
4251 } else {
4252 /* Snapshotting. Perform a SYNC SAVE and exit */
4253 if (rdbSave(server.dbfilename) == REDIS_OK) {
4254 if (server.daemonize)
4255 unlink(server.pidfile);
4256 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
b58ba105
AM
4257 } else {
4258 /* Ooops.. error saving! The best we can do is to continue
4259 * operating. Note that if there was a background saving process,
4260 * in the next cron() Redis will be notified that the background
4261 * saving aborted, handling special stuff like slaves pending for
4262 * synchronization... */
4263 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
fab43727 4264 return REDIS_ERR;
b58ba105
AM
4265 }
4266 }
8513a757 4267 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
fab43727 4268 return REDIS_OK;
b58ba105
AM
4269}
4270
ed9b544e 4271/*================================== Commands =============================== */
4272
abcb223e 4273static void authCommand(redisClient *c) {
2e77c2ee 4274 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
4275 c->authenticated = 1;
4276 addReply(c,shared.ok);
4277 } else {
4278 c->authenticated = 0;
fa4c0aba 4279 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
4280 }
4281}
4282
ed9b544e 4283static void pingCommand(redisClient *c) {
4284 addReply(c,shared.pong);
4285}
4286
4287static void echoCommand(redisClient *c) {
dd88747b 4288 addReplyBulk(c,c->argv[1]);
ed9b544e 4289}
4290
4291/*=================================== Strings =============================== */
4292
526d00a5 4293static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
ed9b544e 4294 int retval;
10ce1276 4295 long seconds = 0; /* initialized to avoid an harmness warning */
ed9b544e 4296
526d00a5 4297 if (expire) {
4298 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4299 return;
4300 if (seconds <= 0) {
4301 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4302 return;
4303 }
4304 }
4305
37ab76c9 4306 touchWatchedKey(c->db,key);
526d00a5 4307 if (nx) deleteIfVolatile(c->db,key);
4308 retval = dictAdd(c->db->dict,key,val);
ed9b544e 4309 if (retval == DICT_ERR) {
4310 if (!nx) {
1b03836c 4311 /* If the key is about a swapped value, we want a new key object
4312 * to overwrite the old. So we delete the old key in the database.
4313 * This will also make sure that swap pages about the old object
4314 * will be marked as free. */
526d00a5 4315 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4316 incrRefCount(key);
4317 dictReplace(c->db->dict,key,val);
4318 incrRefCount(val);
ed9b544e 4319 } else {
c937aa89 4320 addReply(c,shared.czero);
ed9b544e 4321 return;
4322 }
4323 } else {
526d00a5 4324 incrRefCount(key);
4325 incrRefCount(val);
ed9b544e 4326 }
4327 server.dirty++;
526d00a5 4328 removeExpire(c->db,key);
4329 if (expire) setExpire(c->db,key,time(NULL)+seconds);
c937aa89 4330 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 4331}
4332
4333static void setCommand(redisClient *c) {
526d00a5 4334 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
ed9b544e 4335}
4336
4337static void setnxCommand(redisClient *c) {
526d00a5 4338 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4339}
4340
4341static void setexCommand(redisClient *c) {
4342 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
ed9b544e 4343}
4344
322fc7d8 4345static int getGenericCommand(redisClient *c) {
dd88747b 4346 robj *o;
e0a62c7f 4347
dd88747b 4348 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 4349 return REDIS_OK;
dd88747b 4350
4351 if (o->type != REDIS_STRING) {
4352 addReply(c,shared.wrongtypeerr);
4353 return REDIS_ERR;
ed9b544e 4354 } else {
dd88747b 4355 addReplyBulk(c,o);
4356 return REDIS_OK;
ed9b544e 4357 }
4358}
4359
322fc7d8 4360static void getCommand(redisClient *c) {
4361 getGenericCommand(c);
4362}
4363
f6b141c5 4364static void getsetCommand(redisClient *c) {
322fc7d8 4365 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 4366 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4367 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4368 } else {
4369 incrRefCount(c->argv[1]);
4370 }
4371 incrRefCount(c->argv[2]);
4372 server.dirty++;
4373 removeExpire(c->db,c->argv[1]);
4374}
4375
70003d28 4376static void mgetCommand(redisClient *c) {
70003d28 4377 int j;
e0a62c7f 4378
c937aa89 4379 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 4380 for (j = 1; j < c->argc; j++) {
3305306f 4381 robj *o = lookupKeyRead(c->db,c->argv[j]);
4382 if (o == NULL) {
c937aa89 4383 addReply(c,shared.nullbulk);
70003d28 4384 } else {
70003d28 4385 if (o->type != REDIS_STRING) {
c937aa89 4386 addReply(c,shared.nullbulk);
70003d28 4387 } else {
dd88747b 4388 addReplyBulk(c,o);
70003d28 4389 }
4390 }
4391 }
4392}
4393
6c446631 4394static void msetGenericCommand(redisClient *c, int nx) {
906573e7 4395 int j, busykeys = 0;
6c446631 4396
4397 if ((c->argc % 2) == 0) {
454d4e43 4398 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 4399 return;
4400 }
4401 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4402 * set nothing at all if at least one already key exists. */
4403 if (nx) {
4404 for (j = 1; j < c->argc; j += 2) {
906573e7 4405 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4406 busykeys++;
6c446631 4407 }
4408 }
4409 }
906573e7 4410 if (busykeys) {
4411 addReply(c, shared.czero);
4412 return;
4413 }
6c446631 4414
4415 for (j = 1; j < c->argc; j += 2) {
4416 int retval;
4417
05df7621 4418 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
6c446631 4419 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4420 if (retval == DICT_ERR) {
4421 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4422 incrRefCount(c->argv[j+1]);
4423 } else {
4424 incrRefCount(c->argv[j]);
4425 incrRefCount(c->argv[j+1]);
4426 }
4427 removeExpire(c->db,c->argv[j]);
4428 }
4429 server.dirty += (c->argc-1)/2;
4430 addReply(c, nx ? shared.cone : shared.ok);
4431}
4432
4433static void msetCommand(redisClient *c) {
4434 msetGenericCommand(c,0);
4435}
4436
4437static void msetnxCommand(redisClient *c) {
4438 msetGenericCommand(c,1);
4439}
4440
d68ed120 4441static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 4442 long long value;
4443 int retval;
4444 robj *o;
e0a62c7f 4445
3305306f 4446 o = lookupKeyWrite(c->db,c->argv[1]);
6485f293
PN
4447 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4448 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
ed9b544e 4449
4450 value += incr;
d6f4c262 4451 o = createStringObjectFromLongLong(value);
3305306f 4452 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 4453 if (retval == DICT_ERR) {
3305306f 4454 dictReplace(c->db->dict,c->argv[1],o);
4455 removeExpire(c->db,c->argv[1]);
ed9b544e 4456 } else {
4457 incrRefCount(c->argv[1]);
4458 }
4459 server.dirty++;
c937aa89 4460 addReply(c,shared.colon);
ed9b544e 4461 addReply(c,o);
4462 addReply(c,shared.crlf);
4463}
4464
4465static void incrCommand(redisClient *c) {
a4d1ba9a 4466 incrDecrCommand(c,1);
ed9b544e 4467}
4468
4469static void decrCommand(redisClient *c) {
a4d1ba9a 4470 incrDecrCommand(c,-1);
ed9b544e 4471}
4472
4473static void incrbyCommand(redisClient *c) {
bbe025e0
AM
4474 long long incr;
4475
bd79a6bd 4476 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4477 incrDecrCommand(c,incr);
ed9b544e 4478}
4479
4480static void decrbyCommand(redisClient *c) {
bbe025e0
AM
4481 long long incr;
4482
bd79a6bd 4483 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4484 incrDecrCommand(c,-incr);
ed9b544e 4485}
4486
4b00bebd 4487static void appendCommand(redisClient *c) {
4488 int retval;
4489 size_t totlen;
4490 robj *o;
4491
4492 o = lookupKeyWrite(c->db,c->argv[1]);
4493 if (o == NULL) {
4494 /* Create the key */
4495 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4496 incrRefCount(c->argv[1]);
4497 incrRefCount(c->argv[2]);
4498 totlen = stringObjectLen(c->argv[2]);
4499 } else {
4500 dictEntry *de;
e0a62c7f 4501
4b00bebd 4502 de = dictFind(c->db->dict,c->argv[1]);
4503 assert(de != NULL);
4504
4505 o = dictGetEntryVal(de);
4506 if (o->type != REDIS_STRING) {
4507 addReply(c,shared.wrongtypeerr);
4508 return;
4509 }
4510 /* If the object is specially encoded or shared we have to make
4511 * a copy */
4512 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4513 robj *decoded = getDecodedObject(o);
4514
4515 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4516 decrRefCount(decoded);
4517 dictReplace(c->db->dict,c->argv[1],o);
4518 }
4519 /* APPEND! */
4520 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4521 o->ptr = sdscatlen(o->ptr,
4522 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4523 } else {
4524 o->ptr = sdscatprintf(o->ptr, "%ld",
4525 (unsigned long) c->argv[2]->ptr);
4526 }
4527 totlen = sdslen(o->ptr);
4528 }
4529 server.dirty++;
4530 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4531}
4532
39191553 4533static void substrCommand(redisClient *c) {
4534 robj *o;
4535 long start = atoi(c->argv[2]->ptr);
4536 long end = atoi(c->argv[3]->ptr);
dd88747b 4537 size_t rangelen, strlen;
4538 sds range;
39191553 4539
dd88747b 4540 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4541 checkType(c,o,REDIS_STRING)) return;
39191553 4542
dd88747b 4543 o = getDecodedObject(o);
4544 strlen = sdslen(o->ptr);
8fe7fad7 4545
dd88747b 4546 /* convert negative indexes */
4547 if (start < 0) start = strlen+start;
4548 if (end < 0) end = strlen+end;
4549 if (start < 0) start = 0;
4550 if (end < 0) end = 0;
39191553 4551
dd88747b 4552 /* indexes sanity checks */
4553 if (start > end || (size_t)start >= strlen) {
4554 /* Out of range start or start > end result in null reply */
4555 addReply(c,shared.nullbulk);
4556 decrRefCount(o);
4557 return;
39191553 4558 }
dd88747b 4559 if ((size_t)end >= strlen) end = strlen-1;
4560 rangelen = (end-start)+1;
4561
4562 /* Return the result */
4563 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4564 range = sdsnewlen((char*)o->ptr+start,rangelen);
4565 addReplySds(c,range);
4566 addReply(c,shared.crlf);
4567 decrRefCount(o);
39191553 4568}
4569
ed9b544e 4570/* ========================= Type agnostic commands ========================= */
4571
4572static void delCommand(redisClient *c) {
5109cdff 4573 int deleted = 0, j;
4574
4575 for (j = 1; j < c->argc; j++) {
4576 if (deleteKey(c->db,c->argv[j])) {
37ab76c9 4577 touchWatchedKey(c->db,c->argv[j]);
5109cdff 4578 server.dirty++;
4579 deleted++;
4580 }
4581 }
482b672d 4582 addReplyLongLong(c,deleted);
ed9b544e 4583}
4584
4585static void existsCommand(redisClient *c) {
f4f06efc
PN
4586 expireIfNeeded(c->db,c->argv[1]);
4587 if (dictFind(c->db->dict,c->argv[1])) {
4588 addReply(c, shared.cone);
4589 } else {
4590 addReply(c, shared.czero);
4591 }
ed9b544e 4592}
4593
4594static void selectCommand(redisClient *c) {
4595 int id = atoi(c->argv[1]->ptr);
e0a62c7f 4596
ed9b544e 4597 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4598 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4599 } else {
4600 addReply(c,shared.ok);
4601 }
4602}
4603
4604static void randomkeyCommand(redisClient *c) {
4605 dictEntry *de;
dc4be23e 4606 robj *key;
e0a62c7f 4607
3305306f 4608 while(1) {
4609 de = dictGetRandomKey(c->db->dict);
ce7bef07 4610 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 4611 }
2b619329 4612
ed9b544e 4613 if (de == NULL) {
dc4be23e 4614 addReply(c,shared.nullbulk);
4615 return;
4616 }
4617
4618 key = dictGetEntryKey(de);
4619 if (server.vm_enabled) {
4620 key = dupStringObject(key);
4621 addReplyBulk(c,key);
4622 decrRefCount(key);
ed9b544e 4623 } else {
dc4be23e 4624 addReplyBulk(c,key);
ed9b544e 4625 }
4626}
4627
4628static void keysCommand(redisClient *c) {
4629 dictIterator *di;
4630 dictEntry *de;
4631 sds pattern = c->argv[1]->ptr;
4632 int plen = sdslen(pattern);
a3f9eec2 4633 unsigned long numkeys = 0;
ed9b544e 4634 robj *lenobj = createObject(REDIS_STRING,NULL);
4635
3305306f 4636 di = dictGetIterator(c->db->dict);
ed9b544e 4637 addReply(c,lenobj);
4638 decrRefCount(lenobj);
4639 while((de = dictNext(di)) != NULL) {
4640 robj *keyobj = dictGetEntryKey(de);
3305306f 4641
ed9b544e 4642 sds key = keyobj->ptr;
4643 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4644 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 4645 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4646 addReplyBulk(c,keyobj);
3305306f 4647 numkeys++;
3305306f 4648 }
ed9b544e 4649 }
4650 }
4651 dictReleaseIterator(di);
a3f9eec2 4652 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4653}
4654
4655static void dbsizeCommand(redisClient *c) {
4656 addReplySds(c,
3305306f 4657 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4658}
4659
4660static void lastsaveCommand(redisClient *c) {
4661 addReplySds(c,
c937aa89 4662 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4663}
4664
4665static void typeCommand(redisClient *c) {
3305306f 4666 robj *o;
ed9b544e 4667 char *type;
3305306f 4668
4669 o = lookupKeyRead(c->db,c->argv[1]);
4670 if (o == NULL) {
c937aa89 4671 type = "+none";
ed9b544e 4672 } else {
ed9b544e 4673 switch(o->type) {
c937aa89 4674 case REDIS_STRING: type = "+string"; break;
4675 case REDIS_LIST: type = "+list"; break;
4676 case REDIS_SET: type = "+set"; break;
412a8bce 4677 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4678 case REDIS_HASH: type = "+hash"; break;
4679 default: type = "+unknown"; break;
ed9b544e 4680 }
4681 }
4682 addReplySds(c,sdsnew(type));
4683 addReply(c,shared.crlf);
4684}
4685
4686static void saveCommand(redisClient *c) {
9d65a1bb 4687 if (server.bgsavechildpid != -1) {
05557f6d 4688 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4689 return;
4690 }
f78fd11b 4691 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4692 addReply(c,shared.ok);
4693 } else {
4694 addReply(c,shared.err);
4695 }
4696}
4697
4698static void bgsaveCommand(redisClient *c) {
9d65a1bb 4699 if (server.bgsavechildpid != -1) {
ed9b544e 4700 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4701 return;
4702 }
f78fd11b 4703 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4704 char *status = "+Background saving started\r\n";
4705 addReplySds(c,sdsnew(status));
ed9b544e 4706 } else {
4707 addReply(c,shared.err);
4708 }
4709}
4710
4711static void shutdownCommand(redisClient *c) {
fab43727 4712 if (prepareForShutdown() == REDIS_OK)
4713 exit(0);
4714 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
ed9b544e 4715}
4716
4717static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4718 robj *o;
4719
4720 /* To use the same key as src and dst is probably an error */
4721 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4722 addReply(c,shared.sameobjecterr);
ed9b544e 4723 return;
4724 }
4725
dd88747b 4726 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4727 return;
dd88747b 4728
ed9b544e 4729 incrRefCount(o);
3305306f 4730 deleteIfVolatile(c->db,c->argv[2]);
4731 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 4732 if (nx) {
4733 decrRefCount(o);
c937aa89 4734 addReply(c,shared.czero);
ed9b544e 4735 return;
4736 }
3305306f 4737 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 4738 } else {
4739 incrRefCount(c->argv[2]);
4740 }
3305306f 4741 deleteKey(c->db,c->argv[1]);
b167f877 4742 touchWatchedKey(c->db,c->argv[2]);
ed9b544e 4743 server.dirty++;
c937aa89 4744 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4745}
4746
4747static void renameCommand(redisClient *c) {
4748 renameGenericCommand(c,0);
4749}
4750
4751static void renamenxCommand(redisClient *c) {
4752 renameGenericCommand(c,1);
4753}
4754
4755static void moveCommand(redisClient *c) {
3305306f 4756 robj *o;
4757 redisDb *src, *dst;
ed9b544e 4758 int srcid;
4759
4760 /* Obtain source and target DB pointers */
3305306f 4761 src = c->db;
4762 srcid = c->db->id;
ed9b544e 4763 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4764 addReply(c,shared.outofrangeerr);
ed9b544e 4765 return;
4766 }
3305306f 4767 dst = c->db;
4768 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4769
4770 /* If the user is moving using as target the same
4771 * DB as the source DB it is probably an error. */
4772 if (src == dst) {
c937aa89 4773 addReply(c,shared.sameobjecterr);
ed9b544e 4774 return;
4775 }
4776
4777 /* Check if the element exists and get a reference */
3305306f 4778 o = lookupKeyWrite(c->db,c->argv[1]);
4779 if (!o) {
c937aa89 4780 addReply(c,shared.czero);
ed9b544e 4781 return;
4782 }
4783
4784 /* Try to add the element to the target DB */
3305306f 4785 deleteIfVolatile(dst,c->argv[1]);
4786 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 4787 addReply(c,shared.czero);
ed9b544e 4788 return;
4789 }
3305306f 4790 incrRefCount(c->argv[1]);
ed9b544e 4791 incrRefCount(o);
4792
4793 /* OK! key moved, free the entry in the source DB */
3305306f 4794 deleteKey(src,c->argv[1]);
ed9b544e 4795 server.dirty++;
c937aa89 4796 addReply(c,shared.cone);
ed9b544e 4797}
4798
4799/* =================================== Lists ================================ */
4800static void pushGenericCommand(redisClient *c, int where) {
4801 robj *lobj;
ed9b544e 4802 list *list;
3305306f 4803
4804 lobj = lookupKeyWrite(c->db,c->argv[1]);
4805 if (lobj == NULL) {
95242ab5 4806 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4807 addReply(c,shared.cone);
95242ab5 4808 return;
4809 }
ed9b544e 4810 lobj = createListObject();
4811 list = lobj->ptr;
4812 if (where == REDIS_HEAD) {
6b47e12e 4813 listAddNodeHead(list,c->argv[2]);
ed9b544e 4814 } else {
6b47e12e 4815 listAddNodeTail(list,c->argv[2]);
ed9b544e 4816 }
3305306f 4817 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 4818 incrRefCount(c->argv[1]);
4819 incrRefCount(c->argv[2]);
4820 } else {
ed9b544e 4821 if (lobj->type != REDIS_LIST) {
4822 addReply(c,shared.wrongtypeerr);
4823 return;
4824 }
95242ab5 4825 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4826 addReply(c,shared.cone);
95242ab5 4827 return;
4828 }
ed9b544e 4829 list = lobj->ptr;
4830 if (where == REDIS_HEAD) {
6b47e12e 4831 listAddNodeHead(list,c->argv[2]);
ed9b544e 4832 } else {
6b47e12e 4833 listAddNodeTail(list,c->argv[2]);
ed9b544e 4834 }
4835 incrRefCount(c->argv[2]);
4836 }
4837 server.dirty++;
482b672d 4838 addReplyLongLong(c,listLength(list));
ed9b544e 4839}
4840
4841static void lpushCommand(redisClient *c) {
4842 pushGenericCommand(c,REDIS_HEAD);
4843}
4844
4845static void rpushCommand(redisClient *c) {
4846 pushGenericCommand(c,REDIS_TAIL);
4847}
4848
4849static void llenCommand(redisClient *c) {
3305306f 4850 robj *o;
ed9b544e 4851 list *l;
dd88747b 4852
4853 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4854 checkType(c,o,REDIS_LIST)) return;
e0a62c7f 4855
dd88747b 4856 l = o->ptr;
4857 addReplyUlong(c,listLength(l));
ed9b544e 4858}
4859
4860static void lindexCommand(redisClient *c) {
3305306f 4861 robj *o;
ed9b544e 4862 int index = atoi(c->argv[2]->ptr);
dd88747b 4863 list *list;
4864 listNode *ln;
4865
4866 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4867 checkType(c,o,REDIS_LIST)) return;
4868 list = o->ptr;
4869
4870 ln = listIndex(list, index);
4871 if (ln == NULL) {
c937aa89 4872 addReply(c,shared.nullbulk);
ed9b544e 4873 } else {
dd88747b 4874 robj *ele = listNodeValue(ln);
4875 addReplyBulk(c,ele);
ed9b544e 4876 }
4877}
4878
4879static void lsetCommand(redisClient *c) {
3305306f 4880 robj *o;
ed9b544e 4881 int index = atoi(c->argv[2]->ptr);
dd88747b 4882 list *list;
4883 listNode *ln;
4884
4885 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4886 checkType(c,o,REDIS_LIST)) return;
4887 list = o->ptr;
4888
4889 ln = listIndex(list, index);
4890 if (ln == NULL) {
4891 addReply(c,shared.outofrangeerr);
ed9b544e 4892 } else {
dd88747b 4893 robj *ele = listNodeValue(ln);
ed9b544e 4894
dd88747b 4895 decrRefCount(ele);
4896 listNodeValue(ln) = c->argv[3];
4897 incrRefCount(c->argv[3]);
4898 addReply(c,shared.ok);
4899 server.dirty++;
ed9b544e 4900 }
4901}
4902
4903static void popGenericCommand(redisClient *c, int where) {
3305306f 4904 robj *o;
dd88747b 4905 list *list;
4906 listNode *ln;
3305306f 4907
dd88747b 4908 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4909 checkType(c,o,REDIS_LIST)) return;
4910 list = o->ptr;
ed9b544e 4911
dd88747b 4912 if (where == REDIS_HEAD)
4913 ln = listFirst(list);
4914 else
4915 ln = listLast(list);
ed9b544e 4916
dd88747b 4917 if (ln == NULL) {
4918 addReply(c,shared.nullbulk);
4919 } else {
4920 robj *ele = listNodeValue(ln);
4921 addReplyBulk(c,ele);
4922 listDelNode(list,ln);
3ea27d37 4923 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4924 server.dirty++;
ed9b544e 4925 }
4926}
4927
4928static void lpopCommand(redisClient *c) {
4929 popGenericCommand(c,REDIS_HEAD);
4930}
4931
4932static void rpopCommand(redisClient *c) {
4933 popGenericCommand(c,REDIS_TAIL);
4934}
4935
4936static void lrangeCommand(redisClient *c) {
3305306f 4937 robj *o;
ed9b544e 4938 int start = atoi(c->argv[2]->ptr);
4939 int end = atoi(c->argv[3]->ptr);
dd88747b 4940 int llen;
4941 int rangelen, j;
4942 list *list;
4943 listNode *ln;
4944 robj *ele;
4945
4e27f268 4946 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4947 || checkType(c,o,REDIS_LIST)) return;
dd88747b 4948 list = o->ptr;
4949 llen = listLength(list);
4950
4951 /* convert negative indexes */
4952 if (start < 0) start = llen+start;
4953 if (end < 0) end = llen+end;
4954 if (start < 0) start = 0;
4955 if (end < 0) end = 0;
4956
4957 /* indexes sanity checks */
4958 if (start > end || start >= llen) {
4959 /* Out of range start or start > end result in empty list */
4960 addReply(c,shared.emptymultibulk);
4961 return;
4962 }
4963 if (end >= llen) end = llen-1;
4964 rangelen = (end-start)+1;
3305306f 4965
dd88747b 4966 /* Return the result in form of a multi-bulk reply */
4967 ln = listIndex(list, start);
4968 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4969 for (j = 0; j < rangelen; j++) {
4970 ele = listNodeValue(ln);
4971 addReplyBulk(c,ele);
4972 ln = ln->next;
ed9b544e 4973 }
4974}
4975
4976static void ltrimCommand(redisClient *c) {
3305306f 4977 robj *o;
ed9b544e 4978 int start = atoi(c->argv[2]->ptr);
4979 int end = atoi(c->argv[3]->ptr);
dd88747b 4980 int llen;
4981 int j, ltrim, rtrim;
4982 list *list;
4983 listNode *ln;
4984
4985 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4986 checkType(c,o,REDIS_LIST)) return;
4987 list = o->ptr;
4988 llen = listLength(list);
4989
4990 /* convert negative indexes */
4991 if (start < 0) start = llen+start;
4992 if (end < 0) end = llen+end;
4993 if (start < 0) start = 0;
4994 if (end < 0) end = 0;
4995
4996 /* indexes sanity checks */
4997 if (start > end || start >= llen) {
4998 /* Out of range start or start > end result in empty list */
4999 ltrim = llen;
5000 rtrim = 0;
ed9b544e 5001 } else {
dd88747b 5002 if (end >= llen) end = llen-1;
5003 ltrim = start;
5004 rtrim = llen-end-1;
5005 }
ed9b544e 5006
dd88747b 5007 /* Remove list elements to perform the trim */
5008 for (j = 0; j < ltrim; j++) {
5009 ln = listFirst(list);
5010 listDelNode(list,ln);
5011 }
5012 for (j = 0; j < rtrim; j++) {
5013 ln = listLast(list);
5014 listDelNode(list,ln);
ed9b544e 5015 }
3ea27d37 5016 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5017 server.dirty++;
5018 addReply(c,shared.ok);
ed9b544e 5019}
5020
5021static void lremCommand(redisClient *c) {
3305306f 5022 robj *o;
dd88747b 5023 list *list;
5024 listNode *ln, *next;
5025 int toremove = atoi(c->argv[2]->ptr);
5026 int removed = 0;
5027 int fromtail = 0;
a4d1ba9a 5028
dd88747b 5029 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5030 checkType(c,o,REDIS_LIST)) return;
5031 list = o->ptr;
5032
5033 if (toremove < 0) {
5034 toremove = -toremove;
5035 fromtail = 1;
5036 }
5037 ln = fromtail ? list->tail : list->head;
5038 while (ln) {
5039 robj *ele = listNodeValue(ln);
5040
5041 next = fromtail ? ln->prev : ln->next;
bf028098 5042 if (equalStringObjects(ele,c->argv[3])) {
dd88747b 5043 listDelNode(list,ln);
5044 server.dirty++;
5045 removed++;
5046 if (toremove && removed == toremove) break;
ed9b544e 5047 }
dd88747b 5048 ln = next;
ed9b544e 5049 }
3ea27d37 5050 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5051 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 5052}
5053
12f9d551 5054/* This is the semantic of this command:
0f5f7e9a 5055 * RPOPLPUSH srclist dstlist:
12f9d551 5056 * IF LLEN(srclist) > 0
5057 * element = RPOP srclist
5058 * LPUSH dstlist element
5059 * RETURN element
5060 * ELSE
5061 * RETURN nil
5062 * END
5063 * END
5064 *
5065 * The idea is to be able to get an element from a list in a reliable way
5066 * since the element is not just returned but pushed against another list
5067 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5068 */
0f5f7e9a 5069static void rpoplpushcommand(redisClient *c) {
12f9d551 5070 robj *sobj;
dd88747b 5071 list *srclist;
5072 listNode *ln;
5073
5074 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5075 checkType(c,sobj,REDIS_LIST)) return;
5076 srclist = sobj->ptr;
5077 ln = listLast(srclist);
12f9d551 5078
dd88747b 5079 if (ln == NULL) {
12f9d551 5080 addReply(c,shared.nullbulk);
5081 } else {
dd88747b 5082 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
5083 robj *ele = listNodeValue(ln);
5084 list *dstlist;
e20fb74f 5085
dd88747b 5086 if (dobj && dobj->type != REDIS_LIST) {
5087 addReply(c,shared.wrongtypeerr);
5088 return;
5089 }
12f9d551 5090
dd88747b 5091 /* Add the element to the target list (unless it's directly
5092 * passed to some BLPOP-ing client */
5093 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
5094 if (dobj == NULL) {
5095 /* Create the list if the key does not exist */
5096 dobj = createListObject();
5097 dictAdd(c->db->dict,c->argv[2],dobj);
5098 incrRefCount(c->argv[2]);
12f9d551 5099 }
dd88747b 5100 dstlist = dobj->ptr;
5101 listAddNodeHead(dstlist,ele);
5102 incrRefCount(ele);
12f9d551 5103 }
dd88747b 5104
5105 /* Send the element to the client as reply as well */
5106 addReplyBulk(c,ele);
5107
5108 /* Finally remove the element from the source list */
5109 listDelNode(srclist,ln);
3ea27d37 5110 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5111 server.dirty++;
12f9d551 5112 }
5113}
5114
ed9b544e 5115/* ==================================== Sets ================================ */
5116
5117static void saddCommand(redisClient *c) {
ed9b544e 5118 robj *set;
5119
3305306f 5120 set = lookupKeyWrite(c->db,c->argv[1]);
5121 if (set == NULL) {
ed9b544e 5122 set = createSetObject();
3305306f 5123 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 5124 incrRefCount(c->argv[1]);
5125 } else {
ed9b544e 5126 if (set->type != REDIS_SET) {
c937aa89 5127 addReply(c,shared.wrongtypeerr);
ed9b544e 5128 return;
5129 }
5130 }
5131 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5132 incrRefCount(c->argv[2]);
5133 server.dirty++;
c937aa89 5134 addReply(c,shared.cone);
ed9b544e 5135 } else {
c937aa89 5136 addReply(c,shared.czero);
ed9b544e 5137 }
5138}
5139
5140static void sremCommand(redisClient *c) {
3305306f 5141 robj *set;
ed9b544e 5142
dd88747b 5143 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5144 checkType(c,set,REDIS_SET)) return;
5145
5146 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5147 server.dirty++;
5148 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 5149 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5150 addReply(c,shared.cone);
ed9b544e 5151 } else {
dd88747b 5152 addReply(c,shared.czero);
ed9b544e 5153 }
5154}
5155
a4460ef4 5156static void smoveCommand(redisClient *c) {
5157 robj *srcset, *dstset;
5158
5159 srcset = lookupKeyWrite(c->db,c->argv[1]);
5160 dstset = lookupKeyWrite(c->db,c->argv[2]);
5161
5162 /* If the source key does not exist return 0, if it's of the wrong type
5163 * raise an error */
5164 if (srcset == NULL || srcset->type != REDIS_SET) {
5165 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5166 return;
5167 }
5168 /* Error if the destination key is not a set as well */
5169 if (dstset && dstset->type != REDIS_SET) {
5170 addReply(c,shared.wrongtypeerr);
5171 return;
5172 }
5173 /* Remove the element from the source set */
5174 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5175 /* Key not found in the src set! return zero */
5176 addReply(c,shared.czero);
5177 return;
5178 }
3ea27d37 5179 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5180 deleteKey(c->db,c->argv[1]);
a4460ef4 5181 server.dirty++;
5182 /* Add the element to the destination set */
5183 if (!dstset) {
5184 dstset = createSetObject();
5185 dictAdd(c->db->dict,c->argv[2],dstset);
5186 incrRefCount(c->argv[2]);
5187 }
5188 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5189 incrRefCount(c->argv[3]);
5190 addReply(c,shared.cone);
5191}
5192
ed9b544e 5193static void sismemberCommand(redisClient *c) {
3305306f 5194 robj *set;
ed9b544e 5195
dd88747b 5196 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5197 checkType(c,set,REDIS_SET)) return;
5198
5199 if (dictFind(set->ptr,c->argv[2]))
5200 addReply(c,shared.cone);
5201 else
c937aa89 5202 addReply(c,shared.czero);
ed9b544e 5203}
5204
5205static void scardCommand(redisClient *c) {
3305306f 5206 robj *o;
ed9b544e 5207 dict *s;
dd88747b 5208
5209 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5210 checkType(c,o,REDIS_SET)) return;
e0a62c7f 5211
dd88747b 5212 s = o->ptr;
5213 addReplyUlong(c,dictSize(s));
ed9b544e 5214}
5215
12fea928 5216static void spopCommand(redisClient *c) {
5217 robj *set;
5218 dictEntry *de;
5219
dd88747b 5220 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5221 checkType(c,set,REDIS_SET)) return;
5222
5223 de = dictGetRandomKey(set->ptr);
5224 if (de == NULL) {
12fea928 5225 addReply(c,shared.nullbulk);
5226 } else {
dd88747b 5227 robj *ele = dictGetEntryKey(de);
12fea928 5228
dd88747b 5229 addReplyBulk(c,ele);
5230 dictDelete(set->ptr,ele);
5231 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 5232 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5233 server.dirty++;
12fea928 5234 }
5235}
5236
2abb95a9 5237static void srandmemberCommand(redisClient *c) {
5238 robj *set;
5239 dictEntry *de;
5240
dd88747b 5241 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5242 checkType(c,set,REDIS_SET)) return;
5243
5244 de = dictGetRandomKey(set->ptr);
5245 if (de == NULL) {
2abb95a9 5246 addReply(c,shared.nullbulk);
5247 } else {
dd88747b 5248 robj *ele = dictGetEntryKey(de);
2abb95a9 5249
dd88747b 5250 addReplyBulk(c,ele);
2abb95a9 5251 }
5252}
5253
ed9b544e 5254static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5255 dict **d1 = (void*) s1, **d2 = (void*) s2;
5256
3305306f 5257 return dictSize(*d1)-dictSize(*d2);
ed9b544e 5258}
5259
682ac724 5260static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 5261 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5262 dictIterator *di;
5263 dictEntry *de;
5264 robj *lenobj = NULL, *dstset = NULL;
682ac724 5265 unsigned long j, cardinality = 0;
ed9b544e 5266
ed9b544e 5267 for (j = 0; j < setsnum; j++) {
5268 robj *setobj;
3305306f 5269
5270 setobj = dstkey ?
5271 lookupKeyWrite(c->db,setskeys[j]) :
5272 lookupKeyRead(c->db,setskeys[j]);
5273 if (!setobj) {
ed9b544e 5274 zfree(dv);
5faa6025 5275 if (dstkey) {
fdcaae84 5276 if (deleteKey(c->db,dstkey))
5277 server.dirty++;
0d36ded0 5278 addReply(c,shared.czero);
5faa6025 5279 } else {
4e27f268 5280 addReply(c,shared.emptymultibulk);
5faa6025 5281 }
ed9b544e 5282 return;
5283 }
ed9b544e 5284 if (setobj->type != REDIS_SET) {
5285 zfree(dv);
c937aa89 5286 addReply(c,shared.wrongtypeerr);
ed9b544e 5287 return;
5288 }
5289 dv[j] = setobj->ptr;
5290 }
5291 /* Sort sets from the smallest to largest, this will improve our
5292 * algorithm's performace */
5293 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5294
5295 /* The first thing we should output is the total number of elements...
5296 * since this is a multi-bulk write, but at this stage we don't know
5297 * the intersection set size, so we use a trick, append an empty object
5298 * to the output list and save the pointer to later modify it with the
5299 * right length */
5300 if (!dstkey) {
5301 lenobj = createObject(REDIS_STRING,NULL);
5302 addReply(c,lenobj);
5303 decrRefCount(lenobj);
5304 } else {
5305 /* If we have a target key where to store the resulting set
5306 * create this key with an empty set inside */
5307 dstset = createSetObject();
ed9b544e 5308 }
5309
5310 /* Iterate all the elements of the first (smallest) set, and test
5311 * the element against all the other sets, if at least one set does
5312 * not include the element it is discarded */
5313 di = dictGetIterator(dv[0]);
ed9b544e 5314
5315 while((de = dictNext(di)) != NULL) {
5316 robj *ele;
5317
5318 for (j = 1; j < setsnum; j++)
5319 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5320 if (j != setsnum)
5321 continue; /* at least one set does not contain the member */
5322 ele = dictGetEntryKey(de);
5323 if (!dstkey) {
dd88747b 5324 addReplyBulk(c,ele);
ed9b544e 5325 cardinality++;
5326 } else {
5327 dictAdd(dstset->ptr,ele,NULL);
5328 incrRefCount(ele);
5329 }
5330 }
5331 dictReleaseIterator(di);
5332
83cdfe18 5333 if (dstkey) {
3ea27d37 5334 /* Store the resulting set into the target, if the intersection
5335 * is not an empty set. */
83cdfe18 5336 deleteKey(c->db,dstkey);
3ea27d37 5337 if (dictSize((dict*)dstset->ptr) > 0) {
5338 dictAdd(c->db->dict,dstkey,dstset);
5339 incrRefCount(dstkey);
482b672d 5340 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5341 } else {
5342 decrRefCount(dstset);
d36c4e97 5343 addReply(c,shared.czero);
3ea27d37 5344 }
40d224a9 5345 server.dirty++;
d36c4e97 5346 } else {
5347 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 5348 }
ed9b544e 5349 zfree(dv);
5350}
5351
5352static void sinterCommand(redisClient *c) {
5353 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5354}
5355
5356static void sinterstoreCommand(redisClient *c) {
5357 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5358}
5359
f4f56e1d 5360#define REDIS_OP_UNION 0
5361#define REDIS_OP_DIFF 1
2830ca53 5362#define REDIS_OP_INTER 2
f4f56e1d 5363
5364static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 5365 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5366 dictIterator *di;
5367 dictEntry *de;
f4f56e1d 5368 robj *dstset = NULL;
40d224a9 5369 int j, cardinality = 0;
5370
40d224a9 5371 for (j = 0; j < setsnum; j++) {
5372 robj *setobj;
5373
5374 setobj = dstkey ?
5375 lookupKeyWrite(c->db,setskeys[j]) :
5376 lookupKeyRead(c->db,setskeys[j]);
5377 if (!setobj) {
5378 dv[j] = NULL;
5379 continue;
5380 }
5381 if (setobj->type != REDIS_SET) {
5382 zfree(dv);
5383 addReply(c,shared.wrongtypeerr);
5384 return;
5385 }
5386 dv[j] = setobj->ptr;
5387 }
5388
5389 /* We need a temp set object to store our union. If the dstkey
5390 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5391 * this set object will be the resulting object to set into the target key*/
5392 dstset = createSetObject();
5393
40d224a9 5394 /* Iterate all the elements of all the sets, add every element a single
5395 * time to the result set */
5396 for (j = 0; j < setsnum; j++) {
51829ed3 5397 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 5398 if (!dv[j]) continue; /* non existing keys are like empty sets */
5399
5400 di = dictGetIterator(dv[j]);
40d224a9 5401
5402 while((de = dictNext(di)) != NULL) {
5403 robj *ele;
5404
5405 /* dictAdd will not add the same element multiple times */
5406 ele = dictGetEntryKey(de);
f4f56e1d 5407 if (op == REDIS_OP_UNION || j == 0) {
5408 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5409 incrRefCount(ele);
40d224a9 5410 cardinality++;
5411 }
f4f56e1d 5412 } else if (op == REDIS_OP_DIFF) {
5413 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5414 cardinality--;
5415 }
40d224a9 5416 }
5417 }
5418 dictReleaseIterator(di);
51829ed3 5419
d36c4e97 5420 /* result set is empty? Exit asap. */
5421 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 5422 }
5423
f4f56e1d 5424 /* Output the content of the resulting set, if not in STORE mode */
5425 if (!dstkey) {
5426 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5427 di = dictGetIterator(dstset->ptr);
f4f56e1d 5428 while((de = dictNext(di)) != NULL) {
5429 robj *ele;
5430
5431 ele = dictGetEntryKey(de);
dd88747b 5432 addReplyBulk(c,ele);
f4f56e1d 5433 }
5434 dictReleaseIterator(di);
d36c4e97 5435 decrRefCount(dstset);
83cdfe18
AG
5436 } else {
5437 /* If we have a target key where to store the resulting set
5438 * create this key with the result set inside */
5439 deleteKey(c->db,dstkey);
3ea27d37 5440 if (dictSize((dict*)dstset->ptr) > 0) {
5441 dictAdd(c->db->dict,dstkey,dstset);
5442 incrRefCount(dstkey);
482b672d 5443 addReplyLongLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5444 } else {
5445 decrRefCount(dstset);
d36c4e97 5446 addReply(c,shared.czero);
3ea27d37 5447 }
40d224a9 5448 server.dirty++;
5449 }
5450 zfree(dv);
5451}
5452
5453static void sunionCommand(redisClient *c) {
f4f56e1d 5454 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 5455}
5456
5457static void sunionstoreCommand(redisClient *c) {
f4f56e1d 5458 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5459}
5460
5461static void sdiffCommand(redisClient *c) {
5462 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5463}
5464
5465static void sdiffstoreCommand(redisClient *c) {
5466 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 5467}
5468
6b47e12e 5469/* ==================================== ZSets =============================== */
5470
5471/* ZSETs are ordered sets using two data structures to hold the same elements
5472 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5473 * data structure.
5474 *
5475 * The elements are added to an hash table mapping Redis objects to scores.
5476 * At the same time the elements are added to a skip list mapping scores
5477 * to Redis objects (so objects are sorted by scores in this "view"). */
5478
5479/* This skiplist implementation is almost a C translation of the original
5480 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5481 * Alternative to Balanced Trees", modified in three ways:
5482 * a) this implementation allows for repeated values.
5483 * b) the comparison is not just by key (our 'score') but by satellite data.
5484 * c) there is a back pointer, so it's a doubly linked list with the back
5485 * pointers being only at "level 1". This allows to traverse the list
5486 * from tail to head, useful for ZREVRANGE. */
5487
5488static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5489 zskiplistNode *zn = zmalloc(sizeof(*zn));
5490
5491 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2f4dd7e0 5492 if (level > 1)
2b37892e 5493 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
2f4dd7e0 5494 else
5495 zn->span = NULL;
6b47e12e 5496 zn->score = score;
5497 zn->obj = obj;
5498 return zn;
5499}
5500
5501static zskiplist *zslCreate(void) {
5502 int j;
5503 zskiplist *zsl;
e0a62c7f 5504
6b47e12e 5505 zsl = zmalloc(sizeof(*zsl));
5506 zsl->level = 1;
cc812361 5507 zsl->length = 0;
6b47e12e 5508 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 5509 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 5510 zsl->header->forward[j] = NULL;
94e543b5 5511
5512 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5513 if (j < ZSKIPLIST_MAXLEVEL-1)
5514 zsl->header->span[j] = 0;
69d95c3e 5515 }
e3870fab 5516 zsl->header->backward = NULL;
5517 zsl->tail = NULL;
6b47e12e 5518 return zsl;
5519}
5520
fd8ccf44 5521static void zslFreeNode(zskiplistNode *node) {
5522 decrRefCount(node->obj);
ad807e6f 5523 zfree(node->forward);
69d95c3e 5524 zfree(node->span);
fd8ccf44 5525 zfree(node);
5526}
5527
5528static void zslFree(zskiplist *zsl) {
ad807e6f 5529 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 5530
ad807e6f 5531 zfree(zsl->header->forward);
69d95c3e 5532 zfree(zsl->header->span);
ad807e6f 5533 zfree(zsl->header);
fd8ccf44 5534 while(node) {
599379dd 5535 next = node->forward[0];
fd8ccf44 5536 zslFreeNode(node);
5537 node = next;
5538 }
ad807e6f 5539 zfree(zsl);
fd8ccf44 5540}
5541
6b47e12e 5542static int zslRandomLevel(void) {
5543 int level = 1;
5544 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5545 level += 1;
10c2baa5 5546 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6b47e12e 5547}
5548
5549static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5550 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 5551 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 5552 int i, level;
5553
5554 x = zsl->header;
5555 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
5556 /* store rank that is crossed to reach the insert position */
5557 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 5558
9d60e6e4 5559 while (x->forward[i] &&
5560 (x->forward[i]->score < score ||
5561 (x->forward[i]->score == score &&
69d95c3e 5562 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 5563 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 5564 x = x->forward[i];
69d95c3e 5565 }
6b47e12e 5566 update[i] = x;
5567 }
6b47e12e 5568 /* we assume the key is not already inside, since we allow duplicated
5569 * scores, and the re-insertion of score and redis object should never
5570 * happpen since the caller of zslInsert() should test in the hash table
5571 * if the element is already inside or not. */
5572 level = zslRandomLevel();
5573 if (level > zsl->level) {
69d95c3e 5574 for (i = zsl->level; i < level; i++) {
2b37892e 5575 rank[i] = 0;
6b47e12e 5576 update[i] = zsl->header;
2b37892e 5577 update[i]->span[i-1] = zsl->length;
69d95c3e 5578 }
6b47e12e 5579 zsl->level = level;
5580 }
5581 x = zslCreateNode(level,score,obj);
5582 for (i = 0; i < level; i++) {
5583 x->forward[i] = update[i]->forward[i];
5584 update[i]->forward[i] = x;
69d95c3e
PN
5585
5586 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
5587 if (i > 0) {
5588 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5589 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5590 }
6b47e12e 5591 }
69d95c3e
PN
5592
5593 /* increment span for untouched levels */
5594 for (i = level; i < zsl->level; i++) {
2b37892e 5595 update[i]->span[i-1]++;
69d95c3e
PN
5596 }
5597
bb975144 5598 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 5599 if (x->forward[0])
5600 x->forward[0]->backward = x;
5601 else
5602 zsl->tail = x;
cc812361 5603 zsl->length++;
6b47e12e 5604}
5605
84105336
PN
5606/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5607void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5608 int i;
5609 for (i = 0; i < zsl->level; i++) {
5610 if (update[i]->forward[i] == x) {
5611 if (i > 0) {
5612 update[i]->span[i-1] += x->span[i-1] - 1;
5613 }
5614 update[i]->forward[i] = x->forward[i];
5615 } else {
5616 /* invariant: i > 0, because update[0]->forward[0]
5617 * is always equal to x */
5618 update[i]->span[i-1] -= 1;
5619 }
5620 }
5621 if (x->forward[0]) {
5622 x->forward[0]->backward = x->backward;
5623 } else {
5624 zsl->tail = x->backward;
5625 }
5626 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5627 zsl->level--;
5628 zsl->length--;
5629}
5630
50c55df5 5631/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 5632static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 5633 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5634 int i;
5635
5636 x = zsl->header;
5637 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 5638 while (x->forward[i] &&
5639 (x->forward[i]->score < score ||
5640 (x->forward[i]->score == score &&
5641 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 5642 x = x->forward[i];
5643 update[i] = x;
5644 }
5645 /* We may have multiple elements with the same score, what we need
5646 * is to find the element with both the right score and object. */
5647 x = x->forward[0];
bf028098 5648 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
84105336 5649 zslDeleteNode(zsl, x, update);
9d60e6e4 5650 zslFreeNode(x);
9d60e6e4 5651 return 1;
5652 } else {
5653 return 0; /* not found */
e197b441 5654 }
5655 return 0; /* not found */
fd8ccf44 5656}
5657
1807985b 5658/* Delete all the elements with score between min and max from the skiplist.
5659 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5660 * Note that this function takes the reference to the hash table view of the
5661 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 5662static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 5663 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5664 unsigned long removed = 0;
5665 int i;
5666
5667 x = zsl->header;
5668 for (i = zsl->level-1; i >= 0; i--) {
5669 while (x->forward[i] && x->forward[i]->score < min)
5670 x = x->forward[i];
5671 update[i] = x;
5672 }
5673 /* We may have multiple elements with the same score, what we need
5674 * is to find the element with both the right score and object. */
5675 x = x->forward[0];
5676 while (x && x->score <= max) {
84105336
PN
5677 zskiplistNode *next = x->forward[0];
5678 zslDeleteNode(zsl, x, update);
1807985b 5679 dictDelete(dict,x->obj);
5680 zslFreeNode(x);
1807985b 5681 removed++;
5682 x = next;
5683 }
5684 return removed; /* not found */
5685}
1807985b 5686
9212eafd 5687/* Delete all the elements with rank between start and end from the skiplist.
2424490f 5688 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
5689static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5690 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5691 unsigned long traversed = 0, removed = 0;
5692 int i;
5693
9212eafd
PN
5694 x = zsl->header;
5695 for (i = zsl->level-1; i >= 0; i--) {
5696 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5697 traversed += i > 0 ? x->span[i-1] : 1;
5698 x = x->forward[i];
1807985b 5699 }
9212eafd
PN
5700 update[i] = x;
5701 }
5702
5703 traversed++;
5704 x = x->forward[0];
5705 while (x && traversed <= end) {
84105336
PN
5706 zskiplistNode *next = x->forward[0];
5707 zslDeleteNode(zsl, x, update);
1807985b 5708 dictDelete(dict,x->obj);
5709 zslFreeNode(x);
1807985b 5710 removed++;
9212eafd 5711 traversed++;
1807985b 5712 x = next;
5713 }
9212eafd 5714 return removed;
1807985b 5715}
5716
50c55df5 5717/* Find the first node having a score equal or greater than the specified one.
5718 * Returns NULL if there is no match. */
5719static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5720 zskiplistNode *x;
5721 int i;
5722
5723 x = zsl->header;
5724 for (i = zsl->level-1; i >= 0; i--) {
5725 while (x->forward[i] && x->forward[i]->score < score)
5726 x = x->forward[i];
5727 }
5728 /* We may have multiple elements with the same score, what we need
5729 * is to find the element with both the right score and object. */
5730 return x->forward[0];
5731}
5732
27b0ccca
PN
5733/* Find the rank for an element by both score and key.
5734 * Returns 0 when the element cannot be found, rank otherwise.
5735 * Note that the rank is 1-based due to the span of zsl->header to the
5736 * first element. */
5737static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5738 zskiplistNode *x;
5739 unsigned long rank = 0;
5740 int i;
5741
5742 x = zsl->header;
5743 for (i = zsl->level-1; i >= 0; i--) {
5744 while (x->forward[i] &&
5745 (x->forward[i]->score < score ||
5746 (x->forward[i]->score == score &&
5747 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 5748 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
5749 x = x->forward[i];
5750 }
5751
5752 /* x might be equal to zsl->header, so test if obj is non-NULL */
bf028098 5753 if (x->obj && equalStringObjects(x->obj,o)) {
27b0ccca
PN
5754 return rank;
5755 }
5756 }
5757 return 0;
5758}
5759
e74825c2
PN
5760/* Finds an element by its rank. The rank argument needs to be 1-based. */
5761zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5762 zskiplistNode *x;
5763 unsigned long traversed = 0;
5764 int i;
5765
5766 x = zsl->header;
5767 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 5768 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5769 {
a50ea45c 5770 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
5771 x = x->forward[i];
5772 }
e74825c2
PN
5773 if (traversed == rank) {
5774 return x;
5775 }
5776 }
5777 return NULL;
5778}
5779
fd8ccf44 5780/* The actual Z-commands implementations */
5781
7db723ad 5782/* This generic command implements both ZADD and ZINCRBY.
e2665397 5783 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 5784 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 5785static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 5786 robj *zsetobj;
5787 zset *zs;
5788 double *score;
5789
5fc9229c 5790 if (isnan(scoreval)) {
5791 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
5792 return;
5793 }
5794
e2665397 5795 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 5796 if (zsetobj == NULL) {
5797 zsetobj = createZsetObject();
e2665397 5798 dictAdd(c->db->dict,key,zsetobj);
5799 incrRefCount(key);
fd8ccf44 5800 } else {
5801 if (zsetobj->type != REDIS_ZSET) {
5802 addReply(c,shared.wrongtypeerr);
5803 return;
5804 }
5805 }
fd8ccf44 5806 zs = zsetobj->ptr;
e2665397 5807
7db723ad 5808 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 5809 * needs to handle the two different conditions. It's all about setting
5810 * '*score', that is, the new score to set, to the right value. */
5811 score = zmalloc(sizeof(double));
5812 if (doincrement) {
5813 dictEntry *de;
5814
5815 /* Read the old score. If the element was not present starts from 0 */
5816 de = dictFind(zs->dict,ele);
5817 if (de) {
5818 double *oldscore = dictGetEntryVal(de);
5819 *score = *oldscore + scoreval;
5820 } else {
5821 *score = scoreval;
5822 }
5fc9229c 5823 if (isnan(*score)) {
5824 addReplySds(c,
5825 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
5826 zfree(score);
5827 /* Note that we don't need to check if the zset may be empty and
5828 * should be removed here, as we can only obtain Nan as score if
5829 * there was already an element in the sorted set. */
5830 return;
5831 }
e2665397 5832 } else {
5833 *score = scoreval;
5834 }
5835
5836 /* What follows is a simple remove and re-insert operation that is common
7db723ad 5837 * to both ZADD and ZINCRBY... */
e2665397 5838 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 5839 /* case 1: New element */
e2665397 5840 incrRefCount(ele); /* added to hash */
5841 zslInsert(zs->zsl,*score,ele);
5842 incrRefCount(ele); /* added to skiplist */
fd8ccf44 5843 server.dirty++;
e2665397 5844 if (doincrement)
e2665397 5845 addReplyDouble(c,*score);
91d71bfc 5846 else
5847 addReply(c,shared.cone);
fd8ccf44 5848 } else {
5849 dictEntry *de;
5850 double *oldscore;
e0a62c7f 5851
fd8ccf44 5852 /* case 2: Score update operation */
e2665397 5853 de = dictFind(zs->dict,ele);
dfc5e96c 5854 redisAssert(de != NULL);
fd8ccf44 5855 oldscore = dictGetEntryVal(de);
5856 if (*score != *oldscore) {
5857 int deleted;
5858
e2665397 5859 /* Remove and insert the element in the skip list with new score */
5860 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 5861 redisAssert(deleted != 0);
e2665397 5862 zslInsert(zs->zsl,*score,ele);
5863 incrRefCount(ele);
5864 /* Update the score in the hash table */
5865 dictReplace(zs->dict,ele,score);
fd8ccf44 5866 server.dirty++;
2161a965 5867 } else {
5868 zfree(score);
fd8ccf44 5869 }
e2665397 5870 if (doincrement)
5871 addReplyDouble(c,*score);
5872 else
5873 addReply(c,shared.czero);
fd8ccf44 5874 }
5875}
5876
e2665397 5877static void zaddCommand(redisClient *c) {
5878 double scoreval;
5879
bd79a6bd 5880 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5881 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5882}
5883
7db723ad 5884static void zincrbyCommand(redisClient *c) {
e2665397 5885 double scoreval;
5886
bd79a6bd 5887 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5888 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5889}
5890
1b7106e7 5891static void zremCommand(redisClient *c) {
5892 robj *zsetobj;
5893 zset *zs;
dd88747b 5894 dictEntry *de;
5895 double *oldscore;
5896 int deleted;
1b7106e7 5897
dd88747b 5898 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5899 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 5900
dd88747b 5901 zs = zsetobj->ptr;
5902 de = dictFind(zs->dict,c->argv[2]);
5903 if (de == NULL) {
5904 addReply(c,shared.czero);
5905 return;
1b7106e7 5906 }
dd88747b 5907 /* Delete from the skiplist */
5908 oldscore = dictGetEntryVal(de);
5909 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5910 redisAssert(deleted != 0);
5911
5912 /* Delete from the hash table */
5913 dictDelete(zs->dict,c->argv[2]);
5914 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5915 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5916 server.dirty++;
5917 addReply(c,shared.cone);
1b7106e7 5918}
5919
1807985b 5920static void zremrangebyscoreCommand(redisClient *c) {
bbe025e0
AM
5921 double min;
5922 double max;
dd88747b 5923 long deleted;
1807985b 5924 robj *zsetobj;
5925 zset *zs;
5926
bd79a6bd
PN
5927 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5928 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
bbe025e0 5929
dd88747b 5930 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5931 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 5932
dd88747b 5933 zs = zsetobj->ptr;
5934 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5935 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5936 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5937 server.dirty += deleted;
482b672d 5938 addReplyLongLong(c,deleted);
1807985b 5939}
5940
9212eafd 5941static void zremrangebyrankCommand(redisClient *c) {
bbe025e0
AM
5942 long start;
5943 long end;
dd88747b 5944 int llen;
5945 long deleted;
9212eafd
PN
5946 robj *zsetobj;
5947 zset *zs;
5948
bd79a6bd
PN
5949 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5950 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 5951
dd88747b 5952 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5953 checkType(c,zsetobj,REDIS_ZSET)) return;
5954 zs = zsetobj->ptr;
5955 llen = zs->zsl->length;
9212eafd 5956
dd88747b 5957 /* convert negative indexes */
5958 if (start < 0) start = llen+start;
5959 if (end < 0) end = llen+end;
5960 if (start < 0) start = 0;
5961 if (end < 0) end = 0;
9212eafd 5962
dd88747b 5963 /* indexes sanity checks */
5964 if (start > end || start >= llen) {
5965 addReply(c,shared.czero);
5966 return;
9212eafd 5967 }
dd88747b 5968 if (end >= llen) end = llen-1;
5969
5970 /* increment start and end because zsl*Rank functions
5971 * use 1-based rank */
5972 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5973 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5974 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5975 server.dirty += deleted;
482b672d 5976 addReplyLongLong(c, deleted);
9212eafd
PN
5977}
5978
8f92e768
PN
5979typedef struct {
5980 dict *dict;
5981 double weight;
5982} zsetopsrc;
5983
5984static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5985 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5986 unsigned long size1, size2;
5987 size1 = d1->dict ? dictSize(d1->dict) : 0;
5988 size2 = d2->dict ? dictSize(d2->dict) : 0;
5989 return size1 - size2;
5990}
5991
d2764cd6
PN
5992#define REDIS_AGGR_SUM 1
5993#define REDIS_AGGR_MIN 2
5994#define REDIS_AGGR_MAX 3
bc000c1d 5995#define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
d2764cd6
PN
5996
5997inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5998 if (aggregate == REDIS_AGGR_SUM) {
5999 *target = *target + val;
6000 } else if (aggregate == REDIS_AGGR_MIN) {
6001 *target = val < *target ? val : *target;
6002 } else if (aggregate == REDIS_AGGR_MAX) {
6003 *target = val > *target ? val : *target;
6004 } else {
6005 /* safety net */
f83c6cb5 6006 redisPanic("Unknown ZUNION/INTER aggregate type");
d2764cd6
PN
6007 }
6008}
6009
2830ca53 6010static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
bc000c1d 6011 int i, j, setnum;
d2764cd6 6012 int aggregate = REDIS_AGGR_SUM;
8f92e768 6013 zsetopsrc *src;
2830ca53
PN
6014 robj *dstobj;
6015 zset *dstzset;
b287c9bb
PN
6016 dictIterator *di;
6017 dictEntry *de;
6018
bc000c1d
JC
6019 /* expect setnum input keys to be given */
6020 setnum = atoi(c->argv[2]->ptr);
6021 if (setnum < 1) {
5d373da9 6022 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
2830ca53 6023 return;
b287c9bb 6024 }
2830ca53
PN
6025
6026 /* test if the expected number of keys would overflow */
bc000c1d 6027 if (3+setnum > c->argc) {
b287c9bb
PN
6028 addReply(c,shared.syntaxerr);
6029 return;
6030 }
6031
2830ca53 6032 /* read keys to be used for input */
bc000c1d
JC
6033 src = zmalloc(sizeof(zsetopsrc) * setnum);
6034 for (i = 0, j = 3; i < setnum; i++, j++) {
6035 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
6036 if (!obj) {
8f92e768 6037 src[i].dict = NULL;
b287c9bb 6038 } else {
bc000c1d
JC
6039 if (obj->type == REDIS_ZSET) {
6040 src[i].dict = ((zset*)obj->ptr)->dict;
6041 } else if (obj->type == REDIS_SET) {
6042 src[i].dict = (obj->ptr);
6043 } else {
8f92e768 6044 zfree(src);
b287c9bb
PN
6045 addReply(c,shared.wrongtypeerr);
6046 return;
6047 }
b287c9bb 6048 }
2830ca53
PN
6049
6050 /* default all weights to 1 */
8f92e768 6051 src[i].weight = 1.0;
b287c9bb
PN
6052 }
6053
2830ca53
PN
6054 /* parse optional extra arguments */
6055 if (j < c->argc) {
d2764cd6 6056 int remaining = c->argc - j;
b287c9bb 6057
2830ca53 6058 while (remaining) {
bc000c1d 6059 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 6060 j++; remaining--;
bc000c1d 6061 for (i = 0; i < setnum; i++, j++, remaining--) {
bd79a6bd 6062 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
bbe025e0 6063 return;
2830ca53 6064 }
d2764cd6
PN
6065 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6066 j++; remaining--;
6067 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6068 aggregate = REDIS_AGGR_SUM;
6069 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6070 aggregate = REDIS_AGGR_MIN;
6071 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6072 aggregate = REDIS_AGGR_MAX;
6073 } else {
6074 zfree(src);
6075 addReply(c,shared.syntaxerr);
6076 return;
6077 }
6078 j++; remaining--;
2830ca53 6079 } else {
8f92e768 6080 zfree(src);
2830ca53
PN
6081 addReply(c,shared.syntaxerr);
6082 return;
6083 }
6084 }
6085 }
b287c9bb 6086
d2764cd6
PN
6087 /* sort sets from the smallest to largest, this will improve our
6088 * algorithm's performance */
bc000c1d 6089 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
d2764cd6 6090
2830ca53
PN
6091 dstobj = createZsetObject();
6092 dstzset = dstobj->ptr;
6093
6094 if (op == REDIS_OP_INTER) {
8f92e768
PN
6095 /* skip going over all entries if the smallest zset is NULL or empty */
6096 if (src[0].dict && dictSize(src[0].dict) > 0) {
6097 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6098 * from small to large, all src[i > 0].dict are non-empty too */
6099 di = dictGetIterator(src[0].dict);
2830ca53 6100 while((de = dictNext(di)) != NULL) {
d2764cd6 6101 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6102 *score = src[0].weight * zunionInterDictValue(de);
2830ca53 6103
bc000c1d 6104 for (j = 1; j < setnum; j++) {
d2764cd6 6105 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6106 if (other) {
bc000c1d 6107 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6108 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6109 } else {
6110 break;
6111 }
6112 }
b287c9bb 6113
2830ca53 6114 /* skip entry when not present in every source dict */
bc000c1d 6115 if (j != setnum) {
2830ca53
PN
6116 zfree(score);
6117 } else {
6118 robj *o = dictGetEntryKey(de);
6119 dictAdd(dstzset->dict,o,score);
6120 incrRefCount(o); /* added to dictionary */
6121 zslInsert(dstzset->zsl,*score,o);
6122 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
6123 }
6124 }
2830ca53
PN
6125 dictReleaseIterator(di);
6126 }
6127 } else if (op == REDIS_OP_UNION) {
bc000c1d 6128 for (i = 0; i < setnum; i++) {
8f92e768 6129 if (!src[i].dict) continue;
2830ca53 6130
8f92e768 6131 di = dictGetIterator(src[i].dict);
2830ca53
PN
6132 while((de = dictNext(di)) != NULL) {
6133 /* skip key when already processed */
6134 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6135
d2764cd6 6136 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6137 *score = src[i].weight * zunionInterDictValue(de);
2830ca53 6138
d2764cd6
PN
6139 /* because the zsets are sorted by size, its only possible
6140 * for sets at larger indices to hold this entry */
bc000c1d 6141 for (j = (i+1); j < setnum; j++) {
d2764cd6 6142 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6143 if (other) {
bc000c1d 6144 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6145 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6146 }
6147 }
b287c9bb 6148
2830ca53
PN
6149 robj *o = dictGetEntryKey(de);
6150 dictAdd(dstzset->dict,o,score);
6151 incrRefCount(o); /* added to dictionary */
6152 zslInsert(dstzset->zsl,*score,o);
6153 incrRefCount(o); /* added to skiplist */
6154 }
6155 dictReleaseIterator(di);
b287c9bb 6156 }
2830ca53
PN
6157 } else {
6158 /* unknown operator */
6159 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
6160 }
6161
6162 deleteKey(c->db,dstkey);
3ea27d37 6163 if (dstzset->zsl->length) {
6164 dictAdd(c->db->dict,dstkey,dstobj);
6165 incrRefCount(dstkey);
482b672d 6166 addReplyLongLong(c, dstzset->zsl->length);
3ea27d37 6167 server.dirty++;
6168 } else {
8bca8773 6169 decrRefCount(dstobj);
3ea27d37 6170 addReply(c, shared.czero);
6171 }
8f92e768 6172 zfree(src);
b287c9bb
PN
6173}
6174
5d373da9 6175static void zunionstoreCommand(redisClient *c) {
2830ca53 6176 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
6177}
6178
5d373da9 6179static void zinterstoreCommand(redisClient *c) {
2830ca53 6180 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
6181}
6182
e3870fab 6183static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 6184 robj *o;
bbe025e0
AM
6185 long start;
6186 long end;
752da584 6187 int withscores = 0;
dd88747b 6188 int llen;
6189 int rangelen, j;
6190 zset *zsetobj;
6191 zskiplist *zsl;
6192 zskiplistNode *ln;
6193 robj *ele;
752da584 6194
bd79a6bd
PN
6195 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6196 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 6197
752da584 6198 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6199 withscores = 1;
6200 } else if (c->argc >= 5) {
6201 addReply(c,shared.syntaxerr);
6202 return;
6203 }
cc812361 6204
4e27f268 6205 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6206 || checkType(c,o,REDIS_ZSET)) return;
dd88747b 6207 zsetobj = o->ptr;
6208 zsl = zsetobj->zsl;
6209 llen = zsl->length;
cc812361 6210
dd88747b 6211 /* convert negative indexes */
6212 if (start < 0) start = llen+start;
6213 if (end < 0) end = llen+end;
6214 if (start < 0) start = 0;
6215 if (end < 0) end = 0;
cc812361 6216
dd88747b 6217 /* indexes sanity checks */
6218 if (start > end || start >= llen) {
6219 /* Out of range start or start > end result in empty list */
6220 addReply(c,shared.emptymultibulk);
6221 return;
6222 }
6223 if (end >= llen) end = llen-1;
6224 rangelen = (end-start)+1;
cc812361 6225
dd88747b 6226 /* check if starting point is trivial, before searching
6227 * the element in log(N) time */
6228 if (reverse) {
6229 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6230 } else {
6231 ln = start == 0 ?
6232 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6233 }
cc812361 6234
dd88747b 6235 /* Return the result in form of a multi-bulk reply */
6236 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6237 withscores ? (rangelen*2) : rangelen));
6238 for (j = 0; j < rangelen; j++) {
6239 ele = ln->obj;
6240 addReplyBulk(c,ele);
6241 if (withscores)
6242 addReplyDouble(c,ln->score);
6243 ln = reverse ? ln->backward : ln->forward[0];
cc812361 6244 }
6245}
6246
e3870fab 6247static void zrangeCommand(redisClient *c) {
6248 zrangeGenericCommand(c,0);
6249}
6250
6251static void zrevrangeCommand(redisClient *c) {
6252 zrangeGenericCommand(c,1);
6253}
6254
f44dd428 6255/* This command implements both ZRANGEBYSCORE and ZCOUNT.
6256 * If justcount is non-zero, just the count is returned. */
6257static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 6258 robj *o;
f44dd428 6259 double min, max;
6260 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 6261 int offset = 0, limit = -1;
0500ef27
SH
6262 int withscores = 0;
6263 int badsyntax = 0;
6264
f44dd428 6265 /* Parse the min-max interval. If one of the values is prefixed
6266 * by the "(" character, it's considered "open". For instance
6267 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6268 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6269 if (((char*)c->argv[2]->ptr)[0] == '(') {
6270 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6271 minex = 1;
6272 } else {
6273 min = strtod(c->argv[2]->ptr,NULL);
6274 }
6275 if (((char*)c->argv[3]->ptr)[0] == '(') {
6276 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6277 maxex = 1;
6278 } else {
6279 max = strtod(c->argv[3]->ptr,NULL);
6280 }
6281
6282 /* Parse "WITHSCORES": note that if the command was called with
6283 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6284 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 6285 if (c->argc == 5 || c->argc == 8) {
3a3978b1 6286 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6287 withscores = 1;
6288 else
6289 badsyntax = 1;
0500ef27 6290 }
3a3978b1 6291 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 6292 badsyntax = 1;
0500ef27 6293 if (badsyntax) {
454d4e43 6294 addReplySds(c,
6295 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 6296 return;
0500ef27
SH
6297 }
6298
f44dd428 6299 /* Parse "LIMIT" */
0500ef27 6300 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 6301 addReply(c,shared.syntaxerr);
6302 return;
0500ef27 6303 } else if (c->argc == (7 + withscores)) {
80181f78 6304 offset = atoi(c->argv[5]->ptr);
6305 limit = atoi(c->argv[6]->ptr);
0b13687c 6306 if (offset < 0) offset = 0;
80181f78 6307 }
50c55df5 6308
f44dd428 6309 /* Ok, lookup the key and get the range */
50c55df5 6310 o = lookupKeyRead(c->db,c->argv[1]);
6311 if (o == NULL) {
4e27f268 6312 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6313 } else {
6314 if (o->type != REDIS_ZSET) {
6315 addReply(c,shared.wrongtypeerr);
6316 } else {
6317 zset *zsetobj = o->ptr;
6318 zskiplist *zsl = zsetobj->zsl;
6319 zskiplistNode *ln;
f44dd428 6320 robj *ele, *lenobj = NULL;
6321 unsigned long rangelen = 0;
50c55df5 6322
f44dd428 6323 /* Get the first node with the score >= min, or with
6324 * score > min if 'minex' is true. */
50c55df5 6325 ln = zslFirstWithScore(zsl,min);
f44dd428 6326 while (minex && ln && ln->score == min) ln = ln->forward[0];
6327
50c55df5 6328 if (ln == NULL) {
6329 /* No element matching the speciifed interval */
f44dd428 6330 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6331 return;
6332 }
6333
6334 /* We don't know in advance how many matching elements there
6335 * are in the list, so we push this object that will represent
6336 * the multi-bulk length in the output buffer, and will "fix"
6337 * it later */
f44dd428 6338 if (!justcount) {
6339 lenobj = createObject(REDIS_STRING,NULL);
6340 addReply(c,lenobj);
6341 decrRefCount(lenobj);
6342 }
50c55df5 6343
f44dd428 6344 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 6345 if (offset) {
6346 offset--;
6347 ln = ln->forward[0];
6348 continue;
6349 }
6350 if (limit == 0) break;
f44dd428 6351 if (!justcount) {
6352 ele = ln->obj;
dd88747b 6353 addReplyBulk(c,ele);
f44dd428 6354 if (withscores)
6355 addReplyDouble(c,ln->score);
6356 }
50c55df5 6357 ln = ln->forward[0];
6358 rangelen++;
80181f78 6359 if (limit > 0) limit--;
50c55df5 6360 }
f44dd428 6361 if (justcount) {
482b672d 6362 addReplyLongLong(c,(long)rangelen);
f44dd428 6363 } else {
6364 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6365 withscores ? (rangelen*2) : rangelen);
6366 }
50c55df5 6367 }
6368 }
6369}
6370
f44dd428 6371static void zrangebyscoreCommand(redisClient *c) {
6372 genericZrangebyscoreCommand(c,0);
6373}
6374
6375static void zcountCommand(redisClient *c) {
6376 genericZrangebyscoreCommand(c,1);
6377}
6378
3c41331e 6379static void zcardCommand(redisClient *c) {
e197b441 6380 robj *o;
6381 zset *zs;
dd88747b 6382
6383 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6384 checkType(c,o,REDIS_ZSET)) return;
6385
6386 zs = o->ptr;
6387 addReplyUlong(c,zs->zsl->length);
e197b441 6388}
6389
6e333bbe 6390static void zscoreCommand(redisClient *c) {
6391 robj *o;
6392 zset *zs;
dd88747b 6393 dictEntry *de;
6394
6395 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6396 checkType(c,o,REDIS_ZSET)) return;
6397
6398 zs = o->ptr;
6399 de = dictFind(zs->dict,c->argv[2]);
6400 if (!de) {
96d8b4ee 6401 addReply(c,shared.nullbulk);
6e333bbe 6402 } else {
dd88747b 6403 double *score = dictGetEntryVal(de);
6e333bbe 6404
dd88747b 6405 addReplyDouble(c,*score);
6e333bbe 6406 }
6407}
6408
798d9e55 6409static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 6410 robj *o;
dd88747b 6411 zset *zs;
6412 zskiplist *zsl;
6413 dictEntry *de;
6414 unsigned long rank;
6415 double *score;
6416
6417 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6418 checkType(c,o,REDIS_ZSET)) return;
6419
6420 zs = o->ptr;
6421 zsl = zs->zsl;
6422 de = dictFind(zs->dict,c->argv[2]);
6423 if (!de) {
69d95c3e
PN
6424 addReply(c,shared.nullbulk);
6425 return;
6426 }
69d95c3e 6427
dd88747b 6428 score = dictGetEntryVal(de);
6429 rank = zslGetRank(zsl, *score, c->argv[2]);
6430 if (rank) {
6431 if (reverse) {
482b672d 6432 addReplyLongLong(c, zsl->length - rank);
27b0ccca 6433 } else {
482b672d 6434 addReplyLongLong(c, rank-1);
69d95c3e 6435 }
dd88747b 6436 } else {
6437 addReply(c,shared.nullbulk);
978c2c94 6438 }
6439}
6440
798d9e55
PN
6441static void zrankCommand(redisClient *c) {
6442 zrankGenericCommand(c, 0);
6443}
6444
6445static void zrevrankCommand(redisClient *c) {
6446 zrankGenericCommand(c, 1);
6447}
6448
7fb16bac
PN
6449/* ========================= Hashes utility functions ======================= */
6450#define REDIS_HASH_KEY 1
6451#define REDIS_HASH_VALUE 2
978c2c94 6452
7fb16bac
PN
6453/* Check the length of a number of objects to see if we need to convert a
6454 * zipmap to a real hash. Note that we only check string encoded objects
6455 * as their string length can be queried in constant time. */
6456static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6457 int i;
6458 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
978c2c94 6459
7fb16bac
PN
6460 for (i = start; i <= end; i++) {
6461 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6462 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6463 {
6464 convertToRealHash(subject);
978c2c94 6465 return;
6466 }
6467 }
7fb16bac 6468}
bae2c7ec 6469
97224de7
PN
6470/* Encode given objects in-place when the hash uses a dict. */
6471static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6472 if (subject->encoding == REDIS_ENCODING_HT) {
3f973463
PN
6473 if (o1) *o1 = tryObjectEncoding(*o1);
6474 if (o2) *o2 = tryObjectEncoding(*o2);
97224de7
PN
6475 }
6476}
6477
7fb16bac 6478/* Get the value from a hash identified by key. Returns either a string
a3f3af86
PN
6479 * object or NULL if the value cannot be found. The refcount of the object
6480 * is always increased by 1 when the value was found. */
7fb16bac
PN
6481static robj *hashGet(robj *o, robj *key) {
6482 robj *value = NULL;
978c2c94 6483 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7fb16bac
PN
6484 unsigned char *v;
6485 unsigned int vlen;
6486 key = getDecodedObject(key);
6487 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6488 value = createStringObject((char*)v,vlen);
6489 }
6490 decrRefCount(key);
6491 } else {
6492 dictEntry *de = dictFind(o->ptr,key);
6493 if (de != NULL) {
6494 value = dictGetEntryVal(de);
a3f3af86 6495 incrRefCount(value);
7fb16bac
PN
6496 }
6497 }
6498 return value;
6499}
978c2c94 6500
7fb16bac
PN
6501/* Test if the key exists in the given hash. Returns 1 if the key
6502 * exists and 0 when it doesn't. */
6503static int hashExists(robj *o, robj *key) {
6504 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6505 key = getDecodedObject(key);
6506 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6507 decrRefCount(key);
6508 return 1;
6509 }
6510 decrRefCount(key);
6511 } else {
6512 if (dictFind(o->ptr,key) != NULL) {
6513 return 1;
6514 }
6515 }
6516 return 0;
6517}
bae2c7ec 6518
7fb16bac
PN
6519/* Add an element, discard the old if the key already exists.
6520 * Return 0 on insert and 1 on update. */
feb8d7e6 6521static int hashSet(robj *o, robj *key, robj *value) {
7fb16bac
PN
6522 int update = 0;
6523 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6524 key = getDecodedObject(key);
6525 value = getDecodedObject(value);
6526 o->ptr = zipmapSet(o->ptr,
6527 key->ptr,sdslen(key->ptr),
6528 value->ptr,sdslen(value->ptr), &update);
6529 decrRefCount(key);
6530 decrRefCount(value);
6531
6532 /* Check if the zipmap needs to be upgraded to a real hash table */
6533 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
bae2c7ec 6534 convertToRealHash(o);
978c2c94 6535 } else {
7fb16bac
PN
6536 if (dictReplace(o->ptr,key,value)) {
6537 /* Insert */
6538 incrRefCount(key);
978c2c94 6539 } else {
7fb16bac 6540 /* Update */
978c2c94 6541 update = 1;
6542 }
7fb16bac 6543 incrRefCount(value);
978c2c94 6544 }
7fb16bac 6545 return update;
978c2c94 6546}
6547
7fb16bac
PN
6548/* Delete an element from a hash.
6549 * Return 1 on deleted and 0 on not found. */
6550static int hashDelete(robj *o, robj *key) {
6551 int deleted = 0;
6552 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6553 key = getDecodedObject(key);
6554 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6555 decrRefCount(key);
6556 } else {
6557 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6558 /* Always check if the dictionary needs a resize after a delete. */
6559 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
d33278d1 6560 }
7fb16bac
PN
6561 return deleted;
6562}
d33278d1 6563
7fb16bac 6564/* Return the number of elements in a hash. */
c811bb38 6565static unsigned long hashLength(robj *o) {
7fb16bac
PN
6566 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6567 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6568}
6569
6570/* Structure to hold hash iteration abstration. Note that iteration over
6571 * hashes involves both fields and values. Because it is possible that
6572 * not both are required, store pointers in the iterator to avoid
6573 * unnecessary memory allocation for fields/values. */
6574typedef struct {
6575 int encoding;
6576 unsigned char *zi;
6577 unsigned char *zk, *zv;
6578 unsigned int zklen, zvlen;
6579
6580 dictIterator *di;
6581 dictEntry *de;
6582} hashIterator;
6583
c44d3b56
PN
6584static hashIterator *hashInitIterator(robj *subject) {
6585 hashIterator *hi = zmalloc(sizeof(hashIterator));
7fb16bac
PN
6586 hi->encoding = subject->encoding;
6587 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6588 hi->zi = zipmapRewind(subject->ptr);
6589 } else if (hi->encoding == REDIS_ENCODING_HT) {
6590 hi->di = dictGetIterator(subject->ptr);
d33278d1 6591 } else {
7fb16bac 6592 redisAssert(NULL);
d33278d1 6593 }
c44d3b56 6594 return hi;
7fb16bac 6595}
d33278d1 6596
7fb16bac
PN
6597static void hashReleaseIterator(hashIterator *hi) {
6598 if (hi->encoding == REDIS_ENCODING_HT) {
6599 dictReleaseIterator(hi->di);
d33278d1 6600 }
c44d3b56 6601 zfree(hi);
7fb16bac 6602}
d33278d1 6603
7fb16bac
PN
6604/* Move to the next entry in the hash. Return REDIS_OK when the next entry
6605 * could be found and REDIS_ERR when the iterator reaches the end. */
c811bb38 6606static int hashNext(hashIterator *hi) {
7fb16bac
PN
6607 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6608 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6609 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6610 } else {
6611 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6612 }
6613 return REDIS_OK;
6614}
d33278d1 6615
0c390abc 6616/* Get key or value object at current iteration position.
a3f3af86 6617 * This increases the refcount of the field object by 1. */
c811bb38 6618static robj *hashCurrent(hashIterator *hi, int what) {
7fb16bac
PN
6619 robj *o;
6620 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6621 if (what & REDIS_HASH_KEY) {
6622 o = createStringObject((char*)hi->zk,hi->zklen);
6623 } else {
6624 o = createStringObject((char*)hi->zv,hi->zvlen);
d33278d1 6625 }
d33278d1 6626 } else {
7fb16bac
PN
6627 if (what & REDIS_HASH_KEY) {
6628 o = dictGetEntryKey(hi->de);
6629 } else {
6630 o = dictGetEntryVal(hi->de);
d33278d1 6631 }
a3f3af86 6632 incrRefCount(o);
d33278d1 6633 }
7fb16bac 6634 return o;
d33278d1
PN
6635}
6636
7fb16bac
PN
6637static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6638 robj *o = lookupKeyWrite(c->db,key);
01426b05
PN
6639 if (o == NULL) {
6640 o = createHashObject();
7fb16bac
PN
6641 dictAdd(c->db->dict,key,o);
6642 incrRefCount(key);
01426b05
PN
6643 } else {
6644 if (o->type != REDIS_HASH) {
6645 addReply(c,shared.wrongtypeerr);
7fb16bac 6646 return NULL;
01426b05
PN
6647 }
6648 }
7fb16bac
PN
6649 return o;
6650}
01426b05 6651
7fb16bac
PN
6652/* ============================= Hash commands ============================== */
6653static void hsetCommand(redisClient *c) {
6e9e463f 6654 int update;
7fb16bac 6655 robj *o;
bbe025e0 6656
7fb16bac
PN
6657 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6658 hashTryConversion(o,c->argv,2,3);
97224de7 6659 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6660 update = hashSet(o,c->argv[2],c->argv[3]);
6e9e463f 6661 addReply(c, update ? shared.czero : shared.cone);
7fb16bac
PN
6662 server.dirty++;
6663}
01426b05 6664
1f1c7695
PN
6665static void hsetnxCommand(redisClient *c) {
6666 robj *o;
6667 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6668 hashTryConversion(o,c->argv,2,3);
6669
6670 if (hashExists(o, c->argv[2])) {
6671 addReply(c, shared.czero);
01426b05 6672 } else {
97224de7 6673 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6674 hashSet(o,c->argv[2],c->argv[3]);
1f1c7695
PN
6675 addReply(c, shared.cone);
6676 server.dirty++;
6677 }
6678}
01426b05 6679
7fb16bac
PN
6680static void hmsetCommand(redisClient *c) {
6681 int i;
6682 robj *o;
01426b05 6683
7fb16bac
PN
6684 if ((c->argc % 2) == 1) {
6685 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6686 return;
6687 }
01426b05 6688
7fb16bac
PN
6689 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6690 hashTryConversion(o,c->argv,2,c->argc-1);
6691 for (i = 2; i < c->argc; i += 2) {
97224de7 6692 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
feb8d7e6 6693 hashSet(o,c->argv[i],c->argv[i+1]);
7fb16bac
PN
6694 }
6695 addReply(c, shared.ok);
edc2f63a 6696 server.dirty++;
7fb16bac
PN
6697}
6698
6699static void hincrbyCommand(redisClient *c) {
6700 long long value, incr;
6701 robj *o, *current, *new;
6702
bd79a6bd 6703 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
7fb16bac
PN
6704 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6705 if ((current = hashGet(o,c->argv[2])) != NULL) {
946342c1
PN
6706 if (getLongLongFromObjectOrReply(c,current,&value,
6707 "hash value is not an integer") != REDIS_OK) {
6708 decrRefCount(current);
6709 return;
6710 }
a3f3af86 6711 decrRefCount(current);
7fb16bac
PN
6712 } else {
6713 value = 0;
01426b05
PN
6714 }
6715
7fb16bac 6716 value += incr;
3f973463
PN
6717 new = createStringObjectFromLongLong(value);
6718 hashTryObjectEncoding(o,&c->argv[2],NULL);
feb8d7e6 6719 hashSet(o,c->argv[2],new);
7fb16bac
PN
6720 decrRefCount(new);
6721 addReplyLongLong(c,value);
01426b05 6722 server.dirty++;
01426b05
PN
6723}
6724
978c2c94 6725static void hgetCommand(redisClient *c) {
7fb16bac 6726 robj *o, *value;
dd88747b 6727 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6728 checkType(c,o,REDIS_HASH)) return;
6729
7fb16bac
PN
6730 if ((value = hashGet(o,c->argv[2])) != NULL) {
6731 addReplyBulk(c,value);
a3f3af86 6732 decrRefCount(value);
dd88747b 6733 } else {
7fb16bac 6734 addReply(c,shared.nullbulk);
69d95c3e 6735 }
69d95c3e
PN
6736}
6737
09aeb579
PN
6738static void hmgetCommand(redisClient *c) {
6739 int i;
7fb16bac
PN
6740 robj *o, *value;
6741 o = lookupKeyRead(c->db,c->argv[1]);
6742 if (o != NULL && o->type != REDIS_HASH) {
6743 addReply(c,shared.wrongtypeerr);
09aeb579
PN
6744 }
6745
7fb16bac
PN
6746 /* Note the check for o != NULL happens inside the loop. This is
6747 * done because objects that cannot be found are considered to be
6748 * an empty hash. The reply should then be a series of NULLs. */
09aeb579 6749 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7fb16bac
PN
6750 for (i = 2; i < c->argc; i++) {
6751 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6752 addReplyBulk(c,value);
a3f3af86 6753 decrRefCount(value);
7fb16bac
PN
6754 } else {
6755 addReply(c,shared.nullbulk);
09aeb579
PN
6756 }
6757 }
6758}
6759
07efaf74 6760static void hdelCommand(redisClient *c) {
dd88747b 6761 robj *o;
dd88747b 6762 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6763 checkType(c,o,REDIS_HASH)) return;
07efaf74 6764
7fb16bac
PN
6765 if (hashDelete(o,c->argv[2])) {
6766 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6767 addReply(c,shared.cone);
6768 server.dirty++;
dd88747b 6769 } else {
7fb16bac 6770 addReply(c,shared.czero);
07efaf74 6771 }
6772}
6773
92b27fe9 6774static void hlenCommand(redisClient *c) {
6775 robj *o;
dd88747b 6776 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 6777 checkType(c,o,REDIS_HASH)) return;
6778
7fb16bac 6779 addReplyUlong(c,hashLength(o));
92b27fe9 6780}
6781
78409a0f 6782static void genericHgetallCommand(redisClient *c, int flags) {
7fb16bac 6783 robj *o, *lenobj, *obj;
78409a0f 6784 unsigned long count = 0;
c44d3b56 6785 hashIterator *hi;
78409a0f 6786
4e27f268 6787 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
78409a0f 6788 || checkType(c,o,REDIS_HASH)) return;
6789
6790 lenobj = createObject(REDIS_STRING,NULL);
6791 addReply(c,lenobj);
6792 decrRefCount(lenobj);
6793
c44d3b56
PN
6794 hi = hashInitIterator(o);
6795 while (hashNext(hi) != REDIS_ERR) {
7fb16bac 6796 if (flags & REDIS_HASH_KEY) {
c44d3b56 6797 obj = hashCurrent(hi,REDIS_HASH_KEY);
7fb16bac 6798 addReplyBulk(c,obj);
a3f3af86 6799 decrRefCount(obj);
7fb16bac 6800 count++;
78409a0f 6801 }
7fb16bac 6802 if (flags & REDIS_HASH_VALUE) {
c44d3b56 6803 obj = hashCurrent(hi,REDIS_HASH_VALUE);
7fb16bac 6804 addReplyBulk(c,obj);
a3f3af86 6805 decrRefCount(obj);
7fb16bac 6806 count++;
78409a0f 6807 }
78409a0f 6808 }
c44d3b56 6809 hashReleaseIterator(hi);
7fb16bac 6810
78409a0f 6811 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6812}
6813
6814static void hkeysCommand(redisClient *c) {
7fb16bac 6815 genericHgetallCommand(c,REDIS_HASH_KEY);
78409a0f 6816}
6817
6818static void hvalsCommand(redisClient *c) {
7fb16bac 6819 genericHgetallCommand(c,REDIS_HASH_VALUE);
78409a0f 6820}
6821
6822static void hgetallCommand(redisClient *c) {
7fb16bac 6823 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
78409a0f 6824}
6825
a86f14b1 6826static void hexistsCommand(redisClient *c) {
6827 robj *o;
a86f14b1 6828 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6829 checkType(c,o,REDIS_HASH)) return;
6830
7fb16bac 6831 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
a86f14b1 6832}
6833
ada386b2 6834static void convertToRealHash(robj *o) {
6835 unsigned char *key, *val, *p, *zm = o->ptr;
6836 unsigned int klen, vlen;
6837 dict *dict = dictCreate(&hashDictType,NULL);
6838
6839 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6840 p = zipmapRewind(zm);
6841 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6842 robj *keyobj, *valobj;
6843
6844 keyobj = createStringObject((char*)key,klen);
6845 valobj = createStringObject((char*)val,vlen);
05df7621 6846 keyobj = tryObjectEncoding(keyobj);
6847 valobj = tryObjectEncoding(valobj);
ada386b2 6848 dictAdd(dict,keyobj,valobj);
6849 }
6850 o->encoding = REDIS_ENCODING_HT;
6851 o->ptr = dict;
6852 zfree(zm);
6853}
6854
6b47e12e 6855/* ========================= Non type-specific commands ==================== */
6856
ed9b544e 6857static void flushdbCommand(redisClient *c) {
ca37e9cd 6858 server.dirty += dictSize(c->db->dict);
9b30e1a2 6859 touchWatchedKeysOnFlush(c->db->id);
3305306f 6860 dictEmpty(c->db->dict);
6861 dictEmpty(c->db->expires);
ed9b544e 6862 addReply(c,shared.ok);
ed9b544e 6863}
6864
6865static void flushallCommand(redisClient *c) {
9b30e1a2 6866 touchWatchedKeysOnFlush(-1);
ca37e9cd 6867 server.dirty += emptyDb();
ed9b544e 6868 addReply(c,shared.ok);
500ece7c 6869 if (server.bgsavechildpid != -1) {
6870 kill(server.bgsavechildpid,SIGKILL);
6871 rdbRemoveTempFile(server.bgsavechildpid);
6872 }
f78fd11b 6873 rdbSave(server.dbfilename);
ca37e9cd 6874 server.dirty++;
ed9b544e 6875}
6876
56906eef 6877static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 6878 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 6879 so->type = type;
6880 so->pattern = pattern;
6881 return so;
6882}
6883
6884/* Return the value associated to the key with a name obtained
55017f9d
PN
6885 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6886 * The returned object will always have its refcount increased by 1
6887 * when it is non-NULL. */
56906eef 6888static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6d7d1370 6889 char *p, *f;
ed9b544e 6890 sds spat, ssub;
6d7d1370
PN
6891 robj keyobj, fieldobj, *o;
6892 int prefixlen, sublen, postfixlen, fieldlen;
ed9b544e 6893 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6894 struct {
f1017b3f 6895 long len;
6896 long free;
ed9b544e 6897 char buf[REDIS_SORTKEY_MAX+1];
6d7d1370 6898 } keyname, fieldname;
ed9b544e 6899
28173a49 6900 /* If the pattern is "#" return the substitution object itself in order
6901 * to implement the "SORT ... GET #" feature. */
6902 spat = pattern->ptr;
6903 if (spat[0] == '#' && spat[1] == '\0') {
55017f9d 6904 incrRefCount(subst);
28173a49 6905 return subst;
6906 }
6907
6908 /* The substitution object may be specially encoded. If so we create
9d65a1bb 6909 * a decoded object on the fly. Otherwise getDecodedObject will just
6910 * increment the ref count, that we'll decrement later. */
6911 subst = getDecodedObject(subst);
942a3961 6912
ed9b544e 6913 ssub = subst->ptr;
6914 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6915 p = strchr(spat,'*');
ed5a857a 6916 if (!p) {
6917 decrRefCount(subst);
6918 return NULL;
6919 }
ed9b544e 6920
6d7d1370
PN
6921 /* Find out if we're dealing with a hash dereference. */
6922 if ((f = strstr(p+1, "->")) != NULL) {
6923 fieldlen = sdslen(spat)-(f-spat);
6924 /* this also copies \0 character */
6925 memcpy(fieldname.buf,f+2,fieldlen-1);
6926 fieldname.len = fieldlen-2;
6927 } else {
6928 fieldlen = 0;
6929 }
6930
ed9b544e 6931 prefixlen = p-spat;
6932 sublen = sdslen(ssub);
6d7d1370 6933 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
ed9b544e 6934 memcpy(keyname.buf,spat,prefixlen);
6935 memcpy(keyname.buf+prefixlen,ssub,sublen);
6936 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6937 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6938 keyname.len = prefixlen+sublen+postfixlen;
942a3961 6939 decrRefCount(subst);
6940
6d7d1370
PN
6941 /* Lookup substituted key */
6942 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6943 o = lookupKeyRead(db,&keyobj);
55017f9d
PN
6944 if (o == NULL) return NULL;
6945
6946 if (fieldlen > 0) {
6947 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6d7d1370 6948
705dad38
PN
6949 /* Retrieve value from hash by the field name. This operation
6950 * already increases the refcount of the returned object. */
6d7d1370
PN
6951 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6952 o = hashGet(o, &fieldobj);
705dad38 6953 } else {
55017f9d 6954 if (o->type != REDIS_STRING) return NULL;
b6f07345 6955
705dad38
PN
6956 /* Every object that this function returns needs to have its refcount
6957 * increased. sortCommand decreases it again. */
6958 incrRefCount(o);
6d7d1370
PN
6959 }
6960
6961 return o;
ed9b544e 6962}
6963
6964/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6965 * the additional parameter is not standard but a BSD-specific we have to
6966 * pass sorting parameters via the global 'server' structure */
6967static int sortCompare(const void *s1, const void *s2) {
6968 const redisSortObject *so1 = s1, *so2 = s2;
6969 int cmp;
6970
6971 if (!server.sort_alpha) {
6972 /* Numeric sorting. Here it's trivial as we precomputed scores */
6973 if (so1->u.score > so2->u.score) {
6974 cmp = 1;
6975 } else if (so1->u.score < so2->u.score) {
6976 cmp = -1;
6977 } else {
6978 cmp = 0;
6979 }
6980 } else {
6981 /* Alphanumeric sorting */
6982 if (server.sort_bypattern) {
6983 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6984 /* At least one compare object is NULL */
6985 if (so1->u.cmpobj == so2->u.cmpobj)
6986 cmp = 0;
6987 else if (so1->u.cmpobj == NULL)
6988 cmp = -1;
6989 else
6990 cmp = 1;
6991 } else {
6992 /* We have both the objects, use strcoll */
6993 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6994 }
6995 } else {
08ee9b57 6996 /* Compare elements directly. */
6997 cmp = compareStringObjects(so1->obj,so2->obj);
ed9b544e 6998 }
6999 }
7000 return server.sort_desc ? -cmp : cmp;
7001}
7002
7003/* The SORT command is the most complex command in Redis. Warning: this code
7004 * is optimized for speed and a bit less for readability */
7005static void sortCommand(redisClient *c) {
ed9b544e 7006 list *operations;
7007 int outputlen = 0;
7008 int desc = 0, alpha = 0;
7009 int limit_start = 0, limit_count = -1, start, end;
7010 int j, dontsort = 0, vectorlen;
7011 int getop = 0; /* GET operation counter */
443c6409 7012 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 7013 redisSortObject *vector; /* Resulting vector to sort */
7014
7015 /* Lookup the key to sort. It must be of the right types */
3305306f 7016 sortval = lookupKeyRead(c->db,c->argv[1]);
7017 if (sortval == NULL) {
4e27f268 7018 addReply(c,shared.emptymultibulk);
ed9b544e 7019 return;
7020 }
a5eb649b 7021 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
7022 sortval->type != REDIS_ZSET)
7023 {
c937aa89 7024 addReply(c,shared.wrongtypeerr);
ed9b544e 7025 return;
7026 }
7027
7028 /* Create a list of operations to perform for every sorted element.
7029 * Operations can be GET/DEL/INCR/DECR */
7030 operations = listCreate();
092dac2a 7031 listSetFreeMethod(operations,zfree);
ed9b544e 7032 j = 2;
7033
7034 /* Now we need to protect sortval incrementing its count, in the future
7035 * SORT may have options able to overwrite/delete keys during the sorting
7036 * and the sorted key itself may get destroied */
7037 incrRefCount(sortval);
7038
7039 /* The SORT command has an SQL-alike syntax, parse it */
7040 while(j < c->argc) {
7041 int leftargs = c->argc-j-1;
7042 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7043 desc = 0;
7044 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7045 desc = 1;
7046 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7047 alpha = 1;
7048 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7049 limit_start = atoi(c->argv[j+1]->ptr);
7050 limit_count = atoi(c->argv[j+2]->ptr);
7051 j+=2;
443c6409 7052 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7053 storekey = c->argv[j+1];
7054 j++;
ed9b544e 7055 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7056 sortby = c->argv[j+1];
7057 /* If the BY pattern does not contain '*', i.e. it is constant,
7058 * we don't need to sort nor to lookup the weight keys. */
7059 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7060 j++;
7061 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7062 listAddNodeTail(operations,createSortOperation(
7063 REDIS_SORT_GET,c->argv[j+1]));
7064 getop++;
7065 j++;
ed9b544e 7066 } else {
7067 decrRefCount(sortval);
7068 listRelease(operations);
c937aa89 7069 addReply(c,shared.syntaxerr);
ed9b544e 7070 return;
7071 }
7072 j++;
7073 }
7074
7075 /* Load the sorting vector with all the objects to sort */
a5eb649b 7076 switch(sortval->type) {
7077 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
7078 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7079 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
f83c6cb5 7080 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
a5eb649b 7081 }
ed9b544e 7082 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 7083 j = 0;
a5eb649b 7084
ed9b544e 7085 if (sortval->type == REDIS_LIST) {
7086 list *list = sortval->ptr;
6208b3a7 7087 listNode *ln;
c7df85a4 7088 listIter li;
6208b3a7 7089
c7df85a4 7090 listRewind(list,&li);
7091 while((ln = listNext(&li))) {
ed9b544e 7092 robj *ele = ln->value;
7093 vector[j].obj = ele;
7094 vector[j].u.score = 0;
7095 vector[j].u.cmpobj = NULL;
ed9b544e 7096 j++;
7097 }
7098 } else {
a5eb649b 7099 dict *set;
ed9b544e 7100 dictIterator *di;
7101 dictEntry *setele;
7102
a5eb649b 7103 if (sortval->type == REDIS_SET) {
7104 set = sortval->ptr;
7105 } else {
7106 zset *zs = sortval->ptr;
7107 set = zs->dict;
7108 }
7109
ed9b544e 7110 di = dictGetIterator(set);
ed9b544e 7111 while((setele = dictNext(di)) != NULL) {
7112 vector[j].obj = dictGetEntryKey(setele);
7113 vector[j].u.score = 0;
7114 vector[j].u.cmpobj = NULL;
7115 j++;
7116 }
7117 dictReleaseIterator(di);
7118 }
dfc5e96c 7119 redisAssert(j == vectorlen);
ed9b544e 7120
7121 /* Now it's time to load the right scores in the sorting vector */
7122 if (dontsort == 0) {
7123 for (j = 0; j < vectorlen; j++) {
6d7d1370 7124 robj *byval;
ed9b544e 7125 if (sortby) {
6d7d1370 7126 /* lookup value to sort by */
3305306f 7127 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
705dad38 7128 if (!byval) continue;
ed9b544e 7129 } else {
6d7d1370
PN
7130 /* use object itself to sort by */
7131 byval = vector[j].obj;
7132 }
7133
7134 if (alpha) {
08ee9b57 7135 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6d7d1370
PN
7136 } else {
7137 if (byval->encoding == REDIS_ENCODING_RAW) {
7138 vector[j].u.score = strtod(byval->ptr,NULL);
16fa22f1 7139 } else if (byval->encoding == REDIS_ENCODING_INT) {
6d7d1370
PN
7140 /* Don't need to decode the object if it's
7141 * integer-encoded (the only encoding supported) so
7142 * far. We can just cast it */
16fa22f1
PN
7143 vector[j].u.score = (long)byval->ptr;
7144 } else {
7145 redisAssert(1 != 1);
942a3961 7146 }
ed9b544e 7147 }
6d7d1370 7148
705dad38
PN
7149 /* when the object was retrieved using lookupKeyByPattern,
7150 * its refcount needs to be decreased. */
7151 if (sortby) {
7152 decrRefCount(byval);
ed9b544e 7153 }
7154 }
7155 }
7156
7157 /* We are ready to sort the vector... perform a bit of sanity check
7158 * on the LIMIT option too. We'll use a partial version of quicksort. */
7159 start = (limit_start < 0) ? 0 : limit_start;
7160 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7161 if (start >= vectorlen) {
7162 start = vectorlen-1;
7163 end = vectorlen-2;
7164 }
7165 if (end >= vectorlen) end = vectorlen-1;
7166
7167 if (dontsort == 0) {
7168 server.sort_desc = desc;
7169 server.sort_alpha = alpha;
7170 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 7171 if (sortby && (start != 0 || end != vectorlen-1))
7172 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7173 else
7174 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 7175 }
7176
7177 /* Send command output to the output buffer, performing the specified
7178 * GET/DEL/INCR/DECR operations if any. */
7179 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 7180 if (storekey == NULL) {
7181 /* STORE option not specified, sent the sorting result to client */
7182 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7183 for (j = start; j <= end; j++) {
7184 listNode *ln;
c7df85a4 7185 listIter li;
7186
dd88747b 7187 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 7188 listRewind(operations,&li);
7189 while((ln = listNext(&li))) {
443c6409 7190 redisSortOperation *sop = ln->value;
7191 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7192 vector[j].obj);
7193
7194 if (sop->type == REDIS_SORT_GET) {
55017f9d 7195 if (!val) {
443c6409 7196 addReply(c,shared.nullbulk);
7197 } else {
dd88747b 7198 addReplyBulk(c,val);
55017f9d 7199 decrRefCount(val);
443c6409 7200 }
7201 } else {
dfc5e96c 7202 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 7203 }
7204 }
ed9b544e 7205 }
443c6409 7206 } else {
7207 robj *listObject = createListObject();
7208 list *listPtr = (list*) listObject->ptr;
7209
7210 /* STORE option specified, set the sorting result as a List object */
7211 for (j = start; j <= end; j++) {
7212 listNode *ln;
c7df85a4 7213 listIter li;
7214
443c6409 7215 if (!getop) {
7216 listAddNodeTail(listPtr,vector[j].obj);
7217 incrRefCount(vector[j].obj);
7218 }
c7df85a4 7219 listRewind(operations,&li);
7220 while((ln = listNext(&li))) {
443c6409 7221 redisSortOperation *sop = ln->value;
7222 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7223 vector[j].obj);
7224
7225 if (sop->type == REDIS_SORT_GET) {
55017f9d 7226 if (!val) {
443c6409 7227 listAddNodeTail(listPtr,createStringObject("",0));
7228 } else {
55017f9d
PN
7229 /* We should do a incrRefCount on val because it is
7230 * added to the list, but also a decrRefCount because
7231 * it is returned by lookupKeyByPattern. This results
7232 * in doing nothing at all. */
443c6409 7233 listAddNodeTail(listPtr,val);
443c6409 7234 }
ed9b544e 7235 } else {
dfc5e96c 7236 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 7237 }
ed9b544e 7238 }
ed9b544e 7239 }
121796f7 7240 if (dictReplace(c->db->dict,storekey,listObject)) {
7241 incrRefCount(storekey);
7242 }
443c6409 7243 /* Note: we add 1 because the DB is dirty anyway since even if the
7244 * SORT result is empty a new key is set and maybe the old content
7245 * replaced. */
7246 server.dirty += 1+outputlen;
7247 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 7248 }
7249
7250 /* Cleanup */
7251 decrRefCount(sortval);
7252 listRelease(operations);
7253 for (j = 0; j < vectorlen; j++) {
16fa22f1 7254 if (alpha && vector[j].u.cmpobj)
ed9b544e 7255 decrRefCount(vector[j].u.cmpobj);
7256 }
7257 zfree(vector);
7258}
7259
ec6c7a1d 7260/* Convert an amount of bytes into a human readable string in the form
7261 * of 100B, 2G, 100M, 4K, and so forth. */
7262static void bytesToHuman(char *s, unsigned long long n) {
7263 double d;
7264
7265 if (n < 1024) {
7266 /* Bytes */
7267 sprintf(s,"%lluB",n);
7268 return;
7269 } else if (n < (1024*1024)) {
7270 d = (double)n/(1024);
7271 sprintf(s,"%.2fK",d);
7272 } else if (n < (1024LL*1024*1024)) {
7273 d = (double)n/(1024*1024);
7274 sprintf(s,"%.2fM",d);
7275 } else if (n < (1024LL*1024*1024*1024)) {
7276 d = (double)n/(1024LL*1024*1024);
b72f6a4b 7277 sprintf(s,"%.2fG",d);
ec6c7a1d 7278 }
7279}
7280
1c85b79f 7281/* Create the string returned by the INFO command. This is decoupled
7282 * by the INFO command itself as we need to report the same information
7283 * on memory corruption problems. */
7284static sds genRedisInfoString(void) {
ed9b544e 7285 sds info;
7286 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 7287 int j;
ec6c7a1d 7288 char hmem[64];
55a8298f 7289
b72f6a4b 7290 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 7291 info = sdscatprintf(sdsempty(),
7292 "redis_version:%s\r\n"
5436146c
PN
7293 "redis_git_sha1:%s\r\n"
7294 "redis_git_dirty:%d\r\n"
f1017b3f 7295 "arch_bits:%s\r\n"
7a932b74 7296 "multiplexing_api:%s\r\n"
0d7170a4 7297 "process_id:%ld\r\n"
682ac724 7298 "uptime_in_seconds:%ld\r\n"
7299 "uptime_in_days:%ld\r\n"
ed9b544e 7300 "connected_clients:%d\r\n"
7301 "connected_slaves:%d\r\n"
f86a74e9 7302 "blocked_clients:%d\r\n"
5fba9f71 7303 "used_memory:%zu\r\n"
ec6c7a1d 7304 "used_memory_human:%s\r\n"
ed9b544e 7305 "changes_since_last_save:%lld\r\n"
be2bb6b0 7306 "bgsave_in_progress:%d\r\n"
682ac724 7307 "last_save_time:%ld\r\n"
b3fad521 7308 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 7309 "total_connections_received:%lld\r\n"
7310 "total_commands_processed:%lld\r\n"
2a6a2ed1 7311 "expired_keys:%lld\r\n"
3be2c9d7 7312 "hash_max_zipmap_entries:%zu\r\n"
7313 "hash_max_zipmap_value:%zu\r\n"
ffc6b7f8 7314 "pubsub_channels:%ld\r\n"
7315 "pubsub_patterns:%u\r\n"
7d98e08c 7316 "vm_enabled:%d\r\n"
a0f643ea 7317 "role:%s\r\n"
ed9b544e 7318 ,REDIS_VERSION,
5436146c 7319 REDIS_GIT_SHA1,
274e45e3 7320 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
f1017b3f 7321 (sizeof(long) == 8) ? "64" : "32",
7a932b74 7322 aeGetApiName(),
0d7170a4 7323 (long) getpid(),
a0f643ea 7324 uptime,
7325 uptime/(3600*24),
ed9b544e 7326 listLength(server.clients)-listLength(server.slaves),
7327 listLength(server.slaves),
d5d55fc3 7328 server.blpop_blocked_clients,
b72f6a4b 7329 zmalloc_used_memory(),
ec6c7a1d 7330 hmem,
ed9b544e 7331 server.dirty,
9d65a1bb 7332 server.bgsavechildpid != -1,
ed9b544e 7333 server.lastsave,
b3fad521 7334 server.bgrewritechildpid != -1,
ed9b544e 7335 server.stat_numconnections,
7336 server.stat_numcommands,
2a6a2ed1 7337 server.stat_expiredkeys,
55a8298f 7338 server.hash_max_zipmap_entries,
7339 server.hash_max_zipmap_value,
ffc6b7f8 7340 dictSize(server.pubsub_channels),
7341 listLength(server.pubsub_patterns),
7d98e08c 7342 server.vm_enabled != 0,
a0f643ea 7343 server.masterhost == NULL ? "master" : "slave"
ed9b544e 7344 );
a0f643ea 7345 if (server.masterhost) {
7346 info = sdscatprintf(info,
7347 "master_host:%s\r\n"
7348 "master_port:%d\r\n"
7349 "master_link_status:%s\r\n"
7350 "master_last_io_seconds_ago:%d\r\n"
7351 ,server.masterhost,
7352 server.masterport,
7353 (server.replstate == REDIS_REPL_CONNECTED) ?
7354 "up" : "down",
f72b934d 7355 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 7356 );
7357 }
7d98e08c 7358 if (server.vm_enabled) {
1064ef87 7359 lockThreadedIO();
7d98e08c 7360 info = sdscatprintf(info,
7361 "vm_conf_max_memory:%llu\r\n"
7362 "vm_conf_page_size:%llu\r\n"
7363 "vm_conf_pages:%llu\r\n"
7364 "vm_stats_used_pages:%llu\r\n"
7365 "vm_stats_swapped_objects:%llu\r\n"
7366 "vm_stats_swappin_count:%llu\r\n"
7367 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 7368 "vm_stats_io_newjobs_len:%lu\r\n"
7369 "vm_stats_io_processing_len:%lu\r\n"
7370 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 7371 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 7372 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 7373 ,(unsigned long long) server.vm_max_memory,
7374 (unsigned long long) server.vm_page_size,
7375 (unsigned long long) server.vm_pages,
7376 (unsigned long long) server.vm_stats_used_pages,
7377 (unsigned long long) server.vm_stats_swapped_objects,
7378 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 7379 (unsigned long long) server.vm_stats_swapouts,
7380 (unsigned long) listLength(server.io_newjobs),
7381 (unsigned long) listLength(server.io_processing),
7382 (unsigned long) listLength(server.io_processed),
d5d55fc3 7383 (unsigned long) server.io_active_threads,
7384 (unsigned long) server.vm_blocked_clients
7d98e08c 7385 );
1064ef87 7386 unlockThreadedIO();
7d98e08c 7387 }
c3cb078d 7388 for (j = 0; j < server.dbnum; j++) {
7389 long long keys, vkeys;
7390
7391 keys = dictSize(server.db[j].dict);
7392 vkeys = dictSize(server.db[j].expires);
7393 if (keys || vkeys) {
9d65a1bb 7394 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 7395 j, keys, vkeys);
7396 }
7397 }
1c85b79f 7398 return info;
7399}
7400
7401static void infoCommand(redisClient *c) {
7402 sds info = genRedisInfoString();
83c6a618 7403 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7404 (unsigned long)sdslen(info)));
ed9b544e 7405 addReplySds(c,info);
70003d28 7406 addReply(c,shared.crlf);
ed9b544e 7407}
7408
3305306f 7409static void monitorCommand(redisClient *c) {
7410 /* ignore MONITOR if aleady slave or in monitor mode */
7411 if (c->flags & REDIS_SLAVE) return;
7412
7413 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7414 c->slaveseldb = 0;
6b47e12e 7415 listAddNodeTail(server.monitors,c);
3305306f 7416 addReply(c,shared.ok);
7417}
7418
7419/* ================================= Expire ================================= */
7420static int removeExpire(redisDb *db, robj *key) {
7421 if (dictDelete(db->expires,key) == DICT_OK) {
7422 return 1;
7423 } else {
7424 return 0;
7425 }
7426}
7427
7428static int setExpire(redisDb *db, robj *key, time_t when) {
7429 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7430 return 0;
7431 } else {
7432 incrRefCount(key);
7433 return 1;
7434 }
7435}
7436
bb32ede5 7437/* Return the expire time of the specified key, or -1 if no expire
7438 * is associated with this key (i.e. the key is non volatile) */
7439static time_t getExpire(redisDb *db, robj *key) {
7440 dictEntry *de;
7441
7442 /* No expire? return ASAP */
7443 if (dictSize(db->expires) == 0 ||
7444 (de = dictFind(db->expires,key)) == NULL) return -1;
7445
7446 return (time_t) dictGetEntryVal(de);
7447}
7448
3305306f 7449static int expireIfNeeded(redisDb *db, robj *key) {
7450 time_t when;
7451 dictEntry *de;
7452
7453 /* No expire? return ASAP */
7454 if (dictSize(db->expires) == 0 ||
7455 (de = dictFind(db->expires,key)) == NULL) return 0;
7456
7457 /* Lookup the expire */
7458 when = (time_t) dictGetEntryVal(de);
7459 if (time(NULL) <= when) return 0;
7460
7461 /* Delete the key */
7462 dictDelete(db->expires,key);
2a6a2ed1 7463 server.stat_expiredkeys++;
3305306f 7464 return dictDelete(db->dict,key) == DICT_OK;
7465}
7466
7467static int deleteIfVolatile(redisDb *db, robj *key) {
7468 dictEntry *de;
7469
7470 /* No expire? return ASAP */
7471 if (dictSize(db->expires) == 0 ||
7472 (de = dictFind(db->expires,key)) == NULL) return 0;
7473
7474 /* Delete the key */
0c66a471 7475 server.dirty++;
2a6a2ed1 7476 server.stat_expiredkeys++;
3305306f 7477 dictDelete(db->expires,key);
7478 return dictDelete(db->dict,key) == DICT_OK;
7479}
7480
bbe025e0 7481static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
3305306f 7482 dictEntry *de;
bbe025e0
AM
7483 time_t seconds;
7484
bd79a6bd 7485 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
bbe025e0
AM
7486
7487 seconds -= offset;
3305306f 7488
802e8373 7489 de = dictFind(c->db->dict,key);
3305306f 7490 if (de == NULL) {
7491 addReply(c,shared.czero);
7492 return;
7493 }
d4dd6556 7494 if (seconds <= 0) {
43e5ccdf 7495 if (deleteKey(c->db,key)) server.dirty++;
7496 addReply(c, shared.cone);
3305306f 7497 return;
7498 } else {
7499 time_t when = time(NULL)+seconds;
802e8373 7500 if (setExpire(c->db,key,when)) {
3305306f 7501 addReply(c,shared.cone);
77423026 7502 server.dirty++;
7503 } else {
3305306f 7504 addReply(c,shared.czero);
77423026 7505 }
3305306f 7506 return;
7507 }
7508}
7509
802e8373 7510static void expireCommand(redisClient *c) {
bbe025e0 7511 expireGenericCommand(c,c->argv[1],c->argv[2],0);
802e8373 7512}
7513
7514static void expireatCommand(redisClient *c) {
bbe025e0 7515 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
802e8373 7516}
7517
fd88489a 7518static void ttlCommand(redisClient *c) {
7519 time_t expire;
7520 int ttl = -1;
7521
7522 expire = getExpire(c->db,c->argv[1]);
7523 if (expire != -1) {
7524 ttl = (int) (expire-time(NULL));
7525 if (ttl < 0) ttl = -1;
7526 }
7527 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7528}
7529
6e469882 7530/* ================================ MULTI/EXEC ============================== */
7531
7532/* Client state initialization for MULTI/EXEC */
7533static void initClientMultiState(redisClient *c) {
7534 c->mstate.commands = NULL;
7535 c->mstate.count = 0;
7536}
7537
7538/* Release all the resources associated with MULTI/EXEC state */
7539static void freeClientMultiState(redisClient *c) {
7540 int j;
7541
7542 for (j = 0; j < c->mstate.count; j++) {
7543 int i;
7544 multiCmd *mc = c->mstate.commands+j;
7545
7546 for (i = 0; i < mc->argc; i++)
7547 decrRefCount(mc->argv[i]);
7548 zfree(mc->argv);
7549 }
7550 zfree(c->mstate.commands);
7551}
7552
7553/* Add a new command into the MULTI commands queue */
7554static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7555 multiCmd *mc;
7556 int j;
7557
7558 c->mstate.commands = zrealloc(c->mstate.commands,
7559 sizeof(multiCmd)*(c->mstate.count+1));
7560 mc = c->mstate.commands+c->mstate.count;
7561 mc->cmd = cmd;
7562 mc->argc = c->argc;
7563 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7564 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7565 for (j = 0; j < c->argc; j++)
7566 incrRefCount(mc->argv[j]);
7567 c->mstate.count++;
7568}
7569
7570static void multiCommand(redisClient *c) {
6531c94d 7571 if (c->flags & REDIS_MULTI) {
7572 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
7573 return;
7574 }
6e469882 7575 c->flags |= REDIS_MULTI;
36c548f0 7576 addReply(c,shared.ok);
6e469882 7577}
7578
18b6cb76
DJ
7579static void discardCommand(redisClient *c) {
7580 if (!(c->flags & REDIS_MULTI)) {
7581 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7582 return;
7583 }
7584
7585 freeClientMultiState(c);
7586 initClientMultiState(c);
7587 c->flags &= (~REDIS_MULTI);
7588 addReply(c,shared.ok);
7589}
7590
66c8853f 7591/* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7592 * implememntation for more information. */
7593static void execCommandReplicateMulti(redisClient *c) {
7594 struct redisCommand *cmd;
7595 robj *multistring = createStringObject("MULTI",5);
7596
7597 cmd = lookupCommand("multi");
7598 if (server.appendonly)
7599 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7600 if (listLength(server.slaves))
7601 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7602 decrRefCount(multistring);
7603}
7604
6e469882 7605static void execCommand(redisClient *c) {
7606 int j;
7607 robj **orig_argv;
7608 int orig_argc;
7609
7610 if (!(c->flags & REDIS_MULTI)) {
7611 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7612 return;
7613 }
7614
37ab76c9 7615 /* Check if we need to abort the EXEC if some WATCHed key was touched.
7616 * A failed EXEC will return a multi bulk nil object. */
7617 if (c->flags & REDIS_DIRTY_CAS) {
7618 freeClientMultiState(c);
7619 initClientMultiState(c);
7620 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
7621 unwatchAllKeys(c);
7622 addReply(c,shared.nullmultibulk);
7623 return;
7624 }
7625
66c8853f 7626 /* Replicate a MULTI request now that we are sure the block is executed.
7627 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7628 * both the AOF and the replication link will have the same consistency
7629 * and atomicity guarantees. */
7630 execCommandReplicateMulti(c);
7631
7632 /* Exec all the queued commands */
1ad4d316 7633 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
6e469882 7634 orig_argv = c->argv;
7635 orig_argc = c->argc;
7636 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7637 for (j = 0; j < c->mstate.count; j++) {
7638 c->argc = c->mstate.commands[j].argc;
7639 c->argv = c->mstate.commands[j].argv;
7640 call(c,c->mstate.commands[j].cmd);
7641 }
7642 c->argv = orig_argv;
7643 c->argc = orig_argc;
7644 freeClientMultiState(c);
7645 initClientMultiState(c);
1ad4d316 7646 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
66c8853f 7647 /* Make sure the EXEC command is always replicated / AOF, since we
7648 * always send the MULTI command (we can't know beforehand if the
7649 * next operations will contain at least a modification to the DB). */
7650 server.dirty++;
6e469882 7651}
7652
4409877e 7653/* =========================== Blocking Operations ========================= */
7654
7655/* Currently Redis blocking operations support is limited to list POP ops,
7656 * so the current implementation is not fully generic, but it is also not
7657 * completely specific so it will not require a rewrite to support new
7658 * kind of blocking operations in the future.
7659 *
7660 * Still it's important to note that list blocking operations can be already
7661 * used as a notification mechanism in order to implement other blocking
7662 * operations at application level, so there must be a very strong evidence
7663 * of usefulness and generality before new blocking operations are implemented.
7664 *
7665 * This is how the current blocking POP works, we use BLPOP as example:
7666 * - If the user calls BLPOP and the key exists and contains a non empty list
7667 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7668 * if there is not to block.
7669 * - If instead BLPOP is called and the key does not exists or the list is
7670 * empty we need to block. In order to do so we remove the notification for
7671 * new data to read in the client socket (so that we'll not serve new
7672 * requests if the blocking request is not served). Also we put the client
37ab76c9 7673 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
4409877e 7674 * blocking for this keys.
7675 * - If a PUSH operation against a key with blocked clients waiting is
7676 * performed, we serve the first in the list: basically instead to push
7677 * the new element inside the list we return it to the (first / oldest)
7678 * blocking client, unblock the client, and remove it form the list.
7679 *
7680 * The above comment and the source code should be enough in order to understand
7681 * the implementation and modify / fix it later.
7682 */
7683
7684/* Set a client in blocking mode for the specified key, with the specified
7685 * timeout */
b177fd30 7686static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 7687 dictEntry *de;
7688 list *l;
b177fd30 7689 int j;
4409877e 7690
37ab76c9 7691 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
7692 c->blocking_keys_num = numkeys;
4409877e 7693 c->blockingto = timeout;
b177fd30 7694 for (j = 0; j < numkeys; j++) {
7695 /* Add the key in the client structure, to map clients -> keys */
37ab76c9 7696 c->blocking_keys[j] = keys[j];
b177fd30 7697 incrRefCount(keys[j]);
4409877e 7698
b177fd30 7699 /* And in the other "side", to map keys -> clients */
37ab76c9 7700 de = dictFind(c->db->blocking_keys,keys[j]);
b177fd30 7701 if (de == NULL) {
7702 int retval;
7703
7704 /* For every key we take a list of clients blocked for it */
7705 l = listCreate();
37ab76c9 7706 retval = dictAdd(c->db->blocking_keys,keys[j],l);
b177fd30 7707 incrRefCount(keys[j]);
7708 assert(retval == DICT_OK);
7709 } else {
7710 l = dictGetEntryVal(de);
7711 }
7712 listAddNodeTail(l,c);
4409877e 7713 }
b177fd30 7714 /* Mark the client as a blocked client */
4409877e 7715 c->flags |= REDIS_BLOCKED;
d5d55fc3 7716 server.blpop_blocked_clients++;
4409877e 7717}
7718
7719/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 7720static void unblockClientWaitingData(redisClient *c) {
4409877e 7721 dictEntry *de;
7722 list *l;
b177fd30 7723 int j;
4409877e 7724
37ab76c9 7725 assert(c->blocking_keys != NULL);
b177fd30 7726 /* The client may wait for multiple keys, so unblock it for every key. */
37ab76c9 7727 for (j = 0; j < c->blocking_keys_num; j++) {
b177fd30 7728 /* Remove this client from the list of clients waiting for this key. */
37ab76c9 7729 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
b177fd30 7730 assert(de != NULL);
7731 l = dictGetEntryVal(de);
7732 listDelNode(l,listSearchKey(l,c));
7733 /* If the list is empty we need to remove it to avoid wasting memory */
7734 if (listLength(l) == 0)
37ab76c9 7735 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
7736 decrRefCount(c->blocking_keys[j]);
b177fd30 7737 }
7738 /* Cleanup the client structure */
37ab76c9 7739 zfree(c->blocking_keys);
7740 c->blocking_keys = NULL;
4409877e 7741 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 7742 server.blpop_blocked_clients--;
5921aa36 7743 /* We want to process data if there is some command waiting
b0d8747d 7744 * in the input buffer. Note that this is safe even if
7745 * unblockClientWaitingData() gets called from freeClient() because
7746 * freeClient() will be smart enough to call this function
7747 * *after* c->querybuf was set to NULL. */
4409877e 7748 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7749}
7750
7751/* This should be called from any function PUSHing into lists.
7752 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7753 * 'ele' is the element pushed.
7754 *
7755 * If the function returns 0 there was no client waiting for a list push
7756 * against this key.
7757 *
7758 * If the function returns 1 there was a client waiting for a list push
7759 * against this key, the element was passed to this client thus it's not
7760 * needed to actually add it to the list and the caller should return asap. */
7761static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7762 struct dictEntry *de;
7763 redisClient *receiver;
7764 list *l;
7765 listNode *ln;
7766
37ab76c9 7767 de = dictFind(c->db->blocking_keys,key);
4409877e 7768 if (de == NULL) return 0;
7769 l = dictGetEntryVal(de);
7770 ln = listFirst(l);
7771 assert(ln != NULL);
7772 receiver = ln->value;
4409877e 7773
b177fd30 7774 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 7775 addReplyBulk(receiver,key);
7776 addReplyBulk(receiver,ele);
b0d8747d 7777 unblockClientWaitingData(receiver);
4409877e 7778 return 1;
7779}
7780
7781/* Blocking RPOP/LPOP */
7782static void blockingPopGenericCommand(redisClient *c, int where) {
7783 robj *o;
7784 time_t timeout;
b177fd30 7785 int j;
4409877e 7786
b177fd30 7787 for (j = 1; j < c->argc-1; j++) {
7788 o = lookupKeyWrite(c->db,c->argv[j]);
7789 if (o != NULL) {
7790 if (o->type != REDIS_LIST) {
7791 addReply(c,shared.wrongtypeerr);
4409877e 7792 return;
b177fd30 7793 } else {
7794 list *list = o->ptr;
7795 if (listLength(list) != 0) {
7796 /* If the list contains elements fall back to the usual
7797 * non-blocking POP operation */
7798 robj *argv[2], **orig_argv;
7799 int orig_argc;
e0a62c7f 7800
b177fd30 7801 /* We need to alter the command arguments before to call
7802 * popGenericCommand() as the command takes a single key. */
7803 orig_argv = c->argv;
7804 orig_argc = c->argc;
7805 argv[1] = c->argv[j];
7806 c->argv = argv;
7807 c->argc = 2;
7808
7809 /* Also the return value is different, we need to output
7810 * the multi bulk reply header and the key name. The
7811 * "real" command will add the last element (the value)
7812 * for us. If this souds like an hack to you it's just
7813 * because it is... */
7814 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 7815 addReplyBulk(c,argv[1]);
b177fd30 7816 popGenericCommand(c,where);
7817
7818 /* Fix the client structure with the original stuff */
7819 c->argv = orig_argv;
7820 c->argc = orig_argc;
7821 return;
7822 }
4409877e 7823 }
7824 }
7825 }
7826 /* If the list is empty or the key does not exists we must block */
b177fd30 7827 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 7828 if (timeout > 0) timeout += time(NULL);
b177fd30 7829 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 7830}
7831
7832static void blpopCommand(redisClient *c) {
7833 blockingPopGenericCommand(c,REDIS_HEAD);
7834}
7835
7836static void brpopCommand(redisClient *c) {
7837 blockingPopGenericCommand(c,REDIS_TAIL);
7838}
7839
ed9b544e 7840/* =============================== Replication ============================= */
7841
a4d1ba9a 7842static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7843 ssize_t nwritten, ret = size;
7844 time_t start = time(NULL);
7845
7846 timeout++;
7847 while(size) {
7848 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7849 nwritten = write(fd,ptr,size);
7850 if (nwritten == -1) return -1;
7851 ptr += nwritten;
7852 size -= nwritten;
7853 }
7854 if ((time(NULL)-start) > timeout) {
7855 errno = ETIMEDOUT;
7856 return -1;
7857 }
7858 }
7859 return ret;
7860}
7861
a4d1ba9a 7862static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7863 ssize_t nread, totread = 0;
7864 time_t start = time(NULL);
7865
7866 timeout++;
7867 while(size) {
7868 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7869 nread = read(fd,ptr,size);
7870 if (nread == -1) return -1;
7871 ptr += nread;
7872 size -= nread;
7873 totread += nread;
7874 }
7875 if ((time(NULL)-start) > timeout) {
7876 errno = ETIMEDOUT;
7877 return -1;
7878 }
7879 }
7880 return totread;
7881}
7882
7883static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7884 ssize_t nread = 0;
7885
7886 size--;
7887 while(size) {
7888 char c;
7889
7890 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7891 if (c == '\n') {
7892 *ptr = '\0';
7893 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7894 return nread;
7895 } else {
7896 *ptr++ = c;
7897 *ptr = '\0';
7898 nread++;
7899 }
7900 }
7901 return nread;
7902}
7903
7904static void syncCommand(redisClient *c) {
40d224a9 7905 /* ignore SYNC if aleady slave or in monitor mode */
7906 if (c->flags & REDIS_SLAVE) return;
7907
7908 /* SYNC can't be issued when the server has pending data to send to
7909 * the client about already issued commands. We need a fresh reply
7910 * buffer registering the differences between the BGSAVE and the current
7911 * dataset, so that we can copy to other slaves if needed. */
7912 if (listLength(c->reply) != 0) {
7913 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7914 return;
7915 }
7916
7917 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7918 /* Here we need to check if there is a background saving operation
7919 * in progress, or if it is required to start one */
9d65a1bb 7920 if (server.bgsavechildpid != -1) {
40d224a9 7921 /* Ok a background save is in progress. Let's check if it is a good
7922 * one for replication, i.e. if there is another slave that is
7923 * registering differences since the server forked to save */
7924 redisClient *slave;
7925 listNode *ln;
c7df85a4 7926 listIter li;
40d224a9 7927
c7df85a4 7928 listRewind(server.slaves,&li);
7929 while((ln = listNext(&li))) {
40d224a9 7930 slave = ln->value;
7931 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 7932 }
7933 if (ln) {
7934 /* Perfect, the server is already registering differences for
7935 * another slave. Set the right state, and copy the buffer. */
7936 listRelease(c->reply);
7937 c->reply = listDup(slave->reply);
40d224a9 7938 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7939 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7940 } else {
7941 /* No way, we need to wait for the next BGSAVE in order to
7942 * register differences */
7943 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7944 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7945 }
7946 } else {
7947 /* Ok we don't have a BGSAVE in progress, let's start one */
7948 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7949 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7950 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7951 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7952 return;
7953 }
7954 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7955 }
6208b3a7 7956 c->repldbfd = -1;
40d224a9 7957 c->flags |= REDIS_SLAVE;
7958 c->slaveseldb = 0;
6b47e12e 7959 listAddNodeTail(server.slaves,c);
40d224a9 7960 return;
7961}
7962
6208b3a7 7963static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7964 redisClient *slave = privdata;
7965 REDIS_NOTUSED(el);
7966 REDIS_NOTUSED(mask);
7967 char buf[REDIS_IOBUF_LEN];
7968 ssize_t nwritten, buflen;
7969
7970 if (slave->repldboff == 0) {
7971 /* Write the bulk write count before to transfer the DB. In theory here
7972 * we don't know how much room there is in the output buffer of the
7973 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7974 * operations) will never be smaller than the few bytes we need. */
7975 sds bulkcount;
7976
7977 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7978 slave->repldbsize);
7979 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7980 {
7981 sdsfree(bulkcount);
7982 freeClient(slave);
7983 return;
7984 }
7985 sdsfree(bulkcount);
7986 }
7987 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7988 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7989 if (buflen <= 0) {
7990 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7991 (buflen == 0) ? "premature EOF" : strerror(errno));
7992 freeClient(slave);
7993 return;
7994 }
7995 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 7996 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 7997 strerror(errno));
7998 freeClient(slave);
7999 return;
8000 }
8001 slave->repldboff += nwritten;
8002 if (slave->repldboff == slave->repldbsize) {
8003 close(slave->repldbfd);
8004 slave->repldbfd = -1;
8005 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8006 slave->replstate = REDIS_REPL_ONLINE;
8007 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 8008 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 8009 freeClient(slave);
8010 return;
8011 }
8012 addReplySds(slave,sdsempty());
8013 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
8014 }
8015}
ed9b544e 8016
a3b21203 8017/* This function is called at the end of every backgrond saving.
8018 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
8019 * otherwise REDIS_ERR is passed to the function.
8020 *
8021 * The goal of this function is to handle slaves waiting for a successful
8022 * background saving in order to perform non-blocking synchronization. */
8023static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 8024 listNode *ln;
8025 int startbgsave = 0;
c7df85a4 8026 listIter li;
ed9b544e 8027
c7df85a4 8028 listRewind(server.slaves,&li);
8029 while((ln = listNext(&li))) {
6208b3a7 8030 redisClient *slave = ln->value;
ed9b544e 8031
6208b3a7 8032 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
8033 startbgsave = 1;
8034 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8035 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 8036 struct redis_stat buf;
e0a62c7f 8037
6208b3a7 8038 if (bgsaveerr != REDIS_OK) {
8039 freeClient(slave);
8040 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8041 continue;
8042 }
8043 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 8044 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 8045 freeClient(slave);
8046 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8047 continue;
8048 }
8049 slave->repldboff = 0;
8050 slave->repldbsize = buf.st_size;
8051 slave->replstate = REDIS_REPL_SEND_BULK;
8052 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 8053 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 8054 freeClient(slave);
8055 continue;
8056 }
8057 }
ed9b544e 8058 }
6208b3a7 8059 if (startbgsave) {
8060 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 8061 listIter li;
8062
8063 listRewind(server.slaves,&li);
6208b3a7 8064 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 8065 while((ln = listNext(&li))) {
6208b3a7 8066 redisClient *slave = ln->value;
ed9b544e 8067
6208b3a7 8068 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8069 freeClient(slave);
8070 }
8071 }
8072 }
ed9b544e 8073}
8074
8075static int syncWithMaster(void) {
d0ccebcf 8076 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 8077 long dumpsize;
ed9b544e 8078 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 8079 int dfd, maxtries = 5;
ed9b544e 8080
8081 if (fd == -1) {
8082 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8083 strerror(errno));
8084 return REDIS_ERR;
8085 }
d0ccebcf 8086
8087 /* AUTH with the master if required. */
8088 if(server.masterauth) {
8089 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8090 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8091 close(fd);
8092 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8093 strerror(errno));
8094 return REDIS_ERR;
8095 }
8096 /* Read the AUTH result. */
8097 if (syncReadLine(fd,buf,1024,3600) == -1) {
8098 close(fd);
8099 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8100 strerror(errno));
8101 return REDIS_ERR;
8102 }
8103 if (buf[0] != '+') {
8104 close(fd);
8105 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8106 return REDIS_ERR;
8107 }
8108 }
8109
ed9b544e 8110 /* Issue the SYNC command */
8111 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8112 close(fd);
8113 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8114 strerror(errno));
8115 return REDIS_ERR;
8116 }
8117 /* Read the bulk write count */
8c4d91fc 8118 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 8119 close(fd);
8120 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8121 strerror(errno));
8122 return REDIS_ERR;
8123 }
4aa701c1 8124 if (buf[0] != '$') {
8125 close(fd);
8126 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8127 return REDIS_ERR;
8128 }
18e61fa2 8129 dumpsize = strtol(buf+1,NULL,10);
8130 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 8131 /* Read the bulk write data on a temp file */
8c5abee8 8132 while(maxtries--) {
8133 snprintf(tmpfile,256,
8134 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8135 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8136 if (dfd != -1) break;
5de9ad7c 8137 sleep(1);
8c5abee8 8138 }
ed9b544e 8139 if (dfd == -1) {
8140 close(fd);
8141 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8142 return REDIS_ERR;
8143 }
8144 while(dumpsize) {
8145 int nread, nwritten;
8146
8147 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8148 if (nread == -1) {
8149 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8150 strerror(errno));
8151 close(fd);
8152 close(dfd);
8153 return REDIS_ERR;
8154 }
8155 nwritten = write(dfd,buf,nread);
8156 if (nwritten == -1) {
8157 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8158 close(fd);
8159 close(dfd);
8160 return REDIS_ERR;
8161 }
8162 dumpsize -= nread;
8163 }
8164 close(dfd);
8165 if (rename(tmpfile,server.dbfilename) == -1) {
8166 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8167 unlink(tmpfile);
8168 close(fd);
8169 return REDIS_ERR;
8170 }
8171 emptyDb();
f78fd11b 8172 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 8173 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8174 close(fd);
8175 return REDIS_ERR;
8176 }
8177 server.master = createClient(fd);
8178 server.master->flags |= REDIS_MASTER;
179b3952 8179 server.master->authenticated = 1;
ed9b544e 8180 server.replstate = REDIS_REPL_CONNECTED;
8181 return REDIS_OK;
8182}
8183
321b0e13 8184static void slaveofCommand(redisClient *c) {
8185 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8186 !strcasecmp(c->argv[2]->ptr,"one")) {
8187 if (server.masterhost) {
8188 sdsfree(server.masterhost);
8189 server.masterhost = NULL;
8190 if (server.master) freeClient(server.master);
8191 server.replstate = REDIS_REPL_NONE;
8192 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8193 }
8194 } else {
8195 sdsfree(server.masterhost);
8196 server.masterhost = sdsdup(c->argv[1]->ptr);
8197 server.masterport = atoi(c->argv[2]->ptr);
8198 if (server.master) freeClient(server.master);
8199 server.replstate = REDIS_REPL_CONNECT;
8200 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8201 server.masterhost, server.masterport);
8202 }
8203 addReply(c,shared.ok);
8204}
8205
3fd78bcd 8206/* ============================ Maxmemory directive ======================== */
8207
a5819310 8208/* Try to free one object form the pre-allocated objects free list.
8209 * This is useful under low mem conditions as by default we take 1 million
8210 * free objects allocated. On success REDIS_OK is returned, otherwise
8211 * REDIS_ERR. */
8212static int tryFreeOneObjectFromFreelist(void) {
f870935d 8213 robj *o;
8214
a5819310 8215 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8216 if (listLength(server.objfreelist)) {
8217 listNode *head = listFirst(server.objfreelist);
8218 o = listNodeValue(head);
8219 listDelNode(server.objfreelist,head);
8220 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8221 zfree(o);
8222 return REDIS_OK;
8223 } else {
8224 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8225 return REDIS_ERR;
8226 }
f870935d 8227}
8228
3fd78bcd 8229/* This function gets called when 'maxmemory' is set on the config file to limit
8230 * the max memory used by the server, and we are out of memory.
8231 * This function will try to, in order:
8232 *
8233 * - Free objects from the free list
8234 * - Try to remove keys with an EXPIRE set
8235 *
8236 * It is not possible to free enough memory to reach used-memory < maxmemory
8237 * the server will start refusing commands that will enlarge even more the
8238 * memory usage.
8239 */
8240static void freeMemoryIfNeeded(void) {
8241 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 8242 int j, k, freed = 0;
8243
8244 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8245 for (j = 0; j < server.dbnum; j++) {
8246 int minttl = -1;
8247 robj *minkey = NULL;
8248 struct dictEntry *de;
8249
8250 if (dictSize(server.db[j].expires)) {
8251 freed = 1;
8252 /* From a sample of three keys drop the one nearest to
8253 * the natural expire */
8254 for (k = 0; k < 3; k++) {
8255 time_t t;
8256
8257 de = dictGetRandomKey(server.db[j].expires);
8258 t = (time_t) dictGetEntryVal(de);
8259 if (minttl == -1 || t < minttl) {
8260 minkey = dictGetEntryKey(de);
8261 minttl = t;
3fd78bcd 8262 }
3fd78bcd 8263 }
a5819310 8264 deleteKey(server.db+j,minkey);
3fd78bcd 8265 }
3fd78bcd 8266 }
a5819310 8267 if (!freed) return; /* nothing to free... */
3fd78bcd 8268 }
8269}
8270
f80dff62 8271/* ============================== Append Only file ========================== */
8272
560db612 8273/* Called when the user switches from "appendonly yes" to "appendonly no"
8274 * at runtime using the CONFIG command. */
8275static void stopAppendOnly(void) {
8276 flushAppendOnlyFile();
8277 aof_fsync(server.appendfd);
8278 close(server.appendfd);
8279
8280 server.appendfd = -1;
8281 server.appendseldb = -1;
8282 server.appendonly = 0;
8283 /* rewrite operation in progress? kill it, wait child exit */
8284 if (server.bgsavechildpid != -1) {
8285 int statloc;
8286
8287 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8288 wait3(&statloc,0,NULL);
8289 /* reset the buffer accumulating changes while the child saves */
8290 sdsfree(server.bgrewritebuf);
8291 server.bgrewritebuf = sdsempty();
8292 server.bgsavechildpid = -1;
8293 }
8294}
8295
8296/* Called when the user switches from "appendonly no" to "appendonly yes"
8297 * at runtime using the CONFIG command. */
8298static int startAppendOnly(void) {
8299 server.appendonly = 1;
8300 server.lastfsync = time(NULL);
8301 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8302 if (server.appendfd == -1) {
8303 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8304 return REDIS_ERR;
8305 }
8306 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8307 server.appendonly = 0;
8308 close(server.appendfd);
8309 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8310 return REDIS_ERR;
8311 }
8312 return REDIS_OK;
8313}
8314
28ed1f33 8315/* Write the append only file buffer on disk.
8316 *
8317 * Since we are required to write the AOF before replying to the client,
8318 * and the only way the client socket can get a write is entering when the
8319 * the event loop, we accumulate all the AOF writes in a memory
8320 * buffer and write it on disk using this function just before entering
8321 * the event loop again. */
8322static void flushAppendOnlyFile(void) {
8323 time_t now;
8324 ssize_t nwritten;
8325
8326 if (sdslen(server.aofbuf) == 0) return;
8327
8328 /* We want to perform a single write. This should be guaranteed atomic
8329 * at least if the filesystem we are writing is a real physical one.
8330 * While this will save us against the server being killed I don't think
8331 * there is much to do about the whole server stopping for power problems
8332 * or alike */
8333 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8334 if (nwritten != (signed)sdslen(server.aofbuf)) {
8335 /* Ooops, we are in troubles. The best thing to do for now is
8336 * aborting instead of giving the illusion that everything is
8337 * working as expected. */
8338 if (nwritten == -1) {
8339 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8340 } else {
8341 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8342 }
8343 exit(1);
8344 }
8345 sdsfree(server.aofbuf);
8346 server.aofbuf = sdsempty();
8347
38db9171 8348 /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
8349 * childs performing heavy I/O on disk. */
8350 if (server.no_appendfsync_on_rewrite &&
8351 (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
8352 return;
28ed1f33 8353 /* Fsync if needed */
8354 now = time(NULL);
8355 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8356 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8357 now-server.lastfsync > 1))
8358 {
8359 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8360 * flushing metadata. */
8361 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8362 server.lastfsync = now;
8363 }
8364}
8365
9376e434
PN
8366static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8367 int j;
8368 buf = sdscatprintf(buf,"*%d\r\n",argc);
8369 for (j = 0; j < argc; j++) {
8370 robj *o = getDecodedObject(argv[j]);
8371 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8372 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8373 buf = sdscatlen(buf,"\r\n",2);
8374 decrRefCount(o);
8375 }
8376 return buf;
8377}
8378
8379static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8380 int argc = 3;
8381 long when;
8382 robj *argv[3];
8383
8384 /* Make sure we can use strtol */
8385 seconds = getDecodedObject(seconds);
8386 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8387 decrRefCount(seconds);
8388
8389 argv[0] = createStringObject("EXPIREAT",8);
8390 argv[1] = key;
8391 argv[2] = createObject(REDIS_STRING,
8392 sdscatprintf(sdsempty(),"%ld",when));
8393 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8394 decrRefCount(argv[0]);
8395 decrRefCount(argv[2]);
8396 return buf;
8397}
8398
f80dff62 8399static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8400 sds buf = sdsempty();
f80dff62 8401 robj *tmpargv[3];
8402
8403 /* The DB this command was targetting is not the same as the last command
8404 * we appendend. To issue a SELECT command is needed. */
8405 if (dictid != server.appendseldb) {
8406 char seldb[64];
8407
8408 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 8409 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 8410 (unsigned long)strlen(seldb),seldb);
f80dff62 8411 server.appendseldb = dictid;
8412 }
8413
f80dff62 8414 if (cmd->proc == expireCommand) {
9376e434
PN
8415 /* Translate EXPIRE into EXPIREAT */
8416 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8417 } else if (cmd->proc == setexCommand) {
8418 /* Translate SETEX to SET and EXPIREAT */
8419 tmpargv[0] = createStringObject("SET",3);
f80dff62 8420 tmpargv[1] = argv[1];
9376e434
PN
8421 tmpargv[2] = argv[3];
8422 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8423 decrRefCount(tmpargv[0]);
8424 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8425 } else {
8426 buf = catAppendOnlyGenericCommand(buf,argc,argv);
f80dff62 8427 }
8428
28ed1f33 8429 /* Append to the AOF buffer. This will be flushed on disk just before
8430 * of re-entering the event loop, so before the client will get a
8431 * positive reply about the operation performed. */
8432 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8433
85a83172 8434 /* If a background append only file rewriting is in progress we want to
8435 * accumulate the differences between the child DB and the current one
8436 * in a buffer, so that when the child process will do its work we
8437 * can append the differences to the new append only file. */
8438 if (server.bgrewritechildpid != -1)
8439 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8440
8441 sdsfree(buf);
f80dff62 8442}
8443
8444/* In Redis commands are always executed in the context of a client, so in
8445 * order to load the append only file we need to create a fake client. */
8446static struct redisClient *createFakeClient(void) {
8447 struct redisClient *c = zmalloc(sizeof(*c));
8448
8449 selectDb(c,0);
8450 c->fd = -1;
8451 c->querybuf = sdsempty();
8452 c->argc = 0;
8453 c->argv = NULL;
8454 c->flags = 0;
9387d17d 8455 /* We set the fake client as a slave waiting for the synchronization
8456 * so that Redis will not try to send replies to this client. */
8457 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 8458 c->reply = listCreate();
8459 listSetFreeMethod(c->reply,decrRefCount);
8460 listSetDupMethod(c->reply,dupClientReplyValue);
4132ad8d 8461 initClientMultiState(c);
f80dff62 8462 return c;
8463}
8464
8465static void freeFakeClient(struct redisClient *c) {
8466 sdsfree(c->querybuf);
8467 listRelease(c->reply);
4132ad8d 8468 freeClientMultiState(c);
f80dff62 8469 zfree(c);
8470}
8471
8472/* Replay the append log file. On error REDIS_OK is returned. On non fatal
8473 * error (the append only file is zero-length) REDIS_ERR is returned. On
8474 * fatal error an error message is logged and the program exists. */
8475int loadAppendOnlyFile(char *filename) {
8476 struct redisClient *fakeClient;
8477 FILE *fp = fopen(filename,"r");
8478 struct redis_stat sb;
b492cf00 8479 unsigned long long loadedkeys = 0;
4132ad8d 8480 int appendonly = server.appendonly;
f80dff62 8481
8482 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8483 return REDIS_ERR;
8484
8485 if (fp == NULL) {
8486 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8487 exit(1);
8488 }
8489
4132ad8d
PN
8490 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8491 * to the same file we're about to read. */
8492 server.appendonly = 0;
8493
f80dff62 8494 fakeClient = createFakeClient();
8495 while(1) {
8496 int argc, j;
8497 unsigned long len;
8498 robj **argv;
8499 char buf[128];
8500 sds argsds;
8501 struct redisCommand *cmd;
8502
8503 if (fgets(buf,sizeof(buf),fp) == NULL) {
8504 if (feof(fp))
8505 break;
8506 else
8507 goto readerr;
8508 }
8509 if (buf[0] != '*') goto fmterr;
8510 argc = atoi(buf+1);
8511 argv = zmalloc(sizeof(robj*)*argc);
8512 for (j = 0; j < argc; j++) {
8513 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8514 if (buf[0] != '$') goto fmterr;
8515 len = strtol(buf+1,NULL,10);
8516 argsds = sdsnewlen(NULL,len);
0f151ef1 8517 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 8518 argv[j] = createObject(REDIS_STRING,argsds);
8519 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8520 }
8521
8522 /* Command lookup */
8523 cmd = lookupCommand(argv[0]->ptr);
8524 if (!cmd) {
8525 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8526 exit(1);
8527 }
bdcb92f2 8528 /* Try object encoding */
f80dff62 8529 if (cmd->flags & REDIS_CMD_BULK)
05df7621 8530 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
f80dff62 8531 /* Run the command in the context of a fake client */
8532 fakeClient->argc = argc;
8533 fakeClient->argv = argv;
8534 cmd->proc(fakeClient);
8535 /* Discard the reply objects list from the fake client */
8536 while(listLength(fakeClient->reply))
8537 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8538 /* Clean up, ready for the next command */
8539 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8540 zfree(argv);
b492cf00 8541 /* Handle swapping while loading big datasets when VM is on */
8542 loadedkeys++;
8543 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8544 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 8545 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 8546 }
8547 }
f80dff62 8548 }
4132ad8d
PN
8549
8550 /* This point can only be reached when EOF is reached without errors.
8551 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8552 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8553
f80dff62 8554 fclose(fp);
8555 freeFakeClient(fakeClient);
4132ad8d 8556 server.appendonly = appendonly;
f80dff62 8557 return REDIS_OK;
8558
8559readerr:
8560 if (feof(fp)) {
8561 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8562 } else {
8563 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8564 }
8565 exit(1);
8566fmterr:
8567 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8568 exit(1);
8569}
8570
9d65a1bb 8571/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
9c8e3cee 8572static int fwriteBulkObject(FILE *fp, robj *obj) {
9d65a1bb 8573 char buf[128];
b9bc0eef 8574 int decrrc = 0;
8575
f2d9f50f 8576 /* Avoid the incr/decr ref count business if possible to help
8577 * copy-on-write (we are often in a child process when this function
8578 * is called).
8579 * Also makes sure that key objects don't get incrRefCount-ed when VM
8580 * is enabled */
8581 if (obj->encoding != REDIS_ENCODING_RAW) {
b9bc0eef 8582 obj = getDecodedObject(obj);
8583 decrrc = 1;
8584 }
9d65a1bb 8585 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8586 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 8587 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8588 goto err;
9d65a1bb 8589 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 8590 if (decrrc) decrRefCount(obj);
9d65a1bb 8591 return 1;
8592err:
b9bc0eef 8593 if (decrrc) decrRefCount(obj);
9d65a1bb 8594 return 0;
8595}
8596
9c8e3cee 8597/* Write binary-safe string into a file in the bulkformat
8598 * $<count>\r\n<payload>\r\n */
8599static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8600 char buf[128];
8601
8602 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8603 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8604 if (len && fwrite(s,len,1,fp) == 0) return 0;
8605 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8606 return 1;
8607}
8608
9d65a1bb 8609/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8610static int fwriteBulkDouble(FILE *fp, double d) {
8611 char buf[128], dbuf[128];
8612
8613 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8614 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8615 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8616 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8617 return 1;
8618}
8619
8620/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8621static int fwriteBulkLong(FILE *fp, long l) {
8622 char buf[128], lbuf[128];
8623
8624 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8625 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8626 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8627 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8628 return 1;
8629}
8630
8631/* Write a sequence of commands able to fully rebuild the dataset into
8632 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8633static int rewriteAppendOnlyFile(char *filename) {
8634 dictIterator *di = NULL;
8635 dictEntry *de;
8636 FILE *fp;
8637 char tmpfile[256];
8638 int j;
8639 time_t now = time(NULL);
8640
8641 /* Note that we have to use a different temp name here compared to the
8642 * one used by rewriteAppendOnlyFileBackground() function. */
8643 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8644 fp = fopen(tmpfile,"w");
8645 if (!fp) {
8646 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8647 return REDIS_ERR;
8648 }
8649 for (j = 0; j < server.dbnum; j++) {
8650 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8651 redisDb *db = server.db+j;
8652 dict *d = db->dict;
8653 if (dictSize(d) == 0) continue;
8654 di = dictGetIterator(d);
8655 if (!di) {
8656 fclose(fp);
8657 return REDIS_ERR;
8658 }
8659
8660 /* SELECT the new DB */
8661 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 8662 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 8663
8664 /* Iterate this DB writing every entry */
8665 while((de = dictNext(di)) != NULL) {
e7546c63 8666 robj *key, *o;
8667 time_t expiretime;
8668 int swapped;
8669
8670 key = dictGetEntryKey(de);
560db612 8671 o = dictGetEntryVal(de);
b9bc0eef 8672 /* If the value for this key is swapped, load a preview in memory.
8673 * We use a "swapped" flag to remember if we need to free the
8674 * value object instead to just increment the ref count anyway
8675 * in order to avoid copy-on-write of pages if we are forked() */
560db612 8676 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
8677 o->storage == REDIS_VM_SWAPPING) {
e7546c63 8678 swapped = 0;
8679 } else {
560db612 8680 o = vmPreviewObject(o);
e7546c63 8681 swapped = 1;
8682 }
8683 expiretime = getExpire(db,key);
9d65a1bb 8684
8685 /* Save the key and associated value */
9d65a1bb 8686 if (o->type == REDIS_STRING) {
8687 /* Emit a SET command */
8688 char cmd[]="*3\r\n$3\r\nSET\r\n";
8689 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8690 /* Key and value */
9c8e3cee 8691 if (fwriteBulkObject(fp,key) == 0) goto werr;
8692 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 8693 } else if (o->type == REDIS_LIST) {
8694 /* Emit the RPUSHes needed to rebuild the list */
8695 list *list = o->ptr;
8696 listNode *ln;
c7df85a4 8697 listIter li;
9d65a1bb 8698
c7df85a4 8699 listRewind(list,&li);
8700 while((ln = listNext(&li))) {
9d65a1bb 8701 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8702 robj *eleobj = listNodeValue(ln);
8703
8704 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8705 if (fwriteBulkObject(fp,key) == 0) goto werr;
8706 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8707 }
8708 } else if (o->type == REDIS_SET) {
8709 /* Emit the SADDs needed to rebuild the set */
8710 dict *set = o->ptr;
8711 dictIterator *di = dictGetIterator(set);
8712 dictEntry *de;
8713
8714 while((de = dictNext(di)) != NULL) {
8715 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8716 robj *eleobj = dictGetEntryKey(de);
8717
8718 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8719 if (fwriteBulkObject(fp,key) == 0) goto werr;
8720 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8721 }
8722 dictReleaseIterator(di);
8723 } else if (o->type == REDIS_ZSET) {
8724 /* Emit the ZADDs needed to rebuild the sorted set */
8725 zset *zs = o->ptr;
8726 dictIterator *di = dictGetIterator(zs->dict);
8727 dictEntry *de;
8728
8729 while((de = dictNext(di)) != NULL) {
8730 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8731 robj *eleobj = dictGetEntryKey(de);
8732 double *score = dictGetEntryVal(de);
8733
8734 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8735 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8736 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 8737 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8738 }
8739 dictReleaseIterator(di);
9c8e3cee 8740 } else if (o->type == REDIS_HASH) {
8741 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8742
8743 /* Emit the HSETs needed to rebuild the hash */
8744 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8745 unsigned char *p = zipmapRewind(o->ptr);
8746 unsigned char *field, *val;
8747 unsigned int flen, vlen;
8748
8749 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8750 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8751 if (fwriteBulkObject(fp,key) == 0) goto werr;
8752 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8753 return -1;
8754 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8755 return -1;
8756 }
8757 } else {
8758 dictIterator *di = dictGetIterator(o->ptr);
8759 dictEntry *de;
8760
8761 while((de = dictNext(di)) != NULL) {
8762 robj *field = dictGetEntryKey(de);
8763 robj *val = dictGetEntryVal(de);
8764
8765 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8766 if (fwriteBulkObject(fp,key) == 0) goto werr;
8767 if (fwriteBulkObject(fp,field) == -1) return -1;
8768 if (fwriteBulkObject(fp,val) == -1) return -1;
8769 }
8770 dictReleaseIterator(di);
8771 }
9d65a1bb 8772 } else {
f83c6cb5 8773 redisPanic("Unknown object type");
9d65a1bb 8774 }
8775 /* Save the expire time */
8776 if (expiretime != -1) {
e96e4fbf 8777 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 8778 /* If this key is already expired skip it */
8779 if (expiretime < now) continue;
8780 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8781 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8782 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8783 }
b9bc0eef 8784 if (swapped) decrRefCount(o);
9d65a1bb 8785 }
8786 dictReleaseIterator(di);
8787 }
8788
8789 /* Make sure data will not remain on the OS's output buffers */
8790 fflush(fp);
b0bd87f6 8791 aof_fsync(fileno(fp));
9d65a1bb 8792 fclose(fp);
e0a62c7f 8793
9d65a1bb 8794 /* Use RENAME to make sure the DB file is changed atomically only
8795 * if the generate DB file is ok. */
8796 if (rename(tmpfile,filename) == -1) {
8797 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8798 unlink(tmpfile);
8799 return REDIS_ERR;
8800 }
8801 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8802 return REDIS_OK;
8803
8804werr:
8805 fclose(fp);
8806 unlink(tmpfile);
e96e4fbf 8807 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 8808 if (di) dictReleaseIterator(di);
8809 return REDIS_ERR;
8810}
8811
8812/* This is how rewriting of the append only file in background works:
8813 *
8814 * 1) The user calls BGREWRITEAOF
8815 * 2) Redis calls this function, that forks():
8816 * 2a) the child rewrite the append only file in a temp file.
8817 * 2b) the parent accumulates differences in server.bgrewritebuf.
8818 * 3) When the child finished '2a' exists.
8819 * 4) The parent will trap the exit code, if it's OK, will append the
8820 * data accumulated into server.bgrewritebuf into the temp file, and
8821 * finally will rename(2) the temp file in the actual file name.
8822 * The the new file is reopened as the new append only file. Profit!
8823 */
8824static int rewriteAppendOnlyFileBackground(void) {
8825 pid_t childpid;
8826
8827 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 8828 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 8829 if ((childpid = fork()) == 0) {
8830 /* Child */
8831 char tmpfile[256];
9d65a1bb 8832
054e426d 8833 if (server.vm_enabled) vmReopenSwapFile();
8834 close(server.fd);
9d65a1bb 8835 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8836 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 8837 _exit(0);
9d65a1bb 8838 } else {
478c2c6f 8839 _exit(1);
9d65a1bb 8840 }
8841 } else {
8842 /* Parent */
8843 if (childpid == -1) {
8844 redisLog(REDIS_WARNING,
8845 "Can't rewrite append only file in background: fork: %s",
8846 strerror(errno));
8847 return REDIS_ERR;
8848 }
8849 redisLog(REDIS_NOTICE,
8850 "Background append only file rewriting started by pid %d",childpid);
8851 server.bgrewritechildpid = childpid;
884d4b39 8852 updateDictResizePolicy();
85a83172 8853 /* We set appendseldb to -1 in order to force the next call to the
8854 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8855 * accumulated by the parent into server.bgrewritebuf will start
8856 * with a SELECT statement and it will be safe to merge. */
8857 server.appendseldb = -1;
9d65a1bb 8858 return REDIS_OK;
8859 }
8860 return REDIS_OK; /* unreached */
8861}
8862
8863static void bgrewriteaofCommand(redisClient *c) {
8864 if (server.bgrewritechildpid != -1) {
8865 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8866 return;
8867 }
8868 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 8869 char *status = "+Background append only file rewriting started\r\n";
8870 addReplySds(c,sdsnew(status));
9d65a1bb 8871 } else {
8872 addReply(c,shared.err);
8873 }
8874}
8875
8876static void aofRemoveTempFile(pid_t childpid) {
8877 char tmpfile[256];
8878
8879 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8880 unlink(tmpfile);
8881}
8882
996cb5f7 8883/* Virtual Memory is composed mainly of two subsystems:
8884 * - Blocking Virutal Memory
8885 * - Threaded Virtual Memory I/O
8886 * The two parts are not fully decoupled, but functions are split among two
8887 * different sections of the source code (delimited by comments) in order to
8888 * make more clear what functionality is about the blocking VM and what about
8889 * the threaded (not blocking) VM.
8890 *
8891 * Redis VM design:
8892 *
8893 * Redis VM is a blocking VM (one that blocks reading swapped values from
8894 * disk into memory when a value swapped out is needed in memory) that is made
8895 * unblocking by trying to examine the command argument vector in order to
8896 * load in background values that will likely be needed in order to exec
8897 * the command. The command is executed only once all the relevant keys
8898 * are loaded into memory.
8899 *
8900 * This basically is almost as simple of a blocking VM, but almost as parallel
8901 * as a fully non-blocking VM.
8902 */
8903
560db612 8904/* =================== Virtual Memory - Blocking Side ====================== */
2e5eb04e 8905
560db612 8906/* Create a VM pointer object. This kind of objects are used in place of
8907 * values in the key -> value hash table, for swapped out objects. */
8908static vmpointer *createVmPointer(int vtype) {
8909 vmpointer *vp = zmalloc(sizeof(vmpointer));
2e5eb04e 8910
560db612 8911 vp->type = REDIS_VMPOINTER;
8912 vp->storage = REDIS_VM_SWAPPED;
8913 vp->vtype = vtype;
8914 return vp;
2e5eb04e 8915}
8916
75680a3c 8917static void vmInit(void) {
8918 off_t totsize;
996cb5f7 8919 int pipefds[2];
bcaa7a4f 8920 size_t stacksize;
8b5bb414 8921 struct flock fl;
75680a3c 8922
4ad37480 8923 if (server.vm_max_threads != 0)
8924 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8925
054e426d 8926 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8b5bb414 8927 /* Try to open the old swap file, otherwise create it */
6fa987e3 8928 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8929 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8930 }
75680a3c 8931 if (server.vm_fp == NULL) {
6fa987e3 8932 redisLog(REDIS_WARNING,
8b5bb414 8933 "Can't open the swap file: %s. Exiting.",
6fa987e3 8934 strerror(errno));
75680a3c 8935 exit(1);
8936 }
8937 server.vm_fd = fileno(server.vm_fp);
8b5bb414 8938 /* Lock the swap file for writing, this is useful in order to avoid
8939 * another instance to use the same swap file for a config error. */
8940 fl.l_type = F_WRLCK;
8941 fl.l_whence = SEEK_SET;
8942 fl.l_start = fl.l_len = 0;
8943 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8944 redisLog(REDIS_WARNING,
8945 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8946 exit(1);
8947 }
8948 /* Initialize */
75680a3c 8949 server.vm_next_page = 0;
8950 server.vm_near_pages = 0;
7d98e08c 8951 server.vm_stats_used_pages = 0;
8952 server.vm_stats_swapped_objects = 0;
8953 server.vm_stats_swapouts = 0;
8954 server.vm_stats_swapins = 0;
75680a3c 8955 totsize = server.vm_pages*server.vm_page_size;
8956 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8957 if (ftruncate(server.vm_fd,totsize) == -1) {
8958 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8959 strerror(errno));
8960 exit(1);
8961 } else {
8962 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8963 }
7d30035d 8964 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 8965 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 8966 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 8967 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 8968
996cb5f7 8969 /* Initialize threaded I/O (used by Virtual Memory) */
8970 server.io_newjobs = listCreate();
8971 server.io_processing = listCreate();
8972 server.io_processed = listCreate();
d5d55fc3 8973 server.io_ready_clients = listCreate();
92f8e882 8974 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 8975 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8976 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 8977 server.io_active_threads = 0;
996cb5f7 8978 if (pipe(pipefds) == -1) {
8979 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8980 ,strerror(errno));
8981 exit(1);
8982 }
8983 server.io_ready_pipe_read = pipefds[0];
8984 server.io_ready_pipe_write = pipefds[1];
8985 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 8986 /* LZF requires a lot of stack */
8987 pthread_attr_init(&server.io_threads_attr);
8988 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8989 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8990 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 8991 /* Listen for events in the threaded I/O pipe */
8992 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8993 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8994 oom("creating file event");
75680a3c 8995}
8996
06224fec 8997/* Mark the page as used */
8998static void vmMarkPageUsed(off_t page) {
8999 off_t byte = page/8;
9000 int bit = page&7;
970e10bb 9001 redisAssert(vmFreePage(page) == 1);
06224fec 9002 server.vm_bitmap[byte] |= 1<<bit;
9003}
9004
9005/* Mark N contiguous pages as used, with 'page' being the first. */
9006static void vmMarkPagesUsed(off_t page, off_t count) {
9007 off_t j;
9008
9009 for (j = 0; j < count; j++)
7d30035d 9010 vmMarkPageUsed(page+j);
7d98e08c 9011 server.vm_stats_used_pages += count;
7c775e09 9012 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
9013 (long long)count, (long long)page);
06224fec 9014}
9015
9016/* Mark the page as free */
9017static void vmMarkPageFree(off_t page) {
9018 off_t byte = page/8;
9019 int bit = page&7;
970e10bb 9020 redisAssert(vmFreePage(page) == 0);
06224fec 9021 server.vm_bitmap[byte] &= ~(1<<bit);
9022}
9023
9024/* Mark N contiguous pages as free, with 'page' being the first. */
9025static void vmMarkPagesFree(off_t page, off_t count) {
9026 off_t j;
9027
9028 for (j = 0; j < count; j++)
7d30035d 9029 vmMarkPageFree(page+j);
7d98e08c 9030 server.vm_stats_used_pages -= count;
7c775e09 9031 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
9032 (long long)count, (long long)page);
06224fec 9033}
9034
9035/* Test if the page is free */
9036static int vmFreePage(off_t page) {
9037 off_t byte = page/8;
9038 int bit = page&7;
7d30035d 9039 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 9040}
9041
9042/* Find N contiguous free pages storing the first page of the cluster in *first.
e0a62c7f 9043 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
3a66edc7 9044 * REDIS_ERR is returned.
06224fec 9045 *
9046 * This function uses a simple algorithm: we try to allocate
9047 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9048 * again from the start of the swap file searching for free spaces.
9049 *
9050 * If it looks pretty clear that there are no free pages near our offset
9051 * we try to find less populated places doing a forward jump of
9052 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9053 * without hurry, and then we jump again and so forth...
e0a62c7f 9054 *
06224fec 9055 * This function can be improved using a free list to avoid to guess
9056 * too much, since we could collect data about freed pages.
9057 *
9058 * note: I implemented this function just after watching an episode of
9059 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9060 */
c7df85a4 9061static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 9062 off_t base, offset = 0, since_jump = 0, numfree = 0;
9063
9064 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9065 server.vm_near_pages = 0;
9066 server.vm_next_page = 0;
9067 }
9068 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9069 base = server.vm_next_page;
9070
9071 while(offset < server.vm_pages) {
9072 off_t this = base+offset;
9073
9074 /* If we overflow, restart from page zero */
9075 if (this >= server.vm_pages) {
9076 this -= server.vm_pages;
9077 if (this == 0) {
9078 /* Just overflowed, what we found on tail is no longer
9079 * interesting, as it's no longer contiguous. */
9080 numfree = 0;
9081 }
9082 }
9083 if (vmFreePage(this)) {
9084 /* This is a free page */
9085 numfree++;
9086 /* Already got N free pages? Return to the caller, with success */
9087 if (numfree == n) {
7d30035d 9088 *first = this-(n-1);
9089 server.vm_next_page = this+1;
7c775e09 9090 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 9091 return REDIS_OK;
06224fec 9092 }
9093 } else {
9094 /* The current one is not a free page */
9095 numfree = 0;
9096 }
9097
9098 /* Fast-forward if the current page is not free and we already
9099 * searched enough near this place. */
9100 since_jump++;
9101 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9102 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9103 since_jump = 0;
9104 /* Note that even if we rewind after the jump, we are don't need
9105 * to make sure numfree is set to zero as we only jump *if* it
9106 * is set to zero. */
9107 } else {
9108 /* Otherwise just check the next page */
9109 offset++;
9110 }
9111 }
3a66edc7 9112 return REDIS_ERR;
9113}
9114
a5819310 9115/* Write the specified object at the specified page of the swap file */
9116static int vmWriteObjectOnSwap(robj *o, off_t page) {
9117 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9118 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9119 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9120 redisLog(REDIS_WARNING,
9ebed7cf 9121 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 9122 strerror(errno));
9123 return REDIS_ERR;
9124 }
9125 rdbSaveObject(server.vm_fp,o);
ba76a8f9 9126 fflush(server.vm_fp);
a5819310 9127 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9128 return REDIS_OK;
9129}
9130
a4798f73 9131/* Transfers the 'val' object to disk. Store all the information
9132 * a 'vmpointer' object containing all the information needed to load the
9133 * object back later is returned.
9134 *
3a66edc7 9135 * If we can't find enough contiguous empty pages to swap the object on disk
a4798f73 9136 * NULL is returned. */
560db612 9137static vmpointer *vmSwapObjectBlocking(robj *val) {
b9bc0eef 9138 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 9139 off_t page;
560db612 9140 vmpointer *vp;
3a66edc7 9141
560db612 9142 assert(val->storage == REDIS_VM_MEMORY);
9143 assert(val->refcount == 1);
9144 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return NULL;
9145 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return NULL;
9146
9147 vp = createVmPointer(val->type);
9148 vp->page = page;
9149 vp->usedpages = pages;
3a66edc7 9150 decrRefCount(val); /* Deallocate the object from memory. */
9151 vmMarkPagesUsed(page,pages);
560db612 9152 redisLog(REDIS_DEBUG,"VM: object %p swapped out at %lld (%lld pages)",
9153 (void*) val,
7d30035d 9154 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 9155 server.vm_stats_swapped_objects++;
9156 server.vm_stats_swapouts++;
560db612 9157 return vp;
3a66edc7 9158}
9159
a5819310 9160static robj *vmReadObjectFromSwap(off_t page, int type) {
9161 robj *o;
3a66edc7 9162
a5819310 9163 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9164 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 9165 redisLog(REDIS_WARNING,
d5d55fc3 9166 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 9167 strerror(errno));
478c2c6f 9168 _exit(1);
3a66edc7 9169 }
a5819310 9170 o = rdbLoadObject(type,server.vm_fp);
9171 if (o == NULL) {
d5d55fc3 9172 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 9173 _exit(1);
3a66edc7 9174 }
a5819310 9175 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9176 return o;
9177}
9178
560db612 9179/* Load the specified object from swap to memory.
a5819310 9180 * The newly allocated object is returned.
9181 *
9182 * If preview is true the unserialized object is returned to the caller but
560db612 9183 * the pages are not marked as freed, nor the vp object is freed. */
9184static robj *vmGenericLoadObject(vmpointer *vp, int preview) {
a5819310 9185 robj *val;
9186
560db612 9187 redisAssert(vp->type == REDIS_VMPOINTER &&
9188 (vp->storage == REDIS_VM_SWAPPED || vp->storage == REDIS_VM_LOADING));
9189 val = vmReadObjectFromSwap(vp->page,vp->vtype);
7e69548d 9190 if (!preview) {
560db612 9191 redisLog(REDIS_DEBUG, "VM: object %p loaded from disk", (void*)vp);
9192 vmMarkPagesFree(vp->page,vp->usedpages);
9193 zfree(vp);
7d98e08c 9194 server.vm_stats_swapped_objects--;
38aba9a1 9195 } else {
560db612 9196 redisLog(REDIS_DEBUG, "VM: object %p previewed from disk", (void*)vp);
7e69548d 9197 }
7d98e08c 9198 server.vm_stats_swapins++;
3a66edc7 9199 return val;
06224fec 9200}
9201
560db612 9202/* Plain object loading, from swap to memory.
9203 *
9204 * 'o' is actually a redisVmPointer structure that will be freed by the call.
9205 * The return value is the loaded object. */
9206static robj *vmLoadObject(robj *o) {
996cb5f7 9207 /* If we are loading the object in background, stop it, we
9208 * need to load this object synchronously ASAP. */
560db612 9209 if (o->storage == REDIS_VM_LOADING)
9210 vmCancelThreadedIOJob(o);
9211 return vmGenericLoadObject((vmpointer*)o,0);
7e69548d 9212}
9213
9214/* Just load the value on disk, without to modify the key.
9215 * This is useful when we want to perform some operation on the value
9216 * without to really bring it from swap to memory, like while saving the
9217 * dataset or rewriting the append only log. */
560db612 9218static robj *vmPreviewObject(robj *o) {
9219 return vmGenericLoadObject((vmpointer*)o,1);
7e69548d 9220}
9221
4ef8de8a 9222/* How a good candidate is this object for swapping?
9223 * The better candidate it is, the greater the returned value.
9224 *
9225 * Currently we try to perform a fast estimation of the object size in
9226 * memory, and combine it with aging informations.
9227 *
9228 * Basically swappability = idle-time * log(estimated size)
9229 *
9230 * Bigger objects are preferred over smaller objects, but not
9231 * proportionally, this is why we use the logarithm. This algorithm is
9232 * just a first try and will probably be tuned later. */
9233static double computeObjectSwappability(robj *o) {
560db612 9234 /* actual age can be >= minage, but not < minage. As we use wrapping
9235 * 21 bit clocks with minutes resolution for the LRU. */
9236 time_t minage = abs(server.lruclock - o->lru);
4ef8de8a 9237 long asize = 0;
9238 list *l;
9239 dict *d;
9240 struct dictEntry *de;
9241 int z;
9242
560db612 9243 if (minage <= 0) return 0;
4ef8de8a 9244 switch(o->type) {
9245 case REDIS_STRING:
9246 if (o->encoding != REDIS_ENCODING_RAW) {
9247 asize = sizeof(*o);
9248 } else {
9249 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9250 }
9251 break;
9252 case REDIS_LIST:
9253 l = o->ptr;
9254 listNode *ln = listFirst(l);
9255
9256 asize = sizeof(list);
9257 if (ln) {
9258 robj *ele = ln->value;
9259 long elesize;
9260
9261 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
560db612 9262 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
4ef8de8a 9263 asize += (sizeof(listNode)+elesize)*listLength(l);
9264 }
9265 break;
9266 case REDIS_SET:
9267 case REDIS_ZSET:
9268 z = (o->type == REDIS_ZSET);
9269 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9270
9271 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9272 if (z) asize += sizeof(zset)-sizeof(dict);
9273 if (dictSize(d)) {
9274 long elesize;
9275 robj *ele;
9276
9277 de = dictGetRandomKey(d);
9278 ele = dictGetEntryKey(de);
9279 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
560db612 9280 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
4ef8de8a 9281 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9282 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9283 }
9284 break;
a97b9060 9285 case REDIS_HASH:
9286 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9287 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9288 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9289 unsigned int klen, vlen;
9290 unsigned char *key, *val;
9291
9292 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9293 klen = 0;
9294 vlen = 0;
9295 }
9296 asize = len*(klen+vlen+3);
9297 } else if (o->encoding == REDIS_ENCODING_HT) {
9298 d = o->ptr;
9299 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9300 if (dictSize(d)) {
9301 long elesize;
9302 robj *ele;
9303
9304 de = dictGetRandomKey(d);
9305 ele = dictGetEntryKey(de);
9306 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
560db612 9307 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
a97b9060 9308 ele = dictGetEntryVal(de);
9309 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
560db612 9310 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
a97b9060 9311 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9312 }
9313 }
9314 break;
4ef8de8a 9315 }
560db612 9316 return (double)minage*log(1+asize);
4ef8de8a 9317}
9318
9319/* Try to swap an object that's a good candidate for swapping.
9320 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 9321 * to swap any object at all.
9322 *
9323 * If 'usethreaded' is true, Redis will try to swap the object in background
9324 * using I/O threads. */
9325static int vmSwapOneObject(int usethreads) {
4ef8de8a 9326 int j, i;
9327 struct dictEntry *best = NULL;
9328 double best_swappability = 0;
b9bc0eef 9329 redisDb *best_db = NULL;
4ef8de8a 9330 robj *key, *val;
9331
9332 for (j = 0; j < server.dbnum; j++) {
9333 redisDb *db = server.db+j;
b72f6a4b 9334 /* Why maxtries is set to 100?
9335 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9336 * are swappable objects */
b0d8747d 9337 int maxtries = 100;
4ef8de8a 9338
9339 if (dictSize(db->dict) == 0) continue;
9340 for (i = 0; i < 5; i++) {
9341 dictEntry *de;
9342 double swappability;
9343
e3cadb8a 9344 if (maxtries) maxtries--;
4ef8de8a 9345 de = dictGetRandomKey(db->dict);
9346 key = dictGetEntryKey(de);
9347 val = dictGetEntryVal(de);
1064ef87 9348 /* Only swap objects that are currently in memory.
9349 *
560db612 9350 * Also don't swap shared objects: not a good idea in general and
9351 * we need to ensure that the main thread does not touch the
1064ef87 9352 * object while the I/O thread is using it, but we can't
9353 * control other keys without adding additional mutex. */
560db612 9354 if (val->storage != REDIS_VM_MEMORY || val->refcount != 1) {
e3cadb8a 9355 if (maxtries) i--; /* don't count this try */
9356 continue;
9357 }
4ef8de8a 9358 swappability = computeObjectSwappability(val);
9359 if (!best || swappability > best_swappability) {
9360 best = de;
9361 best_swappability = swappability;
b9bc0eef 9362 best_db = db;
4ef8de8a 9363 }
9364 }
9365 }
7c775e09 9366 if (best == NULL) return REDIS_ERR;
4ef8de8a 9367 key = dictGetEntryKey(best);
9368 val = dictGetEntryVal(best);
9369
e3cadb8a 9370 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 9371 key->ptr, best_swappability);
9372
4ef8de8a 9373 /* Swap it */
a69a0c9c 9374 if (usethreads) {
b9bc0eef 9375 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 9376 return REDIS_OK;
9377 } else {
560db612 9378 vmpointer *vp;
9379
9380 if ((vp = vmSwapObjectBlocking(val)) != NULL) {
9381 dictGetEntryVal(best) = vp;
a69a0c9c 9382 return REDIS_OK;
9383 } else {
9384 return REDIS_ERR;
9385 }
4ef8de8a 9386 }
9387}
9388
a69a0c9c 9389static int vmSwapOneObjectBlocking() {
9390 return vmSwapOneObject(0);
9391}
9392
9393static int vmSwapOneObjectThreaded() {
9394 return vmSwapOneObject(1);
9395}
9396
7e69548d 9397/* Return true if it's safe to swap out objects in a given moment.
9398 * Basically we don't want to swap objects out while there is a BGSAVE
9399 * or a BGAEOREWRITE running in backgroud. */
9400static int vmCanSwapOut(void) {
9401 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9402}
9403
1b03836c 9404/* Delete a key if swapped. Returns 1 if the key was found, was swapped
9405 * and was deleted. Otherwise 0 is returned. */
9406static int deleteIfSwapped(redisDb *db, robj *key) {
560db612 9407 robj *val;
1b03836c 9408
560db612 9409 if ((val = dictFetchValue(db->dict,key)) == NULL) return 0;
9410 if (val->storage == REDIS_VM_MEMORY) return 0;
1b03836c 9411 deleteKey(db,key);
9412 return 1;
9413}
9414
996cb5f7 9415/* =================== Virtual Memory - Threaded I/O ======================= */
9416
b9bc0eef 9417static void freeIOJob(iojob *j) {
d5d55fc3 9418 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9419 j->type == REDIS_IOJOB_DO_SWAP ||
9420 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
560db612 9421 {
e4ed181d 9422 /* we fix the storage type, otherwise decrRefCount() will try to
9423 * kill the I/O thread Job (that does no longer exists). */
9424 if (j->val->storage == REDIS_VM_SWAPPING)
560db612 9425 j->val->storage = REDIS_VM_MEMORY;
b9bc0eef 9426 decrRefCount(j->val);
560db612 9427 }
9428 decrRefCount(j->key);
b9bc0eef 9429 zfree(j);
9430}
9431
996cb5f7 9432/* Every time a thread finished a Job, it writes a byte into the write side
9433 * of an unix pipe in order to "awake" the main thread, and this function
9434 * is called. */
9435static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9436 int mask)
9437{
9438 char buf[1];
b0d8747d 9439 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 9440 REDIS_NOTUSED(el);
9441 REDIS_NOTUSED(mask);
9442 REDIS_NOTUSED(privdata);
9443
9444 /* For every byte we read in the read side of the pipe, there is one
9445 * I/O job completed to process. */
9446 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 9447 iojob *j;
9448 listNode *ln;
b9bc0eef 9449 struct dictEntry *de;
9450
996cb5f7 9451 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 9452
9453 /* Get the processed element (the oldest one) */
9454 lockThreadedIO();
1064ef87 9455 assert(listLength(server.io_processed) != 0);
f6c0bba8 9456 if (toprocess == -1) {
9457 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9458 if (toprocess <= 0) toprocess = 1;
9459 }
b9bc0eef 9460 ln = listFirst(server.io_processed);
9461 j = ln->value;
9462 listDelNode(server.io_processed,ln);
9463 unlockThreadedIO();
9464 /* If this job is marked as canceled, just ignore it */
9465 if (j->canceled) {
9466 freeIOJob(j);
9467 continue;
9468 }
9469 /* Post process it in the main thread, as there are things we
9470 * can do just here to avoid race conditions and/or invasive locks */
560db612 9471 redisLog(REDIS_DEBUG,"COMPLETED Job type: %d, ID %p, key: %s", j->type, (void*)j->id, (unsigned char*)j->key->ptr);
b9bc0eef 9472 de = dictFind(j->db->dict,j->key);
e4ed181d 9473 redisAssert(de != NULL);
b9bc0eef 9474 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9475 redisDb *db;
560db612 9476 vmpointer *vp = dictGetEntryVal(de);
d5d55fc3 9477
b9bc0eef 9478 /* Key loaded, bring it at home */
560db612 9479 vmMarkPagesFree(vp->page,vp->usedpages);
b9bc0eef 9480 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
560db612 9481 (unsigned char*) j->key->ptr);
b9bc0eef 9482 server.vm_stats_swapped_objects--;
9483 server.vm_stats_swapins++;
d5d55fc3 9484 dictGetEntryVal(de) = j->val;
9485 incrRefCount(j->val);
9486 db = j->db;
d5d55fc3 9487 /* Handle clients waiting for this key to be loaded. */
560db612 9488 handleClientsBlockedOnSwappedKey(db,j->key);
9489 freeIOJob(j);
9490 zfree(vp);
b9bc0eef 9491 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9492 /* Now we know the amount of pages required to swap this object.
9493 * Let's find some space for it, and queue this task again
9494 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 9495 if (!vmCanSwapOut() ||
9496 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9497 {
9498 /* Ooops... no space or we can't swap as there is
9499 * a fork()ed Redis trying to save stuff on disk. */
560db612 9500 j->val->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 9501 freeIOJob(j);
9502 } else {
c7df85a4 9503 /* Note that we need to mark this pages as used now,
9504 * if the job will be canceled, we'll mark them as freed
9505 * again. */
9506 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 9507 j->type = REDIS_IOJOB_DO_SWAP;
9508 lockThreadedIO();
9509 queueIOJob(j);
9510 unlockThreadedIO();
9511 }
9512 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
560db612 9513 vmpointer *vp;
b9bc0eef 9514
9515 /* Key swapped. We can finally free some memory. */
560db612 9516 if (j->val->storage != REDIS_VM_SWAPPING) {
9517 vmpointer *vp = (vmpointer*) j->id;
9518 printf("storage: %d\n",vp->storage);
9519 printf("key->name: %s\n",(char*)j->key->ptr);
6c96ba7d 9520 printf("val: %p\n",(void*)j->val);
9521 printf("val->type: %d\n",j->val->type);
9522 printf("val->ptr: %s\n",(char*)j->val->ptr);
9523 }
560db612 9524 redisAssert(j->val->storage == REDIS_VM_SWAPPING);
9525 vp = createVmPointer(j->val->type);
9526 vp->page = j->page;
9527 vp->usedpages = j->pages;
9528 dictGetEntryVal(de) = vp;
e4ed181d 9529 /* Fix the storage otherwise decrRefCount will attempt to
9530 * remove the associated I/O job */
9531 j->val->storage = REDIS_VM_MEMORY;
560db612 9532 decrRefCount(j->val);
b9bc0eef 9533 redisLog(REDIS_DEBUG,
9534 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
560db612 9535 (unsigned char*) j->key->ptr,
b9bc0eef 9536 (unsigned long long) j->page, (unsigned long long) j->pages);
9537 server.vm_stats_swapped_objects++;
9538 server.vm_stats_swapouts++;
9539 freeIOJob(j);
f11b8647 9540 /* Put a few more swap requests in queue if we are still
9541 * out of memory */
b0d8747d 9542 if (trytoswap && vmCanSwapOut() &&
9543 zmalloc_used_memory() > server.vm_max_memory)
9544 {
f11b8647 9545 int more = 1;
9546 while(more) {
9547 lockThreadedIO();
9548 more = listLength(server.io_newjobs) <
9549 (unsigned) server.vm_max_threads;
9550 unlockThreadedIO();
9551 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 9552 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9553 trytoswap = 0;
9554 break;
9555 }
f11b8647 9556 }
9557 }
b9bc0eef 9558 }
c953f24b 9559 processed++;
f6c0bba8 9560 if (processed == toprocess) return;
996cb5f7 9561 }
9562 if (retval < 0 && errno != EAGAIN) {
9563 redisLog(REDIS_WARNING,
9564 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9565 strerror(errno));
9566 }
9567}
9568
9569static void lockThreadedIO(void) {
9570 pthread_mutex_lock(&server.io_mutex);
9571}
9572
9573static void unlockThreadedIO(void) {
9574 pthread_mutex_unlock(&server.io_mutex);
9575}
9576
9577/* Remove the specified object from the threaded I/O queue if still not
9578 * processed, otherwise make sure to flag it as canceled. */
9579static void vmCancelThreadedIOJob(robj *o) {
9580 list *lists[3] = {
6c96ba7d 9581 server.io_newjobs, /* 0 */
9582 server.io_processing, /* 1 */
9583 server.io_processed /* 2 */
996cb5f7 9584 };
9585 int i;
9586
9587 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 9588again:
996cb5f7 9589 lockThreadedIO();
560db612 9590 /* Search for a matching object in one of the queues */
996cb5f7 9591 for (i = 0; i < 3; i++) {
9592 listNode *ln;
c7df85a4 9593 listIter li;
996cb5f7 9594
c7df85a4 9595 listRewind(lists[i],&li);
9596 while ((ln = listNext(&li)) != NULL) {
996cb5f7 9597 iojob *job = ln->value;
9598
6c96ba7d 9599 if (job->canceled) continue; /* Skip this, already canceled. */
560db612 9600 if (job->id == o) {
dbc289ae 9601 redisLog(REDIS_DEBUG,"*** CANCELED %p (key %s) (type %d) (LIST ID %d)\n",
9602 (void*)job, (char*)job->key->ptr, job->type, i);
427a2153 9603 /* Mark the pages as free since the swap didn't happened
9604 * or happened but is now discarded. */
970e10bb 9605 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 9606 vmMarkPagesFree(job->page,job->pages);
9607 /* Cancel the job. It depends on the list the job is
9608 * living in. */
996cb5f7 9609 switch(i) {
9610 case 0: /* io_newjobs */
6c96ba7d 9611 /* If the job was yet not processed the best thing to do
996cb5f7 9612 * is to remove it from the queue at all */
6c96ba7d 9613 freeIOJob(job);
996cb5f7 9614 listDelNode(lists[i],ln);
9615 break;
9616 case 1: /* io_processing */
d5d55fc3 9617 /* Oh Shi- the thread is messing with the Job:
9618 *
9619 * Probably it's accessing the object if this is a
9620 * PREPARE_SWAP or DO_SWAP job.
9621 * If it's a LOAD job it may be reading from disk and
9622 * if we don't wait for the job to terminate before to
9623 * cancel it, maybe in a few microseconds data can be
9624 * corrupted in this pages. So the short story is:
9625 *
9626 * Better to wait for the job to move into the
9627 * next queue (processed)... */
9628
9629 /* We try again and again until the job is completed. */
9630 unlockThreadedIO();
9631 /* But let's wait some time for the I/O thread
9632 * to finish with this job. After all this condition
9633 * should be very rare. */
9634 usleep(1);
9635 goto again;
996cb5f7 9636 case 2: /* io_processed */
2e111efe 9637 /* The job was already processed, that's easy...
9638 * just mark it as canceled so that we'll ignore it
9639 * when processing completed jobs. */
996cb5f7 9640 job->canceled = 1;
9641 break;
9642 }
c7df85a4 9643 /* Finally we have to adjust the storage type of the object
9644 * in order to "UNDO" the operaiton. */
996cb5f7 9645 if (o->storage == REDIS_VM_LOADING)
9646 o->storage = REDIS_VM_SWAPPED;
9647 else if (o->storage == REDIS_VM_SWAPPING)
9648 o->storage = REDIS_VM_MEMORY;
9649 unlockThreadedIO();
e4ed181d 9650 redisLog(REDIS_DEBUG,"*** DONE");
996cb5f7 9651 return;
9652 }
9653 }
9654 }
9655 unlockThreadedIO();
560db612 9656 printf("Not found: %p\n", (void*)o);
9657 redisAssert(1 != 1); /* We should never reach this */
996cb5f7 9658}
9659
b9bc0eef 9660static void *IOThreadEntryPoint(void *arg) {
9661 iojob *j;
9662 listNode *ln;
9663 REDIS_NOTUSED(arg);
9664
9665 pthread_detach(pthread_self());
9666 while(1) {
9667 /* Get a new job to process */
9668 lockThreadedIO();
9669 if (listLength(server.io_newjobs) == 0) {
9670 /* No new jobs in queue, exit. */
9ebed7cf 9671 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9672 (long) pthread_self());
b9bc0eef 9673 server.io_active_threads--;
9674 unlockThreadedIO();
9675 return NULL;
9676 }
9677 ln = listFirst(server.io_newjobs);
9678 j = ln->value;
9679 listDelNode(server.io_newjobs,ln);
9680 /* Add the job in the processing queue */
9681 j->thread = pthread_self();
9682 listAddNodeTail(server.io_processing,j);
9683 ln = listLast(server.io_processing); /* We use ln later to remove it */
9684 unlockThreadedIO();
9ebed7cf 9685 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9686 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 9687
9688 /* Process the Job */
9689 if (j->type == REDIS_IOJOB_LOAD) {
560db612 9690 vmpointer *vp = (vmpointer*)j->id;
9691 j->val = vmReadObjectFromSwap(j->page,vp->vtype);
b9bc0eef 9692 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9693 FILE *fp = fopen("/dev/null","w+");
9694 j->pages = rdbSavedObjectPages(j->val,fp);
9695 fclose(fp);
9696 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 9697 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9698 j->canceled = 1;
b9bc0eef 9699 }
9700
9701 /* Done: insert the job into the processed queue */
9ebed7cf 9702 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9703 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 9704 lockThreadedIO();
9705 listDelNode(server.io_processing,ln);
9706 listAddNodeTail(server.io_processed,j);
9707 unlockThreadedIO();
e0a62c7f 9708
b9bc0eef 9709 /* Signal the main thread there is new stuff to process */
9710 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9711 }
9712 return NULL; /* never reached */
9713}
9714
9715static void spawnIOThread(void) {
9716 pthread_t thread;
478c2c6f 9717 sigset_t mask, omask;
a97b9060 9718 int err;
b9bc0eef 9719
478c2c6f 9720 sigemptyset(&mask);
9721 sigaddset(&mask,SIGCHLD);
9722 sigaddset(&mask,SIGHUP);
9723 sigaddset(&mask,SIGPIPE);
9724 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 9725 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9726 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9727 strerror(err));
9728 usleep(1000000);
9729 }
478c2c6f 9730 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 9731 server.io_active_threads++;
9732}
9733
4ee9488d 9734/* We need to wait for the last thread to exit before we are able to
9735 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 9736static void waitEmptyIOJobsQueue(void) {
4ee9488d 9737 while(1) {
76b7233a 9738 int io_processed_len;
9739
4ee9488d 9740 lockThreadedIO();
054e426d 9741 if (listLength(server.io_newjobs) == 0 &&
9742 listLength(server.io_processing) == 0 &&
9743 server.io_active_threads == 0)
9744 {
4ee9488d 9745 unlockThreadedIO();
9746 return;
9747 }
76b7233a 9748 /* While waiting for empty jobs queue condition we post-process some
9749 * finshed job, as I/O threads may be hanging trying to write against
9750 * the io_ready_pipe_write FD but there are so much pending jobs that
9751 * it's blocking. */
9752 io_processed_len = listLength(server.io_processed);
4ee9488d 9753 unlockThreadedIO();
76b7233a 9754 if (io_processed_len) {
9755 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9756 usleep(1000); /* 1 millisecond */
9757 } else {
9758 usleep(10000); /* 10 milliseconds */
9759 }
4ee9488d 9760 }
9761}
9762
054e426d 9763static void vmReopenSwapFile(void) {
478c2c6f 9764 /* Note: we don't close the old one as we are in the child process
9765 * and don't want to mess at all with the original file object. */
054e426d 9766 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9767 if (server.vm_fp == NULL) {
9768 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9769 server.vm_swap_file);
478c2c6f 9770 _exit(1);
054e426d 9771 }
9772 server.vm_fd = fileno(server.vm_fp);
9773}
9774
b9bc0eef 9775/* This function must be called while with threaded IO locked */
9776static void queueIOJob(iojob *j) {
6c96ba7d 9777 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9778 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 9779 listAddNodeTail(server.io_newjobs,j);
9780 if (server.io_active_threads < server.vm_max_threads)
9781 spawnIOThread();
9782}
9783
9784static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9785 iojob *j;
e0a62c7f 9786
b9bc0eef 9787 assert(key->storage == REDIS_VM_MEMORY);
b9bc0eef 9788
9789 j = zmalloc(sizeof(*j));
9790 j->type = REDIS_IOJOB_PREPARE_SWAP;
9791 j->db = db;
78ebe4c8 9792 j->key = key;
7dd8e7cf 9793 incrRefCount(key);
560db612 9794 j->id = j->val = val;
b9bc0eef 9795 incrRefCount(val);
9796 j->canceled = 0;
9797 j->thread = (pthread_t) -1;
560db612 9798 val->storage = REDIS_VM_SWAPPING;
b9bc0eef 9799
9800 lockThreadedIO();
9801 queueIOJob(j);
9802 unlockThreadedIO();
9803 return REDIS_OK;
9804}
9805
b0d8747d 9806/* ============ Virtual Memory - Blocking clients on missing keys =========== */
9807
d5d55fc3 9808/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9809 * If there is not already a job loading the key, it is craeted.
9810 * The key is added to the io_keys list in the client structure, and also
9811 * in the hash table mapping swapped keys to waiting clients, that is,
9812 * server.io_waited_keys. */
9813static int waitForSwappedKey(redisClient *c, robj *key) {
9814 struct dictEntry *de;
9815 robj *o;
9816 list *l;
9817
9818 /* If the key does not exist or is already in RAM we don't need to
9819 * block the client at all. */
9820 de = dictFind(c->db->dict,key);
9821 if (de == NULL) return 0;
560db612 9822 o = dictGetEntryVal(de);
d5d55fc3 9823 if (o->storage == REDIS_VM_MEMORY) {
9824 return 0;
9825 } else if (o->storage == REDIS_VM_SWAPPING) {
9826 /* We were swapping the key, undo it! */
9827 vmCancelThreadedIOJob(o);
9828 return 0;
9829 }
e0a62c7f 9830
d5d55fc3 9831 /* OK: the key is either swapped, or being loaded just now. */
9832
9833 /* Add the key to the list of keys this client is waiting for.
9834 * This maps clients to keys they are waiting for. */
9835 listAddNodeTail(c->io_keys,key);
9836 incrRefCount(key);
9837
9838 /* Add the client to the swapped keys => clients waiting map. */
9839 de = dictFind(c->db->io_keys,key);
9840 if (de == NULL) {
9841 int retval;
9842
9843 /* For every key we take a list of clients blocked for it */
9844 l = listCreate();
9845 retval = dictAdd(c->db->io_keys,key,l);
9846 incrRefCount(key);
9847 assert(retval == DICT_OK);
9848 } else {
9849 l = dictGetEntryVal(de);
9850 }
9851 listAddNodeTail(l,c);
9852
9853 /* Are we already loading the key from disk? If not create a job */
9854 if (o->storage == REDIS_VM_SWAPPED) {
9855 iojob *j;
560db612 9856 vmpointer *vp = (vmpointer*)o;
d5d55fc3 9857
9858 o->storage = REDIS_VM_LOADING;
9859 j = zmalloc(sizeof(*j));
9860 j->type = REDIS_IOJOB_LOAD;
9861 j->db = c->db;
560db612 9862 j->id = (robj*)vp;
9863 j->key = key;
9864 incrRefCount(key);
9865 j->page = vp->page;
d5d55fc3 9866 j->val = NULL;
9867 j->canceled = 0;
9868 j->thread = (pthread_t) -1;
9869 lockThreadedIO();
9870 queueIOJob(j);
9871 unlockThreadedIO();
9872 }
9873 return 1;
9874}
9875
6f078746
PN
9876/* Preload keys for any command with first, last and step values for
9877 * the command keys prototype, as defined in the command table. */
9878static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9879 int j, last;
9880 if (cmd->vm_firstkey == 0) return;
9881 last = cmd->vm_lastkey;
9882 if (last < 0) last = argc+last;
9883 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
9884 redisAssert(j < argc);
9885 waitForSwappedKey(c,argv[j]);
9886 }
9887}
9888
5d373da9 9889/* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
739ba0d2
PN
9890 * Note that the number of keys to preload is user-defined, so we need to
9891 * apply a sanity check against argc. */
ca1788b5 9892static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
76583ea4 9893 int i, num;
ca1788b5 9894 REDIS_NOTUSED(cmd);
ca1788b5
PN
9895
9896 num = atoi(argv[2]->ptr);
739ba0d2 9897 if (num > (argc-3)) return;
76583ea4 9898 for (i = 0; i < num; i++) {
ca1788b5 9899 waitForSwappedKey(c,argv[3+i]);
76583ea4
PN
9900 }
9901}
9902
3805e04f
PN
9903/* Preload keys needed to execute the entire MULTI/EXEC block.
9904 *
9905 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
9906 * and will block the client when any command requires a swapped out value. */
9907static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
9908 int i, margc;
9909 struct redisCommand *mcmd;
9910 robj **margv;
9911 REDIS_NOTUSED(cmd);
9912 REDIS_NOTUSED(argc);
9913 REDIS_NOTUSED(argv);
9914
9915 if (!(c->flags & REDIS_MULTI)) return;
9916 for (i = 0; i < c->mstate.count; i++) {
9917 mcmd = c->mstate.commands[i].cmd;
9918 margc = c->mstate.commands[i].argc;
9919 margv = c->mstate.commands[i].argv;
9920
9921 if (mcmd->vm_preload_proc != NULL) {
9922 mcmd->vm_preload_proc(c,mcmd,margc,margv);
9923 } else {
9924 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
9925 }
76583ea4
PN
9926 }
9927}
9928
b0d8747d 9929/* Is this client attempting to run a command against swapped keys?
d5d55fc3 9930 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 9931 *
d5d55fc3 9932 * The important idea about this function is that it can fail! If keys will
9933 * still be swapped when the client is resumed, this key lookups will
9934 * just block loading keys from disk. In practical terms this should only
9935 * happen with SORT BY command or if there is a bug in this function.
9936 *
9937 * Return 1 if the client is marked as blocked, 0 if the client can
9938 * continue as the keys it is going to access appear to be in memory. */
0a6f3f0f 9939static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
76583ea4 9940 if (cmd->vm_preload_proc != NULL) {
ca1788b5 9941 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
76583ea4 9942 } else {
6f078746 9943 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
76583ea4
PN
9944 }
9945
d5d55fc3 9946 /* If the client was blocked for at least one key, mark it as blocked. */
9947 if (listLength(c->io_keys)) {
9948 c->flags |= REDIS_IO_WAIT;
9949 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9950 server.vm_blocked_clients++;
9951 return 1;
9952 } else {
9953 return 0;
9954 }
9955}
9956
9957/* Remove the 'key' from the list of blocked keys for a given client.
9958 *
9959 * The function returns 1 when there are no longer blocking keys after
9960 * the current one was removed (and the client can be unblocked). */
9961static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9962 list *l;
9963 listNode *ln;
9964 listIter li;
9965 struct dictEntry *de;
9966
9967 /* Remove the key from the list of keys this client is waiting for. */
9968 listRewind(c->io_keys,&li);
9969 while ((ln = listNext(&li)) != NULL) {
bf028098 9970 if (equalStringObjects(ln->value,key)) {
d5d55fc3 9971 listDelNode(c->io_keys,ln);
9972 break;
9973 }
9974 }
9975 assert(ln != NULL);
9976
9977 /* Remove the client form the key => waiting clients map. */
9978 de = dictFind(c->db->io_keys,key);
9979 assert(de != NULL);
9980 l = dictGetEntryVal(de);
9981 ln = listSearchKey(l,c);
9982 assert(ln != NULL);
9983 listDelNode(l,ln);
9984 if (listLength(l) == 0)
9985 dictDelete(c->db->io_keys,key);
9986
9987 return listLength(c->io_keys) == 0;
9988}
9989
560db612 9990/* Every time we now a key was loaded back in memory, we handle clients
9991 * waiting for this key if any. */
d5d55fc3 9992static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9993 struct dictEntry *de;
9994 list *l;
9995 listNode *ln;
9996 int len;
9997
9998 de = dictFind(db->io_keys,key);
9999 if (!de) return;
10000
10001 l = dictGetEntryVal(de);
10002 len = listLength(l);
10003 /* Note: we can't use something like while(listLength(l)) as the list
10004 * can be freed by the calling function when we remove the last element. */
10005 while (len--) {
10006 ln = listFirst(l);
10007 redisClient *c = ln->value;
10008
10009 if (dontWaitForSwappedKey(c,key)) {
10010 /* Put the client in the list of clients ready to go as we
10011 * loaded all the keys about it. */
10012 listAddNodeTail(server.io_ready_clients,c);
10013 }
10014 }
b0d8747d 10015}
b0d8747d 10016
500ece7c 10017/* =========================== Remote Configuration ========================= */
10018
10019static void configSetCommand(redisClient *c) {
10020 robj *o = getDecodedObject(c->argv[3]);
2e5eb04e 10021 long long ll;
10022
500ece7c 10023 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
10024 zfree(server.dbfilename);
10025 server.dbfilename = zstrdup(o->ptr);
10026 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
10027 zfree(server.requirepass);
10028 server.requirepass = zstrdup(o->ptr);
10029 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
10030 zfree(server.masterauth);
10031 server.masterauth = zstrdup(o->ptr);
10032 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
2e5eb04e 10033 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10034 ll < 0) goto badfmt;
10035 server.maxmemory = ll;
10036 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
10037 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10038 ll < 0 || ll > LONG_MAX) goto badfmt;
10039 server.maxidletime = ll;
1b677732 10040 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
10041 if (!strcasecmp(o->ptr,"no")) {
10042 server.appendfsync = APPENDFSYNC_NO;
10043 } else if (!strcasecmp(o->ptr,"everysec")) {
10044 server.appendfsync = APPENDFSYNC_EVERYSEC;
10045 } else if (!strcasecmp(o->ptr,"always")) {
10046 server.appendfsync = APPENDFSYNC_ALWAYS;
10047 } else {
10048 goto badfmt;
10049 }
38db9171 10050 } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) {
10051 int yn = yesnotoi(o->ptr);
10052
10053 if (yn == -1) goto badfmt;
10054 server.no_appendfsync_on_rewrite = yn;
2e5eb04e 10055 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
10056 int old = server.appendonly;
10057 int new = yesnotoi(o->ptr);
10058
10059 if (new == -1) goto badfmt;
10060 if (old != new) {
10061 if (new == 0) {
10062 stopAppendOnly();
10063 } else {
10064 if (startAppendOnly() == REDIS_ERR) {
10065 addReplySds(c,sdscatprintf(sdsempty(),
10066 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10067 decrRefCount(o);
10068 return;
10069 }
10070 }
10071 }
a34e0a25 10072 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10073 int vlen, j;
10074 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10075
10076 /* Perform sanity check before setting the new config:
10077 * - Even number of args
10078 * - Seconds >= 1, changes >= 0 */
10079 if (vlen & 1) {
10080 sdsfreesplitres(v,vlen);
10081 goto badfmt;
10082 }
10083 for (j = 0; j < vlen; j++) {
10084 char *eptr;
10085 long val;
10086
10087 val = strtoll(v[j], &eptr, 10);
10088 if (eptr[0] != '\0' ||
10089 ((j & 1) == 0 && val < 1) ||
10090 ((j & 1) == 1 && val < 0)) {
10091 sdsfreesplitres(v,vlen);
10092 goto badfmt;
10093 }
10094 }
10095 /* Finally set the new config */
10096 resetServerSaveParams();
10097 for (j = 0; j < vlen; j += 2) {
10098 time_t seconds;
10099 int changes;
10100
10101 seconds = strtoll(v[j],NULL,10);
10102 changes = strtoll(v[j+1],NULL,10);
10103 appendServerSaveParams(seconds, changes);
10104 }
10105 sdsfreesplitres(v,vlen);
500ece7c 10106 } else {
10107 addReplySds(c,sdscatprintf(sdsempty(),
10108 "-ERR not supported CONFIG parameter %s\r\n",
10109 (char*)c->argv[2]->ptr));
10110 decrRefCount(o);
10111 return;
10112 }
10113 decrRefCount(o);
10114 addReply(c,shared.ok);
a34e0a25 10115 return;
10116
10117badfmt: /* Bad format errors */
10118 addReplySds(c,sdscatprintf(sdsempty(),
10119 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10120 (char*)o->ptr,
10121 (char*)c->argv[2]->ptr));
10122 decrRefCount(o);
500ece7c 10123}
10124
10125static void configGetCommand(redisClient *c) {
10126 robj *o = getDecodedObject(c->argv[2]);
10127 robj *lenobj = createObject(REDIS_STRING,NULL);
10128 char *pattern = o->ptr;
10129 int matches = 0;
10130
10131 addReply(c,lenobj);
10132 decrRefCount(lenobj);
10133
10134 if (stringmatch(pattern,"dbfilename",0)) {
10135 addReplyBulkCString(c,"dbfilename");
10136 addReplyBulkCString(c,server.dbfilename);
10137 matches++;
10138 }
10139 if (stringmatch(pattern,"requirepass",0)) {
10140 addReplyBulkCString(c,"requirepass");
10141 addReplyBulkCString(c,server.requirepass);
10142 matches++;
10143 }
10144 if (stringmatch(pattern,"masterauth",0)) {
10145 addReplyBulkCString(c,"masterauth");
10146 addReplyBulkCString(c,server.masterauth);
10147 matches++;
10148 }
10149 if (stringmatch(pattern,"maxmemory",0)) {
10150 char buf[128];
10151
2e5eb04e 10152 ll2string(buf,128,server.maxmemory);
500ece7c 10153 addReplyBulkCString(c,"maxmemory");
10154 addReplyBulkCString(c,buf);
10155 matches++;
10156 }
2e5eb04e 10157 if (stringmatch(pattern,"timeout",0)) {
10158 char buf[128];
10159
10160 ll2string(buf,128,server.maxidletime);
10161 addReplyBulkCString(c,"timeout");
10162 addReplyBulkCString(c,buf);
10163 matches++;
10164 }
10165 if (stringmatch(pattern,"appendonly",0)) {
10166 addReplyBulkCString(c,"appendonly");
10167 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10168 matches++;
10169 }
38db9171 10170 if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) {
10171 addReplyBulkCString(c,"no-appendfsync-on-rewrite");
10172 addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no");
10173 matches++;
10174 }
1b677732 10175 if (stringmatch(pattern,"appendfsync",0)) {
10176 char *policy;
10177
10178 switch(server.appendfsync) {
10179 case APPENDFSYNC_NO: policy = "no"; break;
10180 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10181 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10182 default: policy = "unknown"; break; /* too harmless to panic */
10183 }
10184 addReplyBulkCString(c,"appendfsync");
10185 addReplyBulkCString(c,policy);
10186 matches++;
10187 }
a34e0a25 10188 if (stringmatch(pattern,"save",0)) {
10189 sds buf = sdsempty();
10190 int j;
10191
10192 for (j = 0; j < server.saveparamslen; j++) {
10193 buf = sdscatprintf(buf,"%ld %d",
10194 server.saveparams[j].seconds,
10195 server.saveparams[j].changes);
10196 if (j != server.saveparamslen-1)
10197 buf = sdscatlen(buf," ",1);
10198 }
10199 addReplyBulkCString(c,"save");
10200 addReplyBulkCString(c,buf);
10201 sdsfree(buf);
10202 matches++;
10203 }
500ece7c 10204 decrRefCount(o);
10205 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10206}
10207
10208static void configCommand(redisClient *c) {
10209 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10210 if (c->argc != 4) goto badarity;
10211 configSetCommand(c);
10212 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10213 if (c->argc != 3) goto badarity;
10214 configGetCommand(c);
10215 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10216 if (c->argc != 2) goto badarity;
10217 server.stat_numcommands = 0;
10218 server.stat_numconnections = 0;
10219 server.stat_expiredkeys = 0;
10220 server.stat_starttime = time(NULL);
10221 addReply(c,shared.ok);
10222 } else {
10223 addReplySds(c,sdscatprintf(sdsempty(),
10224 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10225 }
10226 return;
10227
10228badarity:
10229 addReplySds(c,sdscatprintf(sdsempty(),
10230 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10231 (char*) c->argv[1]->ptr));
10232}
10233
befec3cd 10234/* =========================== Pubsub implementation ======================== */
10235
ffc6b7f8 10236static void freePubsubPattern(void *p) {
10237 pubsubPattern *pat = p;
10238
10239 decrRefCount(pat->pattern);
10240 zfree(pat);
10241}
10242
10243static int listMatchPubsubPattern(void *a, void *b) {
10244 pubsubPattern *pa = a, *pb = b;
10245
10246 return (pa->client == pb->client) &&
bf028098 10247 (equalStringObjects(pa->pattern,pb->pattern));
ffc6b7f8 10248}
10249
10250/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10251 * 0 if the client was already subscribed to that channel. */
10252static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
befec3cd 10253 struct dictEntry *de;
10254 list *clients = NULL;
10255 int retval = 0;
10256
ffc6b7f8 10257 /* Add the channel to the client -> channels hash table */
10258 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
befec3cd 10259 retval = 1;
ffc6b7f8 10260 incrRefCount(channel);
10261 /* Add the client to the channel -> list of clients hash table */
10262 de = dictFind(server.pubsub_channels,channel);
befec3cd 10263 if (de == NULL) {
10264 clients = listCreate();
ffc6b7f8 10265 dictAdd(server.pubsub_channels,channel,clients);
10266 incrRefCount(channel);
befec3cd 10267 } else {
10268 clients = dictGetEntryVal(de);
10269 }
10270 listAddNodeTail(clients,c);
10271 }
10272 /* Notify the client */
10273 addReply(c,shared.mbulk3);
10274 addReply(c,shared.subscribebulk);
ffc6b7f8 10275 addReplyBulk(c,channel);
482b672d 10276 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
befec3cd 10277 return retval;
10278}
10279
ffc6b7f8 10280/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10281 * 0 if the client was not subscribed to the specified channel. */
10282static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
befec3cd 10283 struct dictEntry *de;
10284 list *clients;
10285 listNode *ln;
10286 int retval = 0;
10287
ffc6b7f8 10288 /* Remove the channel from the client -> channels hash table */
10289 incrRefCount(channel); /* channel may be just a pointer to the same object
201037f5 10290 we have in the hash tables. Protect it... */
ffc6b7f8 10291 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
befec3cd 10292 retval = 1;
ffc6b7f8 10293 /* Remove the client from the channel -> clients list hash table */
10294 de = dictFind(server.pubsub_channels,channel);
befec3cd 10295 assert(de != NULL);
10296 clients = dictGetEntryVal(de);
10297 ln = listSearchKey(clients,c);
10298 assert(ln != NULL);
10299 listDelNode(clients,ln);
ff767a75 10300 if (listLength(clients) == 0) {
10301 /* Free the list and associated hash entry at all if this was
10302 * the latest client, so that it will be possible to abuse
ffc6b7f8 10303 * Redis PUBSUB creating millions of channels. */
10304 dictDelete(server.pubsub_channels,channel);
ff767a75 10305 }
befec3cd 10306 }
10307 /* Notify the client */
10308 if (notify) {
10309 addReply(c,shared.mbulk3);
10310 addReply(c,shared.unsubscribebulk);
ffc6b7f8 10311 addReplyBulk(c,channel);
482b672d 10312 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10313 listLength(c->pubsub_patterns));
10314
10315 }
10316 decrRefCount(channel); /* it is finally safe to release it */
10317 return retval;
10318}
10319
10320/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10321static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10322 int retval = 0;
10323
10324 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10325 retval = 1;
10326 pubsubPattern *pat;
10327 listAddNodeTail(c->pubsub_patterns,pattern);
10328 incrRefCount(pattern);
10329 pat = zmalloc(sizeof(*pat));
10330 pat->pattern = getDecodedObject(pattern);
10331 pat->client = c;
10332 listAddNodeTail(server.pubsub_patterns,pat);
10333 }
10334 /* Notify the client */
10335 addReply(c,shared.mbulk3);
10336 addReply(c,shared.psubscribebulk);
10337 addReplyBulk(c,pattern);
482b672d 10338 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
ffc6b7f8 10339 return retval;
10340}
10341
10342/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10343 * 0 if the client was not subscribed to the specified channel. */
10344static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10345 listNode *ln;
10346 pubsubPattern pat;
10347 int retval = 0;
10348
10349 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10350 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10351 retval = 1;
10352 listDelNode(c->pubsub_patterns,ln);
10353 pat.client = c;
10354 pat.pattern = pattern;
10355 ln = listSearchKey(server.pubsub_patterns,&pat);
10356 listDelNode(server.pubsub_patterns,ln);
10357 }
10358 /* Notify the client */
10359 if (notify) {
10360 addReply(c,shared.mbulk3);
10361 addReply(c,shared.punsubscribebulk);
10362 addReplyBulk(c,pattern);
482b672d 10363 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10364 listLength(c->pubsub_patterns));
befec3cd 10365 }
ffc6b7f8 10366 decrRefCount(pattern);
befec3cd 10367 return retval;
10368}
10369
ffc6b7f8 10370/* Unsubscribe from all the channels. Return the number of channels the
10371 * client was subscribed from. */
10372static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10373 dictIterator *di = dictGetIterator(c->pubsub_channels);
befec3cd 10374 dictEntry *de;
10375 int count = 0;
10376
10377 while((de = dictNext(di)) != NULL) {
ffc6b7f8 10378 robj *channel = dictGetEntryKey(de);
befec3cd 10379
ffc6b7f8 10380 count += pubsubUnsubscribeChannel(c,channel,notify);
befec3cd 10381 }
10382 dictReleaseIterator(di);
10383 return count;
10384}
10385
ffc6b7f8 10386/* Unsubscribe from all the patterns. Return the number of patterns the
10387 * client was subscribed from. */
10388static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10389 listNode *ln;
10390 listIter li;
10391 int count = 0;
10392
10393 listRewind(c->pubsub_patterns,&li);
10394 while ((ln = listNext(&li)) != NULL) {
10395 robj *pattern = ln->value;
10396
10397 count += pubsubUnsubscribePattern(c,pattern,notify);
10398 }
10399 return count;
10400}
10401
befec3cd 10402/* Publish a message */
ffc6b7f8 10403static int pubsubPublishMessage(robj *channel, robj *message) {
befec3cd 10404 int receivers = 0;
10405 struct dictEntry *de;
ffc6b7f8 10406 listNode *ln;
10407 listIter li;
befec3cd 10408
ffc6b7f8 10409 /* Send to clients listening for that channel */
10410 de = dictFind(server.pubsub_channels,channel);
befec3cd 10411 if (de) {
10412 list *list = dictGetEntryVal(de);
10413 listNode *ln;
10414 listIter li;
10415
10416 listRewind(list,&li);
10417 while ((ln = listNext(&li)) != NULL) {
10418 redisClient *c = ln->value;
10419
10420 addReply(c,shared.mbulk3);
10421 addReply(c,shared.messagebulk);
ffc6b7f8 10422 addReplyBulk(c,channel);
befec3cd 10423 addReplyBulk(c,message);
10424 receivers++;
10425 }
10426 }
ffc6b7f8 10427 /* Send to clients listening to matching channels */
10428 if (listLength(server.pubsub_patterns)) {
10429 listRewind(server.pubsub_patterns,&li);
10430 channel = getDecodedObject(channel);
10431 while ((ln = listNext(&li)) != NULL) {
10432 pubsubPattern *pat = ln->value;
10433
10434 if (stringmatchlen((char*)pat->pattern->ptr,
10435 sdslen(pat->pattern->ptr),
10436 (char*)channel->ptr,
10437 sdslen(channel->ptr),0)) {
c8d0ea0e 10438 addReply(pat->client,shared.mbulk4);
10439 addReply(pat->client,shared.pmessagebulk);
10440 addReplyBulk(pat->client,pat->pattern);
ffc6b7f8 10441 addReplyBulk(pat->client,channel);
10442 addReplyBulk(pat->client,message);
10443 receivers++;
10444 }
10445 }
10446 decrRefCount(channel);
10447 }
befec3cd 10448 return receivers;
10449}
10450
10451static void subscribeCommand(redisClient *c) {
10452 int j;
10453
10454 for (j = 1; j < c->argc; j++)
ffc6b7f8 10455 pubsubSubscribeChannel(c,c->argv[j]);
befec3cd 10456}
10457
10458static void unsubscribeCommand(redisClient *c) {
10459 if (c->argc == 1) {
ffc6b7f8 10460 pubsubUnsubscribeAllChannels(c,1);
10461 return;
10462 } else {
10463 int j;
10464
10465 for (j = 1; j < c->argc; j++)
10466 pubsubUnsubscribeChannel(c,c->argv[j],1);
10467 }
10468}
10469
10470static void psubscribeCommand(redisClient *c) {
10471 int j;
10472
10473 for (j = 1; j < c->argc; j++)
10474 pubsubSubscribePattern(c,c->argv[j]);
10475}
10476
10477static void punsubscribeCommand(redisClient *c) {
10478 if (c->argc == 1) {
10479 pubsubUnsubscribeAllPatterns(c,1);
befec3cd 10480 return;
10481 } else {
10482 int j;
10483
10484 for (j = 1; j < c->argc; j++)
ffc6b7f8 10485 pubsubUnsubscribePattern(c,c->argv[j],1);
befec3cd 10486 }
10487}
10488
10489static void publishCommand(redisClient *c) {
10490 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
482b672d 10491 addReplyLongLong(c,receivers);
befec3cd 10492}
10493
37ab76c9 10494/* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
10495 *
10496 * The implementation uses a per-DB hash table mapping keys to list of clients
10497 * WATCHing those keys, so that given a key that is going to be modified
10498 * we can mark all the associated clients as dirty.
10499 *
10500 * Also every client contains a list of WATCHed keys so that's possible to
10501 * un-watch such keys when the client is freed or when UNWATCH is called. */
10502
10503/* In the client->watched_keys list we need to use watchedKey structures
10504 * as in order to identify a key in Redis we need both the key name and the
10505 * DB */
10506typedef struct watchedKey {
10507 robj *key;
10508 redisDb *db;
10509} watchedKey;
10510
10511/* Watch for the specified key */
10512static void watchForKey(redisClient *c, robj *key) {
10513 list *clients = NULL;
10514 listIter li;
10515 listNode *ln;
10516 watchedKey *wk;
10517
10518 /* Check if we are already watching for this key */
10519 listRewind(c->watched_keys,&li);
10520 while((ln = listNext(&li))) {
10521 wk = listNodeValue(ln);
10522 if (wk->db == c->db && equalStringObjects(key,wk->key))
10523 return; /* Key already watched */
10524 }
10525 /* This key is not already watched in this DB. Let's add it */
10526 clients = dictFetchValue(c->db->watched_keys,key);
10527 if (!clients) {
10528 clients = listCreate();
10529 dictAdd(c->db->watched_keys,key,clients);
10530 incrRefCount(key);
10531 }
10532 listAddNodeTail(clients,c);
10533 /* Add the new key to the lits of keys watched by this client */
10534 wk = zmalloc(sizeof(*wk));
10535 wk->key = key;
10536 wk->db = c->db;
10537 incrRefCount(key);
10538 listAddNodeTail(c->watched_keys,wk);
10539}
10540
10541/* Unwatch all the keys watched by this client. To clean the EXEC dirty
10542 * flag is up to the caller. */
10543static void unwatchAllKeys(redisClient *c) {
10544 listIter li;
10545 listNode *ln;
10546
10547 if (listLength(c->watched_keys) == 0) return;
10548 listRewind(c->watched_keys,&li);
10549 while((ln = listNext(&li))) {
10550 list *clients;
10551 watchedKey *wk;
10552
10553 /* Lookup the watched key -> clients list and remove the client
10554 * from the list */
10555 wk = listNodeValue(ln);
10556 clients = dictFetchValue(wk->db->watched_keys, wk->key);
10557 assert(clients != NULL);
10558 listDelNode(clients,listSearchKey(clients,c));
10559 /* Kill the entry at all if this was the only client */
10560 if (listLength(clients) == 0)
10561 dictDelete(wk->db->watched_keys, wk->key);
10562 /* Remove this watched key from the client->watched list */
10563 listDelNode(c->watched_keys,ln);
10564 decrRefCount(wk->key);
10565 zfree(wk);
10566 }
10567}
10568
ca3f830b 10569/* "Touch" a key, so that if this key is being WATCHed by some client the
37ab76c9 10570 * next EXEC will fail. */
10571static void touchWatchedKey(redisDb *db, robj *key) {
10572 list *clients;
10573 listIter li;
10574 listNode *ln;
10575
10576 if (dictSize(db->watched_keys) == 0) return;
10577 clients = dictFetchValue(db->watched_keys, key);
10578 if (!clients) return;
10579
10580 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
10581 /* Check if we are already watching for this key */
10582 listRewind(clients,&li);
10583 while((ln = listNext(&li))) {
10584 redisClient *c = listNodeValue(ln);
10585
10586 c->flags |= REDIS_DIRTY_CAS;
10587 }
10588}
10589
9b30e1a2 10590/* On FLUSHDB or FLUSHALL all the watched keys that are present before the
10591 * flush but will be deleted as effect of the flushing operation should
10592 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
10593 * a FLUSHALL operation (all the DBs flushed). */
10594static void touchWatchedKeysOnFlush(int dbid) {
10595 listIter li1, li2;
10596 listNode *ln;
10597
10598 /* For every client, check all the waited keys */
10599 listRewind(server.clients,&li1);
10600 while((ln = listNext(&li1))) {
10601 redisClient *c = listNodeValue(ln);
10602 listRewind(c->watched_keys,&li2);
10603 while((ln = listNext(&li2))) {
10604 watchedKey *wk = listNodeValue(ln);
10605
10606 /* For every watched key matching the specified DB, if the
10607 * key exists, mark the client as dirty, as the key will be
10608 * removed. */
10609 if (dbid == -1 || wk->db->id == dbid) {
10610 if (dictFind(wk->db->dict, wk->key) != NULL)
10611 c->flags |= REDIS_DIRTY_CAS;
10612 }
10613 }
10614 }
10615}
10616
37ab76c9 10617static void watchCommand(redisClient *c) {
10618 int j;
10619
6531c94d 10620 if (c->flags & REDIS_MULTI) {
10621 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
10622 return;
10623 }
37ab76c9 10624 for (j = 1; j < c->argc; j++)
10625 watchForKey(c,c->argv[j]);
10626 addReply(c,shared.ok);
10627}
10628
10629static void unwatchCommand(redisClient *c) {
10630 unwatchAllKeys(c);
10631 c->flags &= (~REDIS_DIRTY_CAS);
10632 addReply(c,shared.ok);
10633}
10634
7f957c92 10635/* ================================= Debugging ============================== */
10636
ba798261 10637/* Compute the sha1 of string at 's' with 'len' bytes long.
10638 * The SHA1 is then xored againt the string pointed by digest.
10639 * Since xor is commutative, this operation is used in order to
10640 * "add" digests relative to unordered elements.
10641 *
10642 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
10643static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
10644 SHA1_CTX ctx;
10645 unsigned char hash[20], *s = ptr;
10646 int j;
10647
10648 SHA1Init(&ctx);
10649 SHA1Update(&ctx,s,len);
10650 SHA1Final(hash,&ctx);
10651
10652 for (j = 0; j < 20; j++)
10653 digest[j] ^= hash[j];
10654}
10655
10656static void xorObjectDigest(unsigned char *digest, robj *o) {
10657 o = getDecodedObject(o);
10658 xorDigest(digest,o->ptr,sdslen(o->ptr));
10659 decrRefCount(o);
10660}
10661
10662/* This function instead of just computing the SHA1 and xoring it
10663 * against diget, also perform the digest of "digest" itself and
10664 * replace the old value with the new one.
10665 *
10666 * So the final digest will be:
10667 *
10668 * digest = SHA1(digest xor SHA1(data))
10669 *
10670 * This function is used every time we want to preserve the order so
10671 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
10672 *
10673 * Also note that mixdigest("foo") followed by mixdigest("bar")
10674 * will lead to a different digest compared to "fo", "obar".
10675 */
10676static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
10677 SHA1_CTX ctx;
10678 char *s = ptr;
10679
10680 xorDigest(digest,s,len);
10681 SHA1Init(&ctx);
10682 SHA1Update(&ctx,digest,20);
10683 SHA1Final(digest,&ctx);
10684}
10685
10686static void mixObjectDigest(unsigned char *digest, robj *o) {
10687 o = getDecodedObject(o);
10688 mixDigest(digest,o->ptr,sdslen(o->ptr));
10689 decrRefCount(o);
10690}
10691
10692/* Compute the dataset digest. Since keys, sets elements, hashes elements
10693 * are not ordered, we use a trick: every aggregate digest is the xor
10694 * of the digests of their elements. This way the order will not change
10695 * the result. For list instead we use a feedback entering the output digest
10696 * as input in order to ensure that a different ordered list will result in
10697 * a different digest. */
10698static void computeDatasetDigest(unsigned char *final) {
10699 unsigned char digest[20];
10700 char buf[128];
10701 dictIterator *di = NULL;
10702 dictEntry *de;
10703 int j;
10704 uint32_t aux;
10705
10706 memset(final,0,20); /* Start with a clean result */
10707
10708 for (j = 0; j < server.dbnum; j++) {
10709 redisDb *db = server.db+j;
10710
10711 if (dictSize(db->dict) == 0) continue;
10712 di = dictGetIterator(db->dict);
10713
10714 /* hash the DB id, so the same dataset moved in a different
10715 * DB will lead to a different digest */
10716 aux = htonl(j);
10717 mixDigest(final,&aux,sizeof(aux));
10718
10719 /* Iterate this DB writing every entry */
10720 while((de = dictNext(di)) != NULL) {
cbae1d34 10721 robj *key, *o, *kcopy;
ba798261 10722 time_t expiretime;
10723
10724 memset(digest,0,20); /* This key-val digest */
10725 key = dictGetEntryKey(de);
cbae1d34 10726
10727 if (!server.vm_enabled) {
10728 mixObjectDigest(digest,key);
ba798261 10729 o = dictGetEntryVal(de);
ba798261 10730 } else {
cbae1d34 10731 /* Don't work with the key directly as when VM is active
10732 * this is unsafe: TODO: fix decrRefCount to check if the
10733 * count really reached 0 to avoid this mess */
10734 kcopy = dupStringObject(key);
10735 mixObjectDigest(digest,kcopy);
10736 o = lookupKeyRead(db,kcopy);
10737 decrRefCount(kcopy);
ba798261 10738 }
10739 aux = htonl(o->type);
10740 mixDigest(digest,&aux,sizeof(aux));
10741 expiretime = getExpire(db,key);
10742
10743 /* Save the key and associated value */
10744 if (o->type == REDIS_STRING) {
10745 mixObjectDigest(digest,o);
10746 } else if (o->type == REDIS_LIST) {
10747 list *list = o->ptr;
10748 listNode *ln;
10749 listIter li;
10750
10751 listRewind(list,&li);
10752 while((ln = listNext(&li))) {
10753 robj *eleobj = listNodeValue(ln);
10754
10755 mixObjectDigest(digest,eleobj);
10756 }
10757 } else if (o->type == REDIS_SET) {
10758 dict *set = o->ptr;
10759 dictIterator *di = dictGetIterator(set);
10760 dictEntry *de;
10761
10762 while((de = dictNext(di)) != NULL) {
10763 robj *eleobj = dictGetEntryKey(de);
10764
10765 xorObjectDigest(digest,eleobj);
10766 }
10767 dictReleaseIterator(di);
10768 } else if (o->type == REDIS_ZSET) {
10769 zset *zs = o->ptr;
10770 dictIterator *di = dictGetIterator(zs->dict);
10771 dictEntry *de;
10772
10773 while((de = dictNext(di)) != NULL) {
10774 robj *eleobj = dictGetEntryKey(de);
10775 double *score = dictGetEntryVal(de);
10776 unsigned char eledigest[20];
10777
10778 snprintf(buf,sizeof(buf),"%.17g",*score);
10779 memset(eledigest,0,20);
10780 mixObjectDigest(eledigest,eleobj);
10781 mixDigest(eledigest,buf,strlen(buf));
10782 xorDigest(digest,eledigest,20);
10783 }
10784 dictReleaseIterator(di);
10785 } else if (o->type == REDIS_HASH) {
10786 hashIterator *hi;
10787 robj *obj;
10788
10789 hi = hashInitIterator(o);
10790 while (hashNext(hi) != REDIS_ERR) {
10791 unsigned char eledigest[20];
10792
10793 memset(eledigest,0,20);
10794 obj = hashCurrent(hi,REDIS_HASH_KEY);
10795 mixObjectDigest(eledigest,obj);
10796 decrRefCount(obj);
10797 obj = hashCurrent(hi,REDIS_HASH_VALUE);
10798 mixObjectDigest(eledigest,obj);
10799 decrRefCount(obj);
10800 xorDigest(digest,eledigest,20);
10801 }
10802 hashReleaseIterator(hi);
10803 } else {
10804 redisPanic("Unknown object type");
10805 }
ba798261 10806 /* If the key has an expire, add it to the mix */
10807 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
10808 /* We can finally xor the key-val digest to the final digest */
10809 xorDigest(final,digest,20);
10810 }
10811 dictReleaseIterator(di);
10812 }
10813}
10814
7f957c92 10815static void debugCommand(redisClient *c) {
10816 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10817 *((char*)-1) = 'x';
210e29f7 10818 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10819 if (rdbSave(server.dbfilename) != REDIS_OK) {
10820 addReply(c,shared.err);
10821 return;
10822 }
10823 emptyDb();
10824 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10825 addReply(c,shared.err);
10826 return;
10827 }
10828 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10829 addReply(c,shared.ok);
71c2b467 10830 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10831 emptyDb();
10832 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10833 addReply(c,shared.err);
10834 return;
10835 }
10836 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10837 addReply(c,shared.ok);
333298da 10838 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10839 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10840 robj *key, *val;
10841
10842 if (!de) {
10843 addReply(c,shared.nokeyerr);
10844 return;
10845 }
10846 key = dictGetEntryKey(de);
10847 val = dictGetEntryVal(de);
560db612 10848 if (!server.vm_enabled || (val->storage == REDIS_VM_MEMORY ||
10849 val->storage == REDIS_VM_SWAPPING)) {
07efaf74 10850 char *strenc;
10851 char buf[128];
10852
10853 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10854 strenc = strencoding[val->encoding];
10855 } else {
10856 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10857 strenc = buf;
10858 }
ace06542 10859 addReplySds(c,sdscatprintf(sdsempty(),
10860 "+Key at:%p refcount:%d, value at:%p refcount:%d "
07efaf74 10861 "encoding:%s serializedlength:%lld\r\n",
682ac724 10862 (void*)key, key->refcount, (void*)val, val->refcount,
07efaf74 10863 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 10864 } else {
560db612 10865 vmpointer *vp = (vmpointer*) val;
ace06542 10866 addReplySds(c,sdscatprintf(sdsempty(),
10867 "+Key at:%p refcount:%d, value swapped at: page %llu "
10868 "using %llu pages\r\n",
560db612 10869 (void*)key, key->refcount, (unsigned long long) vp->page,
10870 (unsigned long long) vp->usedpages));
ace06542 10871 }
78ebe4c8 10872 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10873 lookupKeyRead(c->db,c->argv[2]);
10874 addReply(c,shared.ok);
7d30035d 10875 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10876 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10877 robj *key, *val;
560db612 10878 vmpointer *vp;
7d30035d 10879
10880 if (!server.vm_enabled) {
10881 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10882 return;
10883 }
10884 if (!de) {
10885 addReply(c,shared.nokeyerr);
10886 return;
10887 }
10888 key = dictGetEntryKey(de);
10889 val = dictGetEntryVal(de);
4ef8de8a 10890 /* Swap it */
560db612 10891 if (val->storage != REDIS_VM_MEMORY) {
7d30035d 10892 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
560db612 10893 } else if (val->refcount != 1) {
10894 addReplySds(c,sdsnew("-ERR Object is shared\r\n"));
10895 } else if ((vp = vmSwapObjectBlocking(val)) != NULL) {
10896 dictGetEntryVal(de) = vp;
7d30035d 10897 addReply(c,shared.ok);
10898 } else {
10899 addReply(c,shared.err);
10900 }
59305dc7 10901 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
10902 long keys, j;
10903 robj *key, *val;
10904 char buf[128];
10905
10906 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
10907 return;
10908 for (j = 0; j < keys; j++) {
10909 snprintf(buf,sizeof(buf),"key:%lu",j);
10910 key = createStringObject(buf,strlen(buf));
10911 if (lookupKeyRead(c->db,key) != NULL) {
10912 decrRefCount(key);
10913 continue;
10914 }
10915 snprintf(buf,sizeof(buf),"value:%lu",j);
10916 val = createStringObject(buf,strlen(buf));
10917 dictAdd(c->db->dict,key,val);
10918 }
10919 addReply(c,shared.ok);
ba798261 10920 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
10921 unsigned char digest[20];
10922 sds d = sdsnew("+");
10923 int j;
10924
10925 computeDatasetDigest(digest);
10926 for (j = 0; j < 20; j++)
10927 d = sdscatprintf(d, "%02x",digest[j]);
10928
10929 d = sdscatlen(d,"\r\n",2);
10930 addReplySds(c,d);
7f957c92 10931 } else {
333298da 10932 addReplySds(c,sdsnew(
bdcb92f2 10933 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 10934 }
10935}
56906eef 10936
6c96ba7d 10937static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 10938 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
fdfb02e7 10939 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
dfc5e96c 10940#ifdef HAVE_BACKTRACE
10941 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10942 *((char*)-1) = 'x';
10943#endif
10944}
10945
c651fd9e 10946static void _redisPanic(char *msg, char *file, int line) {
10947 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
17772754 10948 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
c651fd9e 10949#ifdef HAVE_BACKTRACE
10950 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10951 *((char*)-1) = 'x';
10952#endif
10953}
10954
bcfc686d 10955/* =================================== Main! ================================ */
56906eef 10956
bcfc686d 10957#ifdef __linux__
10958int linuxOvercommitMemoryValue(void) {
10959 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10960 char buf[64];
56906eef 10961
bcfc686d 10962 if (!fp) return -1;
10963 if (fgets(buf,64,fp) == NULL) {
10964 fclose(fp);
10965 return -1;
10966 }
10967 fclose(fp);
56906eef 10968
bcfc686d 10969 return atoi(buf);
10970}
10971
10972void linuxOvercommitMemoryWarning(void) {
10973 if (linuxOvercommitMemoryValue() == 0) {
7ccd2d0a 10974 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
bcfc686d 10975 }
10976}
10977#endif /* __linux__ */
10978
10979static void daemonize(void) {
10980 int fd;
10981 FILE *fp;
10982
10983 if (fork() != 0) exit(0); /* parent exits */
10984 setsid(); /* create a new session */
10985
10986 /* Every output goes to /dev/null. If Redis is daemonized but
10987 * the 'logfile' is set to 'stdout' in the configuration file
10988 * it will not log at all. */
10989 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10990 dup2(fd, STDIN_FILENO);
10991 dup2(fd, STDOUT_FILENO);
10992 dup2(fd, STDERR_FILENO);
10993 if (fd > STDERR_FILENO) close(fd);
10994 }
10995 /* Try to write the pid file */
10996 fp = fopen(server.pidfile,"w");
10997 if (fp) {
10998 fprintf(fp,"%d\n",getpid());
10999 fclose(fp);
56906eef 11000 }
56906eef 11001}
11002
42ab0172 11003static void version() {
8a3b0d2d 11004 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
11005 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
42ab0172
AO
11006 exit(0);
11007}
11008
723fb69b
AO
11009static void usage() {
11010 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 11011 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
11012 exit(1);
11013}
11014
bcfc686d 11015int main(int argc, char **argv) {
9651a787 11016 time_t start;
11017
bcfc686d 11018 initServerConfig();
1a132bbc 11019 sortCommandTable();
bcfc686d 11020 if (argc == 2) {
44efe66e 11021 if (strcmp(argv[1], "-v") == 0 ||
11022 strcmp(argv[1], "--version") == 0) version();
11023 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 11024 resetServerSaveParams();
11025 loadServerConfig(argv[1]);
723fb69b
AO
11026 } else if ((argc > 2)) {
11027 usage();
bcfc686d 11028 } else {
11029 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
11030 }
bcfc686d 11031 if (server.daemonize) daemonize();
71c54b21 11032 initServer();
bcfc686d 11033 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
11034#ifdef __linux__
11035 linuxOvercommitMemoryWarning();
11036#endif
9651a787 11037 start = time(NULL);
bcfc686d 11038 if (server.appendonly) {
11039 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 11040 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 11041 } else {
11042 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 11043 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 11044 }
bcfc686d 11045 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 11046 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 11047 aeMain(server.el);
11048 aeDeleteEventLoop(server.el);
11049 return 0;
11050}
11051
11052/* ============================= Backtrace support ========================= */
11053
11054#ifdef HAVE_BACKTRACE
11055static char *findFuncName(void *pointer, unsigned long *offset);
11056
56906eef 11057static void *getMcontextEip(ucontext_t *uc) {
11058#if defined(__FreeBSD__)
11059 return (void*) uc->uc_mcontext.mc_eip;
11060#elif defined(__dietlibc__)
11061 return (void*) uc->uc_mcontext.eip;
06db1f50 11062#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 11063 #if __x86_64__
11064 return (void*) uc->uc_mcontext->__ss.__rip;
11065 #else
56906eef 11066 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 11067 #endif
06db1f50 11068#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 11069 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 11070 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 11071 #else
11072 return (void*) uc->uc_mcontext->__ss.__eip;
e0a62c7f 11073 #endif
54bac49d 11074#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 11075 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 11076#elif defined(__ia64__) /* Linux IA64 */
11077 return (void*) uc->uc_mcontext.sc_ip;
11078#else
11079 return NULL;
56906eef 11080#endif
11081}
11082
11083static void segvHandler(int sig, siginfo_t *info, void *secret) {
11084 void *trace[100];
11085 char **messages = NULL;
11086 int i, trace_size = 0;
11087 unsigned long offset=0;
56906eef 11088 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 11089 sds infostring;
56906eef 11090 REDIS_NOTUSED(info);
11091
11092 redisLog(REDIS_WARNING,
11093 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 11094 infostring = genRedisInfoString();
11095 redisLog(REDIS_WARNING, "%s",infostring);
11096 /* It's not safe to sdsfree() the returned string under memory
11097 * corruption conditions. Let it leak as we are going to abort */
e0a62c7f 11098
56906eef 11099 trace_size = backtrace(trace, 100);
de96dbfe 11100 /* overwrite sigaction with caller's address */
b91cf5ef 11101 if (getMcontextEip(uc) != NULL) {
11102 trace[1] = getMcontextEip(uc);
11103 }
56906eef 11104 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 11105
d76412d1 11106 for (i=1; i<trace_size; ++i) {
56906eef 11107 char *fn = findFuncName(trace[i], &offset), *p;
11108
11109 p = strchr(messages[i],'+');
11110 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11111 redisLog(REDIS_WARNING,"%s", messages[i]);
11112 } else {
11113 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11114 }
11115 }
b177fd30 11116 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 11117 _exit(0);
fe3bbfbe 11118}
56906eef 11119
fab43727 11120static void sigtermHandler(int sig) {
11121 REDIS_NOTUSED(sig);
b58ba105 11122
fab43727 11123 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11124 server.shutdown_asap = 1;
b58ba105
AM
11125}
11126
56906eef 11127static void setupSigSegvAction(void) {
11128 struct sigaction act;
11129
11130 sigemptyset (&act.sa_mask);
11131 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11132 * is used. Otherwise, sa_handler is used */
11133 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11134 act.sa_sigaction = segvHandler;
11135 sigaction (SIGSEGV, &act, NULL);
11136 sigaction (SIGBUS, &act, NULL);
12fea928 11137 sigaction (SIGFPE, &act, NULL);
11138 sigaction (SIGILL, &act, NULL);
11139 sigaction (SIGBUS, &act, NULL);
b58ba105
AM
11140
11141 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
fab43727 11142 act.sa_handler = sigtermHandler;
b58ba105 11143 sigaction (SIGTERM, &act, NULL);
e65fdc78 11144 return;
56906eef 11145}
e65fdc78 11146
bcfc686d 11147#include "staticsymbols.h"
11148/* This function try to convert a pointer into a function name. It's used in
11149 * oreder to provide a backtrace under segmentation fault that's able to
11150 * display functions declared as static (otherwise the backtrace is useless). */
11151static char *findFuncName(void *pointer, unsigned long *offset){
11152 int i, ret = -1;
11153 unsigned long off, minoff = 0;
ed9b544e 11154
bcfc686d 11155 /* Try to match against the Symbol with the smallest offset */
11156 for (i=0; symsTable[i].pointer; i++) {
11157 unsigned long lp = (unsigned long) pointer;
0bc03378 11158
bcfc686d 11159 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11160 off=lp-symsTable[i].pointer;
11161 if (ret < 0 || off < minoff) {
11162 minoff=off;
11163 ret=i;
11164 }
11165 }
0bc03378 11166 }
bcfc686d 11167 if (ret == -1) return NULL;
11168 *offset = minoff;
11169 return symsTable[ret].name;
0bc03378 11170}
bcfc686d 11171#else /* HAVE_BACKTRACE */
11172static void setupSigSegvAction(void) {
0bc03378 11173}
bcfc686d 11174#endif /* HAVE_BACKTRACE */
0bc03378 11175
ed9b544e 11176
ed9b544e 11177
bcfc686d 11178/* The End */
11179
11180
ed9b544e 11181