]> git.saurik.com Git - redis.git/blame - redis.c
very strong speedup in saving time performance when there are many integers in the...
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
24df7698 30#define REDIS_VERSION "1.3.10"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
40#include <signal.h>
fbf9bcdb 41
42#ifdef HAVE_BACKTRACE
c9468bcf 43#include <execinfo.h>
44#include <ucontext.h>
fbf9bcdb 45#endif /* HAVE_BACKTRACE */
46
ed9b544e 47#include <sys/wait.h>
48#include <errno.h>
49#include <assert.h>
50#include <ctype.h>
51#include <stdarg.h>
52#include <inttypes.h>
53#include <arpa/inet.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#include <sys/time.h>
57#include <sys/resource.h>
2895e862 58#include <sys/uio.h>
f78fd11b 59#include <limits.h>
fb82e75c 60#include <float.h>
a7866db6 61#include <math.h>
92f8e882 62#include <pthread.h>
0bc1b2f6 63
64#if defined(__sun)
5043dff3 65#include "solarisfixes.h"
66#endif
ed9b544e 67
c9468bcf 68#include "redis.h"
ed9b544e 69#include "ae.h" /* Event driven programming library */
70#include "sds.h" /* Dynamic safe strings */
71#include "anet.h" /* Networking the easy way */
72#include "dict.h" /* Hash tables */
73#include "adlist.h" /* Linked lists */
74#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 75#include "lzf.h" /* LZF compression library */
76#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
5234952b 77#include "zipmap.h"
ed9b544e 78
79/* Error codes */
80#define REDIS_OK 0
81#define REDIS_ERR -1
82
83/* Static server configuration */
84#define REDIS_SERVERPORT 6379 /* TCP port */
85#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 86#define REDIS_IOBUF_LEN 1024
ed9b544e 87#define REDIS_LOADBUF_LEN 1024
248ea310 88#define REDIS_STATIC_ARGS 8
ed9b544e 89#define REDIS_DEFAULT_DBNUM 16
90#define REDIS_CONFIGLINE_MAX 1024
91#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
92#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
8ca3e9d1 93#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
6f376729 94#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 95#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
96
97/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
98#define REDIS_WRITEV_THRESHOLD 3
99/* Max number of iovecs used for each writev call */
100#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 101
102/* Hash table parameters */
103#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 104
105/* Command flags */
3fd78bcd 106#define REDIS_CMD_BULK 1 /* Bulk write command */
107#define REDIS_CMD_INLINE 2 /* Inline command */
108/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
109 this flags will return an error when the 'maxmemory' option is set in the
110 config file and the server is using more than maxmemory bytes of memory.
111 In short this commands are denied on low memory conditions. */
112#define REDIS_CMD_DENYOOM 4
4005fef1 113#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
ed9b544e 114
115/* Object types */
116#define REDIS_STRING 0
117#define REDIS_LIST 1
118#define REDIS_SET 2
1812e024 119#define REDIS_ZSET 3
120#define REDIS_HASH 4
f78fd11b 121
5234952b 122/* Objects encoding. Some kind of objects like Strings and Hashes can be
123 * internally represented in multiple ways. The 'encoding' field of the object
124 * is set to one of this fields for this object. */
942a3961 125#define REDIS_ENCODING_RAW 0 /* Raw representation */
126#define REDIS_ENCODING_INT 1 /* Encoded as integer */
5234952b 127#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
128#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
942a3961 129
07efaf74 130static char* strencoding[] = {
131 "raw", "int", "zipmap", "hashtable"
132};
133
f78fd11b 134/* Object types only used for dumping to disk */
bb32ede5 135#define REDIS_EXPIRETIME 253
ed9b544e 136#define REDIS_SELECTDB 254
137#define REDIS_EOF 255
138
f78fd11b 139/* Defines related to the dump file format. To store 32 bits lengths for short
140 * keys requires a lot of space, so we check the most significant 2 bits of
141 * the first byte to interpreter the length:
142 *
143 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
144 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
145 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 146 * 11|000000 this means: specially encoded object will follow. The six bits
147 * number specify the kind of object that follows.
148 * See the REDIS_RDB_ENC_* defines.
f78fd11b 149 *
10c43610 150 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
151 * values, will fit inside. */
f78fd11b 152#define REDIS_RDB_6BITLEN 0
153#define REDIS_RDB_14BITLEN 1
154#define REDIS_RDB_32BITLEN 2
17be1a4a 155#define REDIS_RDB_ENCVAL 3
f78fd11b 156#define REDIS_RDB_LENERR UINT_MAX
157
a4d1ba9a 158/* When a length of a string object stored on disk has the first two bits
159 * set, the remaining two bits specify a special encoding for the object
160 * accordingly to the following defines: */
161#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
162#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
163#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 164#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 165
75680a3c 166/* Virtual memory object->where field. */
167#define REDIS_VM_MEMORY 0 /* The object is on memory */
168#define REDIS_VM_SWAPPED 1 /* The object is on disk */
169#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
170#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
171
06224fec 172/* Virtual memory static configuration stuff.
173 * Check vmFindContiguousPages() to know more about this magic numbers. */
174#define REDIS_VM_MAX_NEAR_PAGES 65536
175#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 176#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 177#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 178/* The following is the *percentage* of completed I/O jobs to process when the
179 * handelr is called. While Virtual Memory I/O operations are performed by
180 * threads, this operations must be processed by the main thread when completed
181 * in order to take effect. */
c953f24b 182#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 183
ed9b544e 184/* Client flags */
d5d55fc3 185#define REDIS_SLAVE 1 /* This client is a slave server */
186#define REDIS_MASTER 2 /* This client is a master server */
187#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
188#define REDIS_MULTI 8 /* This client is in a MULTI context */
189#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
190#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
ed9b544e 191
40d224a9 192/* Slave replication state - slave side */
ed9b544e 193#define REDIS_REPL_NONE 0 /* No active replication */
194#define REDIS_REPL_CONNECT 1 /* Must connect to master */
195#define REDIS_REPL_CONNECTED 2 /* Connected to master */
196
40d224a9 197/* Slave replication state - from the point of view of master
198 * Note that in SEND_BULK and ONLINE state the slave receives new updates
199 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
200 * to start the next background saving in order to send updates to it. */
201#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
202#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
203#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
204#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
205
ed9b544e 206/* List related stuff */
207#define REDIS_HEAD 0
208#define REDIS_TAIL 1
209
210/* Sort operations */
211#define REDIS_SORT_GET 0
443c6409 212#define REDIS_SORT_ASC 1
213#define REDIS_SORT_DESC 2
ed9b544e 214#define REDIS_SORTKEY_MAX 1024
215
216/* Log levels */
217#define REDIS_DEBUG 0
f870935d 218#define REDIS_VERBOSE 1
219#define REDIS_NOTICE 2
220#define REDIS_WARNING 3
ed9b544e 221
222/* Anti-warning macro... */
223#define REDIS_NOTUSED(V) ((void) V)
224
6b47e12e 225#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
226#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 227
48f0308a 228/* Append only defines */
229#define APPENDFSYNC_NO 0
230#define APPENDFSYNC_ALWAYS 1
231#define APPENDFSYNC_EVERYSEC 2
232
cbba7dd7 233/* Hashes related defaults */
234#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
235#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
236
dfc5e96c 237/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 238#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
c651fd9e 239#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
6c96ba7d 240static void _redisAssert(char *estr, char *file, int line);
c651fd9e 241static void _redisPanic(char *msg, char *file, int line);
dfc5e96c 242
ed9b544e 243/*================================= Data types ============================== */
244
245/* A redis object, that is a type able to hold a string / list / set */
75680a3c 246
247/* The VM object structure */
248struct redisObjectVM {
3a66edc7 249 off_t page; /* the page at witch the object is stored on disk */
250 off_t usedpages; /* number of pages used on disk */
251 time_t atime; /* Last access time */
75680a3c 252} vm;
253
254/* The actual Redis Object */
ed9b544e 255typedef struct redisObject {
ed9b544e 256 void *ptr;
942a3961 257 unsigned char type;
258 unsigned char encoding;
d894161b 259 unsigned char storage; /* If this object is a key, where is the value?
260 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
261 unsigned char vtype; /* If this object is a key, and value is swapped out,
262 * this is the type of the swapped out object. */
ed9b544e 263 int refcount;
75680a3c 264 /* VM fields, this are only allocated if VM is active, otherwise the
265 * object allocation function will just allocate
266 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
267 * Redis without VM active will not have any overhead. */
268 struct redisObjectVM vm;
ed9b544e 269} robj;
270
dfc5e96c 271/* Macro used to initalize a Redis object allocated on the stack.
272 * Note that this macro is taken near the structure definition to make sure
273 * we'll update it when the structure is changed, to avoid bugs like
274 * bug #85 introduced exactly in this way. */
275#define initStaticStringObject(_var,_ptr) do { \
276 _var.refcount = 1; \
277 _var.type = REDIS_STRING; \
278 _var.encoding = REDIS_ENCODING_RAW; \
279 _var.ptr = _ptr; \
3a66edc7 280 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 281} while(0);
282
3305306f 283typedef struct redisDb {
4409877e 284 dict *dict; /* The keyspace for this DB */
285 dict *expires; /* Timeout of keys with a timeout set */
286 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 287 dict *io_keys; /* Keys with clients waiting for VM I/O */
3305306f 288 int id;
289} redisDb;
290
6e469882 291/* Client MULTI/EXEC state */
292typedef struct multiCmd {
293 robj **argv;
294 int argc;
295 struct redisCommand *cmd;
296} multiCmd;
297
298typedef struct multiState {
299 multiCmd *commands; /* Array of MULTI commands */
300 int count; /* Total number of MULTI commands */
301} multiState;
302
ed9b544e 303/* With multiplexing we need to take per-clinet state.
304 * Clients are taken in a liked list. */
305typedef struct redisClient {
306 int fd;
3305306f 307 redisDb *db;
ed9b544e 308 int dictid;
309 sds querybuf;
e8a74421 310 robj **argv, **mbargv;
311 int argc, mbargc;
40d224a9 312 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 313 int multibulk; /* multi bulk command format active */
ed9b544e 314 list *reply;
315 int sentlen;
316 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 317 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 318 int slaveseldb; /* slave selected db, if this client is a slave */
319 int authenticated; /* when requirepass is non-NULL */
320 int replstate; /* replication state if this is a slave */
321 int repldbfd; /* replication DB file descriptor */
6e469882 322 long repldboff; /* replication DB file offset */
40d224a9 323 off_t repldbsize; /* replication DB file size */
6e469882 324 multiState mstate; /* MULTI/EXEC state */
d5d55fc3 325 robj **blockingkeys; /* The key we are waiting to terminate a blocking
4409877e 326 * operation such as BLPOP. Otherwise NULL. */
b177fd30 327 int blockingkeysnum; /* Number of blocking keys */
4409877e 328 time_t blockingto; /* Blocking operation timeout. If UNIX current time
329 * is >= blockingto then the operation timed out. */
92f8e882 330 list *io_keys; /* Keys this client is waiting to be loaded from the
331 * swap file in order to continue. */
ffc6b7f8 332 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
333 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
ed9b544e 334} redisClient;
335
336struct saveparam {
337 time_t seconds;
338 int changes;
339};
340
341/* Global server state structure */
342struct redisServer {
343 int port;
344 int fd;
3305306f 345 redisDb *db;
ed9b544e 346 long long dirty; /* changes to DB from the last save */
347 list *clients;
87eca727 348 list *slaves, *monitors;
ed9b544e 349 char neterr[ANET_ERR_LEN];
350 aeEventLoop *el;
351 int cronloops; /* number of times the cron function run */
352 list *objfreelist; /* A list of freed objects to avoid malloc() */
353 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 354 /* Fields used only for stats */
355 time_t stat_starttime; /* server start time */
356 long long stat_numcommands; /* number of processed commands */
357 long long stat_numconnections; /* number of connections received */
2a6a2ed1 358 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 359 /* Configuration */
360 int verbosity;
361 int glueoutputbuf;
362 int maxidletime;
363 int dbnum;
364 int daemonize;
44b38ef4 365 int appendonly;
48f0308a 366 int appendfsync;
367 time_t lastfsync;
44b38ef4 368 int appendfd;
369 int appendseldb;
ed329fcf 370 char *pidfile;
9f3c422c 371 pid_t bgsavechildpid;
9d65a1bb 372 pid_t bgrewritechildpid;
373 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
28ed1f33 374 sds aofbuf; /* AOF buffer, written before entering the event loop */
ed9b544e 375 struct saveparam *saveparams;
376 int saveparamslen;
377 char *logfile;
378 char *bindaddr;
379 char *dbfilename;
44b38ef4 380 char *appendfilename;
abcb223e 381 char *requirepass;
121f70cf 382 int rdbcompression;
8ca3e9d1 383 int activerehashing;
ed9b544e 384 /* Replication related */
385 int isslave;
d0ccebcf 386 char *masterauth;
ed9b544e 387 char *masterhost;
388 int masterport;
40d224a9 389 redisClient *master; /* client that is master for this slave */
ed9b544e 390 int replstate;
285add55 391 unsigned int maxclients;
4ef8de8a 392 unsigned long long maxmemory;
d5d55fc3 393 unsigned int blpop_blocked_clients;
394 unsigned int vm_blocked_clients;
ed9b544e 395 /* Sort parameters - qsort_r() is only available under BSD so we
396 * have to take this state global, in order to pass it to sortCompare() */
397 int sort_desc;
398 int sort_alpha;
399 int sort_bypattern;
75680a3c 400 /* Virtual memory configuration */
401 int vm_enabled;
054e426d 402 char *vm_swap_file;
75680a3c 403 off_t vm_page_size;
404 off_t vm_pages;
4ef8de8a 405 unsigned long long vm_max_memory;
cbba7dd7 406 /* Hashes config */
407 size_t hash_max_zipmap_entries;
408 size_t hash_max_zipmap_value;
75680a3c 409 /* Virtual memory state */
410 FILE *vm_fp;
411 int vm_fd;
412 off_t vm_next_page; /* Next probably empty page */
413 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 414 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 415 time_t unixtime; /* Unix time sampled every second. */
92f8e882 416 /* Virtual memory I/O threads stuff */
92f8e882 417 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 418 * put the result of the operation in the io_done list. While the
419 * job is being processed, it's put on io_processing queue. */
420 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
421 list *io_processing; /* List of VM I/O jobs being processed */
422 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 423 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 424 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 425 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
426 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 427 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 428 int io_active_threads; /* Number of running I/O threads */
429 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 430 /* Our main thread is blocked on the event loop, locking for sockets ready
431 * to be read or written, so when a threaded I/O operation is ready to be
432 * processed by the main thread, the I/O thread will use a unix pipe to
433 * awake the main thread. The followings are the two pipe FDs. */
434 int io_ready_pipe_read;
435 int io_ready_pipe_write;
7d98e08c 436 /* Virtual memory stats */
437 unsigned long long vm_stats_used_pages;
438 unsigned long long vm_stats_swapped_objects;
439 unsigned long long vm_stats_swapouts;
440 unsigned long long vm_stats_swapins;
befec3cd 441 /* Pubsub */
ffc6b7f8 442 dict *pubsub_channels; /* Map channels to list of subscribed clients */
443 list *pubsub_patterns; /* A list of pubsub_patterns */
befec3cd 444 /* Misc */
b9bc0eef 445 FILE *devnull;
ed9b544e 446};
447
ffc6b7f8 448typedef struct pubsubPattern {
449 redisClient *client;
450 robj *pattern;
451} pubsubPattern;
452
ed9b544e 453typedef void redisCommandProc(redisClient *c);
454struct redisCommand {
455 char *name;
456 redisCommandProc *proc;
457 int arity;
458 int flags;
76583ea4
PN
459 /* Use a function to determine which keys need to be loaded
460 * in the background prior to executing this command. Takes precedence
461 * over vm_firstkey and others, ignored when NULL */
462 redisCommandProc *vm_preload_proc;
7c775e09 463 /* What keys should be loaded in background when calling this command? */
464 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
465 int vm_lastkey; /* THe last argument that's a key */
466 int vm_keystep; /* The step between first and last key */
ed9b544e 467};
468
de96dbfe 469struct redisFunctionSym {
470 char *name;
56906eef 471 unsigned long pointer;
de96dbfe 472};
473
ed9b544e 474typedef struct _redisSortObject {
475 robj *obj;
476 union {
477 double score;
478 robj *cmpobj;
479 } u;
480} redisSortObject;
481
482typedef struct _redisSortOperation {
483 int type;
484 robj *pattern;
485} redisSortOperation;
486
6b47e12e 487/* ZSETs use a specialized version of Skiplists */
488
489typedef struct zskiplistNode {
490 struct zskiplistNode **forward;
e3870fab 491 struct zskiplistNode *backward;
912b9165 492 unsigned int *span;
6b47e12e 493 double score;
494 robj *obj;
495} zskiplistNode;
496
497typedef struct zskiplist {
e3870fab 498 struct zskiplistNode *header, *tail;
d13f767c 499 unsigned long length;
6b47e12e 500 int level;
501} zskiplist;
502
1812e024 503typedef struct zset {
504 dict *dict;
6b47e12e 505 zskiplist *zsl;
1812e024 506} zset;
507
6b47e12e 508/* Our shared "common" objects */
509
05df7621 510#define REDIS_SHARED_INTEGERS 10000
ed9b544e 511struct sharedObjectsStruct {
c937aa89 512 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 513 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 514 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
515 *outofrangeerr, *plus,
ed9b544e 516 *select0, *select1, *select2, *select3, *select4,
befec3cd 517 *select5, *select6, *select7, *select8, *select9,
c8d0ea0e 518 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
519 *mbulk4, *psubscribebulk, *punsubscribebulk,
520 *integers[REDIS_SHARED_INTEGERS];
ed9b544e 521} shared;
522
a7866db6 523/* Global vars that are actally used as constants. The following double
524 * values are used for double on-disk serialization, and are initialized
525 * at runtime to avoid strange compiler optimizations. */
526
527static double R_Zero, R_PosInf, R_NegInf, R_Nan;
528
92f8e882 529/* VM threaded I/O request message */
b9bc0eef 530#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
531#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
532#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 533typedef struct iojob {
996cb5f7 534 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 535 redisDb *db;/* Redis database */
92f8e882 536 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 537 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 538 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
539 off_t page; /* Swap page where to read/write the object */
248ea310 540 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 541 int canceled; /* True if this command was canceled by blocking side of VM */
542 pthread_t thread; /* ID of the thread processing this entry */
543} iojob;
92f8e882 544
ed9b544e 545/*================================ Prototypes =============================== */
546
547static void freeStringObject(robj *o);
548static void freeListObject(robj *o);
549static void freeSetObject(robj *o);
550static void decrRefCount(void *o);
551static robj *createObject(int type, void *ptr);
552static void freeClient(redisClient *c);
f78fd11b 553static int rdbLoad(char *filename);
ed9b544e 554static void addReply(redisClient *c, robj *obj);
555static void addReplySds(redisClient *c, sds s);
556static void incrRefCount(robj *o);
f78fd11b 557static int rdbSaveBackground(char *filename);
ed9b544e 558static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 559static robj *dupStringObject(robj *o);
248ea310 560static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
dd142b9c 561static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
28ed1f33 562static void flushAppendOnlyFile(void);
44b38ef4 563static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 564static int syncWithMaster(void);
05df7621 565static robj *tryObjectEncoding(robj *o);
9d65a1bb 566static robj *getDecodedObject(robj *o);
3305306f 567static int removeExpire(redisDb *db, robj *key);
568static int expireIfNeeded(redisDb *db, robj *key);
569static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 570static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 571static int deleteKey(redisDb *db, robj *key);
bb32ede5 572static time_t getExpire(redisDb *db, robj *key);
573static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 574static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 575static void freeMemoryIfNeeded(void);
de96dbfe 576static int processCommand(redisClient *c);
56906eef 577static void setupSigSegvAction(void);
a3b21203 578static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 579static void aofRemoveTempFile(pid_t childpid);
0ea663ea 580static size_t stringObjectLen(robj *o);
638e42ac 581static void processInputBuffer(redisClient *c);
6b47e12e 582static zskiplist *zslCreate(void);
fd8ccf44 583static void zslFree(zskiplist *zsl);
2b59cfdf 584static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 585static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 586static void initClientMultiState(redisClient *c);
587static void freeClientMultiState(redisClient *c);
588static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 589static void unblockClientWaitingData(redisClient *c);
4409877e 590static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 591static void vmInit(void);
a35ddf12 592static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 593static robj *vmLoadObject(robj *key);
7e69548d 594static robj *vmPreviewObject(robj *key);
a69a0c9c 595static int vmSwapOneObjectBlocking(void);
596static int vmSwapOneObjectThreaded(void);
7e69548d 597static int vmCanSwapOut(void);
a5819310 598static int tryFreeOneObjectFromFreelist(void);
996cb5f7 599static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
600static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
601static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 602static void lockThreadedIO(void);
603static void unlockThreadedIO(void);
604static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
605static void freeIOJob(iojob *j);
606static void queueIOJob(iojob *j);
a5819310 607static int vmWriteObjectOnSwap(robj *o, off_t page);
608static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 609static void waitEmptyIOJobsQueue(void);
610static void vmReopenSwapFile(void);
970e10bb 611static int vmFreePage(off_t page);
76583ea4 612static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
d5d55fc3 613static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
614static int dontWaitForSwappedKey(redisClient *c, robj *key);
615static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
616static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
617static struct redisCommand *lookupCommand(char *name);
618static void call(redisClient *c, struct redisCommand *cmd);
619static void resetClient(redisClient *c);
ada386b2 620static void convertToRealHash(robj *o);
ffc6b7f8 621static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
622static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
623static void freePubsubPattern(void *p);
624static int listMatchPubsubPattern(void *a, void *b);
625static int compareStringObjects(robj *a, robj *b);
bf028098 626static int equalStringObjects(robj *a, robj *b);
befec3cd 627static void usage();
8f63ddca 628static int rewriteAppendOnlyFileBackground(void);
242a64f3 629static int vmSwapObjectBlocking(robj *key, robj *val);
ed9b544e 630
abcb223e 631static void authCommand(redisClient *c);
ed9b544e 632static void pingCommand(redisClient *c);
633static void echoCommand(redisClient *c);
634static void setCommand(redisClient *c);
635static void setnxCommand(redisClient *c);
526d00a5 636static void setexCommand(redisClient *c);
ed9b544e 637static void getCommand(redisClient *c);
638static void delCommand(redisClient *c);
639static void existsCommand(redisClient *c);
640static void incrCommand(redisClient *c);
641static void decrCommand(redisClient *c);
642static void incrbyCommand(redisClient *c);
643static void decrbyCommand(redisClient *c);
644static void selectCommand(redisClient *c);
645static void randomkeyCommand(redisClient *c);
646static void keysCommand(redisClient *c);
647static void dbsizeCommand(redisClient *c);
648static void lastsaveCommand(redisClient *c);
649static void saveCommand(redisClient *c);
650static void bgsaveCommand(redisClient *c);
9d65a1bb 651static void bgrewriteaofCommand(redisClient *c);
ed9b544e 652static void shutdownCommand(redisClient *c);
653static void moveCommand(redisClient *c);
654static void renameCommand(redisClient *c);
655static void renamenxCommand(redisClient *c);
656static void lpushCommand(redisClient *c);
657static void rpushCommand(redisClient *c);
658static void lpopCommand(redisClient *c);
659static void rpopCommand(redisClient *c);
660static void llenCommand(redisClient *c);
661static void lindexCommand(redisClient *c);
662static void lrangeCommand(redisClient *c);
663static void ltrimCommand(redisClient *c);
664static void typeCommand(redisClient *c);
665static void lsetCommand(redisClient *c);
666static void saddCommand(redisClient *c);
667static void sremCommand(redisClient *c);
a4460ef4 668static void smoveCommand(redisClient *c);
ed9b544e 669static void sismemberCommand(redisClient *c);
670static void scardCommand(redisClient *c);
12fea928 671static void spopCommand(redisClient *c);
2abb95a9 672static void srandmemberCommand(redisClient *c);
ed9b544e 673static void sinterCommand(redisClient *c);
674static void sinterstoreCommand(redisClient *c);
40d224a9 675static void sunionCommand(redisClient *c);
676static void sunionstoreCommand(redisClient *c);
f4f56e1d 677static void sdiffCommand(redisClient *c);
678static void sdiffstoreCommand(redisClient *c);
ed9b544e 679static void syncCommand(redisClient *c);
680static void flushdbCommand(redisClient *c);
681static void flushallCommand(redisClient *c);
682static void sortCommand(redisClient *c);
683static void lremCommand(redisClient *c);
0f5f7e9a 684static void rpoplpushcommand(redisClient *c);
ed9b544e 685static void infoCommand(redisClient *c);
70003d28 686static void mgetCommand(redisClient *c);
87eca727 687static void monitorCommand(redisClient *c);
3305306f 688static void expireCommand(redisClient *c);
802e8373 689static void expireatCommand(redisClient *c);
f6b141c5 690static void getsetCommand(redisClient *c);
fd88489a 691static void ttlCommand(redisClient *c);
321b0e13 692static void slaveofCommand(redisClient *c);
7f957c92 693static void debugCommand(redisClient *c);
f6b141c5 694static void msetCommand(redisClient *c);
695static void msetnxCommand(redisClient *c);
fd8ccf44 696static void zaddCommand(redisClient *c);
7db723ad 697static void zincrbyCommand(redisClient *c);
cc812361 698static void zrangeCommand(redisClient *c);
50c55df5 699static void zrangebyscoreCommand(redisClient *c);
f44dd428 700static void zcountCommand(redisClient *c);
e3870fab 701static void zrevrangeCommand(redisClient *c);
3c41331e 702static void zcardCommand(redisClient *c);
1b7106e7 703static void zremCommand(redisClient *c);
6e333bbe 704static void zscoreCommand(redisClient *c);
1807985b 705static void zremrangebyscoreCommand(redisClient *c);
6e469882 706static void multiCommand(redisClient *c);
707static void execCommand(redisClient *c);
18b6cb76 708static void discardCommand(redisClient *c);
4409877e 709static void blpopCommand(redisClient *c);
710static void brpopCommand(redisClient *c);
4b00bebd 711static void appendCommand(redisClient *c);
39191553 712static void substrCommand(redisClient *c);
69d95c3e 713static void zrankCommand(redisClient *c);
798d9e55 714static void zrevrankCommand(redisClient *c);
978c2c94 715static void hsetCommand(redisClient *c);
1f1c7695 716static void hsetnxCommand(redisClient *c);
978c2c94 717static void hgetCommand(redisClient *c);
09aeb579
PN
718static void hmsetCommand(redisClient *c);
719static void hmgetCommand(redisClient *c);
07efaf74 720static void hdelCommand(redisClient *c);
92b27fe9 721static void hlenCommand(redisClient *c);
9212eafd 722static void zremrangebyrankCommand(redisClient *c);
2830ca53
PN
723static void zunionCommand(redisClient *c);
724static void zinterCommand(redisClient *c);
78409a0f 725static void hkeysCommand(redisClient *c);
726static void hvalsCommand(redisClient *c);
727static void hgetallCommand(redisClient *c);
a86f14b1 728static void hexistsCommand(redisClient *c);
500ece7c 729static void configCommand(redisClient *c);
01426b05 730static void hincrbyCommand(redisClient *c);
befec3cd 731static void subscribeCommand(redisClient *c);
732static void unsubscribeCommand(redisClient *c);
ffc6b7f8 733static void psubscribeCommand(redisClient *c);
734static void punsubscribeCommand(redisClient *c);
befec3cd 735static void publishCommand(redisClient *c);
f6b141c5 736
ed9b544e 737/*================================= Globals ================================= */
738
739/* Global vars */
740static struct redisServer server; /* server global state */
741static struct redisCommand cmdTable[] = {
76583ea4
PN
742 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
743 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
744 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
526d00a5 745 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
746 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
747 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
748 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
749 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
750 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
751 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
752 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
753 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
754 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
755 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
756 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
757 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
758 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
760 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
761 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
762 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
763 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
764 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
765 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
766 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
767 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
768 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
769 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
770 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
771 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
774 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
775 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
776 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
777 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
778 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
779 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
780 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
781 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
782 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
783 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
784 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
785 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
786 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
787 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
790 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
791 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
792 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
793 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
794 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
795 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
1f1c7695 796 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 797 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
d33278d1 798 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 799 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
01426b05 800 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
801 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
802 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 806 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
807 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
808 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
809 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
810 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
811 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
812 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
814 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
815 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
816 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
817 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
818 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
819 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
820 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
823 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
824 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
825 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
826 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
830 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
958cd5f3 831 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
832 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
835 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
837 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
838 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
840 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 842 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
befec3cd 843 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
844 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
ffc6b7f8 845 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
846 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
4005fef1 847 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
76583ea4 848 {NULL,NULL,0,0,NULL,0,0,0}
ed9b544e 849};
bcfc686d 850
ed9b544e 851/*============================ Utility functions ============================ */
852
853/* Glob-style pattern matching. */
500ece7c 854static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 855 const char *string, int stringLen, int nocase)
856{
857 while(patternLen) {
858 switch(pattern[0]) {
859 case '*':
860 while (pattern[1] == '*') {
861 pattern++;
862 patternLen--;
863 }
864 if (patternLen == 1)
865 return 1; /* match */
866 while(stringLen) {
867 if (stringmatchlen(pattern+1, patternLen-1,
868 string, stringLen, nocase))
869 return 1; /* match */
870 string++;
871 stringLen--;
872 }
873 return 0; /* no match */
874 break;
875 case '?':
876 if (stringLen == 0)
877 return 0; /* no match */
878 string++;
879 stringLen--;
880 break;
881 case '[':
882 {
883 int not, match;
884
885 pattern++;
886 patternLen--;
887 not = pattern[0] == '^';
888 if (not) {
889 pattern++;
890 patternLen--;
891 }
892 match = 0;
893 while(1) {
894 if (pattern[0] == '\\') {
895 pattern++;
896 patternLen--;
897 if (pattern[0] == string[0])
898 match = 1;
899 } else if (pattern[0] == ']') {
900 break;
901 } else if (patternLen == 0) {
902 pattern--;
903 patternLen++;
904 break;
905 } else if (pattern[1] == '-' && patternLen >= 3) {
906 int start = pattern[0];
907 int end = pattern[2];
908 int c = string[0];
909 if (start > end) {
910 int t = start;
911 start = end;
912 end = t;
913 }
914 if (nocase) {
915 start = tolower(start);
916 end = tolower(end);
917 c = tolower(c);
918 }
919 pattern += 2;
920 patternLen -= 2;
921 if (c >= start && c <= end)
922 match = 1;
923 } else {
924 if (!nocase) {
925 if (pattern[0] == string[0])
926 match = 1;
927 } else {
928 if (tolower((int)pattern[0]) == tolower((int)string[0]))
929 match = 1;
930 }
931 }
932 pattern++;
933 patternLen--;
934 }
935 if (not)
936 match = !match;
937 if (!match)
938 return 0; /* no match */
939 string++;
940 stringLen--;
941 break;
942 }
943 case '\\':
944 if (patternLen >= 2) {
945 pattern++;
946 patternLen--;
947 }
948 /* fall through */
949 default:
950 if (!nocase) {
951 if (pattern[0] != string[0])
952 return 0; /* no match */
953 } else {
954 if (tolower((int)pattern[0]) != tolower((int)string[0]))
955 return 0; /* no match */
956 }
957 string++;
958 stringLen--;
959 break;
960 }
961 pattern++;
962 patternLen--;
963 if (stringLen == 0) {
964 while(*pattern == '*') {
965 pattern++;
966 patternLen--;
967 }
968 break;
969 }
970 }
971 if (patternLen == 0 && stringLen == 0)
972 return 1;
973 return 0;
974}
975
500ece7c 976static int stringmatch(const char *pattern, const char *string, int nocase) {
977 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
978}
979
2b619329 980/* Convert a string representing an amount of memory into the number of
981 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
982 * (1024*1024*1024).
983 *
984 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
985 * set to 0 */
986static long long memtoll(const char *p, int *err) {
987 const char *u;
988 char buf[128];
989 long mul; /* unit multiplier */
990 long long val;
991 unsigned int digits;
992
993 if (err) *err = 0;
994 /* Search the first non digit character. */
995 u = p;
996 if (*u == '-') u++;
997 while(*u && isdigit(*u)) u++;
998 if (*u == '\0' || !strcasecmp(u,"b")) {
999 mul = 1;
72324005 1000 } else if (!strcasecmp(u,"k")) {
2b619329 1001 mul = 1000;
72324005 1002 } else if (!strcasecmp(u,"kb")) {
2b619329 1003 mul = 1024;
72324005 1004 } else if (!strcasecmp(u,"m")) {
2b619329 1005 mul = 1000*1000;
72324005 1006 } else if (!strcasecmp(u,"mb")) {
2b619329 1007 mul = 1024*1024;
72324005 1008 } else if (!strcasecmp(u,"g")) {
2b619329 1009 mul = 1000L*1000*1000;
72324005 1010 } else if (!strcasecmp(u,"gb")) {
2b619329 1011 mul = 1024L*1024*1024;
1012 } else {
1013 if (err) *err = 1;
1014 mul = 1;
1015 }
1016 digits = u-p;
1017 if (digits >= sizeof(buf)) {
1018 if (err) *err = 1;
1019 return LLONG_MAX;
1020 }
1021 memcpy(buf,p,digits);
1022 buf[digits] = '\0';
1023 val = strtoll(buf,NULL,10);
1024 return val*mul;
1025}
1026
ee14da56 1027/* Convert a long long into a string. Returns the number of
1028 * characters needed to represent the number, that can be shorter if passed
1029 * buffer length is not enough to store the whole number. */
1030static int ll2string(char *s, size_t len, long long value) {
1031 char buf[32], *p;
1032 unsigned long long v;
1033 size_t l;
1034
1035 if (len == 0) return 0;
1036 v = (value < 0) ? -value : value;
1037 p = buf+31; /* point to the last character */
1038 do {
1039 *p-- = '0'+(v%10);
1040 v /= 10;
1041 } while(v);
1042 if (value < 0) *p-- = '-';
1043 p++;
1044 l = 32-(p-buf);
1045 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1046 memcpy(s,p,l);
1047 s[l] = '\0';
1048 return l;
1049}
1050
56906eef 1051static void redisLog(int level, const char *fmt, ...) {
ed9b544e 1052 va_list ap;
1053 FILE *fp;
1054
1055 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1056 if (!fp) return;
1057
1058 va_start(ap, fmt);
1059 if (level >= server.verbosity) {
6766f45e 1060 char *c = ".-*#";
1904ecc1 1061 char buf[64];
1062 time_t now;
1063
1064 now = time(NULL);
6c9385e0 1065 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 1066 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 1067 vfprintf(fp, fmt, ap);
1068 fprintf(fp,"\n");
1069 fflush(fp);
1070 }
1071 va_end(ap);
1072
1073 if (server.logfile) fclose(fp);
1074}
1075
1076/*====================== Hash table type implementation ==================== */
1077
1078/* This is an hash table type that uses the SDS dynamic strings libary as
1079 * keys and radis objects as values (objects can hold SDS strings,
1080 * lists, sets). */
1081
1812e024 1082static void dictVanillaFree(void *privdata, void *val)
1083{
1084 DICT_NOTUSED(privdata);
1085 zfree(val);
1086}
1087
4409877e 1088static void dictListDestructor(void *privdata, void *val)
1089{
1090 DICT_NOTUSED(privdata);
1091 listRelease((list*)val);
1092}
1093
ed9b544e 1094static int sdsDictKeyCompare(void *privdata, const void *key1,
1095 const void *key2)
1096{
1097 int l1,l2;
1098 DICT_NOTUSED(privdata);
1099
1100 l1 = sdslen((sds)key1);
1101 l2 = sdslen((sds)key2);
1102 if (l1 != l2) return 0;
1103 return memcmp(key1, key2, l1) == 0;
1104}
1105
1106static void dictRedisObjectDestructor(void *privdata, void *val)
1107{
1108 DICT_NOTUSED(privdata);
1109
a35ddf12 1110 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 1111 decrRefCount(val);
1112}
1113
942a3961 1114static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1115 const void *key2)
1116{
1117 const robj *o1 = key1, *o2 = key2;
1118 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1119}
1120
942a3961 1121static unsigned int dictObjHash(const void *key) {
ed9b544e 1122 const robj *o = key;
1123 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1124}
1125
942a3961 1126static int dictEncObjKeyCompare(void *privdata, const void *key1,
1127 const void *key2)
1128{
9d65a1bb 1129 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1130 int cmp;
942a3961 1131
2a1198b4 1132 if (o1->encoding == REDIS_ENCODING_INT &&
dc05abde 1133 o2->encoding == REDIS_ENCODING_INT)
1134 return o1->ptr == o2->ptr;
2a1198b4 1135
9d65a1bb 1136 o1 = getDecodedObject(o1);
1137 o2 = getDecodedObject(o2);
1138 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1139 decrRefCount(o1);
1140 decrRefCount(o2);
1141 return cmp;
942a3961 1142}
1143
1144static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1145 robj *o = (robj*) key;
942a3961 1146
ed9e4966 1147 if (o->encoding == REDIS_ENCODING_RAW) {
1148 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1149 } else {
1150 if (o->encoding == REDIS_ENCODING_INT) {
1151 char buf[32];
1152 int len;
1153
ee14da56 1154 len = ll2string(buf,32,(long)o->ptr);
ed9e4966 1155 return dictGenHashFunction((unsigned char*)buf, len);
1156 } else {
1157 unsigned int hash;
1158
1159 o = getDecodedObject(o);
1160 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1161 decrRefCount(o);
1162 return hash;
1163 }
1164 }
942a3961 1165}
1166
f2d9f50f 1167/* Sets type and expires */
ed9b544e 1168static dictType setDictType = {
942a3961 1169 dictEncObjHash, /* hash function */
ed9b544e 1170 NULL, /* key dup */
1171 NULL, /* val dup */
942a3961 1172 dictEncObjKeyCompare, /* key compare */
ed9b544e 1173 dictRedisObjectDestructor, /* key destructor */
1174 NULL /* val destructor */
1175};
1176
f2d9f50f 1177/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1178static dictType zsetDictType = {
1179 dictEncObjHash, /* hash function */
1180 NULL, /* key dup */
1181 NULL, /* val dup */
1182 dictEncObjKeyCompare, /* key compare */
1183 dictRedisObjectDestructor, /* key destructor */
da0a1620 1184 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1185};
1186
f2d9f50f 1187/* Db->dict */
5234952b 1188static dictType dbDictType = {
942a3961 1189 dictObjHash, /* hash function */
ed9b544e 1190 NULL, /* key dup */
1191 NULL, /* val dup */
942a3961 1192 dictObjKeyCompare, /* key compare */
ed9b544e 1193 dictRedisObjectDestructor, /* key destructor */
1194 dictRedisObjectDestructor /* val destructor */
1195};
1196
f2d9f50f 1197/* Db->expires */
1198static dictType keyptrDictType = {
1199 dictObjHash, /* hash function */
1200 NULL, /* key dup */
1201 NULL, /* val dup */
1202 dictObjKeyCompare, /* key compare */
1203 dictRedisObjectDestructor, /* key destructor */
1204 NULL /* val destructor */
1205};
1206
5234952b 1207/* Hash type hash table (note that small hashes are represented with zimpaps) */
1208static dictType hashDictType = {
1209 dictEncObjHash, /* hash function */
1210 NULL, /* key dup */
1211 NULL, /* val dup */
1212 dictEncObjKeyCompare, /* key compare */
1213 dictRedisObjectDestructor, /* key destructor */
1214 dictRedisObjectDestructor /* val destructor */
1215};
1216
4409877e 1217/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1218 * lists as values. It's used for blocking operations (BLPOP) and to
1219 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1220static dictType keylistDictType = {
1221 dictObjHash, /* hash function */
1222 NULL, /* key dup */
1223 NULL, /* val dup */
1224 dictObjKeyCompare, /* key compare */
1225 dictRedisObjectDestructor, /* key destructor */
1226 dictListDestructor /* val destructor */
1227};
1228
42ab0172
AO
1229static void version();
1230
ed9b544e 1231/* ========================= Random utility functions ======================= */
1232
1233/* Redis generally does not try to recover from out of memory conditions
1234 * when allocating objects or strings, it is not clear if it will be possible
1235 * to report this condition to the client since the networking layer itself
1236 * is based on heap allocation for send buffers, so we simply abort.
1237 * At least the code will be simpler to read... */
1238static void oom(const char *msg) {
71c54b21 1239 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1240 sleep(1);
1241 abort();
1242}
1243
1244/* ====================== Redis server networking stuff ===================== */
56906eef 1245static void closeTimedoutClients(void) {
ed9b544e 1246 redisClient *c;
ed9b544e 1247 listNode *ln;
1248 time_t now = time(NULL);
c7df85a4 1249 listIter li;
ed9b544e 1250
c7df85a4 1251 listRewind(server.clients,&li);
1252 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1253 c = listNodeValue(ln);
f86a74e9 1254 if (server.maxidletime &&
1255 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1256 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
ffc6b7f8 1257 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1258 listLength(c->pubsub_patterns) == 0 &&
d6cc8867 1259 (now - c->lastinteraction > server.maxidletime))
f86a74e9 1260 {
f870935d 1261 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1262 freeClient(c);
f86a74e9 1263 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1264 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1265 addReply(c,shared.nullmultibulk);
b0d8747d 1266 unblockClientWaitingData(c);
f86a74e9 1267 }
ed9b544e 1268 }
1269 }
ed9b544e 1270}
1271
12fea928 1272static int htNeedsResize(dict *dict) {
1273 long long size, used;
1274
1275 size = dictSlots(dict);
1276 used = dictSize(dict);
1277 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1278 (used*100/size < REDIS_HT_MINFILL));
1279}
1280
0bc03378 1281/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1282 * we resize the hash table to save memory */
56906eef 1283static void tryResizeHashTables(void) {
0bc03378 1284 int j;
1285
1286 for (j = 0; j < server.dbnum; j++) {
5413c40d 1287 if (htNeedsResize(server.db[j].dict))
0bc03378 1288 dictResize(server.db[j].dict);
12fea928 1289 if (htNeedsResize(server.db[j].expires))
1290 dictResize(server.db[j].expires);
0bc03378 1291 }
1292}
1293
8ca3e9d1 1294/* Our hash table implementation performs rehashing incrementally while
1295 * we write/read from the hash table. Still if the server is idle, the hash
1296 * table will use two tables for a long time. So we try to use 1 millisecond
1297 * of CPU time at every serverCron() loop in order to rehash some key. */
1298static void incrementallyRehash(void) {
1299 int j;
1300
1301 for (j = 0; j < server.dbnum; j++) {
1302 if (dictIsRehashing(server.db[j].dict)) {
1303 dictRehashMilliseconds(server.db[j].dict,1);
1304 break; /* already used our millisecond for this loop... */
1305 }
1306 }
1307}
1308
9d65a1bb 1309/* A background saving child (BGSAVE) terminated its work. Handle this. */
1310void backgroundSaveDoneHandler(int statloc) {
1311 int exitcode = WEXITSTATUS(statloc);
1312 int bysignal = WIFSIGNALED(statloc);
1313
1314 if (!bysignal && exitcode == 0) {
1315 redisLog(REDIS_NOTICE,
1316 "Background saving terminated with success");
1317 server.dirty = 0;
1318 server.lastsave = time(NULL);
1319 } else if (!bysignal && exitcode != 0) {
1320 redisLog(REDIS_WARNING, "Background saving error");
1321 } else {
1322 redisLog(REDIS_WARNING,
454eea7c 1323 "Background saving terminated by signal %d", WTERMSIG(statloc));
9d65a1bb 1324 rdbRemoveTempFile(server.bgsavechildpid);
1325 }
1326 server.bgsavechildpid = -1;
1327 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1328 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1329 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1330}
1331
1332/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1333 * Handle this. */
1334void backgroundRewriteDoneHandler(int statloc) {
1335 int exitcode = WEXITSTATUS(statloc);
1336 int bysignal = WIFSIGNALED(statloc);
1337
1338 if (!bysignal && exitcode == 0) {
1339 int fd;
1340 char tmpfile[256];
1341
1342 redisLog(REDIS_NOTICE,
1343 "Background append only file rewriting terminated with success");
1344 /* Now it's time to flush the differences accumulated by the parent */
1345 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1346 fd = open(tmpfile,O_WRONLY|O_APPEND);
1347 if (fd == -1) {
1348 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1349 goto cleanup;
1350 }
1351 /* Flush our data... */
1352 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1353 (signed) sdslen(server.bgrewritebuf)) {
1354 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1355 close(fd);
1356 goto cleanup;
1357 }
b32627cd 1358 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1359 /* Now our work is to rename the temp file into the stable file. And
1360 * switch the file descriptor used by the server for append only. */
1361 if (rename(tmpfile,server.appendfilename) == -1) {
1362 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1363 close(fd);
1364 goto cleanup;
1365 }
1366 /* Mission completed... almost */
1367 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1368 if (server.appendfd != -1) {
1369 /* If append only is actually enabled... */
1370 close(server.appendfd);
1371 server.appendfd = fd;
1372 fsync(fd);
85a83172 1373 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1374 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1375 } else {
1376 /* If append only is disabled we just generate a dump in this
1377 * format. Why not? */
1378 close(fd);
1379 }
1380 } else if (!bysignal && exitcode != 0) {
1381 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1382 } else {
1383 redisLog(REDIS_WARNING,
454eea7c 1384 "Background append only file rewriting terminated by signal %d",
1385 WTERMSIG(statloc));
9d65a1bb 1386 }
1387cleanup:
1388 sdsfree(server.bgrewritebuf);
1389 server.bgrewritebuf = sdsempty();
1390 aofRemoveTempFile(server.bgrewritechildpid);
1391 server.bgrewritechildpid = -1;
1392}
1393
884d4b39 1394/* This function is called once a background process of some kind terminates,
1395 * as we want to avoid resizing the hash tables when there is a child in order
1396 * to play well with copy-on-write (otherwise when a resize happens lots of
1397 * memory pages are copied). The goal of this function is to update the ability
1398 * for dict.c to resize the hash tables accordingly to the fact we have o not
1399 * running childs. */
1400static void updateDictResizePolicy(void) {
1401 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1402 dictEnableResize();
1403 else
1404 dictDisableResize();
1405}
1406
56906eef 1407static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1408 int j, loops = server.cronloops++;
ed9b544e 1409 REDIS_NOTUSED(eventLoop);
1410 REDIS_NOTUSED(id);
1411 REDIS_NOTUSED(clientData);
1412
3a66edc7 1413 /* We take a cached value of the unix time in the global state because
1414 * with virtual memory and aging there is to store the current time
1415 * in objects at every object access, and accuracy is not needed.
1416 * To access a global var is faster than calling time(NULL) */
1417 server.unixtime = time(NULL);
1418
0bc03378 1419 /* Show some info about non-empty databases */
ed9b544e 1420 for (j = 0; j < server.dbnum; j++) {
dec423d9 1421 long long size, used, vkeys;
94754ccc 1422
3305306f 1423 size = dictSlots(server.db[j].dict);
1424 used = dictSize(server.db[j].dict);
94754ccc 1425 vkeys = dictSize(server.db[j].expires);
1763929f 1426 if (!(loops % 50) && (used || vkeys)) {
f870935d 1427 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1428 /* dictPrintStats(server.dict); */
ed9b544e 1429 }
ed9b544e 1430 }
1431
0bc03378 1432 /* We don't want to resize the hash tables while a bacground saving
1433 * is in progress: the saving child is created using fork() that is
1434 * implemented with a copy-on-write semantic in most modern systems, so
1435 * if we resize the HT while there is the saving child at work actually
1436 * a lot of memory movements in the parent will cause a lot of pages
1437 * copied. */
8ca3e9d1 1438 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1439 if (!(loops % 10)) tryResizeHashTables();
1440 if (server.activerehashing) incrementallyRehash();
884d4b39 1441 }
0bc03378 1442
ed9b544e 1443 /* Show information about connected clients */
1763929f 1444 if (!(loops % 50)) {
bdcb92f2 1445 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
ed9b544e 1446 listLength(server.clients)-listLength(server.slaves),
1447 listLength(server.slaves),
bdcb92f2 1448 zmalloc_used_memory());
ed9b544e 1449 }
1450
1451 /* Close connections of timedout clients */
1763929f 1452 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1453 closeTimedoutClients();
1454
9d65a1bb 1455 /* Check if a background saving or AOF rewrite in progress terminated */
1456 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1457 int statloc;
9d65a1bb 1458 pid_t pid;
1459
1460 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1461 if (pid == server.bgsavechildpid) {
1462 backgroundSaveDoneHandler(statloc);
ed9b544e 1463 } else {
9d65a1bb 1464 backgroundRewriteDoneHandler(statloc);
ed9b544e 1465 }
884d4b39 1466 updateDictResizePolicy();
ed9b544e 1467 }
1468 } else {
1469 /* If there is not a background saving in progress check if
1470 * we have to save now */
1471 time_t now = time(NULL);
1472 for (j = 0; j < server.saveparamslen; j++) {
1473 struct saveparam *sp = server.saveparams+j;
1474
1475 if (server.dirty >= sp->changes &&
1476 now-server.lastsave > sp->seconds) {
1477 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1478 sp->changes, sp->seconds);
f78fd11b 1479 rdbSaveBackground(server.dbfilename);
ed9b544e 1480 break;
1481 }
1482 }
1483 }
94754ccc 1484
f2324293 1485 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1486 * will use few CPU cycles if there are few expiring keys, otherwise
1487 * it will get more aggressive to avoid that too much memory is used by
1488 * keys that can be removed from the keyspace. */
94754ccc 1489 for (j = 0; j < server.dbnum; j++) {
f2324293 1490 int expired;
94754ccc 1491 redisDb *db = server.db+j;
94754ccc 1492
f2324293 1493 /* Continue to expire if at the end of the cycle more than 25%
1494 * of the keys were expired. */
1495 do {
4ef8de8a 1496 long num = dictSize(db->expires);
94754ccc 1497 time_t now = time(NULL);
1498
f2324293 1499 expired = 0;
94754ccc 1500 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1501 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1502 while (num--) {
1503 dictEntry *de;
1504 time_t t;
1505
1506 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1507 t = (time_t) dictGetEntryVal(de);
1508 if (now > t) {
1509 deleteKey(db,dictGetEntryKey(de));
f2324293 1510 expired++;
2a6a2ed1 1511 server.stat_expiredkeys++;
94754ccc 1512 }
1513 }
f2324293 1514 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1515 }
1516
4ef8de8a 1517 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1518 * is enbled. Try to free objects from the free list first. */
7e69548d 1519 if (vmCanSwapOut()) {
1520 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1521 server.vm_max_memory)
1522 {
72e9fd40 1523 int retval;
1524
a5819310 1525 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1526 retval = (server.vm_max_threads == 0) ?
1527 vmSwapOneObjectBlocking() :
1528 vmSwapOneObjectThreaded();
1763929f 1529 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1530 zmalloc_used_memory() >
1531 (server.vm_max_memory+server.vm_max_memory/10))
1532 {
1533 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1534 }
72e9fd40 1535 /* Note that when using threade I/O we free just one object,
1536 * because anyway when the I/O thread in charge to swap this
1537 * object out will finish, the handler of completed jobs
1538 * will try to swap more objects if we are still out of memory. */
1539 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1540 }
1541 }
1542
ed9b544e 1543 /* Check if we should connect to a MASTER */
1763929f 1544 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1545 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1546 if (syncWithMaster() == REDIS_OK) {
1547 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
8f63ddca 1548 if (server.appendonly) rewriteAppendOnlyFileBackground();
ed9b544e 1549 }
1550 }
1763929f 1551 return 100;
ed9b544e 1552}
1553
d5d55fc3 1554/* This function gets called every time Redis is entering the
1555 * main loop of the event driven library, that is, before to sleep
1556 * for ready file descriptors. */
1557static void beforeSleep(struct aeEventLoop *eventLoop) {
1558 REDIS_NOTUSED(eventLoop);
1559
28ed1f33 1560 /* Awake clients that got all the swapped keys they requested */
d5d55fc3 1561 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1562 listIter li;
1563 listNode *ln;
1564
1565 listRewind(server.io_ready_clients,&li);
1566 while((ln = listNext(&li))) {
1567 redisClient *c = ln->value;
1568 struct redisCommand *cmd;
1569
1570 /* Resume the client. */
1571 listDelNode(server.io_ready_clients,ln);
1572 c->flags &= (~REDIS_IO_WAIT);
1573 server.vm_blocked_clients--;
1574 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1575 readQueryFromClient, c);
1576 cmd = lookupCommand(c->argv[0]->ptr);
1577 assert(cmd != NULL);
1578 call(c,cmd);
1579 resetClient(c);
1580 /* There may be more data to process in the input buffer. */
1581 if (c->querybuf && sdslen(c->querybuf) > 0)
1582 processInputBuffer(c);
1583 }
1584 }
28ed1f33 1585 /* Write the AOF buffer on disk */
1586 flushAppendOnlyFile();
d5d55fc3 1587}
1588
ed9b544e 1589static void createSharedObjects(void) {
05df7621 1590 int j;
1591
ed9b544e 1592 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1593 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1594 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1595 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1596 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1597 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1598 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1599 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1600 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1601 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1602 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1603 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1604 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1605 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1606 "-ERR no such key\r\n"));
ed9b544e 1607 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1608 "-ERR syntax error\r\n"));
c937aa89 1609 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1610 "-ERR source and destination objects are the same\r\n"));
1611 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1612 "-ERR index out of range\r\n"));
ed9b544e 1613 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1614 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1615 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1616 shared.select0 = createStringObject("select 0\r\n",10);
1617 shared.select1 = createStringObject("select 1\r\n",10);
1618 shared.select2 = createStringObject("select 2\r\n",10);
1619 shared.select3 = createStringObject("select 3\r\n",10);
1620 shared.select4 = createStringObject("select 4\r\n",10);
1621 shared.select5 = createStringObject("select 5\r\n",10);
1622 shared.select6 = createStringObject("select 6\r\n",10);
1623 shared.select7 = createStringObject("select 7\r\n",10);
1624 shared.select8 = createStringObject("select 8\r\n",10);
1625 shared.select9 = createStringObject("select 9\r\n",10);
befec3cd 1626 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
c8d0ea0e 1627 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
befec3cd 1628 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
fc46bb71 1629 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
ffc6b7f8 1630 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1631 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
befec3cd 1632 shared.mbulk3 = createStringObject("*3\r\n",4);
c8d0ea0e 1633 shared.mbulk4 = createStringObject("*4\r\n",4);
05df7621 1634 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1635 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1636 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1637 }
ed9b544e 1638}
1639
1640static void appendServerSaveParams(time_t seconds, int changes) {
1641 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1642 server.saveparams[server.saveparamslen].seconds = seconds;
1643 server.saveparams[server.saveparamslen].changes = changes;
1644 server.saveparamslen++;
1645}
1646
bcfc686d 1647static void resetServerSaveParams() {
ed9b544e 1648 zfree(server.saveparams);
1649 server.saveparams = NULL;
1650 server.saveparamslen = 0;
1651}
1652
1653static void initServerConfig() {
1654 server.dbnum = REDIS_DEFAULT_DBNUM;
1655 server.port = REDIS_SERVERPORT;
f870935d 1656 server.verbosity = REDIS_VERBOSE;
ed9b544e 1657 server.maxidletime = REDIS_MAXIDLETIME;
1658 server.saveparams = NULL;
1659 server.logfile = NULL; /* NULL = log on standard output */
1660 server.bindaddr = NULL;
1661 server.glueoutputbuf = 1;
1662 server.daemonize = 0;
44b38ef4 1663 server.appendonly = 0;
1b677732 1664 server.appendfsync = APPENDFSYNC_EVERYSEC;
48f0308a 1665 server.lastfsync = time(NULL);
44b38ef4 1666 server.appendfd = -1;
1667 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1668 server.pidfile = zstrdup("/var/run/redis.pid");
1669 server.dbfilename = zstrdup("dump.rdb");
1670 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1671 server.requirepass = NULL;
b0553789 1672 server.rdbcompression = 1;
8ca3e9d1 1673 server.activerehashing = 1;
285add55 1674 server.maxclients = 0;
d5d55fc3 1675 server.blpop_blocked_clients = 0;
3fd78bcd 1676 server.maxmemory = 0;
75680a3c 1677 server.vm_enabled = 0;
054e426d 1678 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1679 server.vm_page_size = 256; /* 256 bytes per page */
1680 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1681 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1682 server.vm_max_threads = 4;
d5d55fc3 1683 server.vm_blocked_clients = 0;
cbba7dd7 1684 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1685 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
75680a3c 1686
bcfc686d 1687 resetServerSaveParams();
ed9b544e 1688
1689 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1690 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1691 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1692 /* Replication related */
1693 server.isslave = 0;
d0ccebcf 1694 server.masterauth = NULL;
ed9b544e 1695 server.masterhost = NULL;
1696 server.masterport = 6379;
1697 server.master = NULL;
1698 server.replstate = REDIS_REPL_NONE;
a7866db6 1699
1700 /* Double constants initialization */
1701 R_Zero = 0.0;
1702 R_PosInf = 1.0/R_Zero;
1703 R_NegInf = -1.0/R_Zero;
1704 R_Nan = R_Zero/R_Zero;
ed9b544e 1705}
1706
1707static void initServer() {
1708 int j;
1709
1710 signal(SIGHUP, SIG_IGN);
1711 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1712 setupSigSegvAction();
ed9b544e 1713
b9bc0eef 1714 server.devnull = fopen("/dev/null","w");
1715 if (server.devnull == NULL) {
1716 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1717 exit(1);
1718 }
ed9b544e 1719 server.clients = listCreate();
1720 server.slaves = listCreate();
87eca727 1721 server.monitors = listCreate();
ed9b544e 1722 server.objfreelist = listCreate();
1723 createSharedObjects();
1724 server.el = aeCreateEventLoop();
3305306f 1725 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
ed9b544e 1726 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1727 if (server.fd == -1) {
1728 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1729 exit(1);
1730 }
3305306f 1731 for (j = 0; j < server.dbnum; j++) {
5234952b 1732 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1733 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
4409877e 1734 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1735 if (server.vm_enabled)
1736 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1737 server.db[j].id = j;
1738 }
ffc6b7f8 1739 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1740 server.pubsub_patterns = listCreate();
1741 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1742 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
ed9b544e 1743 server.cronloops = 0;
9f3c422c 1744 server.bgsavechildpid = -1;
9d65a1bb 1745 server.bgrewritechildpid = -1;
1746 server.bgrewritebuf = sdsempty();
28ed1f33 1747 server.aofbuf = sdsempty();
ed9b544e 1748 server.lastsave = time(NULL);
1749 server.dirty = 0;
ed9b544e 1750 server.stat_numcommands = 0;
1751 server.stat_numconnections = 0;
2a6a2ed1 1752 server.stat_expiredkeys = 0;
ed9b544e 1753 server.stat_starttime = time(NULL);
3a66edc7 1754 server.unixtime = time(NULL);
d8f8b666 1755 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1756 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1757 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1758
1759 if (server.appendonly) {
3bb225d6 1760 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1761 if (server.appendfd == -1) {
1762 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1763 strerror(errno));
1764 exit(1);
1765 }
1766 }
75680a3c 1767
1768 if (server.vm_enabled) vmInit();
ed9b544e 1769}
1770
1771/* Empty the whole database */
ca37e9cd 1772static long long emptyDb() {
ed9b544e 1773 int j;
ca37e9cd 1774 long long removed = 0;
ed9b544e 1775
3305306f 1776 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1777 removed += dictSize(server.db[j].dict);
3305306f 1778 dictEmpty(server.db[j].dict);
1779 dictEmpty(server.db[j].expires);
1780 }
ca37e9cd 1781 return removed;
ed9b544e 1782}
1783
85dd2f3a 1784static int yesnotoi(char *s) {
1785 if (!strcasecmp(s,"yes")) return 1;
1786 else if (!strcasecmp(s,"no")) return 0;
1787 else return -1;
1788}
1789
ed9b544e 1790/* I agree, this is a very rudimental way to load a configuration...
1791 will improve later if the config gets more complex */
1792static void loadServerConfig(char *filename) {
c9a111ac 1793 FILE *fp;
ed9b544e 1794 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1795 int linenum = 0;
1796 sds line = NULL;
c9a111ac 1797
1798 if (filename[0] == '-' && filename[1] == '\0')
1799 fp = stdin;
1800 else {
1801 if ((fp = fopen(filename,"r")) == NULL) {
9a22de82 1802 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
c9a111ac 1803 exit(1);
1804 }
ed9b544e 1805 }
c9a111ac 1806
ed9b544e 1807 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1808 sds *argv;
1809 int argc, j;
1810
1811 linenum++;
1812 line = sdsnew(buf);
1813 line = sdstrim(line," \t\r\n");
1814
1815 /* Skip comments and blank lines*/
1816 if (line[0] == '#' || line[0] == '\0') {
1817 sdsfree(line);
1818 continue;
1819 }
1820
1821 /* Split into arguments */
1822 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1823 sdstolower(argv[0]);
1824
1825 /* Execute config directives */
bb0b03a3 1826 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1827 server.maxidletime = atoi(argv[1]);
0150db36 1828 if (server.maxidletime < 0) {
ed9b544e 1829 err = "Invalid timeout value"; goto loaderr;
1830 }
bb0b03a3 1831 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1832 server.port = atoi(argv[1]);
1833 if (server.port < 1 || server.port > 65535) {
1834 err = "Invalid port"; goto loaderr;
1835 }
bb0b03a3 1836 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1837 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1838 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1839 int seconds = atoi(argv[1]);
1840 int changes = atoi(argv[2]);
1841 if (seconds < 1 || changes < 0) {
1842 err = "Invalid save parameters"; goto loaderr;
1843 }
1844 appendServerSaveParams(seconds,changes);
bb0b03a3 1845 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1846 if (chdir(argv[1]) == -1) {
1847 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1848 argv[1], strerror(errno));
1849 exit(1);
1850 }
bb0b03a3 1851 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1852 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1853 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1854 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1855 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1856 else {
1857 err = "Invalid log level. Must be one of debug, notice, warning";
1858 goto loaderr;
1859 }
bb0b03a3 1860 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1861 FILE *logfp;
ed9b544e 1862
1863 server.logfile = zstrdup(argv[1]);
bb0b03a3 1864 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1865 zfree(server.logfile);
1866 server.logfile = NULL;
1867 }
1868 if (server.logfile) {
1869 /* Test if we are able to open the file. The server will not
1870 * be able to abort just for this problem later... */
c9a111ac 1871 logfp = fopen(server.logfile,"a");
1872 if (logfp == NULL) {
ed9b544e 1873 err = sdscatprintf(sdsempty(),
1874 "Can't open the log file: %s", strerror(errno));
1875 goto loaderr;
1876 }
c9a111ac 1877 fclose(logfp);
ed9b544e 1878 }
bb0b03a3 1879 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1880 server.dbnum = atoi(argv[1]);
1881 if (server.dbnum < 1) {
1882 err = "Invalid number of databases"; goto loaderr;
1883 }
b3f83f12
JZ
1884 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1885 loadServerConfig(argv[1]);
285add55 1886 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1887 server.maxclients = atoi(argv[1]);
3fd78bcd 1888 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
2b619329 1889 server.maxmemory = memtoll(argv[1],NULL);
bb0b03a3 1890 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1891 server.masterhost = sdsnew(argv[1]);
1892 server.masterport = atoi(argv[2]);
1893 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1894 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1895 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1896 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1897 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1898 err = "argument must be 'yes' or 'no'"; goto loaderr;
1899 }
121f70cf 1900 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1901 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
8ca3e9d1 1902 err = "argument must be 'yes' or 'no'"; goto loaderr;
1903 }
1904 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1905 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
121f70cf 1906 err = "argument must be 'yes' or 'no'"; goto loaderr;
1907 }
bb0b03a3 1908 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1909 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1910 err = "argument must be 'yes' or 'no'"; goto loaderr;
1911 }
44b38ef4 1912 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1913 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1914 err = "argument must be 'yes' or 'no'"; goto loaderr;
1915 }
48f0308a 1916 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1917 if (!strcasecmp(argv[1],"no")) {
48f0308a 1918 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1919 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1920 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1921 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1922 server.appendfsync = APPENDFSYNC_EVERYSEC;
1923 } else {
1924 err = "argument must be 'no', 'always' or 'everysec'";
1925 goto loaderr;
1926 }
bb0b03a3 1927 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 1928 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1929 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 1930 zfree(server.pidfile);
054e426d 1931 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1932 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 1933 zfree(server.dbfilename);
054e426d 1934 server.dbfilename = zstrdup(argv[1]);
75680a3c 1935 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1936 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1937 err = "argument must be 'yes' or 'no'"; goto loaderr;
1938 }
054e426d 1939 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 1940 zfree(server.vm_swap_file);
054e426d 1941 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 1942 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2b619329 1943 server.vm_max_memory = memtoll(argv[1],NULL);
4ef8de8a 1944 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2b619329 1945 server.vm_page_size = memtoll(argv[1], NULL);
4ef8de8a 1946 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2b619329 1947 server.vm_pages = memtoll(argv[1], NULL);
92f8e882 1948 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1949 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 1950 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2b619329 1951 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
cbba7dd7 1952 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2b619329 1953 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
ed9b544e 1954 } else {
1955 err = "Bad directive or wrong number of arguments"; goto loaderr;
1956 }
1957 for (j = 0; j < argc; j++)
1958 sdsfree(argv[j]);
1959 zfree(argv);
1960 sdsfree(line);
1961 }
c9a111ac 1962 if (fp != stdin) fclose(fp);
ed9b544e 1963 return;
1964
1965loaderr:
1966 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1967 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1968 fprintf(stderr, ">>> '%s'\n", line);
1969 fprintf(stderr, "%s\n", err);
1970 exit(1);
1971}
1972
1973static void freeClientArgv(redisClient *c) {
1974 int j;
1975
1976 for (j = 0; j < c->argc; j++)
1977 decrRefCount(c->argv[j]);
e8a74421 1978 for (j = 0; j < c->mbargc; j++)
1979 decrRefCount(c->mbargv[j]);
ed9b544e 1980 c->argc = 0;
e8a74421 1981 c->mbargc = 0;
ed9b544e 1982}
1983
1984static void freeClient(redisClient *c) {
1985 listNode *ln;
1986
4409877e 1987 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 1988 * call, we have to set querybuf to NULL *before* to call
1989 * unblockClientWaitingData() to avoid processInputBuffer() will get
1990 * called. Also it is important to remove the file events after
1991 * this, because this call adds the READABLE event. */
4409877e 1992 sdsfree(c->querybuf);
1993 c->querybuf = NULL;
1994 if (c->flags & REDIS_BLOCKED)
b0d8747d 1995 unblockClientWaitingData(c);
4409877e 1996
ffc6b7f8 1997 /* Unsubscribe from all the pubsub channels */
1998 pubsubUnsubscribeAllChannels(c,0);
1999 pubsubUnsubscribeAllPatterns(c,0);
2000 dictRelease(c->pubsub_channels);
2001 listRelease(c->pubsub_patterns);
befec3cd 2002 /* Obvious cleanup */
ed9b544e 2003 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2004 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 2005 listRelease(c->reply);
2006 freeClientArgv(c);
2007 close(c->fd);
92f8e882 2008 /* Remove from the list of clients */
ed9b544e 2009 ln = listSearchKey(server.clients,c);
dfc5e96c 2010 redisAssert(ln != NULL);
ed9b544e 2011 listDelNode(server.clients,ln);
d5d55fc3 2012 /* Remove from the list of clients waiting for swapped keys */
2013 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2014 ln = listSearchKey(server.io_ready_clients,c);
2015 if (ln) {
2016 listDelNode(server.io_ready_clients,ln);
2017 server.vm_blocked_clients--;
2018 }
2019 }
2020 while (server.vm_enabled && listLength(c->io_keys)) {
2021 ln = listFirst(c->io_keys);
2022 dontWaitForSwappedKey(c,ln->value);
92f8e882 2023 }
b3e3d0d7 2024 listRelease(c->io_keys);
befec3cd 2025 /* Master/slave cleanup */
ed9b544e 2026 if (c->flags & REDIS_SLAVE) {
6208b3a7 2027 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2028 close(c->repldbfd);
87eca727 2029 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2030 ln = listSearchKey(l,c);
dfc5e96c 2031 redisAssert(ln != NULL);
87eca727 2032 listDelNode(l,ln);
ed9b544e 2033 }
2034 if (c->flags & REDIS_MASTER) {
2035 server.master = NULL;
2036 server.replstate = REDIS_REPL_CONNECT;
2037 }
befec3cd 2038 /* Release memory */
93ea3759 2039 zfree(c->argv);
e8a74421 2040 zfree(c->mbargv);
6e469882 2041 freeClientMultiState(c);
ed9b544e 2042 zfree(c);
2043}
2044
cc30e368 2045#define GLUEREPLY_UP_TO (1024)
ed9b544e 2046static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 2047 int copylen = 0;
2048 char buf[GLUEREPLY_UP_TO];
6208b3a7 2049 listNode *ln;
c7df85a4 2050 listIter li;
ed9b544e 2051 robj *o;
2052
c7df85a4 2053 listRewind(c->reply,&li);
2054 while((ln = listNext(&li))) {
c28b42ac 2055 int objlen;
2056
ed9b544e 2057 o = ln->value;
c28b42ac 2058 objlen = sdslen(o->ptr);
2059 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2060 memcpy(buf+copylen,o->ptr,objlen);
2061 copylen += objlen;
ed9b544e 2062 listDelNode(c->reply,ln);
c28b42ac 2063 } else {
2064 if (copylen == 0) return;
2065 break;
ed9b544e 2066 }
ed9b544e 2067 }
c28b42ac 2068 /* Now the output buffer is empty, add the new single element */
2069 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2070 listAddNodeHead(c->reply,o);
ed9b544e 2071}
2072
2073static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2074 redisClient *c = privdata;
2075 int nwritten = 0, totwritten = 0, objlen;
2076 robj *o;
2077 REDIS_NOTUSED(el);
2078 REDIS_NOTUSED(mask);
2079
2895e862 2080 /* Use writev() if we have enough buffers to send */
7ea870c0 2081 if (!server.glueoutputbuf &&
e0a62c7f 2082 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
7ea870c0 2083 !(c->flags & REDIS_MASTER))
2895e862 2084 {
2085 sendReplyToClientWritev(el, fd, privdata, mask);
2086 return;
2087 }
2895e862 2088
ed9b544e 2089 while(listLength(c->reply)) {
c28b42ac 2090 if (server.glueoutputbuf && listLength(c->reply) > 1)
2091 glueReplyBuffersIfNeeded(c);
2092
ed9b544e 2093 o = listNodeValue(listFirst(c->reply));
2094 objlen = sdslen(o->ptr);
2095
2096 if (objlen == 0) {
2097 listDelNode(c->reply,listFirst(c->reply));
2098 continue;
2099 }
2100
2101 if (c->flags & REDIS_MASTER) {
6f376729 2102 /* Don't reply to a master */
ed9b544e 2103 nwritten = objlen - c->sentlen;
2104 } else {
a4d1ba9a 2105 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 2106 if (nwritten <= 0) break;
2107 }
2108 c->sentlen += nwritten;
2109 totwritten += nwritten;
2110 /* If we fully sent the object on head go to the next one */
2111 if (c->sentlen == objlen) {
2112 listDelNode(c->reply,listFirst(c->reply));
2113 c->sentlen = 0;
2114 }
6f376729 2115 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 2116 * bytes, in a single threaded server it's a good idea to serve
6f376729 2117 * other clients as well, even if a very large request comes from
2118 * super fast link that is always able to accept data (in real world
12f9d551 2119 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 2120 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 2121 }
2122 if (nwritten == -1) {
2123 if (errno == EAGAIN) {
2124 nwritten = 0;
2125 } else {
f870935d 2126 redisLog(REDIS_VERBOSE,
ed9b544e 2127 "Error writing to client: %s", strerror(errno));
2128 freeClient(c);
2129 return;
2130 }
2131 }
2132 if (totwritten > 0) c->lastinteraction = time(NULL);
2133 if (listLength(c->reply) == 0) {
2134 c->sentlen = 0;
2135 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2136 }
2137}
2138
2895e862 2139static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2140{
2141 redisClient *c = privdata;
2142 int nwritten = 0, totwritten = 0, objlen, willwrite;
2143 robj *o;
2144 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2145 int offset, ion = 0;
2146 REDIS_NOTUSED(el);
2147 REDIS_NOTUSED(mask);
2148
2149 listNode *node;
2150 while (listLength(c->reply)) {
2151 offset = c->sentlen;
2152 ion = 0;
2153 willwrite = 0;
2154
2155 /* fill-in the iov[] array */
2156 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2157 o = listNodeValue(node);
2158 objlen = sdslen(o->ptr);
2159
e0a62c7f 2160 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2895e862 2161 break;
2162
2163 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2164 break; /* no more iovecs */
2165
2166 iov[ion].iov_base = ((char*)o->ptr) + offset;
2167 iov[ion].iov_len = objlen - offset;
2168 willwrite += objlen - offset;
2169 offset = 0; /* just for the first item */
2170 ion++;
2171 }
2172
2173 if(willwrite == 0)
2174 break;
2175
2176 /* write all collected blocks at once */
2177 if((nwritten = writev(fd, iov, ion)) < 0) {
2178 if (errno != EAGAIN) {
f870935d 2179 redisLog(REDIS_VERBOSE,
2895e862 2180 "Error writing to client: %s", strerror(errno));
2181 freeClient(c);
2182 return;
2183 }
2184 break;
2185 }
2186
2187 totwritten += nwritten;
2188 offset = c->sentlen;
2189
2190 /* remove written robjs from c->reply */
2191 while (nwritten && listLength(c->reply)) {
2192 o = listNodeValue(listFirst(c->reply));
2193 objlen = sdslen(o->ptr);
2194
2195 if(nwritten >= objlen - offset) {
2196 listDelNode(c->reply, listFirst(c->reply));
2197 nwritten -= objlen - offset;
2198 c->sentlen = 0;
2199 } else {
2200 /* partial write */
2201 c->sentlen += nwritten;
2202 break;
2203 }
2204 offset = 0;
2205 }
2206 }
2207
e0a62c7f 2208 if (totwritten > 0)
2895e862 2209 c->lastinteraction = time(NULL);
2210
2211 if (listLength(c->reply) == 0) {
2212 c->sentlen = 0;
2213 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2214 }
2215}
2216
ed9b544e 2217static struct redisCommand *lookupCommand(char *name) {
2218 int j = 0;
2219 while(cmdTable[j].name != NULL) {
bb0b03a3 2220 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
ed9b544e 2221 j++;
2222 }
2223 return NULL;
2224}
2225
2226/* resetClient prepare the client to process the next command */
2227static void resetClient(redisClient *c) {
2228 freeClientArgv(c);
2229 c->bulklen = -1;
e8a74421 2230 c->multibulk = 0;
ed9b544e 2231}
2232
6e469882 2233/* Call() is the core of Redis execution of a command */
2234static void call(redisClient *c, struct redisCommand *cmd) {
2235 long long dirty;
2236
2237 dirty = server.dirty;
2238 cmd->proc(c);
4005fef1 2239 dirty = server.dirty-dirty;
2240
2241 if (server.appendonly && dirty)
6e469882 2242 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
4005fef1 2243 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2244 listLength(server.slaves))
248ea310 2245 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2246 if (listLength(server.monitors))
dd142b9c 2247 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2248 server.stat_numcommands++;
2249}
2250
ed9b544e 2251/* If this function gets called we already read a whole
2252 * command, argments are in the client argv/argc fields.
2253 * processCommand() execute the command or prepare the
2254 * server for a bulk read from the client.
2255 *
2256 * If 1 is returned the client is still alive and valid and
2257 * and other operations can be performed by the caller. Otherwise
2258 * if 0 is returned the client was destroied (i.e. after QUIT). */
2259static int processCommand(redisClient *c) {
2260 struct redisCommand *cmd;
ed9b544e 2261
3fd78bcd 2262 /* Free some memory if needed (maxmemory setting) */
2263 if (server.maxmemory) freeMemoryIfNeeded();
2264
e8a74421 2265 /* Handle the multi bulk command type. This is an alternative protocol
2266 * supported by Redis in order to receive commands that are composed of
2267 * multiple binary-safe "bulk" arguments. The latency of processing is
2268 * a bit higher but this allows things like multi-sets, so if this
2269 * protocol is used only for MSET and similar commands this is a big win. */
2270 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2271 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2272 if (c->multibulk <= 0) {
2273 resetClient(c);
2274 return 1;
2275 } else {
2276 decrRefCount(c->argv[c->argc-1]);
2277 c->argc--;
2278 return 1;
2279 }
2280 } else if (c->multibulk) {
2281 if (c->bulklen == -1) {
2282 if (((char*)c->argv[0]->ptr)[0] != '$') {
2283 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2284 resetClient(c);
2285 return 1;
2286 } else {
2287 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2288 decrRefCount(c->argv[0]);
2289 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2290 c->argc--;
2291 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2292 resetClient(c);
2293 return 1;
2294 }
2295 c->argc--;
2296 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2297 return 1;
2298 }
2299 } else {
2300 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2301 c->mbargv[c->mbargc] = c->argv[0];
2302 c->mbargc++;
2303 c->argc--;
2304 c->multibulk--;
2305 if (c->multibulk == 0) {
2306 robj **auxargv;
2307 int auxargc;
2308
2309 /* Here we need to swap the multi-bulk argc/argv with the
2310 * normal argc/argv of the client structure. */
2311 auxargv = c->argv;
2312 c->argv = c->mbargv;
2313 c->mbargv = auxargv;
2314
2315 auxargc = c->argc;
2316 c->argc = c->mbargc;
2317 c->mbargc = auxargc;
2318
2319 /* We need to set bulklen to something different than -1
2320 * in order for the code below to process the command without
2321 * to try to read the last argument of a bulk command as
2322 * a special argument. */
2323 c->bulklen = 0;
2324 /* continue below and process the command */
2325 } else {
2326 c->bulklen = -1;
2327 return 1;
2328 }
2329 }
2330 }
2331 /* -- end of multi bulk commands processing -- */
2332
ed9b544e 2333 /* The QUIT command is handled as a special case. Normal command
2334 * procs are unable to close the client connection safely */
bb0b03a3 2335 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2336 freeClient(c);
2337 return 0;
2338 }
d5d55fc3 2339
2340 /* Now lookup the command and check ASAP about trivial error conditions
2341 * such wrong arity, bad command name and so forth. */
ed9b544e 2342 cmd = lookupCommand(c->argv[0]->ptr);
2343 if (!cmd) {
2c14807b 2344 addReplySds(c,
2345 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2346 (char*)c->argv[0]->ptr));
ed9b544e 2347 resetClient(c);
2348 return 1;
2349 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2350 (c->argc < -cmd->arity)) {
454d4e43 2351 addReplySds(c,
2352 sdscatprintf(sdsempty(),
2353 "-ERR wrong number of arguments for '%s' command\r\n",
2354 cmd->name));
ed9b544e 2355 resetClient(c);
2356 return 1;
2357 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2358 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2359 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2360
2361 decrRefCount(c->argv[c->argc-1]);
2362 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2363 c->argc--;
2364 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2365 resetClient(c);
2366 return 1;
2367 }
2368 c->argc--;
2369 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2370 /* It is possible that the bulk read is already in the
8d0490e7 2371 * buffer. Check this condition and handle it accordingly.
2372 * This is just a fast path, alternative to call processInputBuffer().
2373 * It's a good idea since the code is small and this condition
2374 * happens most of the times. */
ed9b544e 2375 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2376 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2377 c->argc++;
2378 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2379 } else {
d5d55fc3 2380 /* Otherwise return... there is to read the last argument
2381 * from the socket. */
ed9b544e 2382 return 1;
2383 }
2384 }
942a3961 2385 /* Let's try to encode the bulk object to save space. */
2386 if (cmd->flags & REDIS_CMD_BULK)
05df7621 2387 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
942a3961 2388
e63943a4 2389 /* Check if the user is authenticated */
2390 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2391 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2392 resetClient(c);
2393 return 1;
2394 }
2395
b61a28fe 2396 /* Handle the maxmemory directive */
2397 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2398 zmalloc_used_memory() > server.maxmemory)
2399 {
2400 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2401 resetClient(c);
2402 return 1;
2403 }
2404
d6cc8867 2405 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
e6cca5db 2406 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2407 &&
ffc6b7f8 2408 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2409 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2410 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
d6cc8867 2411 resetClient(c);
2412 return 1;
2413 }
2414
ed9b544e 2415 /* Exec the command */
18b6cb76 2416 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
6e469882 2417 queueMultiCommand(c,cmd);
2418 addReply(c,shared.queued);
2419 } else {
d5d55fc3 2420 if (server.vm_enabled && server.vm_max_threads > 0 &&
2421 blockClientOnSwappedKeys(cmd,c)) return 1;
6e469882 2422 call(c,cmd);
2423 }
ed9b544e 2424
2425 /* Prepare the client for the next command */
ed9b544e 2426 resetClient(c);
2427 return 1;
2428}
2429
248ea310 2430static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2431 listNode *ln;
c7df85a4 2432 listIter li;
ed9b544e 2433 int outc = 0, j;
93ea3759 2434 robj **outv;
248ea310 2435 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2436 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2437 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2438 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2439 robj *lenobj;
93ea3759 2440
2441 if (argc <= REDIS_STATIC_ARGS) {
2442 outv = static_outv;
2443 } else {
248ea310 2444 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2445 }
248ea310 2446
2447 lenobj = createObject(REDIS_STRING,
2448 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2449 lenobj->refcount = 0;
2450 outv[outc++] = lenobj;
ed9b544e 2451 for (j = 0; j < argc; j++) {
248ea310 2452 lenobj = createObject(REDIS_STRING,
2453 sdscatprintf(sdsempty(),"$%lu\r\n",
2454 (unsigned long) stringObjectLen(argv[j])));
2455 lenobj->refcount = 0;
2456 outv[outc++] = lenobj;
ed9b544e 2457 outv[outc++] = argv[j];
248ea310 2458 outv[outc++] = shared.crlf;
ed9b544e 2459 }
ed9b544e 2460
40d224a9 2461 /* Increment all the refcounts at start and decrement at end in order to
2462 * be sure to free objects if there is no slave in a replication state
2463 * able to be feed with commands */
2464 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2465 listRewind(slaves,&li);
2466 while((ln = listNext(&li))) {
ed9b544e 2467 redisClient *slave = ln->value;
40d224a9 2468
2469 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2470 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2471
2472 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2473 if (slave->slaveseldb != dictid) {
2474 robj *selectcmd;
2475
2476 switch(dictid) {
2477 case 0: selectcmd = shared.select0; break;
2478 case 1: selectcmd = shared.select1; break;
2479 case 2: selectcmd = shared.select2; break;
2480 case 3: selectcmd = shared.select3; break;
2481 case 4: selectcmd = shared.select4; break;
2482 case 5: selectcmd = shared.select5; break;
2483 case 6: selectcmd = shared.select6; break;
2484 case 7: selectcmd = shared.select7; break;
2485 case 8: selectcmd = shared.select8; break;
2486 case 9: selectcmd = shared.select9; break;
2487 default:
2488 selectcmd = createObject(REDIS_STRING,
2489 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2490 selectcmd->refcount = 0;
2491 break;
2492 }
2493 addReply(slave,selectcmd);
2494 slave->slaveseldb = dictid;
2495 }
2496 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2497 }
40d224a9 2498 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2499 if (outv != static_outv) zfree(outv);
ed9b544e 2500}
2501
dd142b9c 2502static sds sdscatrepr(sds s, char *p, size_t len) {
2503 s = sdscatlen(s,"\"",1);
2504 while(len--) {
2505 switch(*p) {
2506 case '\\':
2507 case '"':
2508 s = sdscatprintf(s,"\\%c",*p);
2509 break;
2510 case '\n': s = sdscatlen(s,"\\n",1); break;
2511 case '\r': s = sdscatlen(s,"\\r",1); break;
2512 case '\t': s = sdscatlen(s,"\\t",1); break;
2513 case '\a': s = sdscatlen(s,"\\a",1); break;
2514 case '\b': s = sdscatlen(s,"\\b",1); break;
2515 default:
2516 if (isprint(*p))
2517 s = sdscatprintf(s,"%c",*p);
2518 else
2519 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2520 break;
2521 }
2522 p++;
2523 }
2524 return sdscatlen(s,"\"",1);
2525}
2526
2527static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2528 listNode *ln;
2529 listIter li;
2530 int j;
2531 sds cmdrepr = sdsnew("+");
2532 robj *cmdobj;
2533 struct timeval tv;
2534
2535 gettimeofday(&tv,NULL);
2536 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2537 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2538
2539 for (j = 0; j < argc; j++) {
2540 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2541 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2542 } else {
2543 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2544 sdslen(argv[j]->ptr));
2545 }
2546 if (j != argc-1)
2547 cmdrepr = sdscatlen(cmdrepr," ",1);
2548 }
2549 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2550 cmdobj = createObject(REDIS_STRING,cmdrepr);
2551
2552 listRewind(monitors,&li);
2553 while((ln = listNext(&li))) {
2554 redisClient *monitor = ln->value;
2555 addReply(monitor,cmdobj);
2556 }
2557 decrRefCount(cmdobj);
2558}
2559
638e42ac 2560static void processInputBuffer(redisClient *c) {
ed9b544e 2561again:
4409877e 2562 /* Before to process the input buffer, make sure the client is not
2563 * waitig for a blocking operation such as BLPOP. Note that the first
2564 * iteration the client is never blocked, otherwise the processInputBuffer
2565 * would not be called at all, but after the execution of the first commands
2566 * in the input buffer the client may be blocked, and the "goto again"
2567 * will try to reiterate. The following line will make it return asap. */
92f8e882 2568 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2569 if (c->bulklen == -1) {
2570 /* Read the first line of the query */
2571 char *p = strchr(c->querybuf,'\n');
2572 size_t querylen;
644fafa3 2573
ed9b544e 2574 if (p) {
2575 sds query, *argv;
2576 int argc, j;
e0a62c7f 2577
ed9b544e 2578 query = c->querybuf;
2579 c->querybuf = sdsempty();
2580 querylen = 1+(p-(query));
2581 if (sdslen(query) > querylen) {
2582 /* leave data after the first line of the query in the buffer */
2583 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2584 }
2585 *p = '\0'; /* remove "\n" */
2586 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2587 sdsupdatelen(query);
2588
2589 /* Now we can split the query in arguments */
ed9b544e 2590 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2591 sdsfree(query);
2592
2593 if (c->argv) zfree(c->argv);
2594 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2595
2596 for (j = 0; j < argc; j++) {
ed9b544e 2597 if (sdslen(argv[j])) {
2598 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2599 c->argc++;
2600 } else {
2601 sdsfree(argv[j]);
2602 }
2603 }
2604 zfree(argv);
7c49733c 2605 if (c->argc) {
2606 /* Execute the command. If the client is still valid
2607 * after processCommand() return and there is something
2608 * on the query buffer try to process the next command. */
2609 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2610 } else {
2611 /* Nothing to process, argc == 0. Just process the query
2612 * buffer if it's not empty or return to the caller */
2613 if (sdslen(c->querybuf)) goto again;
2614 }
ed9b544e 2615 return;
644fafa3 2616 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2617 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2618 freeClient(c);
2619 return;
2620 }
2621 } else {
2622 /* Bulk read handling. Note that if we are at this point
2623 the client already sent a command terminated with a newline,
2624 we are reading the bulk data that is actually the last
2625 argument of the command. */
2626 int qbl = sdslen(c->querybuf);
2627
2628 if (c->bulklen <= qbl) {
2629 /* Copy everything but the final CRLF as final argument */
2630 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2631 c->argc++;
2632 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2633 /* Process the command. If the client is still valid after
2634 * the processing and there is more data in the buffer
2635 * try to parse it. */
2636 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2637 return;
2638 }
2639 }
2640}
2641
638e42ac 2642static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2643 redisClient *c = (redisClient*) privdata;
2644 char buf[REDIS_IOBUF_LEN];
2645 int nread;
2646 REDIS_NOTUSED(el);
2647 REDIS_NOTUSED(mask);
2648
2649 nread = read(fd, buf, REDIS_IOBUF_LEN);
2650 if (nread == -1) {
2651 if (errno == EAGAIN) {
2652 nread = 0;
2653 } else {
f870935d 2654 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2655 freeClient(c);
2656 return;
2657 }
2658 } else if (nread == 0) {
f870935d 2659 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2660 freeClient(c);
2661 return;
2662 }
2663 if (nread) {
2664 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2665 c->lastinteraction = time(NULL);
2666 } else {
2667 return;
2668 }
168ac5c6 2669 processInputBuffer(c);
638e42ac 2670}
2671
ed9b544e 2672static int selectDb(redisClient *c, int id) {
2673 if (id < 0 || id >= server.dbnum)
2674 return REDIS_ERR;
3305306f 2675 c->db = &server.db[id];
ed9b544e 2676 return REDIS_OK;
2677}
2678
40d224a9 2679static void *dupClientReplyValue(void *o) {
2680 incrRefCount((robj*)o);
12d090d2 2681 return o;
40d224a9 2682}
2683
ffc6b7f8 2684static int listMatchObjects(void *a, void *b) {
bf028098 2685 return equalStringObjects(a,b);
ffc6b7f8 2686}
2687
ed9b544e 2688static redisClient *createClient(int fd) {
2689 redisClient *c = zmalloc(sizeof(*c));
2690
2691 anetNonBlock(NULL,fd);
2692 anetTcpNoDelay(NULL,fd);
2693 if (!c) return NULL;
2694 selectDb(c,0);
2695 c->fd = fd;
2696 c->querybuf = sdsempty();
2697 c->argc = 0;
93ea3759 2698 c->argv = NULL;
ed9b544e 2699 c->bulklen = -1;
e8a74421 2700 c->multibulk = 0;
2701 c->mbargc = 0;
2702 c->mbargv = NULL;
ed9b544e 2703 c->sentlen = 0;
2704 c->flags = 0;
2705 c->lastinteraction = time(NULL);
abcb223e 2706 c->authenticated = 0;
40d224a9 2707 c->replstate = REDIS_REPL_NONE;
6b47e12e 2708 c->reply = listCreate();
ed9b544e 2709 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2710 listSetDupMethod(c->reply,dupClientReplyValue);
92f8e882 2711 c->blockingkeys = NULL;
2712 c->blockingkeysnum = 0;
2713 c->io_keys = listCreate();
2714 listSetFreeMethod(c->io_keys,decrRefCount);
ffc6b7f8 2715 c->pubsub_channels = dictCreate(&setDictType,NULL);
2716 c->pubsub_patterns = listCreate();
2717 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2718 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
ed9b544e 2719 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2720 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2721 freeClient(c);
2722 return NULL;
2723 }
6b47e12e 2724 listAddNodeTail(server.clients,c);
6e469882 2725 initClientMultiState(c);
ed9b544e 2726 return c;
2727}
2728
2729static void addReply(redisClient *c, robj *obj) {
2730 if (listLength(c->reply) == 0 &&
6208b3a7 2731 (c->replstate == REDIS_REPL_NONE ||
2732 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2733 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2734 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2735
2736 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2737 obj = dupStringObject(obj);
2738 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2739 }
9d65a1bb 2740 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2741}
2742
2743static void addReplySds(redisClient *c, sds s) {
2744 robj *o = createObject(REDIS_STRING,s);
2745 addReply(c,o);
2746 decrRefCount(o);
2747}
2748
e2665397 2749static void addReplyDouble(redisClient *c, double d) {
2750 char buf[128];
2751
2752 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2753 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2754 (unsigned long) strlen(buf),buf));
e2665397 2755}
2756
f44dd428 2757static void addReplyLong(redisClient *c, long l) {
2758 char buf[128];
2759 size_t len;
2760
dd88747b 2761 if (l == 0) {
2762 addReply(c,shared.czero);
2763 return;
2764 } else if (l == 1) {
2765 addReply(c,shared.cone);
2766 return;
2767 }
f44dd428 2768 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2769 addReplySds(c,sdsnewlen(buf,len));
2770}
2771
aa7c2934
PN
2772static void addReplyLongLong(redisClient *c, long long ll) {
2773 char buf[128];
2774 size_t len;
2775
2776 if (ll == 0) {
2777 addReply(c,shared.czero);
2778 return;
2779 } else if (ll == 1) {
2780 addReply(c,shared.cone);
2781 return;
2782 }
2783 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2784 addReplySds(c,sdsnewlen(buf,len));
2785}
2786
92b27fe9 2787static void addReplyUlong(redisClient *c, unsigned long ul) {
2788 char buf[128];
2789 size_t len;
2790
dd88747b 2791 if (ul == 0) {
2792 addReply(c,shared.czero);
2793 return;
2794 } else if (ul == 1) {
2795 addReply(c,shared.cone);
2796 return;
2797 }
92b27fe9 2798 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2799 addReplySds(c,sdsnewlen(buf,len));
2800}
2801
942a3961 2802static void addReplyBulkLen(redisClient *c, robj *obj) {
2803 size_t len;
2804
2805 if (obj->encoding == REDIS_ENCODING_RAW) {
2806 len = sdslen(obj->ptr);
2807 } else {
2808 long n = (long)obj->ptr;
2809
e054afda 2810 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2811 len = 1;
2812 if (n < 0) {
2813 len++;
2814 n = -n;
2815 }
2816 while((n = n/10) != 0) {
2817 len++;
2818 }
2819 }
83c6a618 2820 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
942a3961 2821}
2822
dd88747b 2823static void addReplyBulk(redisClient *c, robj *obj) {
2824 addReplyBulkLen(c,obj);
2825 addReply(c,obj);
2826 addReply(c,shared.crlf);
2827}
2828
500ece7c 2829/* In the CONFIG command we need to add vanilla C string as bulk replies */
2830static void addReplyBulkCString(redisClient *c, char *s) {
2831 if (s == NULL) {
2832 addReply(c,shared.nullbulk);
2833 } else {
2834 robj *o = createStringObject(s,strlen(s));
2835 addReplyBulk(c,o);
2836 decrRefCount(o);
2837 }
2838}
2839
ed9b544e 2840static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2841 int cport, cfd;
2842 char cip[128];
285add55 2843 redisClient *c;
ed9b544e 2844 REDIS_NOTUSED(el);
2845 REDIS_NOTUSED(mask);
2846 REDIS_NOTUSED(privdata);
2847
2848 cfd = anetAccept(server.neterr, fd, cip, &cport);
2849 if (cfd == AE_ERR) {
f870935d 2850 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2851 return;
2852 }
f870935d 2853 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2854 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2855 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2856 close(cfd); /* May be already closed, just ingore errors */
2857 return;
2858 }
285add55 2859 /* If maxclient directive is set and this is one client more... close the
2860 * connection. Note that we create the client instead to check before
2861 * for this condition, since now the socket is already set in nonblocking
2862 * mode and we can send an error for free using the Kernel I/O */
2863 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2864 char *err = "-ERR max number of clients reached\r\n";
2865
2866 /* That's a best effort error message, don't check write errors */
fee803ba 2867 if (write(c->fd,err,strlen(err)) == -1) {
2868 /* Nothing to do, Just to avoid the warning... */
2869 }
285add55 2870 freeClient(c);
2871 return;
2872 }
ed9b544e 2873 server.stat_numconnections++;
2874}
2875
2876/* ======================= Redis objects implementation ===================== */
2877
2878static robj *createObject(int type, void *ptr) {
2879 robj *o;
2880
a5819310 2881 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2882 if (listLength(server.objfreelist)) {
2883 listNode *head = listFirst(server.objfreelist);
2884 o = listNodeValue(head);
2885 listDelNode(server.objfreelist,head);
a5819310 2886 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2887 } else {
75680a3c 2888 if (server.vm_enabled) {
a5819310 2889 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2890 o = zmalloc(sizeof(*o));
2891 } else {
2892 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2893 }
ed9b544e 2894 }
ed9b544e 2895 o->type = type;
942a3961 2896 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2897 o->ptr = ptr;
2898 o->refcount = 1;
3a66edc7 2899 if (server.vm_enabled) {
1064ef87 2900 /* Note that this code may run in the context of an I/O thread
2901 * and accessing to server.unixtime in theory is an error
2902 * (no locks). But in practice this is safe, and even if we read
2903 * garbage Redis will not fail, as it's just a statistical info */
3a66edc7 2904 o->vm.atime = server.unixtime;
2905 o->storage = REDIS_VM_MEMORY;
2906 }
ed9b544e 2907 return o;
2908}
2909
2910static robj *createStringObject(char *ptr, size_t len) {
2911 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2912}
2913
3f973463
PN
2914static robj *createStringObjectFromLongLong(long long value) {
2915 robj *o;
2916 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2917 incrRefCount(shared.integers[value]);
2918 o = shared.integers[value];
2919 } else {
2920 o = createObject(REDIS_STRING, NULL);
2921 if (value >= LONG_MIN && value <= LONG_MAX) {
2922 o->encoding = REDIS_ENCODING_INT;
2923 o->ptr = (void*)((long)value);
2924 } else {
ee14da56 2925 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3f973463
PN
2926 }
2927 }
2928 return o;
2929}
2930
4ef8de8a 2931static robj *dupStringObject(robj *o) {
b9bc0eef 2932 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2933 return createStringObject(o->ptr,sdslen(o->ptr));
2934}
2935
ed9b544e 2936static robj *createListObject(void) {
2937 list *l = listCreate();
2938
ed9b544e 2939 listSetFreeMethod(l,decrRefCount);
2940 return createObject(REDIS_LIST,l);
2941}
2942
2943static robj *createSetObject(void) {
2944 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2945 return createObject(REDIS_SET,d);
2946}
2947
5234952b 2948static robj *createHashObject(void) {
2949 /* All the Hashes start as zipmaps. Will be automatically converted
2950 * into hash tables if there are enough elements or big elements
2951 * inside. */
2952 unsigned char *zm = zipmapNew();
2953 robj *o = createObject(REDIS_HASH,zm);
2954 o->encoding = REDIS_ENCODING_ZIPMAP;
2955 return o;
2956}
2957
1812e024 2958static robj *createZsetObject(void) {
6b47e12e 2959 zset *zs = zmalloc(sizeof(*zs));
2960
2961 zs->dict = dictCreate(&zsetDictType,NULL);
2962 zs->zsl = zslCreate();
2963 return createObject(REDIS_ZSET,zs);
1812e024 2964}
2965
ed9b544e 2966static void freeStringObject(robj *o) {
942a3961 2967 if (o->encoding == REDIS_ENCODING_RAW) {
2968 sdsfree(o->ptr);
2969 }
ed9b544e 2970}
2971
2972static void freeListObject(robj *o) {
2973 listRelease((list*) o->ptr);
2974}
2975
2976static void freeSetObject(robj *o) {
2977 dictRelease((dict*) o->ptr);
2978}
2979
fd8ccf44 2980static void freeZsetObject(robj *o) {
2981 zset *zs = o->ptr;
2982
2983 dictRelease(zs->dict);
2984 zslFree(zs->zsl);
2985 zfree(zs);
2986}
2987
ed9b544e 2988static void freeHashObject(robj *o) {
cbba7dd7 2989 switch (o->encoding) {
2990 case REDIS_ENCODING_HT:
2991 dictRelease((dict*) o->ptr);
2992 break;
2993 case REDIS_ENCODING_ZIPMAP:
2994 zfree(o->ptr);
2995 break;
2996 default:
f83c6cb5 2997 redisPanic("Unknown hash encoding type");
cbba7dd7 2998 break;
2999 }
ed9b544e 3000}
3001
3002static void incrRefCount(robj *o) {
3003 o->refcount++;
3004}
3005
3006static void decrRefCount(void *obj) {
3007 robj *o = obj;
94754ccc 3008
c651fd9e 3009 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
970e10bb 3010 /* Object is a key of a swapped out value, or in the process of being
3011 * loaded. */
996cb5f7 3012 if (server.vm_enabled &&
3013 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3014 {
996cb5f7 3015 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 3016 redisAssert(o->type == REDIS_STRING);
a35ddf12 3017 freeStringObject(o);
3018 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 3019 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 3020 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3021 !listAddNodeHead(server.objfreelist,o))
3022 zfree(o);
a5819310 3023 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 3024 server.vm_stats_swapped_objects--;
a35ddf12 3025 return;
3026 }
996cb5f7 3027 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 3028 if (--(o->refcount) == 0) {
996cb5f7 3029 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3030 vmCancelThreadedIOJob(obj);
ed9b544e 3031 switch(o->type) {
3032 case REDIS_STRING: freeStringObject(o); break;
3033 case REDIS_LIST: freeListObject(o); break;
3034 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 3035 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 3036 case REDIS_HASH: freeHashObject(o); break;
f83c6cb5 3037 default: redisPanic("Unknown object type"); break;
ed9b544e 3038 }
a5819310 3039 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 3040 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3041 !listAddNodeHead(server.objfreelist,o))
3042 zfree(o);
a5819310 3043 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 3044 }
3045}
3046
942a3961 3047static robj *lookupKey(redisDb *db, robj *key) {
3048 dictEntry *de = dictFind(db->dict,key);
3a66edc7 3049 if (de) {
55cf8433 3050 robj *key = dictGetEntryKey(de);
3051 robj *val = dictGetEntryVal(de);
3a66edc7 3052
55cf8433 3053 if (server.vm_enabled) {
996cb5f7 3054 if (key->storage == REDIS_VM_MEMORY ||
3055 key->storage == REDIS_VM_SWAPPING)
3056 {
3057 /* If we were swapping the object out, stop it, this key
3058 * was requested. */
3059 if (key->storage == REDIS_VM_SWAPPING)
3060 vmCancelThreadedIOJob(key);
55cf8433 3061 /* Update the access time of the key for the aging algorithm. */
3062 key->vm.atime = server.unixtime;
3063 } else {
d5d55fc3 3064 int notify = (key->storage == REDIS_VM_LOADING);
3065
55cf8433 3066 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 3067 redisAssert(val == NULL);
55cf8433 3068 val = vmLoadObject(key);
3069 dictGetEntryVal(de) = val;
d5d55fc3 3070
3071 /* Clients blocked by the VM subsystem may be waiting for
3072 * this key... */
3073 if (notify) handleClientsBlockedOnSwappedKey(db,key);
55cf8433 3074 }
3075 }
3076 return val;
3a66edc7 3077 } else {
3078 return NULL;
3079 }
942a3961 3080}
3081
3082static robj *lookupKeyRead(redisDb *db, robj *key) {
3083 expireIfNeeded(db,key);
3084 return lookupKey(db,key);
3085}
3086
3087static robj *lookupKeyWrite(redisDb *db, robj *key) {
3088 deleteIfVolatile(db,key);
3089 return lookupKey(db,key);
3090}
3091
92b27fe9 3092static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3093 robj *o = lookupKeyRead(c->db, key);
3094 if (!o) addReply(c,reply);
3095 return o;
3096}
3097
3098static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3099 robj *o = lookupKeyWrite(c->db, key);
3100 if (!o) addReply(c,reply);
3101 return o;
3102}
3103
3104static int checkType(redisClient *c, robj *o, int type) {
3105 if (o->type != type) {
3106 addReply(c,shared.wrongtypeerr);
3107 return 1;
3108 }
3109 return 0;
3110}
3111
942a3961 3112static int deleteKey(redisDb *db, robj *key) {
3113 int retval;
3114
3115 /* We need to protect key from destruction: after the first dictDelete()
3116 * it may happen that 'key' is no longer valid if we don't increment
3117 * it's count. This may happen when we get the object reference directly
3118 * from the hash table with dictRandomKey() or dict iterators */
3119 incrRefCount(key);
3120 if (dictSize(db->expires)) dictDelete(db->expires,key);
3121 retval = dictDelete(db->dict,key);
3122 decrRefCount(key);
3123
3124 return retval == DICT_OK;
3125}
3126
724a51b1 3127/* Check if the nul-terminated string 's' can be represented by a long
3128 * (that is, is a number that fits into long without any other space or
3129 * character before or after the digits).
3130 *
3131 * If so, the function returns REDIS_OK and *longval is set to the value
3132 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 3133static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 3134 char buf[32], *endptr;
3135 long value;
3136 int slen;
e0a62c7f 3137
724a51b1 3138 value = strtol(s, &endptr, 10);
3139 if (endptr[0] != '\0') return REDIS_ERR;
ee14da56 3140 slen = ll2string(buf,32,value);
724a51b1 3141
3142 /* If the number converted back into a string is not identical
3143 * then it's not possible to encode the string as integer */
f69f2cba 3144 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 3145 if (longval) *longval = value;
3146 return REDIS_OK;
3147}
3148
942a3961 3149/* Try to encode a string object in order to save space */
05df7621 3150static robj *tryObjectEncoding(robj *o) {
942a3961 3151 long value;
942a3961 3152 sds s = o->ptr;
3305306f 3153
942a3961 3154 if (o->encoding != REDIS_ENCODING_RAW)
05df7621 3155 return o; /* Already encoded */
3305306f 3156
05df7621 3157 /* It's not safe to encode shared objects: shared objects can be shared
942a3961 3158 * everywhere in the "object space" of Redis. Encoded objects can only
3159 * appear as "values" (and not, for instance, as keys) */
05df7621 3160 if (o->refcount > 1) return o;
3305306f 3161
942a3961 3162 /* Currently we try to encode only strings */
dfc5e96c 3163 redisAssert(o->type == REDIS_STRING);
94754ccc 3164
724a51b1 3165 /* Check if we can represent this string as a long integer */
05df7621 3166 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
942a3961 3167
3168 /* Ok, this object can be encoded */
05df7621 3169 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3170 decrRefCount(o);
3171 incrRefCount(shared.integers[value]);
3172 return shared.integers[value];
3173 } else {
3174 o->encoding = REDIS_ENCODING_INT;
3175 sdsfree(o->ptr);
3176 o->ptr = (void*) value;
3177 return o;
3178 }
942a3961 3179}
3180
9d65a1bb 3181/* Get a decoded version of an encoded object (returned as a new object).
3182 * If the object is already raw-encoded just increment the ref count. */
3183static robj *getDecodedObject(robj *o) {
942a3961 3184 robj *dec;
e0a62c7f 3185
9d65a1bb 3186 if (o->encoding == REDIS_ENCODING_RAW) {
3187 incrRefCount(o);
3188 return o;
3189 }
942a3961 3190 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3191 char buf[32];
3192
ee14da56 3193 ll2string(buf,32,(long)o->ptr);
942a3961 3194 dec = createStringObject(buf,strlen(buf));
3195 return dec;
3196 } else {
08ee9b57 3197 redisPanic("Unknown encoding type");
942a3961 3198 }
3305306f 3199}
3200
d7f43c08 3201/* Compare two string objects via strcmp() or alike.
3202 * Note that the objects may be integer-encoded. In such a case we
ee14da56 3203 * use ll2string() to get a string representation of the numbers on the stack
1fd9bc8a 3204 * and compare the strings, it's much faster than calling getDecodedObject().
3205 *
3206 * Important note: if objects are not integer encoded, but binary-safe strings,
3207 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3208 * binary safe. */
724a51b1 3209static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 3210 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 3211 char bufa[128], bufb[128], *astr, *bstr;
3212 int bothsds = 1;
724a51b1 3213
e197b441 3214 if (a == b) return 0;
d7f43c08 3215 if (a->encoding != REDIS_ENCODING_RAW) {
ee14da56 3216 ll2string(bufa,sizeof(bufa),(long) a->ptr);
d7f43c08 3217 astr = bufa;
3218 bothsds = 0;
724a51b1 3219 } else {
d7f43c08 3220 astr = a->ptr;
724a51b1 3221 }
d7f43c08 3222 if (b->encoding != REDIS_ENCODING_RAW) {
ee14da56 3223 ll2string(bufb,sizeof(bufb),(long) b->ptr);
d7f43c08 3224 bstr = bufb;
3225 bothsds = 0;
3226 } else {
3227 bstr = b->ptr;
3228 }
3229 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3230}
3231
bf028098 3232/* Equal string objects return 1 if the two objects are the same from the
3233 * point of view of a string comparison, otherwise 0 is returned. Note that
3234 * this function is faster then checking for (compareStringObject(a,b) == 0)
3235 * because it can perform some more optimization. */
3236static int equalStringObjects(robj *a, robj *b) {
3237 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3238 return a->ptr == b->ptr;
3239 } else {
3240 return compareStringObjects(a,b) == 0;
3241 }
3242}
3243
0ea663ea 3244static size_t stringObjectLen(robj *o) {
dfc5e96c 3245 redisAssert(o->type == REDIS_STRING);
0ea663ea 3246 if (o->encoding == REDIS_ENCODING_RAW) {
3247 return sdslen(o->ptr);
3248 } else {
3249 char buf[32];
3250
ee14da56 3251 return ll2string(buf,32,(long)o->ptr);
0ea663ea 3252 }
3253}
3254
bd79a6bd
PN
3255static int getDoubleFromObject(robj *o, double *target) {
3256 double value;
682c73e8 3257 char *eptr;
bbe025e0 3258
bd79a6bd
PN
3259 if (o == NULL) {
3260 value = 0;
3261 } else {
3262 redisAssert(o->type == REDIS_STRING);
3263 if (o->encoding == REDIS_ENCODING_RAW) {
3264 value = strtod(o->ptr, &eptr);
682c73e8 3265 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3266 } else if (o->encoding == REDIS_ENCODING_INT) {
3267 value = (long)o->ptr;
3268 } else {
946342c1 3269 redisPanic("Unknown string encoding");
bd79a6bd
PN
3270 }
3271 }
3272
bd79a6bd
PN
3273 *target = value;
3274 return REDIS_OK;
3275}
bbe025e0 3276
bd79a6bd
PN
3277static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3278 double value;
3279 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3280 if (msg != NULL) {
3281 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3282 } else {
3283 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3284 }
bbe025e0
AM
3285 return REDIS_ERR;
3286 }
3287
bd79a6bd 3288 *target = value;
bbe025e0
AM
3289 return REDIS_OK;
3290}
3291
bd79a6bd
PN
3292static int getLongLongFromObject(robj *o, long long *target) {
3293 long long value;
682c73e8 3294 char *eptr;
bbe025e0 3295
bd79a6bd
PN
3296 if (o == NULL) {
3297 value = 0;
3298 } else {
3299 redisAssert(o->type == REDIS_STRING);
3300 if (o->encoding == REDIS_ENCODING_RAW) {
3301 value = strtoll(o->ptr, &eptr, 10);
682c73e8 3302 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3303 } else if (o->encoding == REDIS_ENCODING_INT) {
3304 value = (long)o->ptr;
3305 } else {
946342c1 3306 redisPanic("Unknown string encoding");
bd79a6bd
PN
3307 }
3308 }
3309
bd79a6bd
PN
3310 *target = value;
3311 return REDIS_OK;
3312}
bbe025e0 3313
bd79a6bd
PN
3314static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3315 long long value;
3316 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3317 if (msg != NULL) {
3318 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3319 } else {
3320 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3321 }
bbe025e0
AM
3322 return REDIS_ERR;
3323 }
3324
bd79a6bd 3325 *target = value;
bbe025e0
AM
3326 return REDIS_OK;
3327}
3328
bd79a6bd
PN
3329static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3330 long long value;
bbe025e0 3331
bd79a6bd
PN
3332 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3333 if (value < LONG_MIN || value > LONG_MAX) {
3334 if (msg != NULL) {
3335 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3336 } else {
3337 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3338 }
bbe025e0
AM
3339 return REDIS_ERR;
3340 }
3341
bd79a6bd 3342 *target = value;
bbe025e0
AM
3343 return REDIS_OK;
3344}
3345
06233c45 3346/*============================ RDB saving/loading =========================== */
ed9b544e 3347
f78fd11b 3348static int rdbSaveType(FILE *fp, unsigned char type) {
3349 if (fwrite(&type,1,1,fp) == 0) return -1;
3350 return 0;
3351}
3352
bb32ede5 3353static int rdbSaveTime(FILE *fp, time_t t) {
3354 int32_t t32 = (int32_t) t;
3355 if (fwrite(&t32,4,1,fp) == 0) return -1;
3356 return 0;
3357}
3358
e3566d4b 3359/* check rdbLoadLen() comments for more info */
f78fd11b 3360static int rdbSaveLen(FILE *fp, uint32_t len) {
3361 unsigned char buf[2];
3362
3363 if (len < (1<<6)) {
3364 /* Save a 6 bit len */
10c43610 3365 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3366 if (fwrite(buf,1,1,fp) == 0) return -1;
3367 } else if (len < (1<<14)) {
3368 /* Save a 14 bit len */
10c43610 3369 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3370 buf[1] = len&0xFF;
17be1a4a 3371 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3372 } else {
3373 /* Save a 32 bit len */
10c43610 3374 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3375 if (fwrite(buf,1,1,fp) == 0) return -1;
3376 len = htonl(len);
3377 if (fwrite(&len,4,1,fp) == 0) return -1;
3378 }
3379 return 0;
3380}
3381
32a66513 3382/* Encode 'value' as an integer if possible (if integer will fit the
3383 * supported range). If the function sucessful encoded the integer
3384 * then the (up to 5 bytes) encoded representation is written in the
3385 * string pointed by 'enc' and the length is returned. Otherwise
3386 * 0 is returned. */
3387static int rdbEncodeInteger(long long value, unsigned char *enc) {
e3566d4b 3388 /* Finally check if it fits in our ranges */
3389 if (value >= -(1<<7) && value <= (1<<7)-1) {
3390 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3391 enc[1] = value&0xFF;
3392 return 2;
3393 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3394 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3395 enc[1] = value&0xFF;
3396 enc[2] = (value>>8)&0xFF;
3397 return 3;
3398 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3399 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3400 enc[1] = value&0xFF;
3401 enc[2] = (value>>8)&0xFF;
3402 enc[3] = (value>>16)&0xFF;
3403 enc[4] = (value>>24)&0xFF;
3404 return 5;
3405 } else {
3406 return 0;
3407 }
3408}
3409
32a66513 3410/* String objects in the form "2391" "-100" without any space and with a
3411 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3412 * encoded as integers to save space */
3413static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3414 long long value;
3415 char *endptr, buf[32];
3416
3417 /* Check if it's possible to encode this value as a number */
3418 value = strtoll(s, &endptr, 10);
3419 if (endptr[0] != '\0') return 0;
3420 ll2string(buf,32,value);
3421
3422 /* If the number converted back into a string is not identical
3423 * then it's not possible to encode the string as integer */
3424 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3425
3426 return rdbEncodeInteger(value,enc);
3427}
3428
b1befe6a 3429static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3430 size_t comprlen, outlen;
774e3047 3431 unsigned char byte;
3432 void *out;
3433
3434 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3435 if (len <= 4) return 0;
3436 outlen = len-4;
3a2694c4 3437 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3438 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3439 if (comprlen == 0) {
88e85998 3440 zfree(out);
774e3047 3441 return 0;
3442 }
3443 /* Data compressed! Let's save it on disk */
3444 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3445 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3446 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3447 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3448 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3449 zfree(out);
774e3047 3450 return comprlen;
3451
3452writeerr:
88e85998 3453 zfree(out);
774e3047 3454 return -1;
3455}
3456
e3566d4b 3457/* Save a string objet as [len][data] on disk. If the object is a string
3458 * representation of an integer value we try to safe it in a special form */
b1befe6a 3459static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3460 int enclen;
10c43610 3461
774e3047 3462 /* Try integer encoding */
e3566d4b 3463 if (len <= 11) {
3464 unsigned char buf[5];
b1befe6a 3465 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3466 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3467 return 0;
3468 }
3469 }
774e3047 3470
3471 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3472 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3473 if (server.rdbcompression && len > 20) {
774e3047 3474 int retval;
3475
b1befe6a 3476 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3477 if (retval == -1) return -1;
3478 if (retval > 0) return 0;
3479 /* retval == 0 means data can't be compressed, save the old way */
3480 }
3481
3482 /* Store verbatim */
10c43610 3483 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3484 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3485 return 0;
3486}
3487
942a3961 3488/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3489static int rdbSaveStringObject(FILE *fp, robj *obj) {
3490 int retval;
942a3961 3491
32a66513 3492 /* Avoid to decode the object, then encode it again, if the
3493 * object is alrady integer encoded. */
3494 if (obj->encoding == REDIS_ENCODING_INT) {
3495 long val = (long) obj->ptr;
3496 unsigned char buf[5];
3497 int enclen;
3498
3499 if ((enclen = rdbEncodeInteger(val,buf)) > 0) {
3500 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3501 return 0;
3502 }
3503 /* otherwise... fall throught and continue with the usual
3504 * code path. */
3505 }
3506
f2d9f50f 3507 /* Avoid incr/decr ref count business when possible.
3508 * This plays well with copy-on-write given that we are probably
3509 * in a child process (BGSAVE). Also this makes sure key objects
3510 * of swapped objects are not incRefCount-ed (an assert does not allow
3511 * this in order to avoid bugs) */
3512 if (obj->encoding != REDIS_ENCODING_RAW) {
996cb5f7 3513 obj = getDecodedObject(obj);
b1befe6a 3514 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3515 decrRefCount(obj);
3516 } else {
b1befe6a 3517 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3518 }
9d65a1bb 3519 return retval;
942a3961 3520}
3521
a7866db6 3522/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3523 * 8 bit integer specifing the length of the representation.
3524 * This 8 bit integer has special values in order to specify the following
3525 * conditions:
3526 * 253: not a number
3527 * 254: + inf
3528 * 255: - inf
3529 */
3530static int rdbSaveDoubleValue(FILE *fp, double val) {
3531 unsigned char buf[128];
3532 int len;
3533
3534 if (isnan(val)) {
3535 buf[0] = 253;
3536 len = 1;
3537 } else if (!isfinite(val)) {
3538 len = 1;
3539 buf[0] = (val < 0) ? 255 : 254;
3540 } else {
88e8d89f 3541#if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
fe244589 3542 /* Check if the float is in a safe range to be casted into a
3543 * long long. We are assuming that long long is 64 bit here.
3544 * Also we are assuming that there are no implementations around where
3545 * double has precision < 52 bit.
3546 *
3547 * Under this assumptions we test if a double is inside an interval
3548 * where casting to long long is safe. Then using two castings we
3549 * make sure the decimal part is zero. If all this is true we use
3550 * integer printing function that is much faster. */
fb82e75c 3551 double min = -4503599627370495; /* (2^52)-1 */
3552 double max = 4503599627370496; /* -(2^52) */
fe244589 3553 if (val > min && val < max && val == ((double)((long long)val)))
8c096b16 3554 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3555 else
88e8d89f 3556#endif
8c096b16 3557 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3558 buf[0] = strlen((char*)buf+1);
a7866db6 3559 len = buf[0]+1;
3560 }
3561 if (fwrite(buf,len,1,fp) == 0) return -1;
3562 return 0;
3563}
3564
06233c45 3565/* Save a Redis object. */
3566static int rdbSaveObject(FILE *fp, robj *o) {
3567 if (o->type == REDIS_STRING) {
3568 /* Save a string value */
3569 if (rdbSaveStringObject(fp,o) == -1) return -1;
3570 } else if (o->type == REDIS_LIST) {
3571 /* Save a list value */
3572 list *list = o->ptr;
c7df85a4 3573 listIter li;
06233c45 3574 listNode *ln;
3575
06233c45 3576 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
c7df85a4 3577 listRewind(list,&li);
3578 while((ln = listNext(&li))) {
06233c45 3579 robj *eleobj = listNodeValue(ln);
3580
3581 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3582 }
3583 } else if (o->type == REDIS_SET) {
3584 /* Save a set value */
3585 dict *set = o->ptr;
3586 dictIterator *di = dictGetIterator(set);
3587 dictEntry *de;
3588
3589 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3590 while((de = dictNext(di)) != NULL) {
3591 robj *eleobj = dictGetEntryKey(de);
3592
3593 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3594 }
3595 dictReleaseIterator(di);
3596 } else if (o->type == REDIS_ZSET) {
3597 /* Save a set value */
3598 zset *zs = o->ptr;
3599 dictIterator *di = dictGetIterator(zs->dict);
3600 dictEntry *de;
3601
3602 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3603 while((de = dictNext(di)) != NULL) {
3604 robj *eleobj = dictGetEntryKey(de);
3605 double *score = dictGetEntryVal(de);
3606
3607 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3608 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3609 }
3610 dictReleaseIterator(di);
b1befe6a 3611 } else if (o->type == REDIS_HASH) {
3612 /* Save a hash value */
3613 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3614 unsigned char *p = zipmapRewind(o->ptr);
3615 unsigned int count = zipmapLen(o->ptr);
3616 unsigned char *key, *val;
3617 unsigned int klen, vlen;
3618
3619 if (rdbSaveLen(fp,count) == -1) return -1;
3620 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3621 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3622 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3623 }
3624 } else {
3625 dictIterator *di = dictGetIterator(o->ptr);
3626 dictEntry *de;
3627
3628 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3629 while((de = dictNext(di)) != NULL) {
3630 robj *key = dictGetEntryKey(de);
3631 robj *val = dictGetEntryVal(de);
3632
3633 if (rdbSaveStringObject(fp,key) == -1) return -1;
3634 if (rdbSaveStringObject(fp,val) == -1) return -1;
3635 }
3636 dictReleaseIterator(di);
3637 }
06233c45 3638 } else {
f83c6cb5 3639 redisPanic("Unknown object type");
06233c45 3640 }
3641 return 0;
3642}
3643
3644/* Return the length the object will have on disk if saved with
3645 * the rdbSaveObject() function. Currently we use a trick to get
3646 * this length with very little changes to the code. In the future
3647 * we could switch to a faster solution. */
b9bc0eef 3648static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3649 if (fp == NULL) fp = server.devnull;
06233c45 3650 rewind(fp);
3651 assert(rdbSaveObject(fp,o) != 1);
3652 return ftello(fp);
3653}
3654
06224fec 3655/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3656static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3657 off_t bytes = rdbSavedObjectLen(o,fp);
e0a62c7f 3658
06224fec 3659 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3660}
3661
ed9b544e 3662/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3663static int rdbSave(char *filename) {
ed9b544e 3664 dictIterator *di = NULL;
3665 dictEntry *de;
ed9b544e 3666 FILE *fp;
3667 char tmpfile[256];
3668 int j;
bb32ede5 3669 time_t now = time(NULL);
ed9b544e 3670
2316bb3b 3671 /* Wait for I/O therads to terminate, just in case this is a
3672 * foreground-saving, to avoid seeking the swap file descriptor at the
3673 * same time. */
3674 if (server.vm_enabled)
3675 waitEmptyIOJobsQueue();
3676
a3b21203 3677 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3678 fp = fopen(tmpfile,"w");
3679 if (!fp) {
3680 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3681 return REDIS_ERR;
3682 }
f78fd11b 3683 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3684 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3685 redisDb *db = server.db+j;
3686 dict *d = db->dict;
3305306f 3687 if (dictSize(d) == 0) continue;
ed9b544e 3688 di = dictGetIterator(d);
3689 if (!di) {
3690 fclose(fp);
3691 return REDIS_ERR;
3692 }
3693
3694 /* Write the SELECT DB opcode */
f78fd11b 3695 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3696 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3697
3698 /* Iterate this DB writing every entry */
3699 while((de = dictNext(di)) != NULL) {
3700 robj *key = dictGetEntryKey(de);
3701 robj *o = dictGetEntryVal(de);
bb32ede5 3702 time_t expiretime = getExpire(db,key);
3703
3704 /* Save the expire time */
3705 if (expiretime != -1) {
3706 /* If this key is already expired skip it */
3707 if (expiretime < now) continue;
3708 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3709 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3710 }
7e69548d 3711 /* Save the key and associated value. This requires special
3712 * handling if the value is swapped out. */
996cb5f7 3713 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3714 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3715 /* Save type, key, value */
3716 if (rdbSaveType(fp,o->type) == -1) goto werr;
3717 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3718 if (rdbSaveObject(fp,o) == -1) goto werr;
3719 } else {
996cb5f7 3720 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3721 robj *po;
7e69548d 3722 /* Get a preview of the object in memory */
3723 po = vmPreviewObject(key);
7e69548d 3724 /* Save type, key, value */
3725 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3726 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3727 if (rdbSaveObject(fp,po) == -1) goto werr;
3728 /* Remove the loaded object from memory */
3729 decrRefCount(po);
7e69548d 3730 }
ed9b544e 3731 }
3732 dictReleaseIterator(di);
3733 }
3734 /* EOF opcode */
f78fd11b 3735 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3736
3737 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3738 fflush(fp);
3739 fsync(fileno(fp));
3740 fclose(fp);
e0a62c7f 3741
ed9b544e 3742 /* Use RENAME to make sure the DB file is changed atomically only
3743 * if the generate DB file is ok. */
3744 if (rename(tmpfile,filename) == -1) {
325d1eb4 3745 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3746 unlink(tmpfile);
3747 return REDIS_ERR;
3748 }
3749 redisLog(REDIS_NOTICE,"DB saved on disk");
3750 server.dirty = 0;
3751 server.lastsave = time(NULL);
3752 return REDIS_OK;
3753
3754werr:
3755 fclose(fp);
3756 unlink(tmpfile);
3757 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3758 if (di) dictReleaseIterator(di);
3759 return REDIS_ERR;
3760}
3761
f78fd11b 3762static int rdbSaveBackground(char *filename) {
ed9b544e 3763 pid_t childpid;
3764
9d65a1bb 3765 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3766 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3767 if ((childpid = fork()) == 0) {
3768 /* Child */
054e426d 3769 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3770 close(server.fd);
f78fd11b 3771 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3772 _exit(0);
ed9b544e 3773 } else {
478c2c6f 3774 _exit(1);
ed9b544e 3775 }
3776 } else {
3777 /* Parent */
5a7c647e 3778 if (childpid == -1) {
3779 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3780 strerror(errno));
3781 return REDIS_ERR;
3782 }
ed9b544e 3783 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3784 server.bgsavechildpid = childpid;
884d4b39 3785 updateDictResizePolicy();
ed9b544e 3786 return REDIS_OK;
3787 }
3788 return REDIS_OK; /* unreached */
3789}
3790
a3b21203 3791static void rdbRemoveTempFile(pid_t childpid) {
3792 char tmpfile[256];
3793
3794 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3795 unlink(tmpfile);
3796}
3797
f78fd11b 3798static int rdbLoadType(FILE *fp) {
3799 unsigned char type;
7b45bfb2 3800 if (fread(&type,1,1,fp) == 0) return -1;
3801 return type;
3802}
3803
bb32ede5 3804static time_t rdbLoadTime(FILE *fp) {
3805 int32_t t32;
3806 if (fread(&t32,4,1,fp) == 0) return -1;
3807 return (time_t) t32;
3808}
3809
e3566d4b 3810/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3811 * of this file for a description of how this are stored on disk.
3812 *
3813 * isencoded is set to 1 if the readed length is not actually a length but
3814 * an "encoding type", check the above comments for more info */
c78a8ccc 3815static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3816 unsigned char buf[2];
3817 uint32_t len;
c78a8ccc 3818 int type;
f78fd11b 3819
e3566d4b 3820 if (isencoded) *isencoded = 0;
c78a8ccc 3821 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3822 type = (buf[0]&0xC0)>>6;
3823 if (type == REDIS_RDB_6BITLEN) {
3824 /* Read a 6 bit len */
3825 return buf[0]&0x3F;
3826 } else if (type == REDIS_RDB_ENCVAL) {
3827 /* Read a 6 bit len encoding type */
3828 if (isencoded) *isencoded = 1;
3829 return buf[0]&0x3F;
3830 } else if (type == REDIS_RDB_14BITLEN) {
3831 /* Read a 14 bit len */
3832 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3833 return ((buf[0]&0x3F)<<8)|buf[1];
3834 } else {
3835 /* Read a 32 bit len */
f78fd11b 3836 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3837 return ntohl(len);
f78fd11b 3838 }
f78fd11b 3839}
3840
ad30aa60 3841/* Load an integer-encoded object from file 'fp', with the specified
3842 * encoding type 'enctype'. If encode is true the function may return
3843 * an integer-encoded object as reply, otherwise the returned object
3844 * will always be encoded as a raw string. */
3845static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
e3566d4b 3846 unsigned char enc[4];
3847 long long val;
3848
3849 if (enctype == REDIS_RDB_ENC_INT8) {
3850 if (fread(enc,1,1,fp) == 0) return NULL;
3851 val = (signed char)enc[0];
3852 } else if (enctype == REDIS_RDB_ENC_INT16) {
3853 uint16_t v;
3854 if (fread(enc,2,1,fp) == 0) return NULL;
3855 v = enc[0]|(enc[1]<<8);
3856 val = (int16_t)v;
3857 } else if (enctype == REDIS_RDB_ENC_INT32) {
3858 uint32_t v;
3859 if (fread(enc,4,1,fp) == 0) return NULL;
3860 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3861 val = (int32_t)v;
3862 } else {
3863 val = 0; /* anti-warning */
f83c6cb5 3864 redisPanic("Unknown RDB integer encoding type");
e3566d4b 3865 }
ad30aa60 3866 if (encode)
3867 return createStringObjectFromLongLong(val);
3868 else
3869 return createObject(REDIS_STRING,sdsfromlonglong(val));
e3566d4b 3870}
3871
c78a8ccc 3872static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3873 unsigned int len, clen;
3874 unsigned char *c = NULL;
3875 sds val = NULL;
3876
c78a8ccc 3877 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3878 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3879 if ((c = zmalloc(clen)) == NULL) goto err;
3880 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3881 if (fread(c,clen,1,fp) == 0) goto err;
3882 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3883 zfree(c);
88e85998 3884 return createObject(REDIS_STRING,val);
3885err:
3886 zfree(c);
3887 sdsfree(val);
3888 return NULL;
3889}
3890
ad30aa60 3891static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
e3566d4b 3892 int isencoded;
3893 uint32_t len;
f78fd11b 3894 sds val;
3895
c78a8ccc 3896 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3897 if (isencoded) {
3898 switch(len) {
3899 case REDIS_RDB_ENC_INT8:
3900 case REDIS_RDB_ENC_INT16:
3901 case REDIS_RDB_ENC_INT32:
ad30aa60 3902 return rdbLoadIntegerObject(fp,len,encode);
88e85998 3903 case REDIS_RDB_ENC_LZF:
bdcb92f2 3904 return rdbLoadLzfStringObject(fp);
e3566d4b 3905 default:
f83c6cb5 3906 redisPanic("Unknown RDB encoding type");
e3566d4b 3907 }
3908 }
3909
f78fd11b 3910 if (len == REDIS_RDB_LENERR) return NULL;
3911 val = sdsnewlen(NULL,len);
3912 if (len && fread(val,len,1,fp) == 0) {
3913 sdsfree(val);
3914 return NULL;
3915 }
bdcb92f2 3916 return createObject(REDIS_STRING,val);
f78fd11b 3917}
3918
ad30aa60 3919static robj *rdbLoadStringObject(FILE *fp) {
3920 return rdbGenericLoadStringObject(fp,0);
3921}
3922
3923static robj *rdbLoadEncodedStringObject(FILE *fp) {
3924 return rdbGenericLoadStringObject(fp,1);
3925}
3926
a7866db6 3927/* For information about double serialization check rdbSaveDoubleValue() */
3928static int rdbLoadDoubleValue(FILE *fp, double *val) {
3929 char buf[128];
3930 unsigned char len;
3931
3932 if (fread(&len,1,1,fp) == 0) return -1;
3933 switch(len) {
3934 case 255: *val = R_NegInf; return 0;
3935 case 254: *val = R_PosInf; return 0;
3936 case 253: *val = R_Nan; return 0;
3937 default:
3938 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3939 buf[len] = '\0';
a7866db6 3940 sscanf(buf, "%lg", val);
3941 return 0;
3942 }
3943}
3944
c78a8ccc 3945/* Load a Redis object of the specified type from the specified file.
3946 * On success a newly allocated object is returned, otherwise NULL. */
3947static robj *rdbLoadObject(int type, FILE *fp) {
3948 robj *o;
3949
bcd11906 3950 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 3951 if (type == REDIS_STRING) {
3952 /* Read string value */
ad30aa60 3953 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 3954 o = tryObjectEncoding(o);
c78a8ccc 3955 } else if (type == REDIS_LIST || type == REDIS_SET) {
3956 /* Read list/set value */
3957 uint32_t listlen;
3958
3959 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3960 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3c68de9b 3961 /* It's faster to expand the dict to the right size asap in order
3962 * to avoid rehashing */
3963 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3964 dictExpand(o->ptr,listlen);
c78a8ccc 3965 /* Load every single element of the list/set */
3966 while(listlen--) {
3967 robj *ele;
3968
ad30aa60 3969 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 3970 ele = tryObjectEncoding(ele);
c78a8ccc 3971 if (type == REDIS_LIST) {
3972 listAddNodeTail((list*)o->ptr,ele);
3973 } else {
3974 dictAdd((dict*)o->ptr,ele,NULL);
3975 }
3976 }
3977 } else if (type == REDIS_ZSET) {
3978 /* Read list/set value */
ada386b2 3979 size_t zsetlen;
c78a8ccc 3980 zset *zs;
3981
3982 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3983 o = createZsetObject();
3984 zs = o->ptr;
3985 /* Load every single element of the list/set */
3986 while(zsetlen--) {
3987 robj *ele;
3988 double *score = zmalloc(sizeof(double));
3989
ad30aa60 3990 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 3991 ele = tryObjectEncoding(ele);
c78a8ccc 3992 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3993 dictAdd(zs->dict,ele,score);
3994 zslInsert(zs->zsl,*score,ele);
3995 incrRefCount(ele); /* added to skiplist */
3996 }
ada386b2 3997 } else if (type == REDIS_HASH) {
3998 size_t hashlen;
3999
4000 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4001 o = createHashObject();
4002 /* Too many entries? Use an hash table. */
4003 if (hashlen > server.hash_max_zipmap_entries)
4004 convertToRealHash(o);
4005 /* Load every key/value, then set it into the zipmap or hash
4006 * table, as needed. */
4007 while(hashlen--) {
4008 robj *key, *val;
4009
4010 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
4011 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
4012 /* If we are using a zipmap and there are too big values
4013 * the object is converted to real hash table encoding. */
4014 if (o->encoding != REDIS_ENCODING_HT &&
4015 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4016 sdslen(val->ptr) > server.hash_max_zipmap_value))
4017 {
4018 convertToRealHash(o);
4019 }
4020
4021 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4022 unsigned char *zm = o->ptr;
4023
4024 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4025 val->ptr,sdslen(val->ptr),NULL);
4026 o->ptr = zm;
4027 decrRefCount(key);
4028 decrRefCount(val);
4029 } else {
05df7621 4030 key = tryObjectEncoding(key);
4031 val = tryObjectEncoding(val);
ada386b2 4032 dictAdd((dict*)o->ptr,key,val);
ada386b2 4033 }
4034 }
c78a8ccc 4035 } else {
f83c6cb5 4036 redisPanic("Unknown object type");
c78a8ccc 4037 }
4038 return o;
4039}
4040
f78fd11b 4041static int rdbLoad(char *filename) {
ed9b544e 4042 FILE *fp;
f78fd11b 4043 uint32_t dbid;
bb32ede5 4044 int type, retval, rdbver;
585af7e2 4045 int swap_all_values = 0;
3305306f 4046 dict *d = server.db[0].dict;
bb32ede5 4047 redisDb *db = server.db+0;
f78fd11b 4048 char buf[1024];
242a64f3 4049 time_t expiretime, now = time(NULL);
b492cf00 4050 long long loadedkeys = 0;
bb32ede5 4051
ed9b544e 4052 fp = fopen(filename,"r");
4053 if (!fp) return REDIS_ERR;
4054 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 4055 buf[9] = '\0';
4056 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 4057 fclose(fp);
4058 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4059 return REDIS_ERR;
4060 }
f78fd11b 4061 rdbver = atoi(buf+5);
c78a8ccc 4062 if (rdbver != 1) {
f78fd11b 4063 fclose(fp);
4064 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4065 return REDIS_ERR;
4066 }
ed9b544e 4067 while(1) {
585af7e2 4068 robj *key, *val;
ed9b544e 4069
585af7e2 4070 expiretime = -1;
ed9b544e 4071 /* Read type. */
f78fd11b 4072 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 4073 if (type == REDIS_EXPIRETIME) {
4074 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4075 /* We read the time so we need to read the object type again */
4076 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4077 }
ed9b544e 4078 if (type == REDIS_EOF) break;
4079 /* Handle SELECT DB opcode as a special case */
4080 if (type == REDIS_SELECTDB) {
c78a8ccc 4081 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 4082 goto eoferr;
ed9b544e 4083 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 4084 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 4085 exit(1);
4086 }
bb32ede5 4087 db = server.db+dbid;
4088 d = db->dict;
ed9b544e 4089 continue;
4090 }
4091 /* Read key */
585af7e2 4092 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
c78a8ccc 4093 /* Read value */
585af7e2 4094 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
89e689c5 4095 /* Check if the key already expired */
4096 if (expiretime != -1 && expiretime < now) {
4097 decrRefCount(key);
4098 decrRefCount(val);
4099 continue;
4100 }
ed9b544e 4101 /* Add the new object in the hash table */
585af7e2 4102 retval = dictAdd(d,key,val);
ed9b544e 4103 if (retval == DICT_ERR) {
585af7e2 4104 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
ed9b544e 4105 exit(1);
4106 }
242a64f3 4107 loadedkeys++;
bb32ede5 4108 /* Set the expire time if needed */
89e689c5 4109 if (expiretime != -1) setExpire(db,key,expiretime);
242a64f3 4110
b492cf00 4111 /* Handle swapping while loading big datasets when VM is on */
242a64f3 4112
4113 /* If we detecter we are hopeless about fitting something in memory
4114 * we just swap every new key on disk. Directly...
4115 * Note that's important to check for this condition before resorting
4116 * to random sampling, otherwise we may try to swap already
4117 * swapped keys. */
585af7e2 4118 if (swap_all_values) {
4119 dictEntry *de = dictFind(d,key);
242a64f3 4120
4121 /* de may be NULL since the key already expired */
4122 if (de) {
585af7e2 4123 key = dictGetEntryKey(de);
4124 val = dictGetEntryVal(de);
242a64f3 4125
585af7e2 4126 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
242a64f3 4127 dictGetEntryVal(de) = NULL;
4128 }
4129 }
4130 continue;
4131 }
4132
4133 /* If we have still some hope of having some value fitting memory
4134 * then we try random sampling. */
585af7e2 4135 if (!swap_all_values && server.vm_enabled && (loadedkeys % 5000) == 0) {
b492cf00 4136 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 4137 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 4138 }
242a64f3 4139 if (zmalloc_used_memory() > server.vm_max_memory)
585af7e2 4140 swap_all_values = 1; /* We are already using too much mem */
b492cf00 4141 }
ed9b544e 4142 }
4143 fclose(fp);
4144 return REDIS_OK;
4145
4146eoferr: /* unexpected end of file is handled here with a fatal exit */
f80dff62 4147 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 4148 exit(1);
4149 return REDIS_ERR; /* Just to avoid warning */
4150}
4151
4152/*================================== Commands =============================== */
4153
abcb223e 4154static void authCommand(redisClient *c) {
2e77c2ee 4155 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
4156 c->authenticated = 1;
4157 addReply(c,shared.ok);
4158 } else {
4159 c->authenticated = 0;
fa4c0aba 4160 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
4161 }
4162}
4163
ed9b544e 4164static void pingCommand(redisClient *c) {
4165 addReply(c,shared.pong);
4166}
4167
4168static void echoCommand(redisClient *c) {
dd88747b 4169 addReplyBulk(c,c->argv[1]);
ed9b544e 4170}
4171
4172/*=================================== Strings =============================== */
4173
526d00a5 4174static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
ed9b544e 4175 int retval;
10ce1276 4176 long seconds = 0; /* initialized to avoid an harmness warning */
ed9b544e 4177
526d00a5 4178 if (expire) {
4179 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4180 return;
4181 if (seconds <= 0) {
4182 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4183 return;
4184 }
4185 }
4186
4187 if (nx) deleteIfVolatile(c->db,key);
4188 retval = dictAdd(c->db->dict,key,val);
ed9b544e 4189 if (retval == DICT_ERR) {
4190 if (!nx) {
1b03836c 4191 /* If the key is about a swapped value, we want a new key object
4192 * to overwrite the old. So we delete the old key in the database.
4193 * This will also make sure that swap pages about the old object
4194 * will be marked as free. */
526d00a5 4195 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4196 incrRefCount(key);
4197 dictReplace(c->db->dict,key,val);
4198 incrRefCount(val);
ed9b544e 4199 } else {
c937aa89 4200 addReply(c,shared.czero);
ed9b544e 4201 return;
4202 }
4203 } else {
526d00a5 4204 incrRefCount(key);
4205 incrRefCount(val);
ed9b544e 4206 }
4207 server.dirty++;
526d00a5 4208 removeExpire(c->db,key);
4209 if (expire) setExpire(c->db,key,time(NULL)+seconds);
c937aa89 4210 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 4211}
4212
4213static void setCommand(redisClient *c) {
526d00a5 4214 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
ed9b544e 4215}
4216
4217static void setnxCommand(redisClient *c) {
526d00a5 4218 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4219}
4220
4221static void setexCommand(redisClient *c) {
4222 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
ed9b544e 4223}
4224
322fc7d8 4225static int getGenericCommand(redisClient *c) {
dd88747b 4226 robj *o;
e0a62c7f 4227
dd88747b 4228 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 4229 return REDIS_OK;
dd88747b 4230
4231 if (o->type != REDIS_STRING) {
4232 addReply(c,shared.wrongtypeerr);
4233 return REDIS_ERR;
ed9b544e 4234 } else {
dd88747b 4235 addReplyBulk(c,o);
4236 return REDIS_OK;
ed9b544e 4237 }
4238}
4239
322fc7d8 4240static void getCommand(redisClient *c) {
4241 getGenericCommand(c);
4242}
4243
f6b141c5 4244static void getsetCommand(redisClient *c) {
322fc7d8 4245 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 4246 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4247 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4248 } else {
4249 incrRefCount(c->argv[1]);
4250 }
4251 incrRefCount(c->argv[2]);
4252 server.dirty++;
4253 removeExpire(c->db,c->argv[1]);
4254}
4255
70003d28 4256static void mgetCommand(redisClient *c) {
70003d28 4257 int j;
e0a62c7f 4258
c937aa89 4259 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 4260 for (j = 1; j < c->argc; j++) {
3305306f 4261 robj *o = lookupKeyRead(c->db,c->argv[j]);
4262 if (o == NULL) {
c937aa89 4263 addReply(c,shared.nullbulk);
70003d28 4264 } else {
70003d28 4265 if (o->type != REDIS_STRING) {
c937aa89 4266 addReply(c,shared.nullbulk);
70003d28 4267 } else {
dd88747b 4268 addReplyBulk(c,o);
70003d28 4269 }
4270 }
4271 }
4272}
4273
6c446631 4274static void msetGenericCommand(redisClient *c, int nx) {
906573e7 4275 int j, busykeys = 0;
6c446631 4276
4277 if ((c->argc % 2) == 0) {
454d4e43 4278 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 4279 return;
4280 }
4281 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4282 * set nothing at all if at least one already key exists. */
4283 if (nx) {
4284 for (j = 1; j < c->argc; j += 2) {
906573e7 4285 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4286 busykeys++;
6c446631 4287 }
4288 }
4289 }
906573e7 4290 if (busykeys) {
4291 addReply(c, shared.czero);
4292 return;
4293 }
6c446631 4294
4295 for (j = 1; j < c->argc; j += 2) {
4296 int retval;
4297
05df7621 4298 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
6c446631 4299 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4300 if (retval == DICT_ERR) {
4301 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4302 incrRefCount(c->argv[j+1]);
4303 } else {
4304 incrRefCount(c->argv[j]);
4305 incrRefCount(c->argv[j+1]);
4306 }
4307 removeExpire(c->db,c->argv[j]);
4308 }
4309 server.dirty += (c->argc-1)/2;
4310 addReply(c, nx ? shared.cone : shared.ok);
4311}
4312
4313static void msetCommand(redisClient *c) {
4314 msetGenericCommand(c,0);
4315}
4316
4317static void msetnxCommand(redisClient *c) {
4318 msetGenericCommand(c,1);
4319}
4320
d68ed120 4321static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 4322 long long value;
4323 int retval;
4324 robj *o;
e0a62c7f 4325
3305306f 4326 o = lookupKeyWrite(c->db,c->argv[1]);
6485f293
PN
4327 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4328 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
ed9b544e 4329
4330 value += incr;
4331 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
05df7621 4332 o = tryObjectEncoding(o);
3305306f 4333 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 4334 if (retval == DICT_ERR) {
3305306f 4335 dictReplace(c->db->dict,c->argv[1],o);
4336 removeExpire(c->db,c->argv[1]);
ed9b544e 4337 } else {
4338 incrRefCount(c->argv[1]);
4339 }
4340 server.dirty++;
c937aa89 4341 addReply(c,shared.colon);
ed9b544e 4342 addReply(c,o);
4343 addReply(c,shared.crlf);
4344}
4345
4346static void incrCommand(redisClient *c) {
a4d1ba9a 4347 incrDecrCommand(c,1);
ed9b544e 4348}
4349
4350static void decrCommand(redisClient *c) {
a4d1ba9a 4351 incrDecrCommand(c,-1);
ed9b544e 4352}
4353
4354static void incrbyCommand(redisClient *c) {
bbe025e0
AM
4355 long long incr;
4356
bd79a6bd 4357 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4358 incrDecrCommand(c,incr);
ed9b544e 4359}
4360
4361static void decrbyCommand(redisClient *c) {
bbe025e0
AM
4362 long long incr;
4363
bd79a6bd 4364 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4365 incrDecrCommand(c,-incr);
ed9b544e 4366}
4367
4b00bebd 4368static void appendCommand(redisClient *c) {
4369 int retval;
4370 size_t totlen;
4371 robj *o;
4372
4373 o = lookupKeyWrite(c->db,c->argv[1]);
4374 if (o == NULL) {
4375 /* Create the key */
4376 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4377 incrRefCount(c->argv[1]);
4378 incrRefCount(c->argv[2]);
4379 totlen = stringObjectLen(c->argv[2]);
4380 } else {
4381 dictEntry *de;
e0a62c7f 4382
4b00bebd 4383 de = dictFind(c->db->dict,c->argv[1]);
4384 assert(de != NULL);
4385
4386 o = dictGetEntryVal(de);
4387 if (o->type != REDIS_STRING) {
4388 addReply(c,shared.wrongtypeerr);
4389 return;
4390 }
4391 /* If the object is specially encoded or shared we have to make
4392 * a copy */
4393 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4394 robj *decoded = getDecodedObject(o);
4395
4396 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4397 decrRefCount(decoded);
4398 dictReplace(c->db->dict,c->argv[1],o);
4399 }
4400 /* APPEND! */
4401 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4402 o->ptr = sdscatlen(o->ptr,
4403 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4404 } else {
4405 o->ptr = sdscatprintf(o->ptr, "%ld",
4406 (unsigned long) c->argv[2]->ptr);
4407 }
4408 totlen = sdslen(o->ptr);
4409 }
4410 server.dirty++;
4411 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4412}
4413
39191553 4414static void substrCommand(redisClient *c) {
4415 robj *o;
4416 long start = atoi(c->argv[2]->ptr);
4417 long end = atoi(c->argv[3]->ptr);
dd88747b 4418 size_t rangelen, strlen;
4419 sds range;
39191553 4420
dd88747b 4421 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4422 checkType(c,o,REDIS_STRING)) return;
39191553 4423
dd88747b 4424 o = getDecodedObject(o);
4425 strlen = sdslen(o->ptr);
8fe7fad7 4426
dd88747b 4427 /* convert negative indexes */
4428 if (start < 0) start = strlen+start;
4429 if (end < 0) end = strlen+end;
4430 if (start < 0) start = 0;
4431 if (end < 0) end = 0;
39191553 4432
dd88747b 4433 /* indexes sanity checks */
4434 if (start > end || (size_t)start >= strlen) {
4435 /* Out of range start or start > end result in null reply */
4436 addReply(c,shared.nullbulk);
4437 decrRefCount(o);
4438 return;
39191553 4439 }
dd88747b 4440 if ((size_t)end >= strlen) end = strlen-1;
4441 rangelen = (end-start)+1;
4442
4443 /* Return the result */
4444 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4445 range = sdsnewlen((char*)o->ptr+start,rangelen);
4446 addReplySds(c,range);
4447 addReply(c,shared.crlf);
4448 decrRefCount(o);
39191553 4449}
4450
ed9b544e 4451/* ========================= Type agnostic commands ========================= */
4452
4453static void delCommand(redisClient *c) {
5109cdff 4454 int deleted = 0, j;
4455
4456 for (j = 1; j < c->argc; j++) {
4457 if (deleteKey(c->db,c->argv[j])) {
4458 server.dirty++;
4459 deleted++;
4460 }
4461 }
dd88747b 4462 addReplyLong(c,deleted);
ed9b544e 4463}
4464
4465static void existsCommand(redisClient *c) {
3305306f 4466 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
ed9b544e 4467}
4468
4469static void selectCommand(redisClient *c) {
4470 int id = atoi(c->argv[1]->ptr);
e0a62c7f 4471
ed9b544e 4472 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4473 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4474 } else {
4475 addReply(c,shared.ok);
4476 }
4477}
4478
4479static void randomkeyCommand(redisClient *c) {
4480 dictEntry *de;
dc4be23e 4481 robj *key;
e0a62c7f 4482
3305306f 4483 while(1) {
4484 de = dictGetRandomKey(c->db->dict);
ce7bef07 4485 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 4486 }
2b619329 4487
ed9b544e 4488 if (de == NULL) {
dc4be23e 4489 addReply(c,shared.nullbulk);
4490 return;
4491 }
4492
4493 key = dictGetEntryKey(de);
4494 if (server.vm_enabled) {
4495 key = dupStringObject(key);
4496 addReplyBulk(c,key);
4497 decrRefCount(key);
ed9b544e 4498 } else {
dc4be23e 4499 addReplyBulk(c,key);
ed9b544e 4500 }
4501}
4502
4503static void keysCommand(redisClient *c) {
4504 dictIterator *di;
4505 dictEntry *de;
4506 sds pattern = c->argv[1]->ptr;
4507 int plen = sdslen(pattern);
a3f9eec2 4508 unsigned long numkeys = 0;
ed9b544e 4509 robj *lenobj = createObject(REDIS_STRING,NULL);
4510
3305306f 4511 di = dictGetIterator(c->db->dict);
ed9b544e 4512 addReply(c,lenobj);
4513 decrRefCount(lenobj);
4514 while((de = dictNext(di)) != NULL) {
4515 robj *keyobj = dictGetEntryKey(de);
3305306f 4516
ed9b544e 4517 sds key = keyobj->ptr;
4518 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4519 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 4520 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4521 addReplyBulk(c,keyobj);
3305306f 4522 numkeys++;
3305306f 4523 }
ed9b544e 4524 }
4525 }
4526 dictReleaseIterator(di);
a3f9eec2 4527 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4528}
4529
4530static void dbsizeCommand(redisClient *c) {
4531 addReplySds(c,
3305306f 4532 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4533}
4534
4535static void lastsaveCommand(redisClient *c) {
4536 addReplySds(c,
c937aa89 4537 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4538}
4539
4540static void typeCommand(redisClient *c) {
3305306f 4541 robj *o;
ed9b544e 4542 char *type;
3305306f 4543
4544 o = lookupKeyRead(c->db,c->argv[1]);
4545 if (o == NULL) {
c937aa89 4546 type = "+none";
ed9b544e 4547 } else {
ed9b544e 4548 switch(o->type) {
c937aa89 4549 case REDIS_STRING: type = "+string"; break;
4550 case REDIS_LIST: type = "+list"; break;
4551 case REDIS_SET: type = "+set"; break;
412a8bce 4552 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4553 case REDIS_HASH: type = "+hash"; break;
4554 default: type = "+unknown"; break;
ed9b544e 4555 }
4556 }
4557 addReplySds(c,sdsnew(type));
4558 addReply(c,shared.crlf);
4559}
4560
4561static void saveCommand(redisClient *c) {
9d65a1bb 4562 if (server.bgsavechildpid != -1) {
05557f6d 4563 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4564 return;
4565 }
f78fd11b 4566 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4567 addReply(c,shared.ok);
4568 } else {
4569 addReply(c,shared.err);
4570 }
4571}
4572
4573static void bgsaveCommand(redisClient *c) {
9d65a1bb 4574 if (server.bgsavechildpid != -1) {
ed9b544e 4575 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4576 return;
4577 }
f78fd11b 4578 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4579 char *status = "+Background saving started\r\n";
4580 addReplySds(c,sdsnew(status));
ed9b544e 4581 } else {
4582 addReply(c,shared.err);
4583 }
4584}
4585
4586static void shutdownCommand(redisClient *c) {
4587 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
a3b21203 4588 /* Kill the saving child if there is a background saving in progress.
4589 We want to avoid race conditions, for instance our saving child may
4590 overwrite the synchronous saving did by SHUTDOWN. */
9d65a1bb 4591 if (server.bgsavechildpid != -1) {
9f3c422c 4592 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4593 kill(server.bgsavechildpid,SIGKILL);
a3b21203 4594 rdbRemoveTempFile(server.bgsavechildpid);
9f3c422c 4595 }
ac945e2d 4596 if (server.appendonly) {
4597 /* Append only file: fsync() the AOF and exit */
4598 fsync(server.appendfd);
054e426d 4599 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 4600 exit(0);
ed9b544e 4601 } else {
ac945e2d 4602 /* Snapshotting. Perform a SYNC SAVE and exit */
4603 if (rdbSave(server.dbfilename) == REDIS_OK) {
4604 if (server.daemonize)
4605 unlink(server.pidfile);
4606 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4607 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
4608 exit(0);
4609 } else {
dd88747b 4610 /* Ooops.. error saving! The best we can do is to continue
4611 * operating. Note that if there was a background saving process,
4612 * in the next cron() Redis will be notified that the background
4613 * saving aborted, handling special stuff like slaves pending for
4614 * synchronization... */
e0a62c7f 4615 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
dd88747b 4616 addReplySds(c,
4617 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
ac945e2d 4618 }
ed9b544e 4619 }
4620}
4621
4622static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4623 robj *o;
4624
4625 /* To use the same key as src and dst is probably an error */
4626 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4627 addReply(c,shared.sameobjecterr);
ed9b544e 4628 return;
4629 }
4630
dd88747b 4631 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4632 return;
dd88747b 4633
ed9b544e 4634 incrRefCount(o);
3305306f 4635 deleteIfVolatile(c->db,c->argv[2]);
4636 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 4637 if (nx) {
4638 decrRefCount(o);
c937aa89 4639 addReply(c,shared.czero);
ed9b544e 4640 return;
4641 }
3305306f 4642 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 4643 } else {
4644 incrRefCount(c->argv[2]);
4645 }
3305306f 4646 deleteKey(c->db,c->argv[1]);
ed9b544e 4647 server.dirty++;
c937aa89 4648 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4649}
4650
4651static void renameCommand(redisClient *c) {
4652 renameGenericCommand(c,0);
4653}
4654
4655static void renamenxCommand(redisClient *c) {
4656 renameGenericCommand(c,1);
4657}
4658
4659static void moveCommand(redisClient *c) {
3305306f 4660 robj *o;
4661 redisDb *src, *dst;
ed9b544e 4662 int srcid;
4663
4664 /* Obtain source and target DB pointers */
3305306f 4665 src = c->db;
4666 srcid = c->db->id;
ed9b544e 4667 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4668 addReply(c,shared.outofrangeerr);
ed9b544e 4669 return;
4670 }
3305306f 4671 dst = c->db;
4672 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4673
4674 /* If the user is moving using as target the same
4675 * DB as the source DB it is probably an error. */
4676 if (src == dst) {
c937aa89 4677 addReply(c,shared.sameobjecterr);
ed9b544e 4678 return;
4679 }
4680
4681 /* Check if the element exists and get a reference */
3305306f 4682 o = lookupKeyWrite(c->db,c->argv[1]);
4683 if (!o) {
c937aa89 4684 addReply(c,shared.czero);
ed9b544e 4685 return;
4686 }
4687
4688 /* Try to add the element to the target DB */
3305306f 4689 deleteIfVolatile(dst,c->argv[1]);
4690 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 4691 addReply(c,shared.czero);
ed9b544e 4692 return;
4693 }
3305306f 4694 incrRefCount(c->argv[1]);
ed9b544e 4695 incrRefCount(o);
4696
4697 /* OK! key moved, free the entry in the source DB */
3305306f 4698 deleteKey(src,c->argv[1]);
ed9b544e 4699 server.dirty++;
c937aa89 4700 addReply(c,shared.cone);
ed9b544e 4701}
4702
4703/* =================================== Lists ================================ */
4704static void pushGenericCommand(redisClient *c, int where) {
4705 robj *lobj;
ed9b544e 4706 list *list;
3305306f 4707
4708 lobj = lookupKeyWrite(c->db,c->argv[1]);
4709 if (lobj == NULL) {
95242ab5 4710 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4711 addReply(c,shared.cone);
95242ab5 4712 return;
4713 }
ed9b544e 4714 lobj = createListObject();
4715 list = lobj->ptr;
4716 if (where == REDIS_HEAD) {
6b47e12e 4717 listAddNodeHead(list,c->argv[2]);
ed9b544e 4718 } else {
6b47e12e 4719 listAddNodeTail(list,c->argv[2]);
ed9b544e 4720 }
3305306f 4721 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 4722 incrRefCount(c->argv[1]);
4723 incrRefCount(c->argv[2]);
4724 } else {
ed9b544e 4725 if (lobj->type != REDIS_LIST) {
4726 addReply(c,shared.wrongtypeerr);
4727 return;
4728 }
95242ab5 4729 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4730 addReply(c,shared.cone);
95242ab5 4731 return;
4732 }
ed9b544e 4733 list = lobj->ptr;
4734 if (where == REDIS_HEAD) {
6b47e12e 4735 listAddNodeHead(list,c->argv[2]);
ed9b544e 4736 } else {
6b47e12e 4737 listAddNodeTail(list,c->argv[2]);
ed9b544e 4738 }
4739 incrRefCount(c->argv[2]);
4740 }
4741 server.dirty++;
520b5a33 4742 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
ed9b544e 4743}
4744
4745static void lpushCommand(redisClient *c) {
4746 pushGenericCommand(c,REDIS_HEAD);
4747}
4748
4749static void rpushCommand(redisClient *c) {
4750 pushGenericCommand(c,REDIS_TAIL);
4751}
4752
4753static void llenCommand(redisClient *c) {
3305306f 4754 robj *o;
ed9b544e 4755 list *l;
dd88747b 4756
4757 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4758 checkType(c,o,REDIS_LIST)) return;
e0a62c7f 4759
dd88747b 4760 l = o->ptr;
4761 addReplyUlong(c,listLength(l));
ed9b544e 4762}
4763
4764static void lindexCommand(redisClient *c) {
3305306f 4765 robj *o;
ed9b544e 4766 int index = atoi(c->argv[2]->ptr);
dd88747b 4767 list *list;
4768 listNode *ln;
4769
4770 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4771 checkType(c,o,REDIS_LIST)) return;
4772 list = o->ptr;
4773
4774 ln = listIndex(list, index);
4775 if (ln == NULL) {
c937aa89 4776 addReply(c,shared.nullbulk);
ed9b544e 4777 } else {
dd88747b 4778 robj *ele = listNodeValue(ln);
4779 addReplyBulk(c,ele);
ed9b544e 4780 }
4781}
4782
4783static void lsetCommand(redisClient *c) {
3305306f 4784 robj *o;
ed9b544e 4785 int index = atoi(c->argv[2]->ptr);
dd88747b 4786 list *list;
4787 listNode *ln;
4788
4789 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4790 checkType(c,o,REDIS_LIST)) return;
4791 list = o->ptr;
4792
4793 ln = listIndex(list, index);
4794 if (ln == NULL) {
4795 addReply(c,shared.outofrangeerr);
ed9b544e 4796 } else {
dd88747b 4797 robj *ele = listNodeValue(ln);
ed9b544e 4798
dd88747b 4799 decrRefCount(ele);
4800 listNodeValue(ln) = c->argv[3];
4801 incrRefCount(c->argv[3]);
4802 addReply(c,shared.ok);
4803 server.dirty++;
ed9b544e 4804 }
4805}
4806
4807static void popGenericCommand(redisClient *c, int where) {
3305306f 4808 robj *o;
dd88747b 4809 list *list;
4810 listNode *ln;
3305306f 4811
dd88747b 4812 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4813 checkType(c,o,REDIS_LIST)) return;
4814 list = o->ptr;
ed9b544e 4815
dd88747b 4816 if (where == REDIS_HEAD)
4817 ln = listFirst(list);
4818 else
4819 ln = listLast(list);
ed9b544e 4820
dd88747b 4821 if (ln == NULL) {
4822 addReply(c,shared.nullbulk);
4823 } else {
4824 robj *ele = listNodeValue(ln);
4825 addReplyBulk(c,ele);
4826 listDelNode(list,ln);
3ea27d37 4827 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4828 server.dirty++;
ed9b544e 4829 }
4830}
4831
4832static void lpopCommand(redisClient *c) {
4833 popGenericCommand(c,REDIS_HEAD);
4834}
4835
4836static void rpopCommand(redisClient *c) {
4837 popGenericCommand(c,REDIS_TAIL);
4838}
4839
4840static void lrangeCommand(redisClient *c) {
3305306f 4841 robj *o;
ed9b544e 4842 int start = atoi(c->argv[2]->ptr);
4843 int end = atoi(c->argv[3]->ptr);
dd88747b 4844 int llen;
4845 int rangelen, j;
4846 list *list;
4847 listNode *ln;
4848 robj *ele;
4849
4e27f268 4850 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4851 || checkType(c,o,REDIS_LIST)) return;
dd88747b 4852 list = o->ptr;
4853 llen = listLength(list);
4854
4855 /* convert negative indexes */
4856 if (start < 0) start = llen+start;
4857 if (end < 0) end = llen+end;
4858 if (start < 0) start = 0;
4859 if (end < 0) end = 0;
4860
4861 /* indexes sanity checks */
4862 if (start > end || start >= llen) {
4863 /* Out of range start or start > end result in empty list */
4864 addReply(c,shared.emptymultibulk);
4865 return;
4866 }
4867 if (end >= llen) end = llen-1;
4868 rangelen = (end-start)+1;
3305306f 4869
dd88747b 4870 /* Return the result in form of a multi-bulk reply */
4871 ln = listIndex(list, start);
4872 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4873 for (j = 0; j < rangelen; j++) {
4874 ele = listNodeValue(ln);
4875 addReplyBulk(c,ele);
4876 ln = ln->next;
ed9b544e 4877 }
4878}
4879
4880static void ltrimCommand(redisClient *c) {
3305306f 4881 robj *o;
ed9b544e 4882 int start = atoi(c->argv[2]->ptr);
4883 int end = atoi(c->argv[3]->ptr);
dd88747b 4884 int llen;
4885 int j, ltrim, rtrim;
4886 list *list;
4887 listNode *ln;
4888
4889 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4890 checkType(c,o,REDIS_LIST)) return;
4891 list = o->ptr;
4892 llen = listLength(list);
4893
4894 /* convert negative indexes */
4895 if (start < 0) start = llen+start;
4896 if (end < 0) end = llen+end;
4897 if (start < 0) start = 0;
4898 if (end < 0) end = 0;
4899
4900 /* indexes sanity checks */
4901 if (start > end || start >= llen) {
4902 /* Out of range start or start > end result in empty list */
4903 ltrim = llen;
4904 rtrim = 0;
ed9b544e 4905 } else {
dd88747b 4906 if (end >= llen) end = llen-1;
4907 ltrim = start;
4908 rtrim = llen-end-1;
4909 }
ed9b544e 4910
dd88747b 4911 /* Remove list elements to perform the trim */
4912 for (j = 0; j < ltrim; j++) {
4913 ln = listFirst(list);
4914 listDelNode(list,ln);
4915 }
4916 for (j = 0; j < rtrim; j++) {
4917 ln = listLast(list);
4918 listDelNode(list,ln);
ed9b544e 4919 }
3ea27d37 4920 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4921 server.dirty++;
4922 addReply(c,shared.ok);
ed9b544e 4923}
4924
4925static void lremCommand(redisClient *c) {
3305306f 4926 robj *o;
dd88747b 4927 list *list;
4928 listNode *ln, *next;
4929 int toremove = atoi(c->argv[2]->ptr);
4930 int removed = 0;
4931 int fromtail = 0;
a4d1ba9a 4932
dd88747b 4933 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4934 checkType(c,o,REDIS_LIST)) return;
4935 list = o->ptr;
4936
4937 if (toremove < 0) {
4938 toremove = -toremove;
4939 fromtail = 1;
4940 }
4941 ln = fromtail ? list->tail : list->head;
4942 while (ln) {
4943 robj *ele = listNodeValue(ln);
4944
4945 next = fromtail ? ln->prev : ln->next;
bf028098 4946 if (equalStringObjects(ele,c->argv[3])) {
dd88747b 4947 listDelNode(list,ln);
4948 server.dirty++;
4949 removed++;
4950 if (toremove && removed == toremove) break;
ed9b544e 4951 }
dd88747b 4952 ln = next;
ed9b544e 4953 }
3ea27d37 4954 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4955 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 4956}
4957
12f9d551 4958/* This is the semantic of this command:
0f5f7e9a 4959 * RPOPLPUSH srclist dstlist:
12f9d551 4960 * IF LLEN(srclist) > 0
4961 * element = RPOP srclist
4962 * LPUSH dstlist element
4963 * RETURN element
4964 * ELSE
4965 * RETURN nil
4966 * END
4967 * END
4968 *
4969 * The idea is to be able to get an element from a list in a reliable way
4970 * since the element is not just returned but pushed against another list
4971 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4972 */
0f5f7e9a 4973static void rpoplpushcommand(redisClient *c) {
12f9d551 4974 robj *sobj;
dd88747b 4975 list *srclist;
4976 listNode *ln;
4977
4978 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4979 checkType(c,sobj,REDIS_LIST)) return;
4980 srclist = sobj->ptr;
4981 ln = listLast(srclist);
12f9d551 4982
dd88747b 4983 if (ln == NULL) {
12f9d551 4984 addReply(c,shared.nullbulk);
4985 } else {
dd88747b 4986 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4987 robj *ele = listNodeValue(ln);
4988 list *dstlist;
e20fb74f 4989
dd88747b 4990 if (dobj && dobj->type != REDIS_LIST) {
4991 addReply(c,shared.wrongtypeerr);
4992 return;
4993 }
12f9d551 4994
dd88747b 4995 /* Add the element to the target list (unless it's directly
4996 * passed to some BLPOP-ing client */
4997 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4998 if (dobj == NULL) {
4999 /* Create the list if the key does not exist */
5000 dobj = createListObject();
5001 dictAdd(c->db->dict,c->argv[2],dobj);
5002 incrRefCount(c->argv[2]);
12f9d551 5003 }
dd88747b 5004 dstlist = dobj->ptr;
5005 listAddNodeHead(dstlist,ele);
5006 incrRefCount(ele);
12f9d551 5007 }
dd88747b 5008
5009 /* Send the element to the client as reply as well */
5010 addReplyBulk(c,ele);
5011
5012 /* Finally remove the element from the source list */
5013 listDelNode(srclist,ln);
3ea27d37 5014 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5015 server.dirty++;
12f9d551 5016 }
5017}
5018
ed9b544e 5019/* ==================================== Sets ================================ */
5020
5021static void saddCommand(redisClient *c) {
ed9b544e 5022 robj *set;
5023
3305306f 5024 set = lookupKeyWrite(c->db,c->argv[1]);
5025 if (set == NULL) {
ed9b544e 5026 set = createSetObject();
3305306f 5027 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 5028 incrRefCount(c->argv[1]);
5029 } else {
ed9b544e 5030 if (set->type != REDIS_SET) {
c937aa89 5031 addReply(c,shared.wrongtypeerr);
ed9b544e 5032 return;
5033 }
5034 }
5035 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
5036 incrRefCount(c->argv[2]);
5037 server.dirty++;
c937aa89 5038 addReply(c,shared.cone);
ed9b544e 5039 } else {
c937aa89 5040 addReply(c,shared.czero);
ed9b544e 5041 }
5042}
5043
5044static void sremCommand(redisClient *c) {
3305306f 5045 robj *set;
ed9b544e 5046
dd88747b 5047 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5048 checkType(c,set,REDIS_SET)) return;
5049
5050 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
5051 server.dirty++;
5052 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 5053 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5054 addReply(c,shared.cone);
ed9b544e 5055 } else {
dd88747b 5056 addReply(c,shared.czero);
ed9b544e 5057 }
5058}
5059
a4460ef4 5060static void smoveCommand(redisClient *c) {
5061 robj *srcset, *dstset;
5062
5063 srcset = lookupKeyWrite(c->db,c->argv[1]);
5064 dstset = lookupKeyWrite(c->db,c->argv[2]);
5065
5066 /* If the source key does not exist return 0, if it's of the wrong type
5067 * raise an error */
5068 if (srcset == NULL || srcset->type != REDIS_SET) {
5069 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
5070 return;
5071 }
5072 /* Error if the destination key is not a set as well */
5073 if (dstset && dstset->type != REDIS_SET) {
5074 addReply(c,shared.wrongtypeerr);
5075 return;
5076 }
5077 /* Remove the element from the source set */
5078 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
5079 /* Key not found in the src set! return zero */
5080 addReply(c,shared.czero);
5081 return;
5082 }
3ea27d37 5083 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
5084 deleteKey(c->db,c->argv[1]);
a4460ef4 5085 server.dirty++;
5086 /* Add the element to the destination set */
5087 if (!dstset) {
5088 dstset = createSetObject();
5089 dictAdd(c->db->dict,c->argv[2],dstset);
5090 incrRefCount(c->argv[2]);
5091 }
5092 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
5093 incrRefCount(c->argv[3]);
5094 addReply(c,shared.cone);
5095}
5096
ed9b544e 5097static void sismemberCommand(redisClient *c) {
3305306f 5098 robj *set;
ed9b544e 5099
dd88747b 5100 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5101 checkType(c,set,REDIS_SET)) return;
5102
5103 if (dictFind(set->ptr,c->argv[2]))
5104 addReply(c,shared.cone);
5105 else
c937aa89 5106 addReply(c,shared.czero);
ed9b544e 5107}
5108
5109static void scardCommand(redisClient *c) {
3305306f 5110 robj *o;
ed9b544e 5111 dict *s;
dd88747b 5112
5113 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5114 checkType(c,o,REDIS_SET)) return;
e0a62c7f 5115
dd88747b 5116 s = o->ptr;
5117 addReplyUlong(c,dictSize(s));
ed9b544e 5118}
5119
12fea928 5120static void spopCommand(redisClient *c) {
5121 robj *set;
5122 dictEntry *de;
5123
dd88747b 5124 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5125 checkType(c,set,REDIS_SET)) return;
5126
5127 de = dictGetRandomKey(set->ptr);
5128 if (de == NULL) {
12fea928 5129 addReply(c,shared.nullbulk);
5130 } else {
dd88747b 5131 robj *ele = dictGetEntryKey(de);
12fea928 5132
dd88747b 5133 addReplyBulk(c,ele);
5134 dictDelete(set->ptr,ele);
5135 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 5136 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5137 server.dirty++;
12fea928 5138 }
5139}
5140
2abb95a9 5141static void srandmemberCommand(redisClient *c) {
5142 robj *set;
5143 dictEntry *de;
5144
dd88747b 5145 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5146 checkType(c,set,REDIS_SET)) return;
5147
5148 de = dictGetRandomKey(set->ptr);
5149 if (de == NULL) {
2abb95a9 5150 addReply(c,shared.nullbulk);
5151 } else {
dd88747b 5152 robj *ele = dictGetEntryKey(de);
2abb95a9 5153
dd88747b 5154 addReplyBulk(c,ele);
2abb95a9 5155 }
5156}
5157
ed9b544e 5158static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5159 dict **d1 = (void*) s1, **d2 = (void*) s2;
5160
3305306f 5161 return dictSize(*d1)-dictSize(*d2);
ed9b544e 5162}
5163
682ac724 5164static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 5165 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5166 dictIterator *di;
5167 dictEntry *de;
5168 robj *lenobj = NULL, *dstset = NULL;
682ac724 5169 unsigned long j, cardinality = 0;
ed9b544e 5170
ed9b544e 5171 for (j = 0; j < setsnum; j++) {
5172 robj *setobj;
3305306f 5173
5174 setobj = dstkey ?
5175 lookupKeyWrite(c->db,setskeys[j]) :
5176 lookupKeyRead(c->db,setskeys[j]);
5177 if (!setobj) {
ed9b544e 5178 zfree(dv);
5faa6025 5179 if (dstkey) {
fdcaae84 5180 if (deleteKey(c->db,dstkey))
5181 server.dirty++;
0d36ded0 5182 addReply(c,shared.czero);
5faa6025 5183 } else {
4e27f268 5184 addReply(c,shared.emptymultibulk);
5faa6025 5185 }
ed9b544e 5186 return;
5187 }
ed9b544e 5188 if (setobj->type != REDIS_SET) {
5189 zfree(dv);
c937aa89 5190 addReply(c,shared.wrongtypeerr);
ed9b544e 5191 return;
5192 }
5193 dv[j] = setobj->ptr;
5194 }
5195 /* Sort sets from the smallest to largest, this will improve our
5196 * algorithm's performace */
5197 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5198
5199 /* The first thing we should output is the total number of elements...
5200 * since this is a multi-bulk write, but at this stage we don't know
5201 * the intersection set size, so we use a trick, append an empty object
5202 * to the output list and save the pointer to later modify it with the
5203 * right length */
5204 if (!dstkey) {
5205 lenobj = createObject(REDIS_STRING,NULL);
5206 addReply(c,lenobj);
5207 decrRefCount(lenobj);
5208 } else {
5209 /* If we have a target key where to store the resulting set
5210 * create this key with an empty set inside */
5211 dstset = createSetObject();
ed9b544e 5212 }
5213
5214 /* Iterate all the elements of the first (smallest) set, and test
5215 * the element against all the other sets, if at least one set does
5216 * not include the element it is discarded */
5217 di = dictGetIterator(dv[0]);
ed9b544e 5218
5219 while((de = dictNext(di)) != NULL) {
5220 robj *ele;
5221
5222 for (j = 1; j < setsnum; j++)
5223 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5224 if (j != setsnum)
5225 continue; /* at least one set does not contain the member */
5226 ele = dictGetEntryKey(de);
5227 if (!dstkey) {
dd88747b 5228 addReplyBulk(c,ele);
ed9b544e 5229 cardinality++;
5230 } else {
5231 dictAdd(dstset->ptr,ele,NULL);
5232 incrRefCount(ele);
5233 }
5234 }
5235 dictReleaseIterator(di);
5236
83cdfe18 5237 if (dstkey) {
3ea27d37 5238 /* Store the resulting set into the target, if the intersection
5239 * is not an empty set. */
83cdfe18 5240 deleteKey(c->db,dstkey);
3ea27d37 5241 if (dictSize((dict*)dstset->ptr) > 0) {
5242 dictAdd(c->db->dict,dstkey,dstset);
5243 incrRefCount(dstkey);
d36c4e97 5244 addReplyLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5245 } else {
5246 decrRefCount(dstset);
d36c4e97 5247 addReply(c,shared.czero);
3ea27d37 5248 }
40d224a9 5249 server.dirty++;
d36c4e97 5250 } else {
5251 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 5252 }
ed9b544e 5253 zfree(dv);
5254}
5255
5256static void sinterCommand(redisClient *c) {
5257 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5258}
5259
5260static void sinterstoreCommand(redisClient *c) {
5261 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5262}
5263
f4f56e1d 5264#define REDIS_OP_UNION 0
5265#define REDIS_OP_DIFF 1
2830ca53 5266#define REDIS_OP_INTER 2
f4f56e1d 5267
5268static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 5269 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5270 dictIterator *di;
5271 dictEntry *de;
f4f56e1d 5272 robj *dstset = NULL;
40d224a9 5273 int j, cardinality = 0;
5274
40d224a9 5275 for (j = 0; j < setsnum; j++) {
5276 robj *setobj;
5277
5278 setobj = dstkey ?
5279 lookupKeyWrite(c->db,setskeys[j]) :
5280 lookupKeyRead(c->db,setskeys[j]);
5281 if (!setobj) {
5282 dv[j] = NULL;
5283 continue;
5284 }
5285 if (setobj->type != REDIS_SET) {
5286 zfree(dv);
5287 addReply(c,shared.wrongtypeerr);
5288 return;
5289 }
5290 dv[j] = setobj->ptr;
5291 }
5292
5293 /* We need a temp set object to store our union. If the dstkey
5294 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5295 * this set object will be the resulting object to set into the target key*/
5296 dstset = createSetObject();
5297
40d224a9 5298 /* Iterate all the elements of all the sets, add every element a single
5299 * time to the result set */
5300 for (j = 0; j < setsnum; j++) {
51829ed3 5301 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 5302 if (!dv[j]) continue; /* non existing keys are like empty sets */
5303
5304 di = dictGetIterator(dv[j]);
40d224a9 5305
5306 while((de = dictNext(di)) != NULL) {
5307 robj *ele;
5308
5309 /* dictAdd will not add the same element multiple times */
5310 ele = dictGetEntryKey(de);
f4f56e1d 5311 if (op == REDIS_OP_UNION || j == 0) {
5312 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5313 incrRefCount(ele);
40d224a9 5314 cardinality++;
5315 }
f4f56e1d 5316 } else if (op == REDIS_OP_DIFF) {
5317 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5318 cardinality--;
5319 }
40d224a9 5320 }
5321 }
5322 dictReleaseIterator(di);
51829ed3 5323
d36c4e97 5324 /* result set is empty? Exit asap. */
5325 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 5326 }
5327
f4f56e1d 5328 /* Output the content of the resulting set, if not in STORE mode */
5329 if (!dstkey) {
5330 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5331 di = dictGetIterator(dstset->ptr);
f4f56e1d 5332 while((de = dictNext(di)) != NULL) {
5333 robj *ele;
5334
5335 ele = dictGetEntryKey(de);
dd88747b 5336 addReplyBulk(c,ele);
f4f56e1d 5337 }
5338 dictReleaseIterator(di);
d36c4e97 5339 decrRefCount(dstset);
83cdfe18
AG
5340 } else {
5341 /* If we have a target key where to store the resulting set
5342 * create this key with the result set inside */
5343 deleteKey(c->db,dstkey);
3ea27d37 5344 if (dictSize((dict*)dstset->ptr) > 0) {
5345 dictAdd(c->db->dict,dstkey,dstset);
5346 incrRefCount(dstkey);
d36c4e97 5347 addReplyLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5348 } else {
5349 decrRefCount(dstset);
d36c4e97 5350 addReply(c,shared.czero);
3ea27d37 5351 }
40d224a9 5352 server.dirty++;
5353 }
5354 zfree(dv);
5355}
5356
5357static void sunionCommand(redisClient *c) {
f4f56e1d 5358 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 5359}
5360
5361static void sunionstoreCommand(redisClient *c) {
f4f56e1d 5362 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5363}
5364
5365static void sdiffCommand(redisClient *c) {
5366 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5367}
5368
5369static void sdiffstoreCommand(redisClient *c) {
5370 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 5371}
5372
6b47e12e 5373/* ==================================== ZSets =============================== */
5374
5375/* ZSETs are ordered sets using two data structures to hold the same elements
5376 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5377 * data structure.
5378 *
5379 * The elements are added to an hash table mapping Redis objects to scores.
5380 * At the same time the elements are added to a skip list mapping scores
5381 * to Redis objects (so objects are sorted by scores in this "view"). */
5382
5383/* This skiplist implementation is almost a C translation of the original
5384 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5385 * Alternative to Balanced Trees", modified in three ways:
5386 * a) this implementation allows for repeated values.
5387 * b) the comparison is not just by key (our 'score') but by satellite data.
5388 * c) there is a back pointer, so it's a doubly linked list with the back
5389 * pointers being only at "level 1". This allows to traverse the list
5390 * from tail to head, useful for ZREVRANGE. */
5391
5392static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5393 zskiplistNode *zn = zmalloc(sizeof(*zn));
5394
5395 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2b37892e
PN
5396 if (level > 0)
5397 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
6b47e12e 5398 zn->score = score;
5399 zn->obj = obj;
5400 return zn;
5401}
5402
5403static zskiplist *zslCreate(void) {
5404 int j;
5405 zskiplist *zsl;
e0a62c7f 5406
6b47e12e 5407 zsl = zmalloc(sizeof(*zsl));
5408 zsl->level = 1;
cc812361 5409 zsl->length = 0;
6b47e12e 5410 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 5411 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 5412 zsl->header->forward[j] = NULL;
94e543b5 5413
5414 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5415 if (j < ZSKIPLIST_MAXLEVEL-1)
5416 zsl->header->span[j] = 0;
69d95c3e 5417 }
e3870fab 5418 zsl->header->backward = NULL;
5419 zsl->tail = NULL;
6b47e12e 5420 return zsl;
5421}
5422
fd8ccf44 5423static void zslFreeNode(zskiplistNode *node) {
5424 decrRefCount(node->obj);
ad807e6f 5425 zfree(node->forward);
69d95c3e 5426 zfree(node->span);
fd8ccf44 5427 zfree(node);
5428}
5429
5430static void zslFree(zskiplist *zsl) {
ad807e6f 5431 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 5432
ad807e6f 5433 zfree(zsl->header->forward);
69d95c3e 5434 zfree(zsl->header->span);
ad807e6f 5435 zfree(zsl->header);
fd8ccf44 5436 while(node) {
599379dd 5437 next = node->forward[0];
fd8ccf44 5438 zslFreeNode(node);
5439 node = next;
5440 }
ad807e6f 5441 zfree(zsl);
fd8ccf44 5442}
5443
6b47e12e 5444static int zslRandomLevel(void) {
5445 int level = 1;
5446 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5447 level += 1;
10c2baa5 5448 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6b47e12e 5449}
5450
5451static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5452 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 5453 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 5454 int i, level;
5455
5456 x = zsl->header;
5457 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
5458 /* store rank that is crossed to reach the insert position */
5459 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 5460
9d60e6e4 5461 while (x->forward[i] &&
5462 (x->forward[i]->score < score ||
5463 (x->forward[i]->score == score &&
69d95c3e 5464 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 5465 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 5466 x = x->forward[i];
69d95c3e 5467 }
6b47e12e 5468 update[i] = x;
5469 }
6b47e12e 5470 /* we assume the key is not already inside, since we allow duplicated
5471 * scores, and the re-insertion of score and redis object should never
5472 * happpen since the caller of zslInsert() should test in the hash table
5473 * if the element is already inside or not. */
5474 level = zslRandomLevel();
5475 if (level > zsl->level) {
69d95c3e 5476 for (i = zsl->level; i < level; i++) {
2b37892e 5477 rank[i] = 0;
6b47e12e 5478 update[i] = zsl->header;
2b37892e 5479 update[i]->span[i-1] = zsl->length;
69d95c3e 5480 }
6b47e12e 5481 zsl->level = level;
5482 }
5483 x = zslCreateNode(level,score,obj);
5484 for (i = 0; i < level; i++) {
5485 x->forward[i] = update[i]->forward[i];
5486 update[i]->forward[i] = x;
69d95c3e
PN
5487
5488 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
5489 if (i > 0) {
5490 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5491 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5492 }
6b47e12e 5493 }
69d95c3e
PN
5494
5495 /* increment span for untouched levels */
5496 for (i = level; i < zsl->level; i++) {
2b37892e 5497 update[i]->span[i-1]++;
69d95c3e
PN
5498 }
5499
bb975144 5500 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 5501 if (x->forward[0])
5502 x->forward[0]->backward = x;
5503 else
5504 zsl->tail = x;
cc812361 5505 zsl->length++;
6b47e12e 5506}
5507
84105336
PN
5508/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5509void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5510 int i;
5511 for (i = 0; i < zsl->level; i++) {
5512 if (update[i]->forward[i] == x) {
5513 if (i > 0) {
5514 update[i]->span[i-1] += x->span[i-1] - 1;
5515 }
5516 update[i]->forward[i] = x->forward[i];
5517 } else {
5518 /* invariant: i > 0, because update[0]->forward[0]
5519 * is always equal to x */
5520 update[i]->span[i-1] -= 1;
5521 }
5522 }
5523 if (x->forward[0]) {
5524 x->forward[0]->backward = x->backward;
5525 } else {
5526 zsl->tail = x->backward;
5527 }
5528 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5529 zsl->level--;
5530 zsl->length--;
5531}
5532
50c55df5 5533/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 5534static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 5535 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5536 int i;
5537
5538 x = zsl->header;
5539 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 5540 while (x->forward[i] &&
5541 (x->forward[i]->score < score ||
5542 (x->forward[i]->score == score &&
5543 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 5544 x = x->forward[i];
5545 update[i] = x;
5546 }
5547 /* We may have multiple elements with the same score, what we need
5548 * is to find the element with both the right score and object. */
5549 x = x->forward[0];
bf028098 5550 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
84105336 5551 zslDeleteNode(zsl, x, update);
9d60e6e4 5552 zslFreeNode(x);
9d60e6e4 5553 return 1;
5554 } else {
5555 return 0; /* not found */
e197b441 5556 }
5557 return 0; /* not found */
fd8ccf44 5558}
5559
1807985b 5560/* Delete all the elements with score between min and max from the skiplist.
5561 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5562 * Note that this function takes the reference to the hash table view of the
5563 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 5564static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 5565 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5566 unsigned long removed = 0;
5567 int i;
5568
5569 x = zsl->header;
5570 for (i = zsl->level-1; i >= 0; i--) {
5571 while (x->forward[i] && x->forward[i]->score < min)
5572 x = x->forward[i];
5573 update[i] = x;
5574 }
5575 /* We may have multiple elements with the same score, what we need
5576 * is to find the element with both the right score and object. */
5577 x = x->forward[0];
5578 while (x && x->score <= max) {
84105336
PN
5579 zskiplistNode *next = x->forward[0];
5580 zslDeleteNode(zsl, x, update);
1807985b 5581 dictDelete(dict,x->obj);
5582 zslFreeNode(x);
1807985b 5583 removed++;
5584 x = next;
5585 }
5586 return removed; /* not found */
5587}
1807985b 5588
9212eafd 5589/* Delete all the elements with rank between start and end from the skiplist.
2424490f 5590 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
5591static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5592 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5593 unsigned long traversed = 0, removed = 0;
5594 int i;
5595
9212eafd
PN
5596 x = zsl->header;
5597 for (i = zsl->level-1; i >= 0; i--) {
5598 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5599 traversed += i > 0 ? x->span[i-1] : 1;
5600 x = x->forward[i];
1807985b 5601 }
9212eafd
PN
5602 update[i] = x;
5603 }
5604
5605 traversed++;
5606 x = x->forward[0];
5607 while (x && traversed <= end) {
84105336
PN
5608 zskiplistNode *next = x->forward[0];
5609 zslDeleteNode(zsl, x, update);
1807985b 5610 dictDelete(dict,x->obj);
5611 zslFreeNode(x);
1807985b 5612 removed++;
9212eafd 5613 traversed++;
1807985b 5614 x = next;
5615 }
9212eafd 5616 return removed;
1807985b 5617}
5618
50c55df5 5619/* Find the first node having a score equal or greater than the specified one.
5620 * Returns NULL if there is no match. */
5621static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5622 zskiplistNode *x;
5623 int i;
5624
5625 x = zsl->header;
5626 for (i = zsl->level-1; i >= 0; i--) {
5627 while (x->forward[i] && x->forward[i]->score < score)
5628 x = x->forward[i];
5629 }
5630 /* We may have multiple elements with the same score, what we need
5631 * is to find the element with both the right score and object. */
5632 return x->forward[0];
5633}
5634
27b0ccca
PN
5635/* Find the rank for an element by both score and key.
5636 * Returns 0 when the element cannot be found, rank otherwise.
5637 * Note that the rank is 1-based due to the span of zsl->header to the
5638 * first element. */
5639static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5640 zskiplistNode *x;
5641 unsigned long rank = 0;
5642 int i;
5643
5644 x = zsl->header;
5645 for (i = zsl->level-1; i >= 0; i--) {
5646 while (x->forward[i] &&
5647 (x->forward[i]->score < score ||
5648 (x->forward[i]->score == score &&
5649 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 5650 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
5651 x = x->forward[i];
5652 }
5653
5654 /* x might be equal to zsl->header, so test if obj is non-NULL */
bf028098 5655 if (x->obj && equalStringObjects(x->obj,o)) {
27b0ccca
PN
5656 return rank;
5657 }
5658 }
5659 return 0;
5660}
5661
e74825c2
PN
5662/* Finds an element by its rank. The rank argument needs to be 1-based. */
5663zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5664 zskiplistNode *x;
5665 unsigned long traversed = 0;
5666 int i;
5667
5668 x = zsl->header;
5669 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 5670 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5671 {
a50ea45c 5672 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
5673 x = x->forward[i];
5674 }
e74825c2
PN
5675 if (traversed == rank) {
5676 return x;
5677 }
5678 }
5679 return NULL;
5680}
5681
fd8ccf44 5682/* The actual Z-commands implementations */
5683
7db723ad 5684/* This generic command implements both ZADD and ZINCRBY.
e2665397 5685 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 5686 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 5687static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 5688 robj *zsetobj;
5689 zset *zs;
5690 double *score;
5691
e2665397 5692 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 5693 if (zsetobj == NULL) {
5694 zsetobj = createZsetObject();
e2665397 5695 dictAdd(c->db->dict,key,zsetobj);
5696 incrRefCount(key);
fd8ccf44 5697 } else {
5698 if (zsetobj->type != REDIS_ZSET) {
5699 addReply(c,shared.wrongtypeerr);
5700 return;
5701 }
5702 }
fd8ccf44 5703 zs = zsetobj->ptr;
e2665397 5704
7db723ad 5705 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 5706 * needs to handle the two different conditions. It's all about setting
5707 * '*score', that is, the new score to set, to the right value. */
5708 score = zmalloc(sizeof(double));
5709 if (doincrement) {
5710 dictEntry *de;
5711
5712 /* Read the old score. If the element was not present starts from 0 */
5713 de = dictFind(zs->dict,ele);
5714 if (de) {
5715 double *oldscore = dictGetEntryVal(de);
5716 *score = *oldscore + scoreval;
5717 } else {
5718 *score = scoreval;
5719 }
5720 } else {
5721 *score = scoreval;
5722 }
5723
5724 /* What follows is a simple remove and re-insert operation that is common
7db723ad 5725 * to both ZADD and ZINCRBY... */
e2665397 5726 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 5727 /* case 1: New element */
e2665397 5728 incrRefCount(ele); /* added to hash */
5729 zslInsert(zs->zsl,*score,ele);
5730 incrRefCount(ele); /* added to skiplist */
fd8ccf44 5731 server.dirty++;
e2665397 5732 if (doincrement)
e2665397 5733 addReplyDouble(c,*score);
91d71bfc 5734 else
5735 addReply(c,shared.cone);
fd8ccf44 5736 } else {
5737 dictEntry *de;
5738 double *oldscore;
e0a62c7f 5739
fd8ccf44 5740 /* case 2: Score update operation */
e2665397 5741 de = dictFind(zs->dict,ele);
dfc5e96c 5742 redisAssert(de != NULL);
fd8ccf44 5743 oldscore = dictGetEntryVal(de);
5744 if (*score != *oldscore) {
5745 int deleted;
5746
e2665397 5747 /* Remove and insert the element in the skip list with new score */
5748 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 5749 redisAssert(deleted != 0);
e2665397 5750 zslInsert(zs->zsl,*score,ele);
5751 incrRefCount(ele);
5752 /* Update the score in the hash table */
5753 dictReplace(zs->dict,ele,score);
fd8ccf44 5754 server.dirty++;
2161a965 5755 } else {
5756 zfree(score);
fd8ccf44 5757 }
e2665397 5758 if (doincrement)
5759 addReplyDouble(c,*score);
5760 else
5761 addReply(c,shared.czero);
fd8ccf44 5762 }
5763}
5764
e2665397 5765static void zaddCommand(redisClient *c) {
5766 double scoreval;
5767
bd79a6bd 5768 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5769 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5770}
5771
7db723ad 5772static void zincrbyCommand(redisClient *c) {
e2665397 5773 double scoreval;
5774
bd79a6bd 5775 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5776 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5777}
5778
1b7106e7 5779static void zremCommand(redisClient *c) {
5780 robj *zsetobj;
5781 zset *zs;
dd88747b 5782 dictEntry *de;
5783 double *oldscore;
5784 int deleted;
1b7106e7 5785
dd88747b 5786 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5787 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 5788
dd88747b 5789 zs = zsetobj->ptr;
5790 de = dictFind(zs->dict,c->argv[2]);
5791 if (de == NULL) {
5792 addReply(c,shared.czero);
5793 return;
1b7106e7 5794 }
dd88747b 5795 /* Delete from the skiplist */
5796 oldscore = dictGetEntryVal(de);
5797 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5798 redisAssert(deleted != 0);
5799
5800 /* Delete from the hash table */
5801 dictDelete(zs->dict,c->argv[2]);
5802 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5803 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5804 server.dirty++;
5805 addReply(c,shared.cone);
1b7106e7 5806}
5807
1807985b 5808static void zremrangebyscoreCommand(redisClient *c) {
bbe025e0
AM
5809 double min;
5810 double max;
dd88747b 5811 long deleted;
1807985b 5812 robj *zsetobj;
5813 zset *zs;
5814
bd79a6bd
PN
5815 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5816 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
bbe025e0 5817
dd88747b 5818 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5819 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 5820
dd88747b 5821 zs = zsetobj->ptr;
5822 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5823 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5824 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5825 server.dirty += deleted;
5826 addReplyLong(c,deleted);
1807985b 5827}
5828
9212eafd 5829static void zremrangebyrankCommand(redisClient *c) {
bbe025e0
AM
5830 long start;
5831 long end;
dd88747b 5832 int llen;
5833 long deleted;
9212eafd
PN
5834 robj *zsetobj;
5835 zset *zs;
5836
bd79a6bd
PN
5837 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5838 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 5839
dd88747b 5840 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5841 checkType(c,zsetobj,REDIS_ZSET)) return;
5842 zs = zsetobj->ptr;
5843 llen = zs->zsl->length;
9212eafd 5844
dd88747b 5845 /* convert negative indexes */
5846 if (start < 0) start = llen+start;
5847 if (end < 0) end = llen+end;
5848 if (start < 0) start = 0;
5849 if (end < 0) end = 0;
9212eafd 5850
dd88747b 5851 /* indexes sanity checks */
5852 if (start > end || start >= llen) {
5853 addReply(c,shared.czero);
5854 return;
9212eafd 5855 }
dd88747b 5856 if (end >= llen) end = llen-1;
5857
5858 /* increment start and end because zsl*Rank functions
5859 * use 1-based rank */
5860 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5861 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5862 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5863 server.dirty += deleted;
5864 addReplyLong(c, deleted);
9212eafd
PN
5865}
5866
8f92e768
PN
5867typedef struct {
5868 dict *dict;
5869 double weight;
5870} zsetopsrc;
5871
5872static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5873 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5874 unsigned long size1, size2;
5875 size1 = d1->dict ? dictSize(d1->dict) : 0;
5876 size2 = d2->dict ? dictSize(d2->dict) : 0;
5877 return size1 - size2;
5878}
5879
d2764cd6
PN
5880#define REDIS_AGGR_SUM 1
5881#define REDIS_AGGR_MIN 2
5882#define REDIS_AGGR_MAX 3
5883
5884inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5885 if (aggregate == REDIS_AGGR_SUM) {
5886 *target = *target + val;
5887 } else if (aggregate == REDIS_AGGR_MIN) {
5888 *target = val < *target ? val : *target;
5889 } else if (aggregate == REDIS_AGGR_MAX) {
5890 *target = val > *target ? val : *target;
5891 } else {
5892 /* safety net */
f83c6cb5 5893 redisPanic("Unknown ZUNION/INTER aggregate type");
d2764cd6
PN
5894 }
5895}
5896
2830ca53 5897static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
8f92e768 5898 int i, j, zsetnum;
d2764cd6 5899 int aggregate = REDIS_AGGR_SUM;
8f92e768 5900 zsetopsrc *src;
2830ca53
PN
5901 robj *dstobj;
5902 zset *dstzset;
b287c9bb
PN
5903 dictIterator *di;
5904 dictEntry *de;
5905
2830ca53
PN
5906 /* expect zsetnum input keys to be given */
5907 zsetnum = atoi(c->argv[2]->ptr);
5908 if (zsetnum < 1) {
5909 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5910 return;
b287c9bb 5911 }
2830ca53
PN
5912
5913 /* test if the expected number of keys would overflow */
5914 if (3+zsetnum > c->argc) {
b287c9bb
PN
5915 addReply(c,shared.syntaxerr);
5916 return;
5917 }
5918
2830ca53 5919 /* read keys to be used for input */
b9eed483 5920 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
2830ca53 5921 for (i = 0, j = 3; i < zsetnum; i++, j++) {
b287c9bb
PN
5922 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5923 if (!zsetobj) {
8f92e768 5924 src[i].dict = NULL;
b287c9bb
PN
5925 } else {
5926 if (zsetobj->type != REDIS_ZSET) {
8f92e768 5927 zfree(src);
b287c9bb
PN
5928 addReply(c,shared.wrongtypeerr);
5929 return;
5930 }
8f92e768 5931 src[i].dict = ((zset*)zsetobj->ptr)->dict;
b287c9bb 5932 }
2830ca53
PN
5933
5934 /* default all weights to 1 */
8f92e768 5935 src[i].weight = 1.0;
b287c9bb
PN
5936 }
5937
2830ca53
PN
5938 /* parse optional extra arguments */
5939 if (j < c->argc) {
d2764cd6 5940 int remaining = c->argc - j;
b287c9bb 5941
2830ca53 5942 while (remaining) {
d2764cd6 5943 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 5944 j++; remaining--;
2830ca53 5945 for (i = 0; i < zsetnum; i++, j++, remaining--) {
bd79a6bd 5946 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
bbe025e0 5947 return;
2830ca53 5948 }
d2764cd6
PN
5949 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5950 j++; remaining--;
5951 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5952 aggregate = REDIS_AGGR_SUM;
5953 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5954 aggregate = REDIS_AGGR_MIN;
5955 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5956 aggregate = REDIS_AGGR_MAX;
5957 } else {
5958 zfree(src);
5959 addReply(c,shared.syntaxerr);
5960 return;
5961 }
5962 j++; remaining--;
2830ca53 5963 } else {
8f92e768 5964 zfree(src);
2830ca53
PN
5965 addReply(c,shared.syntaxerr);
5966 return;
5967 }
5968 }
5969 }
b287c9bb 5970
d2764cd6
PN
5971 /* sort sets from the smallest to largest, this will improve our
5972 * algorithm's performance */
5973 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5974
2830ca53
PN
5975 dstobj = createZsetObject();
5976 dstzset = dstobj->ptr;
5977
5978 if (op == REDIS_OP_INTER) {
8f92e768
PN
5979 /* skip going over all entries if the smallest zset is NULL or empty */
5980 if (src[0].dict && dictSize(src[0].dict) > 0) {
5981 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5982 * from small to large, all src[i > 0].dict are non-empty too */
5983 di = dictGetIterator(src[0].dict);
2830ca53 5984 while((de = dictNext(di)) != NULL) {
d2764cd6
PN
5985 double *score = zmalloc(sizeof(double)), value;
5986 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
2830ca53 5987
d2764cd6
PN
5988 for (j = 1; j < zsetnum; j++) {
5989 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 5990 if (other) {
d2764cd6
PN
5991 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5992 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
5993 } else {
5994 break;
5995 }
5996 }
b287c9bb 5997
2830ca53 5998 /* skip entry when not present in every source dict */
8f92e768 5999 if (j != zsetnum) {
2830ca53
PN
6000 zfree(score);
6001 } else {
6002 robj *o = dictGetEntryKey(de);
6003 dictAdd(dstzset->dict,o,score);
6004 incrRefCount(o); /* added to dictionary */
6005 zslInsert(dstzset->zsl,*score,o);
6006 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
6007 }
6008 }
2830ca53
PN
6009 dictReleaseIterator(di);
6010 }
6011 } else if (op == REDIS_OP_UNION) {
6012 for (i = 0; i < zsetnum; i++) {
8f92e768 6013 if (!src[i].dict) continue;
2830ca53 6014
8f92e768 6015 di = dictGetIterator(src[i].dict);
2830ca53
PN
6016 while((de = dictNext(di)) != NULL) {
6017 /* skip key when already processed */
6018 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6019
d2764cd6
PN
6020 double *score = zmalloc(sizeof(double)), value;
6021 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
2830ca53 6022
d2764cd6
PN
6023 /* because the zsets are sorted by size, its only possible
6024 * for sets at larger indices to hold this entry */
6025 for (j = (i+1); j < zsetnum; j++) {
6026 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6027 if (other) {
d2764cd6
PN
6028 value = src[j].weight * (*(double*)dictGetEntryVal(other));
6029 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6030 }
6031 }
b287c9bb 6032
2830ca53
PN
6033 robj *o = dictGetEntryKey(de);
6034 dictAdd(dstzset->dict,o,score);
6035 incrRefCount(o); /* added to dictionary */
6036 zslInsert(dstzset->zsl,*score,o);
6037 incrRefCount(o); /* added to skiplist */
6038 }
6039 dictReleaseIterator(di);
b287c9bb 6040 }
2830ca53
PN
6041 } else {
6042 /* unknown operator */
6043 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
6044 }
6045
6046 deleteKey(c->db,dstkey);
3ea27d37 6047 if (dstzset->zsl->length) {
6048 dictAdd(c->db->dict,dstkey,dstobj);
6049 incrRefCount(dstkey);
6050 addReplyLong(c, dstzset->zsl->length);
6051 server.dirty++;
6052 } else {
8bca8773 6053 decrRefCount(dstobj);
3ea27d37 6054 addReply(c, shared.czero);
6055 }
8f92e768 6056 zfree(src);
b287c9bb
PN
6057}
6058
2830ca53
PN
6059static void zunionCommand(redisClient *c) {
6060 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
6061}
6062
2830ca53
PN
6063static void zinterCommand(redisClient *c) {
6064 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
6065}
6066
e3870fab 6067static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 6068 robj *o;
bbe025e0
AM
6069 long start;
6070 long end;
752da584 6071 int withscores = 0;
dd88747b 6072 int llen;
6073 int rangelen, j;
6074 zset *zsetobj;
6075 zskiplist *zsl;
6076 zskiplistNode *ln;
6077 robj *ele;
752da584 6078
bd79a6bd
PN
6079 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6080 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 6081
752da584 6082 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6083 withscores = 1;
6084 } else if (c->argc >= 5) {
6085 addReply(c,shared.syntaxerr);
6086 return;
6087 }
cc812361 6088
4e27f268 6089 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6090 || checkType(c,o,REDIS_ZSET)) return;
dd88747b 6091 zsetobj = o->ptr;
6092 zsl = zsetobj->zsl;
6093 llen = zsl->length;
cc812361 6094
dd88747b 6095 /* convert negative indexes */
6096 if (start < 0) start = llen+start;
6097 if (end < 0) end = llen+end;
6098 if (start < 0) start = 0;
6099 if (end < 0) end = 0;
cc812361 6100
dd88747b 6101 /* indexes sanity checks */
6102 if (start > end || start >= llen) {
6103 /* Out of range start or start > end result in empty list */
6104 addReply(c,shared.emptymultibulk);
6105 return;
6106 }
6107 if (end >= llen) end = llen-1;
6108 rangelen = (end-start)+1;
cc812361 6109
dd88747b 6110 /* check if starting point is trivial, before searching
6111 * the element in log(N) time */
6112 if (reverse) {
6113 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
6114 } else {
6115 ln = start == 0 ?
6116 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
6117 }
cc812361 6118
dd88747b 6119 /* Return the result in form of a multi-bulk reply */
6120 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6121 withscores ? (rangelen*2) : rangelen));
6122 for (j = 0; j < rangelen; j++) {
6123 ele = ln->obj;
6124 addReplyBulk(c,ele);
6125 if (withscores)
6126 addReplyDouble(c,ln->score);
6127 ln = reverse ? ln->backward : ln->forward[0];
cc812361 6128 }
6129}
6130
e3870fab 6131static void zrangeCommand(redisClient *c) {
6132 zrangeGenericCommand(c,0);
6133}
6134
6135static void zrevrangeCommand(redisClient *c) {
6136 zrangeGenericCommand(c,1);
6137}
6138
f44dd428 6139/* This command implements both ZRANGEBYSCORE and ZCOUNT.
6140 * If justcount is non-zero, just the count is returned. */
6141static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 6142 robj *o;
f44dd428 6143 double min, max;
6144 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 6145 int offset = 0, limit = -1;
0500ef27
SH
6146 int withscores = 0;
6147 int badsyntax = 0;
6148
f44dd428 6149 /* Parse the min-max interval. If one of the values is prefixed
6150 * by the "(" character, it's considered "open". For instance
6151 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6152 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6153 if (((char*)c->argv[2]->ptr)[0] == '(') {
6154 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6155 minex = 1;
6156 } else {
6157 min = strtod(c->argv[2]->ptr,NULL);
6158 }
6159 if (((char*)c->argv[3]->ptr)[0] == '(') {
6160 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6161 maxex = 1;
6162 } else {
6163 max = strtod(c->argv[3]->ptr,NULL);
6164 }
6165
6166 /* Parse "WITHSCORES": note that if the command was called with
6167 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6168 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 6169 if (c->argc == 5 || c->argc == 8) {
3a3978b1 6170 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6171 withscores = 1;
6172 else
6173 badsyntax = 1;
0500ef27 6174 }
3a3978b1 6175 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 6176 badsyntax = 1;
0500ef27 6177 if (badsyntax) {
454d4e43 6178 addReplySds(c,
6179 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 6180 return;
0500ef27
SH
6181 }
6182
f44dd428 6183 /* Parse "LIMIT" */
0500ef27 6184 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 6185 addReply(c,shared.syntaxerr);
6186 return;
0500ef27 6187 } else if (c->argc == (7 + withscores)) {
80181f78 6188 offset = atoi(c->argv[5]->ptr);
6189 limit = atoi(c->argv[6]->ptr);
0b13687c 6190 if (offset < 0) offset = 0;
80181f78 6191 }
50c55df5 6192
f44dd428 6193 /* Ok, lookup the key and get the range */
50c55df5 6194 o = lookupKeyRead(c->db,c->argv[1]);
6195 if (o == NULL) {
4e27f268 6196 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6197 } else {
6198 if (o->type != REDIS_ZSET) {
6199 addReply(c,shared.wrongtypeerr);
6200 } else {
6201 zset *zsetobj = o->ptr;
6202 zskiplist *zsl = zsetobj->zsl;
6203 zskiplistNode *ln;
f44dd428 6204 robj *ele, *lenobj = NULL;
6205 unsigned long rangelen = 0;
50c55df5 6206
f44dd428 6207 /* Get the first node with the score >= min, or with
6208 * score > min if 'minex' is true. */
50c55df5 6209 ln = zslFirstWithScore(zsl,min);
f44dd428 6210 while (minex && ln && ln->score == min) ln = ln->forward[0];
6211
50c55df5 6212 if (ln == NULL) {
6213 /* No element matching the speciifed interval */
f44dd428 6214 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6215 return;
6216 }
6217
6218 /* We don't know in advance how many matching elements there
6219 * are in the list, so we push this object that will represent
6220 * the multi-bulk length in the output buffer, and will "fix"
6221 * it later */
f44dd428 6222 if (!justcount) {
6223 lenobj = createObject(REDIS_STRING,NULL);
6224 addReply(c,lenobj);
6225 decrRefCount(lenobj);
6226 }
50c55df5 6227
f44dd428 6228 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 6229 if (offset) {
6230 offset--;
6231 ln = ln->forward[0];
6232 continue;
6233 }
6234 if (limit == 0) break;
f44dd428 6235 if (!justcount) {
6236 ele = ln->obj;
dd88747b 6237 addReplyBulk(c,ele);
f44dd428 6238 if (withscores)
6239 addReplyDouble(c,ln->score);
6240 }
50c55df5 6241 ln = ln->forward[0];
6242 rangelen++;
80181f78 6243 if (limit > 0) limit--;
50c55df5 6244 }
f44dd428 6245 if (justcount) {
6246 addReplyLong(c,(long)rangelen);
6247 } else {
6248 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6249 withscores ? (rangelen*2) : rangelen);
6250 }
50c55df5 6251 }
6252 }
6253}
6254
f44dd428 6255static void zrangebyscoreCommand(redisClient *c) {
6256 genericZrangebyscoreCommand(c,0);
6257}
6258
6259static void zcountCommand(redisClient *c) {
6260 genericZrangebyscoreCommand(c,1);
6261}
6262
3c41331e 6263static void zcardCommand(redisClient *c) {
e197b441 6264 robj *o;
6265 zset *zs;
dd88747b 6266
6267 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6268 checkType(c,o,REDIS_ZSET)) return;
6269
6270 zs = o->ptr;
6271 addReplyUlong(c,zs->zsl->length);
e197b441 6272}
6273
6e333bbe 6274static void zscoreCommand(redisClient *c) {
6275 robj *o;
6276 zset *zs;
dd88747b 6277 dictEntry *de;
6278
6279 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6280 checkType(c,o,REDIS_ZSET)) return;
6281
6282 zs = o->ptr;
6283 de = dictFind(zs->dict,c->argv[2]);
6284 if (!de) {
96d8b4ee 6285 addReply(c,shared.nullbulk);
6e333bbe 6286 } else {
dd88747b 6287 double *score = dictGetEntryVal(de);
6e333bbe 6288
dd88747b 6289 addReplyDouble(c,*score);
6e333bbe 6290 }
6291}
6292
798d9e55 6293static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 6294 robj *o;
dd88747b 6295 zset *zs;
6296 zskiplist *zsl;
6297 dictEntry *de;
6298 unsigned long rank;
6299 double *score;
6300
6301 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6302 checkType(c,o,REDIS_ZSET)) return;
6303
6304 zs = o->ptr;
6305 zsl = zs->zsl;
6306 de = dictFind(zs->dict,c->argv[2]);
6307 if (!de) {
69d95c3e
PN
6308 addReply(c,shared.nullbulk);
6309 return;
6310 }
69d95c3e 6311
dd88747b 6312 score = dictGetEntryVal(de);
6313 rank = zslGetRank(zsl, *score, c->argv[2]);
6314 if (rank) {
6315 if (reverse) {
6316 addReplyLong(c, zsl->length - rank);
27b0ccca 6317 } else {
dd88747b 6318 addReplyLong(c, rank-1);
69d95c3e 6319 }
dd88747b 6320 } else {
6321 addReply(c,shared.nullbulk);
978c2c94 6322 }
6323}
6324
798d9e55
PN
6325static void zrankCommand(redisClient *c) {
6326 zrankGenericCommand(c, 0);
6327}
6328
6329static void zrevrankCommand(redisClient *c) {
6330 zrankGenericCommand(c, 1);
6331}
6332
7fb16bac
PN
6333/* ========================= Hashes utility functions ======================= */
6334#define REDIS_HASH_KEY 1
6335#define REDIS_HASH_VALUE 2
978c2c94 6336
7fb16bac
PN
6337/* Check the length of a number of objects to see if we need to convert a
6338 * zipmap to a real hash. Note that we only check string encoded objects
6339 * as their string length can be queried in constant time. */
6340static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6341 int i;
6342 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
978c2c94 6343
7fb16bac
PN
6344 for (i = start; i <= end; i++) {
6345 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6346 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6347 {
6348 convertToRealHash(subject);
978c2c94 6349 return;
6350 }
6351 }
7fb16bac 6352}
bae2c7ec 6353
97224de7
PN
6354/* Encode given objects in-place when the hash uses a dict. */
6355static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6356 if (subject->encoding == REDIS_ENCODING_HT) {
3f973463
PN
6357 if (o1) *o1 = tryObjectEncoding(*o1);
6358 if (o2) *o2 = tryObjectEncoding(*o2);
97224de7
PN
6359 }
6360}
6361
7fb16bac 6362/* Get the value from a hash identified by key. Returns either a string
a3f3af86
PN
6363 * object or NULL if the value cannot be found. The refcount of the object
6364 * is always increased by 1 when the value was found. */
7fb16bac
PN
6365static robj *hashGet(robj *o, robj *key) {
6366 robj *value = NULL;
978c2c94 6367 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7fb16bac
PN
6368 unsigned char *v;
6369 unsigned int vlen;
6370 key = getDecodedObject(key);
6371 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6372 value = createStringObject((char*)v,vlen);
6373 }
6374 decrRefCount(key);
6375 } else {
6376 dictEntry *de = dictFind(o->ptr,key);
6377 if (de != NULL) {
6378 value = dictGetEntryVal(de);
a3f3af86 6379 incrRefCount(value);
7fb16bac
PN
6380 }
6381 }
6382 return value;
6383}
978c2c94 6384
7fb16bac
PN
6385/* Test if the key exists in the given hash. Returns 1 if the key
6386 * exists and 0 when it doesn't. */
6387static int hashExists(robj *o, robj *key) {
6388 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6389 key = getDecodedObject(key);
6390 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6391 decrRefCount(key);
6392 return 1;
6393 }
6394 decrRefCount(key);
6395 } else {
6396 if (dictFind(o->ptr,key) != NULL) {
6397 return 1;
6398 }
6399 }
6400 return 0;
6401}
bae2c7ec 6402
7fb16bac
PN
6403/* Add an element, discard the old if the key already exists.
6404 * Return 0 on insert and 1 on update. */
feb8d7e6 6405static int hashSet(robj *o, robj *key, robj *value) {
7fb16bac
PN
6406 int update = 0;
6407 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6408 key = getDecodedObject(key);
6409 value = getDecodedObject(value);
6410 o->ptr = zipmapSet(o->ptr,
6411 key->ptr,sdslen(key->ptr),
6412 value->ptr,sdslen(value->ptr), &update);
6413 decrRefCount(key);
6414 decrRefCount(value);
6415
6416 /* Check if the zipmap needs to be upgraded to a real hash table */
6417 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
bae2c7ec 6418 convertToRealHash(o);
978c2c94 6419 } else {
7fb16bac
PN
6420 if (dictReplace(o->ptr,key,value)) {
6421 /* Insert */
6422 incrRefCount(key);
978c2c94 6423 } else {
7fb16bac 6424 /* Update */
978c2c94 6425 update = 1;
6426 }
7fb16bac 6427 incrRefCount(value);
978c2c94 6428 }
7fb16bac 6429 return update;
978c2c94 6430}
6431
7fb16bac
PN
6432/* Delete an element from a hash.
6433 * Return 1 on deleted and 0 on not found. */
6434static int hashDelete(robj *o, robj *key) {
6435 int deleted = 0;
6436 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6437 key = getDecodedObject(key);
6438 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6439 decrRefCount(key);
6440 } else {
6441 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6442 /* Always check if the dictionary needs a resize after a delete. */
6443 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
d33278d1 6444 }
7fb16bac
PN
6445 return deleted;
6446}
d33278d1 6447
7fb16bac 6448/* Return the number of elements in a hash. */
c811bb38 6449static unsigned long hashLength(robj *o) {
7fb16bac
PN
6450 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6451 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6452}
6453
6454/* Structure to hold hash iteration abstration. Note that iteration over
6455 * hashes involves both fields and values. Because it is possible that
6456 * not both are required, store pointers in the iterator to avoid
6457 * unnecessary memory allocation for fields/values. */
6458typedef struct {
6459 int encoding;
6460 unsigned char *zi;
6461 unsigned char *zk, *zv;
6462 unsigned int zklen, zvlen;
6463
6464 dictIterator *di;
6465 dictEntry *de;
6466} hashIterator;
6467
c44d3b56
PN
6468static hashIterator *hashInitIterator(robj *subject) {
6469 hashIterator *hi = zmalloc(sizeof(hashIterator));
7fb16bac
PN
6470 hi->encoding = subject->encoding;
6471 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6472 hi->zi = zipmapRewind(subject->ptr);
6473 } else if (hi->encoding == REDIS_ENCODING_HT) {
6474 hi->di = dictGetIterator(subject->ptr);
d33278d1 6475 } else {
7fb16bac 6476 redisAssert(NULL);
d33278d1 6477 }
c44d3b56 6478 return hi;
7fb16bac 6479}
d33278d1 6480
7fb16bac
PN
6481static void hashReleaseIterator(hashIterator *hi) {
6482 if (hi->encoding == REDIS_ENCODING_HT) {
6483 dictReleaseIterator(hi->di);
d33278d1 6484 }
c44d3b56 6485 zfree(hi);
7fb16bac 6486}
d33278d1 6487
7fb16bac
PN
6488/* Move to the next entry in the hash. Return REDIS_OK when the next entry
6489 * could be found and REDIS_ERR when the iterator reaches the end. */
c811bb38 6490static int hashNext(hashIterator *hi) {
7fb16bac
PN
6491 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6492 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6493 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6494 } else {
6495 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6496 }
6497 return REDIS_OK;
6498}
d33278d1 6499
0c390abc 6500/* Get key or value object at current iteration position.
a3f3af86 6501 * This increases the refcount of the field object by 1. */
c811bb38 6502static robj *hashCurrent(hashIterator *hi, int what) {
7fb16bac
PN
6503 robj *o;
6504 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6505 if (what & REDIS_HASH_KEY) {
6506 o = createStringObject((char*)hi->zk,hi->zklen);
6507 } else {
6508 o = createStringObject((char*)hi->zv,hi->zvlen);
d33278d1 6509 }
d33278d1 6510 } else {
7fb16bac
PN
6511 if (what & REDIS_HASH_KEY) {
6512 o = dictGetEntryKey(hi->de);
6513 } else {
6514 o = dictGetEntryVal(hi->de);
d33278d1 6515 }
a3f3af86 6516 incrRefCount(o);
d33278d1 6517 }
7fb16bac 6518 return o;
d33278d1
PN
6519}
6520
7fb16bac
PN
6521static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6522 robj *o = lookupKeyWrite(c->db,key);
01426b05
PN
6523 if (o == NULL) {
6524 o = createHashObject();
7fb16bac
PN
6525 dictAdd(c->db->dict,key,o);
6526 incrRefCount(key);
01426b05
PN
6527 } else {
6528 if (o->type != REDIS_HASH) {
6529 addReply(c,shared.wrongtypeerr);
7fb16bac 6530 return NULL;
01426b05
PN
6531 }
6532 }
7fb16bac
PN
6533 return o;
6534}
01426b05 6535
7fb16bac
PN
6536/* ============================= Hash commands ============================== */
6537static void hsetCommand(redisClient *c) {
6e9e463f 6538 int update;
7fb16bac 6539 robj *o;
bbe025e0 6540
7fb16bac
PN
6541 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6542 hashTryConversion(o,c->argv,2,3);
97224de7 6543 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6544 update = hashSet(o,c->argv[2],c->argv[3]);
6e9e463f 6545 addReply(c, update ? shared.czero : shared.cone);
7fb16bac
PN
6546 server.dirty++;
6547}
01426b05 6548
1f1c7695
PN
6549static void hsetnxCommand(redisClient *c) {
6550 robj *o;
6551 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6552 hashTryConversion(o,c->argv,2,3);
6553
6554 if (hashExists(o, c->argv[2])) {
6555 addReply(c, shared.czero);
01426b05 6556 } else {
97224de7 6557 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6558 hashSet(o,c->argv[2],c->argv[3]);
1f1c7695
PN
6559 addReply(c, shared.cone);
6560 server.dirty++;
6561 }
6562}
01426b05 6563
7fb16bac
PN
6564static void hmsetCommand(redisClient *c) {
6565 int i;
6566 robj *o;
01426b05 6567
7fb16bac
PN
6568 if ((c->argc % 2) == 1) {
6569 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6570 return;
6571 }
01426b05 6572
7fb16bac
PN
6573 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6574 hashTryConversion(o,c->argv,2,c->argc-1);
6575 for (i = 2; i < c->argc; i += 2) {
97224de7 6576 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
feb8d7e6 6577 hashSet(o,c->argv[i],c->argv[i+1]);
7fb16bac
PN
6578 }
6579 addReply(c, shared.ok);
edc2f63a 6580 server.dirty++;
7fb16bac
PN
6581}
6582
6583static void hincrbyCommand(redisClient *c) {
6584 long long value, incr;
6585 robj *o, *current, *new;
6586
bd79a6bd 6587 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
7fb16bac
PN
6588 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6589 if ((current = hashGet(o,c->argv[2])) != NULL) {
946342c1
PN
6590 if (getLongLongFromObjectOrReply(c,current,&value,
6591 "hash value is not an integer") != REDIS_OK) {
6592 decrRefCount(current);
6593 return;
6594 }
a3f3af86 6595 decrRefCount(current);
7fb16bac
PN
6596 } else {
6597 value = 0;
01426b05
PN
6598 }
6599
7fb16bac 6600 value += incr;
3f973463
PN
6601 new = createStringObjectFromLongLong(value);
6602 hashTryObjectEncoding(o,&c->argv[2],NULL);
feb8d7e6 6603 hashSet(o,c->argv[2],new);
7fb16bac
PN
6604 decrRefCount(new);
6605 addReplyLongLong(c,value);
01426b05 6606 server.dirty++;
01426b05
PN
6607}
6608
978c2c94 6609static void hgetCommand(redisClient *c) {
7fb16bac 6610 robj *o, *value;
dd88747b 6611 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6612 checkType(c,o,REDIS_HASH)) return;
6613
7fb16bac
PN
6614 if ((value = hashGet(o,c->argv[2])) != NULL) {
6615 addReplyBulk(c,value);
a3f3af86 6616 decrRefCount(value);
dd88747b 6617 } else {
7fb16bac 6618 addReply(c,shared.nullbulk);
69d95c3e 6619 }
69d95c3e
PN
6620}
6621
09aeb579
PN
6622static void hmgetCommand(redisClient *c) {
6623 int i;
7fb16bac
PN
6624 robj *o, *value;
6625 o = lookupKeyRead(c->db,c->argv[1]);
6626 if (o != NULL && o->type != REDIS_HASH) {
6627 addReply(c,shared.wrongtypeerr);
09aeb579
PN
6628 }
6629
7fb16bac
PN
6630 /* Note the check for o != NULL happens inside the loop. This is
6631 * done because objects that cannot be found are considered to be
6632 * an empty hash. The reply should then be a series of NULLs. */
09aeb579 6633 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7fb16bac
PN
6634 for (i = 2; i < c->argc; i++) {
6635 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6636 addReplyBulk(c,value);
a3f3af86 6637 decrRefCount(value);
7fb16bac
PN
6638 } else {
6639 addReply(c,shared.nullbulk);
09aeb579
PN
6640 }
6641 }
6642}
6643
07efaf74 6644static void hdelCommand(redisClient *c) {
dd88747b 6645 robj *o;
dd88747b 6646 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6647 checkType(c,o,REDIS_HASH)) return;
07efaf74 6648
7fb16bac
PN
6649 if (hashDelete(o,c->argv[2])) {
6650 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6651 addReply(c,shared.cone);
6652 server.dirty++;
dd88747b 6653 } else {
7fb16bac 6654 addReply(c,shared.czero);
07efaf74 6655 }
6656}
6657
92b27fe9 6658static void hlenCommand(redisClient *c) {
6659 robj *o;
dd88747b 6660 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 6661 checkType(c,o,REDIS_HASH)) return;
6662
7fb16bac 6663 addReplyUlong(c,hashLength(o));
92b27fe9 6664}
6665
78409a0f 6666static void genericHgetallCommand(redisClient *c, int flags) {
7fb16bac 6667 robj *o, *lenobj, *obj;
78409a0f 6668 unsigned long count = 0;
c44d3b56 6669 hashIterator *hi;
78409a0f 6670
4e27f268 6671 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
78409a0f 6672 || checkType(c,o,REDIS_HASH)) return;
6673
6674 lenobj = createObject(REDIS_STRING,NULL);
6675 addReply(c,lenobj);
6676 decrRefCount(lenobj);
6677
c44d3b56
PN
6678 hi = hashInitIterator(o);
6679 while (hashNext(hi) != REDIS_ERR) {
7fb16bac 6680 if (flags & REDIS_HASH_KEY) {
c44d3b56 6681 obj = hashCurrent(hi,REDIS_HASH_KEY);
7fb16bac 6682 addReplyBulk(c,obj);
a3f3af86 6683 decrRefCount(obj);
7fb16bac 6684 count++;
78409a0f 6685 }
7fb16bac 6686 if (flags & REDIS_HASH_VALUE) {
c44d3b56 6687 obj = hashCurrent(hi,REDIS_HASH_VALUE);
7fb16bac 6688 addReplyBulk(c,obj);
a3f3af86 6689 decrRefCount(obj);
7fb16bac 6690 count++;
78409a0f 6691 }
78409a0f 6692 }
c44d3b56 6693 hashReleaseIterator(hi);
7fb16bac 6694
78409a0f 6695 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6696}
6697
6698static void hkeysCommand(redisClient *c) {
7fb16bac 6699 genericHgetallCommand(c,REDIS_HASH_KEY);
78409a0f 6700}
6701
6702static void hvalsCommand(redisClient *c) {
7fb16bac 6703 genericHgetallCommand(c,REDIS_HASH_VALUE);
78409a0f 6704}
6705
6706static void hgetallCommand(redisClient *c) {
7fb16bac 6707 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
78409a0f 6708}
6709
a86f14b1 6710static void hexistsCommand(redisClient *c) {
6711 robj *o;
a86f14b1 6712 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6713 checkType(c,o,REDIS_HASH)) return;
6714
7fb16bac 6715 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
a86f14b1 6716}
6717
ada386b2 6718static void convertToRealHash(robj *o) {
6719 unsigned char *key, *val, *p, *zm = o->ptr;
6720 unsigned int klen, vlen;
6721 dict *dict = dictCreate(&hashDictType,NULL);
6722
6723 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6724 p = zipmapRewind(zm);
6725 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6726 robj *keyobj, *valobj;
6727
6728 keyobj = createStringObject((char*)key,klen);
6729 valobj = createStringObject((char*)val,vlen);
05df7621 6730 keyobj = tryObjectEncoding(keyobj);
6731 valobj = tryObjectEncoding(valobj);
ada386b2 6732 dictAdd(dict,keyobj,valobj);
6733 }
6734 o->encoding = REDIS_ENCODING_HT;
6735 o->ptr = dict;
6736 zfree(zm);
6737}
6738
6b47e12e 6739/* ========================= Non type-specific commands ==================== */
6740
ed9b544e 6741static void flushdbCommand(redisClient *c) {
ca37e9cd 6742 server.dirty += dictSize(c->db->dict);
3305306f 6743 dictEmpty(c->db->dict);
6744 dictEmpty(c->db->expires);
ed9b544e 6745 addReply(c,shared.ok);
ed9b544e 6746}
6747
6748static void flushallCommand(redisClient *c) {
ca37e9cd 6749 server.dirty += emptyDb();
ed9b544e 6750 addReply(c,shared.ok);
500ece7c 6751 if (server.bgsavechildpid != -1) {
6752 kill(server.bgsavechildpid,SIGKILL);
6753 rdbRemoveTempFile(server.bgsavechildpid);
6754 }
f78fd11b 6755 rdbSave(server.dbfilename);
ca37e9cd 6756 server.dirty++;
ed9b544e 6757}
6758
56906eef 6759static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 6760 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 6761 so->type = type;
6762 so->pattern = pattern;
6763 return so;
6764}
6765
6766/* Return the value associated to the key with a name obtained
55017f9d
PN
6767 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6768 * The returned object will always have its refcount increased by 1
6769 * when it is non-NULL. */
56906eef 6770static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6d7d1370 6771 char *p, *f;
ed9b544e 6772 sds spat, ssub;
6d7d1370
PN
6773 robj keyobj, fieldobj, *o;
6774 int prefixlen, sublen, postfixlen, fieldlen;
ed9b544e 6775 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6776 struct {
f1017b3f 6777 long len;
6778 long free;
ed9b544e 6779 char buf[REDIS_SORTKEY_MAX+1];
6d7d1370 6780 } keyname, fieldname;
ed9b544e 6781
28173a49 6782 /* If the pattern is "#" return the substitution object itself in order
6783 * to implement the "SORT ... GET #" feature. */
6784 spat = pattern->ptr;
6785 if (spat[0] == '#' && spat[1] == '\0') {
55017f9d 6786 incrRefCount(subst);
28173a49 6787 return subst;
6788 }
6789
6790 /* The substitution object may be specially encoded. If so we create
9d65a1bb 6791 * a decoded object on the fly. Otherwise getDecodedObject will just
6792 * increment the ref count, that we'll decrement later. */
6793 subst = getDecodedObject(subst);
942a3961 6794
ed9b544e 6795 ssub = subst->ptr;
6796 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6797 p = strchr(spat,'*');
ed5a857a 6798 if (!p) {
6799 decrRefCount(subst);
6800 return NULL;
6801 }
ed9b544e 6802
6d7d1370
PN
6803 /* Find out if we're dealing with a hash dereference. */
6804 if ((f = strstr(p+1, "->")) != NULL) {
6805 fieldlen = sdslen(spat)-(f-spat);
6806 /* this also copies \0 character */
6807 memcpy(fieldname.buf,f+2,fieldlen-1);
6808 fieldname.len = fieldlen-2;
6809 } else {
6810 fieldlen = 0;
6811 }
6812
ed9b544e 6813 prefixlen = p-spat;
6814 sublen = sdslen(ssub);
6d7d1370 6815 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
ed9b544e 6816 memcpy(keyname.buf,spat,prefixlen);
6817 memcpy(keyname.buf+prefixlen,ssub,sublen);
6818 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6819 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6820 keyname.len = prefixlen+sublen+postfixlen;
942a3961 6821 decrRefCount(subst);
6822
6d7d1370
PN
6823 /* Lookup substituted key */
6824 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6825 o = lookupKeyRead(db,&keyobj);
55017f9d
PN
6826 if (o == NULL) return NULL;
6827
6828 if (fieldlen > 0) {
6829 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6d7d1370 6830
705dad38
PN
6831 /* Retrieve value from hash by the field name. This operation
6832 * already increases the refcount of the returned object. */
6d7d1370
PN
6833 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6834 o = hashGet(o, &fieldobj);
705dad38 6835 } else {
55017f9d 6836 if (o->type != REDIS_STRING) return NULL;
b6f07345 6837
705dad38
PN
6838 /* Every object that this function returns needs to have its refcount
6839 * increased. sortCommand decreases it again. */
6840 incrRefCount(o);
6d7d1370
PN
6841 }
6842
6843 return o;
ed9b544e 6844}
6845
6846/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6847 * the additional parameter is not standard but a BSD-specific we have to
6848 * pass sorting parameters via the global 'server' structure */
6849static int sortCompare(const void *s1, const void *s2) {
6850 const redisSortObject *so1 = s1, *so2 = s2;
6851 int cmp;
6852
6853 if (!server.sort_alpha) {
6854 /* Numeric sorting. Here it's trivial as we precomputed scores */
6855 if (so1->u.score > so2->u.score) {
6856 cmp = 1;
6857 } else if (so1->u.score < so2->u.score) {
6858 cmp = -1;
6859 } else {
6860 cmp = 0;
6861 }
6862 } else {
6863 /* Alphanumeric sorting */
6864 if (server.sort_bypattern) {
6865 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6866 /* At least one compare object is NULL */
6867 if (so1->u.cmpobj == so2->u.cmpobj)
6868 cmp = 0;
6869 else if (so1->u.cmpobj == NULL)
6870 cmp = -1;
6871 else
6872 cmp = 1;
6873 } else {
6874 /* We have both the objects, use strcoll */
6875 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6876 }
6877 } else {
08ee9b57 6878 /* Compare elements directly. */
6879 cmp = compareStringObjects(so1->obj,so2->obj);
ed9b544e 6880 }
6881 }
6882 return server.sort_desc ? -cmp : cmp;
6883}
6884
6885/* The SORT command is the most complex command in Redis. Warning: this code
6886 * is optimized for speed and a bit less for readability */
6887static void sortCommand(redisClient *c) {
ed9b544e 6888 list *operations;
6889 int outputlen = 0;
6890 int desc = 0, alpha = 0;
6891 int limit_start = 0, limit_count = -1, start, end;
6892 int j, dontsort = 0, vectorlen;
6893 int getop = 0; /* GET operation counter */
443c6409 6894 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 6895 redisSortObject *vector; /* Resulting vector to sort */
6896
6897 /* Lookup the key to sort. It must be of the right types */
3305306f 6898 sortval = lookupKeyRead(c->db,c->argv[1]);
6899 if (sortval == NULL) {
4e27f268 6900 addReply(c,shared.emptymultibulk);
ed9b544e 6901 return;
6902 }
a5eb649b 6903 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6904 sortval->type != REDIS_ZSET)
6905 {
c937aa89 6906 addReply(c,shared.wrongtypeerr);
ed9b544e 6907 return;
6908 }
6909
6910 /* Create a list of operations to perform for every sorted element.
6911 * Operations can be GET/DEL/INCR/DECR */
6912 operations = listCreate();
092dac2a 6913 listSetFreeMethod(operations,zfree);
ed9b544e 6914 j = 2;
6915
6916 /* Now we need to protect sortval incrementing its count, in the future
6917 * SORT may have options able to overwrite/delete keys during the sorting
6918 * and the sorted key itself may get destroied */
6919 incrRefCount(sortval);
6920
6921 /* The SORT command has an SQL-alike syntax, parse it */
6922 while(j < c->argc) {
6923 int leftargs = c->argc-j-1;
6924 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6925 desc = 0;
6926 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6927 desc = 1;
6928 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6929 alpha = 1;
6930 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6931 limit_start = atoi(c->argv[j+1]->ptr);
6932 limit_count = atoi(c->argv[j+2]->ptr);
6933 j+=2;
443c6409 6934 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6935 storekey = c->argv[j+1];
6936 j++;
ed9b544e 6937 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6938 sortby = c->argv[j+1];
6939 /* If the BY pattern does not contain '*', i.e. it is constant,
6940 * we don't need to sort nor to lookup the weight keys. */
6941 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6942 j++;
6943 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6944 listAddNodeTail(operations,createSortOperation(
6945 REDIS_SORT_GET,c->argv[j+1]));
6946 getop++;
6947 j++;
ed9b544e 6948 } else {
6949 decrRefCount(sortval);
6950 listRelease(operations);
c937aa89 6951 addReply(c,shared.syntaxerr);
ed9b544e 6952 return;
6953 }
6954 j++;
6955 }
6956
6957 /* Load the sorting vector with all the objects to sort */
a5eb649b 6958 switch(sortval->type) {
6959 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6960 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6961 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
f83c6cb5 6962 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
a5eb649b 6963 }
ed9b544e 6964 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 6965 j = 0;
a5eb649b 6966
ed9b544e 6967 if (sortval->type == REDIS_LIST) {
6968 list *list = sortval->ptr;
6208b3a7 6969 listNode *ln;
c7df85a4 6970 listIter li;
6208b3a7 6971
c7df85a4 6972 listRewind(list,&li);
6973 while((ln = listNext(&li))) {
ed9b544e 6974 robj *ele = ln->value;
6975 vector[j].obj = ele;
6976 vector[j].u.score = 0;
6977 vector[j].u.cmpobj = NULL;
ed9b544e 6978 j++;
6979 }
6980 } else {
a5eb649b 6981 dict *set;
ed9b544e 6982 dictIterator *di;
6983 dictEntry *setele;
6984
a5eb649b 6985 if (sortval->type == REDIS_SET) {
6986 set = sortval->ptr;
6987 } else {
6988 zset *zs = sortval->ptr;
6989 set = zs->dict;
6990 }
6991
ed9b544e 6992 di = dictGetIterator(set);
ed9b544e 6993 while((setele = dictNext(di)) != NULL) {
6994 vector[j].obj = dictGetEntryKey(setele);
6995 vector[j].u.score = 0;
6996 vector[j].u.cmpobj = NULL;
6997 j++;
6998 }
6999 dictReleaseIterator(di);
7000 }
dfc5e96c 7001 redisAssert(j == vectorlen);
ed9b544e 7002
7003 /* Now it's time to load the right scores in the sorting vector */
7004 if (dontsort == 0) {
7005 for (j = 0; j < vectorlen; j++) {
6d7d1370 7006 robj *byval;
ed9b544e 7007 if (sortby) {
6d7d1370 7008 /* lookup value to sort by */
3305306f 7009 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
705dad38 7010 if (!byval) continue;
ed9b544e 7011 } else {
6d7d1370
PN
7012 /* use object itself to sort by */
7013 byval = vector[j].obj;
7014 }
7015
7016 if (alpha) {
08ee9b57 7017 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6d7d1370
PN
7018 } else {
7019 if (byval->encoding == REDIS_ENCODING_RAW) {
7020 vector[j].u.score = strtod(byval->ptr,NULL);
16fa22f1 7021 } else if (byval->encoding == REDIS_ENCODING_INT) {
6d7d1370
PN
7022 /* Don't need to decode the object if it's
7023 * integer-encoded (the only encoding supported) so
7024 * far. We can just cast it */
16fa22f1
PN
7025 vector[j].u.score = (long)byval->ptr;
7026 } else {
7027 redisAssert(1 != 1);
942a3961 7028 }
ed9b544e 7029 }
6d7d1370 7030
705dad38
PN
7031 /* when the object was retrieved using lookupKeyByPattern,
7032 * its refcount needs to be decreased. */
7033 if (sortby) {
7034 decrRefCount(byval);
ed9b544e 7035 }
7036 }
7037 }
7038
7039 /* We are ready to sort the vector... perform a bit of sanity check
7040 * on the LIMIT option too. We'll use a partial version of quicksort. */
7041 start = (limit_start < 0) ? 0 : limit_start;
7042 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7043 if (start >= vectorlen) {
7044 start = vectorlen-1;
7045 end = vectorlen-2;
7046 }
7047 if (end >= vectorlen) end = vectorlen-1;
7048
7049 if (dontsort == 0) {
7050 server.sort_desc = desc;
7051 server.sort_alpha = alpha;
7052 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 7053 if (sortby && (start != 0 || end != vectorlen-1))
7054 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7055 else
7056 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 7057 }
7058
7059 /* Send command output to the output buffer, performing the specified
7060 * GET/DEL/INCR/DECR operations if any. */
7061 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 7062 if (storekey == NULL) {
7063 /* STORE option not specified, sent the sorting result to client */
7064 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7065 for (j = start; j <= end; j++) {
7066 listNode *ln;
c7df85a4 7067 listIter li;
7068
dd88747b 7069 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 7070 listRewind(operations,&li);
7071 while((ln = listNext(&li))) {
443c6409 7072 redisSortOperation *sop = ln->value;
7073 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7074 vector[j].obj);
7075
7076 if (sop->type == REDIS_SORT_GET) {
55017f9d 7077 if (!val) {
443c6409 7078 addReply(c,shared.nullbulk);
7079 } else {
dd88747b 7080 addReplyBulk(c,val);
55017f9d 7081 decrRefCount(val);
443c6409 7082 }
7083 } else {
dfc5e96c 7084 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 7085 }
7086 }
ed9b544e 7087 }
443c6409 7088 } else {
7089 robj *listObject = createListObject();
7090 list *listPtr = (list*) listObject->ptr;
7091
7092 /* STORE option specified, set the sorting result as a List object */
7093 for (j = start; j <= end; j++) {
7094 listNode *ln;
c7df85a4 7095 listIter li;
7096
443c6409 7097 if (!getop) {
7098 listAddNodeTail(listPtr,vector[j].obj);
7099 incrRefCount(vector[j].obj);
7100 }
c7df85a4 7101 listRewind(operations,&li);
7102 while((ln = listNext(&li))) {
443c6409 7103 redisSortOperation *sop = ln->value;
7104 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7105 vector[j].obj);
7106
7107 if (sop->type == REDIS_SORT_GET) {
55017f9d 7108 if (!val) {
443c6409 7109 listAddNodeTail(listPtr,createStringObject("",0));
7110 } else {
55017f9d
PN
7111 /* We should do a incrRefCount on val because it is
7112 * added to the list, but also a decrRefCount because
7113 * it is returned by lookupKeyByPattern. This results
7114 * in doing nothing at all. */
443c6409 7115 listAddNodeTail(listPtr,val);
443c6409 7116 }
ed9b544e 7117 } else {
dfc5e96c 7118 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 7119 }
ed9b544e 7120 }
ed9b544e 7121 }
121796f7 7122 if (dictReplace(c->db->dict,storekey,listObject)) {
7123 incrRefCount(storekey);
7124 }
443c6409 7125 /* Note: we add 1 because the DB is dirty anyway since even if the
7126 * SORT result is empty a new key is set and maybe the old content
7127 * replaced. */
7128 server.dirty += 1+outputlen;
7129 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 7130 }
7131
7132 /* Cleanup */
7133 decrRefCount(sortval);
7134 listRelease(operations);
7135 for (j = 0; j < vectorlen; j++) {
16fa22f1 7136 if (alpha && vector[j].u.cmpobj)
ed9b544e 7137 decrRefCount(vector[j].u.cmpobj);
7138 }
7139 zfree(vector);
7140}
7141
ec6c7a1d 7142/* Convert an amount of bytes into a human readable string in the form
7143 * of 100B, 2G, 100M, 4K, and so forth. */
7144static void bytesToHuman(char *s, unsigned long long n) {
7145 double d;
7146
7147 if (n < 1024) {
7148 /* Bytes */
7149 sprintf(s,"%lluB",n);
7150 return;
7151 } else if (n < (1024*1024)) {
7152 d = (double)n/(1024);
7153 sprintf(s,"%.2fK",d);
7154 } else if (n < (1024LL*1024*1024)) {
7155 d = (double)n/(1024*1024);
7156 sprintf(s,"%.2fM",d);
7157 } else if (n < (1024LL*1024*1024*1024)) {
7158 d = (double)n/(1024LL*1024*1024);
b72f6a4b 7159 sprintf(s,"%.2fG",d);
ec6c7a1d 7160 }
7161}
7162
1c85b79f 7163/* Create the string returned by the INFO command. This is decoupled
7164 * by the INFO command itself as we need to report the same information
7165 * on memory corruption problems. */
7166static sds genRedisInfoString(void) {
ed9b544e 7167 sds info;
7168 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 7169 int j;
ec6c7a1d 7170 char hmem[64];
55a8298f 7171
b72f6a4b 7172 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 7173 info = sdscatprintf(sdsempty(),
7174 "redis_version:%s\r\n"
f1017b3f 7175 "arch_bits:%s\r\n"
7a932b74 7176 "multiplexing_api:%s\r\n"
0d7170a4 7177 "process_id:%ld\r\n"
682ac724 7178 "uptime_in_seconds:%ld\r\n"
7179 "uptime_in_days:%ld\r\n"
ed9b544e 7180 "connected_clients:%d\r\n"
7181 "connected_slaves:%d\r\n"
f86a74e9 7182 "blocked_clients:%d\r\n"
5fba9f71 7183 "used_memory:%zu\r\n"
ec6c7a1d 7184 "used_memory_human:%s\r\n"
ed9b544e 7185 "changes_since_last_save:%lld\r\n"
be2bb6b0 7186 "bgsave_in_progress:%d\r\n"
682ac724 7187 "last_save_time:%ld\r\n"
b3fad521 7188 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 7189 "total_connections_received:%lld\r\n"
7190 "total_commands_processed:%lld\r\n"
2a6a2ed1 7191 "expired_keys:%lld\r\n"
55a8298f 7192 "hash_max_zipmap_entries:%ld\r\n"
7193 "hash_max_zipmap_value:%ld\r\n"
ffc6b7f8 7194 "pubsub_channels:%ld\r\n"
7195 "pubsub_patterns:%u\r\n"
7d98e08c 7196 "vm_enabled:%d\r\n"
a0f643ea 7197 "role:%s\r\n"
ed9b544e 7198 ,REDIS_VERSION,
f1017b3f 7199 (sizeof(long) == 8) ? "64" : "32",
7a932b74 7200 aeGetApiName(),
0d7170a4 7201 (long) getpid(),
a0f643ea 7202 uptime,
7203 uptime/(3600*24),
ed9b544e 7204 listLength(server.clients)-listLength(server.slaves),
7205 listLength(server.slaves),
d5d55fc3 7206 server.blpop_blocked_clients,
b72f6a4b 7207 zmalloc_used_memory(),
ec6c7a1d 7208 hmem,
ed9b544e 7209 server.dirty,
9d65a1bb 7210 server.bgsavechildpid != -1,
ed9b544e 7211 server.lastsave,
b3fad521 7212 server.bgrewritechildpid != -1,
ed9b544e 7213 server.stat_numconnections,
7214 server.stat_numcommands,
2a6a2ed1 7215 server.stat_expiredkeys,
55a8298f 7216 server.hash_max_zipmap_entries,
7217 server.hash_max_zipmap_value,
ffc6b7f8 7218 dictSize(server.pubsub_channels),
7219 listLength(server.pubsub_patterns),
7d98e08c 7220 server.vm_enabled != 0,
a0f643ea 7221 server.masterhost == NULL ? "master" : "slave"
ed9b544e 7222 );
a0f643ea 7223 if (server.masterhost) {
7224 info = sdscatprintf(info,
7225 "master_host:%s\r\n"
7226 "master_port:%d\r\n"
7227 "master_link_status:%s\r\n"
7228 "master_last_io_seconds_ago:%d\r\n"
7229 ,server.masterhost,
7230 server.masterport,
7231 (server.replstate == REDIS_REPL_CONNECTED) ?
7232 "up" : "down",
f72b934d 7233 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 7234 );
7235 }
7d98e08c 7236 if (server.vm_enabled) {
1064ef87 7237 lockThreadedIO();
7d98e08c 7238 info = sdscatprintf(info,
7239 "vm_conf_max_memory:%llu\r\n"
7240 "vm_conf_page_size:%llu\r\n"
7241 "vm_conf_pages:%llu\r\n"
7242 "vm_stats_used_pages:%llu\r\n"
7243 "vm_stats_swapped_objects:%llu\r\n"
7244 "vm_stats_swappin_count:%llu\r\n"
7245 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 7246 "vm_stats_io_newjobs_len:%lu\r\n"
7247 "vm_stats_io_processing_len:%lu\r\n"
7248 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 7249 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 7250 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 7251 ,(unsigned long long) server.vm_max_memory,
7252 (unsigned long long) server.vm_page_size,
7253 (unsigned long long) server.vm_pages,
7254 (unsigned long long) server.vm_stats_used_pages,
7255 (unsigned long long) server.vm_stats_swapped_objects,
7256 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 7257 (unsigned long long) server.vm_stats_swapouts,
7258 (unsigned long) listLength(server.io_newjobs),
7259 (unsigned long) listLength(server.io_processing),
7260 (unsigned long) listLength(server.io_processed),
d5d55fc3 7261 (unsigned long) server.io_active_threads,
7262 (unsigned long) server.vm_blocked_clients
7d98e08c 7263 );
1064ef87 7264 unlockThreadedIO();
7d98e08c 7265 }
c3cb078d 7266 for (j = 0; j < server.dbnum; j++) {
7267 long long keys, vkeys;
7268
7269 keys = dictSize(server.db[j].dict);
7270 vkeys = dictSize(server.db[j].expires);
7271 if (keys || vkeys) {
9d65a1bb 7272 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 7273 j, keys, vkeys);
7274 }
7275 }
1c85b79f 7276 return info;
7277}
7278
7279static void infoCommand(redisClient *c) {
7280 sds info = genRedisInfoString();
83c6a618 7281 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7282 (unsigned long)sdslen(info)));
ed9b544e 7283 addReplySds(c,info);
70003d28 7284 addReply(c,shared.crlf);
ed9b544e 7285}
7286
3305306f 7287static void monitorCommand(redisClient *c) {
7288 /* ignore MONITOR if aleady slave or in monitor mode */
7289 if (c->flags & REDIS_SLAVE) return;
7290
7291 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7292 c->slaveseldb = 0;
6b47e12e 7293 listAddNodeTail(server.monitors,c);
3305306f 7294 addReply(c,shared.ok);
7295}
7296
7297/* ================================= Expire ================================= */
7298static int removeExpire(redisDb *db, robj *key) {
7299 if (dictDelete(db->expires,key) == DICT_OK) {
7300 return 1;
7301 } else {
7302 return 0;
7303 }
7304}
7305
7306static int setExpire(redisDb *db, robj *key, time_t when) {
7307 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7308 return 0;
7309 } else {
7310 incrRefCount(key);
7311 return 1;
7312 }
7313}
7314
bb32ede5 7315/* Return the expire time of the specified key, or -1 if no expire
7316 * is associated with this key (i.e. the key is non volatile) */
7317static time_t getExpire(redisDb *db, robj *key) {
7318 dictEntry *de;
7319
7320 /* No expire? return ASAP */
7321 if (dictSize(db->expires) == 0 ||
7322 (de = dictFind(db->expires,key)) == NULL) return -1;
7323
7324 return (time_t) dictGetEntryVal(de);
7325}
7326
3305306f 7327static int expireIfNeeded(redisDb *db, robj *key) {
7328 time_t when;
7329 dictEntry *de;
7330
7331 /* No expire? return ASAP */
7332 if (dictSize(db->expires) == 0 ||
7333 (de = dictFind(db->expires,key)) == NULL) return 0;
7334
7335 /* Lookup the expire */
7336 when = (time_t) dictGetEntryVal(de);
7337 if (time(NULL) <= when) return 0;
7338
7339 /* Delete the key */
7340 dictDelete(db->expires,key);
2a6a2ed1 7341 server.stat_expiredkeys++;
3305306f 7342 return dictDelete(db->dict,key) == DICT_OK;
7343}
7344
7345static int deleteIfVolatile(redisDb *db, robj *key) {
7346 dictEntry *de;
7347
7348 /* No expire? return ASAP */
7349 if (dictSize(db->expires) == 0 ||
7350 (de = dictFind(db->expires,key)) == NULL) return 0;
7351
7352 /* Delete the key */
0c66a471 7353 server.dirty++;
2a6a2ed1 7354 server.stat_expiredkeys++;
3305306f 7355 dictDelete(db->expires,key);
7356 return dictDelete(db->dict,key) == DICT_OK;
7357}
7358
bbe025e0 7359static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
3305306f 7360 dictEntry *de;
bbe025e0
AM
7361 time_t seconds;
7362
bd79a6bd 7363 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
bbe025e0
AM
7364
7365 seconds -= offset;
3305306f 7366
802e8373 7367 de = dictFind(c->db->dict,key);
3305306f 7368 if (de == NULL) {
7369 addReply(c,shared.czero);
7370 return;
7371 }
d4dd6556 7372 if (seconds <= 0) {
43e5ccdf 7373 if (deleteKey(c->db,key)) server.dirty++;
7374 addReply(c, shared.cone);
3305306f 7375 return;
7376 } else {
7377 time_t when = time(NULL)+seconds;
802e8373 7378 if (setExpire(c->db,key,when)) {
3305306f 7379 addReply(c,shared.cone);
77423026 7380 server.dirty++;
7381 } else {
3305306f 7382 addReply(c,shared.czero);
77423026 7383 }
3305306f 7384 return;
7385 }
7386}
7387
802e8373 7388static void expireCommand(redisClient *c) {
bbe025e0 7389 expireGenericCommand(c,c->argv[1],c->argv[2],0);
802e8373 7390}
7391
7392static void expireatCommand(redisClient *c) {
bbe025e0 7393 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
802e8373 7394}
7395
fd88489a 7396static void ttlCommand(redisClient *c) {
7397 time_t expire;
7398 int ttl = -1;
7399
7400 expire = getExpire(c->db,c->argv[1]);
7401 if (expire != -1) {
7402 ttl = (int) (expire-time(NULL));
7403 if (ttl < 0) ttl = -1;
7404 }
7405 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7406}
7407
6e469882 7408/* ================================ MULTI/EXEC ============================== */
7409
7410/* Client state initialization for MULTI/EXEC */
7411static void initClientMultiState(redisClient *c) {
7412 c->mstate.commands = NULL;
7413 c->mstate.count = 0;
7414}
7415
7416/* Release all the resources associated with MULTI/EXEC state */
7417static void freeClientMultiState(redisClient *c) {
7418 int j;
7419
7420 for (j = 0; j < c->mstate.count; j++) {
7421 int i;
7422 multiCmd *mc = c->mstate.commands+j;
7423
7424 for (i = 0; i < mc->argc; i++)
7425 decrRefCount(mc->argv[i]);
7426 zfree(mc->argv);
7427 }
7428 zfree(c->mstate.commands);
7429}
7430
7431/* Add a new command into the MULTI commands queue */
7432static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7433 multiCmd *mc;
7434 int j;
7435
7436 c->mstate.commands = zrealloc(c->mstate.commands,
7437 sizeof(multiCmd)*(c->mstate.count+1));
7438 mc = c->mstate.commands+c->mstate.count;
7439 mc->cmd = cmd;
7440 mc->argc = c->argc;
7441 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7442 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7443 for (j = 0; j < c->argc; j++)
7444 incrRefCount(mc->argv[j]);
7445 c->mstate.count++;
7446}
7447
7448static void multiCommand(redisClient *c) {
7449 c->flags |= REDIS_MULTI;
36c548f0 7450 addReply(c,shared.ok);
6e469882 7451}
7452
18b6cb76
DJ
7453static void discardCommand(redisClient *c) {
7454 if (!(c->flags & REDIS_MULTI)) {
7455 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7456 return;
7457 }
7458
7459 freeClientMultiState(c);
7460 initClientMultiState(c);
7461 c->flags &= (~REDIS_MULTI);
7462 addReply(c,shared.ok);
7463}
7464
66c8853f 7465/* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7466 * implememntation for more information. */
7467static void execCommandReplicateMulti(redisClient *c) {
7468 struct redisCommand *cmd;
7469 robj *multistring = createStringObject("MULTI",5);
7470
7471 cmd = lookupCommand("multi");
7472 if (server.appendonly)
7473 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7474 if (listLength(server.slaves))
7475 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7476 decrRefCount(multistring);
7477}
7478
6e469882 7479static void execCommand(redisClient *c) {
7480 int j;
7481 robj **orig_argv;
7482 int orig_argc;
7483
7484 if (!(c->flags & REDIS_MULTI)) {
7485 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7486 return;
7487 }
7488
66c8853f 7489 /* Replicate a MULTI request now that we are sure the block is executed.
7490 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7491 * both the AOF and the replication link will have the same consistency
7492 * and atomicity guarantees. */
7493 execCommandReplicateMulti(c);
7494
7495 /* Exec all the queued commands */
6e469882 7496 orig_argv = c->argv;
7497 orig_argc = c->argc;
7498 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7499 for (j = 0; j < c->mstate.count; j++) {
7500 c->argc = c->mstate.commands[j].argc;
7501 c->argv = c->mstate.commands[j].argv;
7502 call(c,c->mstate.commands[j].cmd);
7503 }
7504 c->argv = orig_argv;
7505 c->argc = orig_argc;
7506 freeClientMultiState(c);
7507 initClientMultiState(c);
7508 c->flags &= (~REDIS_MULTI);
66c8853f 7509 /* Make sure the EXEC command is always replicated / AOF, since we
7510 * always send the MULTI command (we can't know beforehand if the
7511 * next operations will contain at least a modification to the DB). */
7512 server.dirty++;
6e469882 7513}
7514
4409877e 7515/* =========================== Blocking Operations ========================= */
7516
7517/* Currently Redis blocking operations support is limited to list POP ops,
7518 * so the current implementation is not fully generic, but it is also not
7519 * completely specific so it will not require a rewrite to support new
7520 * kind of blocking operations in the future.
7521 *
7522 * Still it's important to note that list blocking operations can be already
7523 * used as a notification mechanism in order to implement other blocking
7524 * operations at application level, so there must be a very strong evidence
7525 * of usefulness and generality before new blocking operations are implemented.
7526 *
7527 * This is how the current blocking POP works, we use BLPOP as example:
7528 * - If the user calls BLPOP and the key exists and contains a non empty list
7529 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7530 * if there is not to block.
7531 * - If instead BLPOP is called and the key does not exists or the list is
7532 * empty we need to block. In order to do so we remove the notification for
7533 * new data to read in the client socket (so that we'll not serve new
7534 * requests if the blocking request is not served). Also we put the client
95242ab5 7535 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
4409877e 7536 * blocking for this keys.
7537 * - If a PUSH operation against a key with blocked clients waiting is
7538 * performed, we serve the first in the list: basically instead to push
7539 * the new element inside the list we return it to the (first / oldest)
7540 * blocking client, unblock the client, and remove it form the list.
7541 *
7542 * The above comment and the source code should be enough in order to understand
7543 * the implementation and modify / fix it later.
7544 */
7545
7546/* Set a client in blocking mode for the specified key, with the specified
7547 * timeout */
b177fd30 7548static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 7549 dictEntry *de;
7550 list *l;
b177fd30 7551 int j;
4409877e 7552
b177fd30 7553 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7554 c->blockingkeysnum = numkeys;
4409877e 7555 c->blockingto = timeout;
b177fd30 7556 for (j = 0; j < numkeys; j++) {
7557 /* Add the key in the client structure, to map clients -> keys */
7558 c->blockingkeys[j] = keys[j];
7559 incrRefCount(keys[j]);
4409877e 7560
b177fd30 7561 /* And in the other "side", to map keys -> clients */
7562 de = dictFind(c->db->blockingkeys,keys[j]);
7563 if (de == NULL) {
7564 int retval;
7565
7566 /* For every key we take a list of clients blocked for it */
7567 l = listCreate();
7568 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7569 incrRefCount(keys[j]);
7570 assert(retval == DICT_OK);
7571 } else {
7572 l = dictGetEntryVal(de);
7573 }
7574 listAddNodeTail(l,c);
4409877e 7575 }
b177fd30 7576 /* Mark the client as a blocked client */
4409877e 7577 c->flags |= REDIS_BLOCKED;
d5d55fc3 7578 server.blpop_blocked_clients++;
4409877e 7579}
7580
7581/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 7582static void unblockClientWaitingData(redisClient *c) {
4409877e 7583 dictEntry *de;
7584 list *l;
b177fd30 7585 int j;
4409877e 7586
b177fd30 7587 assert(c->blockingkeys != NULL);
7588 /* The client may wait for multiple keys, so unblock it for every key. */
7589 for (j = 0; j < c->blockingkeysnum; j++) {
7590 /* Remove this client from the list of clients waiting for this key. */
7591 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7592 assert(de != NULL);
7593 l = dictGetEntryVal(de);
7594 listDelNode(l,listSearchKey(l,c));
7595 /* If the list is empty we need to remove it to avoid wasting memory */
7596 if (listLength(l) == 0)
7597 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7598 decrRefCount(c->blockingkeys[j]);
7599 }
7600 /* Cleanup the client structure */
7601 zfree(c->blockingkeys);
7602 c->blockingkeys = NULL;
4409877e 7603 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 7604 server.blpop_blocked_clients--;
5921aa36 7605 /* We want to process data if there is some command waiting
b0d8747d 7606 * in the input buffer. Note that this is safe even if
7607 * unblockClientWaitingData() gets called from freeClient() because
7608 * freeClient() will be smart enough to call this function
7609 * *after* c->querybuf was set to NULL. */
4409877e 7610 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7611}
7612
7613/* This should be called from any function PUSHing into lists.
7614 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7615 * 'ele' is the element pushed.
7616 *
7617 * If the function returns 0 there was no client waiting for a list push
7618 * against this key.
7619 *
7620 * If the function returns 1 there was a client waiting for a list push
7621 * against this key, the element was passed to this client thus it's not
7622 * needed to actually add it to the list and the caller should return asap. */
7623static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7624 struct dictEntry *de;
7625 redisClient *receiver;
7626 list *l;
7627 listNode *ln;
7628
7629 de = dictFind(c->db->blockingkeys,key);
7630 if (de == NULL) return 0;
7631 l = dictGetEntryVal(de);
7632 ln = listFirst(l);
7633 assert(ln != NULL);
7634 receiver = ln->value;
4409877e 7635
b177fd30 7636 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 7637 addReplyBulk(receiver,key);
7638 addReplyBulk(receiver,ele);
b0d8747d 7639 unblockClientWaitingData(receiver);
4409877e 7640 return 1;
7641}
7642
7643/* Blocking RPOP/LPOP */
7644static void blockingPopGenericCommand(redisClient *c, int where) {
7645 robj *o;
7646 time_t timeout;
b177fd30 7647 int j;
4409877e 7648
b177fd30 7649 for (j = 1; j < c->argc-1; j++) {
7650 o = lookupKeyWrite(c->db,c->argv[j]);
7651 if (o != NULL) {
7652 if (o->type != REDIS_LIST) {
7653 addReply(c,shared.wrongtypeerr);
4409877e 7654 return;
b177fd30 7655 } else {
7656 list *list = o->ptr;
7657 if (listLength(list) != 0) {
7658 /* If the list contains elements fall back to the usual
7659 * non-blocking POP operation */
7660 robj *argv[2], **orig_argv;
7661 int orig_argc;
e0a62c7f 7662
b177fd30 7663 /* We need to alter the command arguments before to call
7664 * popGenericCommand() as the command takes a single key. */
7665 orig_argv = c->argv;
7666 orig_argc = c->argc;
7667 argv[1] = c->argv[j];
7668 c->argv = argv;
7669 c->argc = 2;
7670
7671 /* Also the return value is different, we need to output
7672 * the multi bulk reply header and the key name. The
7673 * "real" command will add the last element (the value)
7674 * for us. If this souds like an hack to you it's just
7675 * because it is... */
7676 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 7677 addReplyBulk(c,argv[1]);
b177fd30 7678 popGenericCommand(c,where);
7679
7680 /* Fix the client structure with the original stuff */
7681 c->argv = orig_argv;
7682 c->argc = orig_argc;
7683 return;
7684 }
4409877e 7685 }
7686 }
7687 }
7688 /* If the list is empty or the key does not exists we must block */
b177fd30 7689 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 7690 if (timeout > 0) timeout += time(NULL);
b177fd30 7691 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 7692}
7693
7694static void blpopCommand(redisClient *c) {
7695 blockingPopGenericCommand(c,REDIS_HEAD);
7696}
7697
7698static void brpopCommand(redisClient *c) {
7699 blockingPopGenericCommand(c,REDIS_TAIL);
7700}
7701
ed9b544e 7702/* =============================== Replication ============================= */
7703
a4d1ba9a 7704static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7705 ssize_t nwritten, ret = size;
7706 time_t start = time(NULL);
7707
7708 timeout++;
7709 while(size) {
7710 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7711 nwritten = write(fd,ptr,size);
7712 if (nwritten == -1) return -1;
7713 ptr += nwritten;
7714 size -= nwritten;
7715 }
7716 if ((time(NULL)-start) > timeout) {
7717 errno = ETIMEDOUT;
7718 return -1;
7719 }
7720 }
7721 return ret;
7722}
7723
a4d1ba9a 7724static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7725 ssize_t nread, totread = 0;
7726 time_t start = time(NULL);
7727
7728 timeout++;
7729 while(size) {
7730 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7731 nread = read(fd,ptr,size);
7732 if (nread == -1) return -1;
7733 ptr += nread;
7734 size -= nread;
7735 totread += nread;
7736 }
7737 if ((time(NULL)-start) > timeout) {
7738 errno = ETIMEDOUT;
7739 return -1;
7740 }
7741 }
7742 return totread;
7743}
7744
7745static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7746 ssize_t nread = 0;
7747
7748 size--;
7749 while(size) {
7750 char c;
7751
7752 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7753 if (c == '\n') {
7754 *ptr = '\0';
7755 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7756 return nread;
7757 } else {
7758 *ptr++ = c;
7759 *ptr = '\0';
7760 nread++;
7761 }
7762 }
7763 return nread;
7764}
7765
7766static void syncCommand(redisClient *c) {
40d224a9 7767 /* ignore SYNC if aleady slave or in monitor mode */
7768 if (c->flags & REDIS_SLAVE) return;
7769
7770 /* SYNC can't be issued when the server has pending data to send to
7771 * the client about already issued commands. We need a fresh reply
7772 * buffer registering the differences between the BGSAVE and the current
7773 * dataset, so that we can copy to other slaves if needed. */
7774 if (listLength(c->reply) != 0) {
7775 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7776 return;
7777 }
7778
7779 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7780 /* Here we need to check if there is a background saving operation
7781 * in progress, or if it is required to start one */
9d65a1bb 7782 if (server.bgsavechildpid != -1) {
40d224a9 7783 /* Ok a background save is in progress. Let's check if it is a good
7784 * one for replication, i.e. if there is another slave that is
7785 * registering differences since the server forked to save */
7786 redisClient *slave;
7787 listNode *ln;
c7df85a4 7788 listIter li;
40d224a9 7789
c7df85a4 7790 listRewind(server.slaves,&li);
7791 while((ln = listNext(&li))) {
40d224a9 7792 slave = ln->value;
7793 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 7794 }
7795 if (ln) {
7796 /* Perfect, the server is already registering differences for
7797 * another slave. Set the right state, and copy the buffer. */
7798 listRelease(c->reply);
7799 c->reply = listDup(slave->reply);
40d224a9 7800 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7801 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7802 } else {
7803 /* No way, we need to wait for the next BGSAVE in order to
7804 * register differences */
7805 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7806 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7807 }
7808 } else {
7809 /* Ok we don't have a BGSAVE in progress, let's start one */
7810 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7811 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7812 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7813 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7814 return;
7815 }
7816 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7817 }
6208b3a7 7818 c->repldbfd = -1;
40d224a9 7819 c->flags |= REDIS_SLAVE;
7820 c->slaveseldb = 0;
6b47e12e 7821 listAddNodeTail(server.slaves,c);
40d224a9 7822 return;
7823}
7824
6208b3a7 7825static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7826 redisClient *slave = privdata;
7827 REDIS_NOTUSED(el);
7828 REDIS_NOTUSED(mask);
7829 char buf[REDIS_IOBUF_LEN];
7830 ssize_t nwritten, buflen;
7831
7832 if (slave->repldboff == 0) {
7833 /* Write the bulk write count before to transfer the DB. In theory here
7834 * we don't know how much room there is in the output buffer of the
7835 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7836 * operations) will never be smaller than the few bytes we need. */
7837 sds bulkcount;
7838
7839 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7840 slave->repldbsize);
7841 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7842 {
7843 sdsfree(bulkcount);
7844 freeClient(slave);
7845 return;
7846 }
7847 sdsfree(bulkcount);
7848 }
7849 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7850 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7851 if (buflen <= 0) {
7852 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7853 (buflen == 0) ? "premature EOF" : strerror(errno));
7854 freeClient(slave);
7855 return;
7856 }
7857 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 7858 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 7859 strerror(errno));
7860 freeClient(slave);
7861 return;
7862 }
7863 slave->repldboff += nwritten;
7864 if (slave->repldboff == slave->repldbsize) {
7865 close(slave->repldbfd);
7866 slave->repldbfd = -1;
7867 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7868 slave->replstate = REDIS_REPL_ONLINE;
7869 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 7870 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 7871 freeClient(slave);
7872 return;
7873 }
7874 addReplySds(slave,sdsempty());
7875 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7876 }
7877}
ed9b544e 7878
a3b21203 7879/* This function is called at the end of every backgrond saving.
7880 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7881 * otherwise REDIS_ERR is passed to the function.
7882 *
7883 * The goal of this function is to handle slaves waiting for a successful
7884 * background saving in order to perform non-blocking synchronization. */
7885static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 7886 listNode *ln;
7887 int startbgsave = 0;
c7df85a4 7888 listIter li;
ed9b544e 7889
c7df85a4 7890 listRewind(server.slaves,&li);
7891 while((ln = listNext(&li))) {
6208b3a7 7892 redisClient *slave = ln->value;
ed9b544e 7893
6208b3a7 7894 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7895 startbgsave = 1;
7896 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7897 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 7898 struct redis_stat buf;
e0a62c7f 7899
6208b3a7 7900 if (bgsaveerr != REDIS_OK) {
7901 freeClient(slave);
7902 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7903 continue;
7904 }
7905 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 7906 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 7907 freeClient(slave);
7908 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7909 continue;
7910 }
7911 slave->repldboff = 0;
7912 slave->repldbsize = buf.st_size;
7913 slave->replstate = REDIS_REPL_SEND_BULK;
7914 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 7915 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 7916 freeClient(slave);
7917 continue;
7918 }
7919 }
ed9b544e 7920 }
6208b3a7 7921 if (startbgsave) {
7922 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 7923 listIter li;
7924
7925 listRewind(server.slaves,&li);
6208b3a7 7926 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 7927 while((ln = listNext(&li))) {
6208b3a7 7928 redisClient *slave = ln->value;
ed9b544e 7929
6208b3a7 7930 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7931 freeClient(slave);
7932 }
7933 }
7934 }
ed9b544e 7935}
7936
7937static int syncWithMaster(void) {
d0ccebcf 7938 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 7939 long dumpsize;
ed9b544e 7940 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 7941 int dfd, maxtries = 5;
ed9b544e 7942
7943 if (fd == -1) {
7944 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7945 strerror(errno));
7946 return REDIS_ERR;
7947 }
d0ccebcf 7948
7949 /* AUTH with the master if required. */
7950 if(server.masterauth) {
7951 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7952 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7953 close(fd);
7954 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7955 strerror(errno));
7956 return REDIS_ERR;
7957 }
7958 /* Read the AUTH result. */
7959 if (syncReadLine(fd,buf,1024,3600) == -1) {
7960 close(fd);
7961 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7962 strerror(errno));
7963 return REDIS_ERR;
7964 }
7965 if (buf[0] != '+') {
7966 close(fd);
7967 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7968 return REDIS_ERR;
7969 }
7970 }
7971
ed9b544e 7972 /* Issue the SYNC command */
7973 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7974 close(fd);
7975 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7976 strerror(errno));
7977 return REDIS_ERR;
7978 }
7979 /* Read the bulk write count */
8c4d91fc 7980 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 7981 close(fd);
7982 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7983 strerror(errno));
7984 return REDIS_ERR;
7985 }
4aa701c1 7986 if (buf[0] != '$') {
7987 close(fd);
7988 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7989 return REDIS_ERR;
7990 }
18e61fa2 7991 dumpsize = strtol(buf+1,NULL,10);
7992 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 7993 /* Read the bulk write data on a temp file */
8c5abee8 7994 while(maxtries--) {
7995 snprintf(tmpfile,256,
7996 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7997 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7998 if (dfd != -1) break;
5de9ad7c 7999 sleep(1);
8c5abee8 8000 }
ed9b544e 8001 if (dfd == -1) {
8002 close(fd);
8003 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8004 return REDIS_ERR;
8005 }
8006 while(dumpsize) {
8007 int nread, nwritten;
8008
8009 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8010 if (nread == -1) {
8011 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8012 strerror(errno));
8013 close(fd);
8014 close(dfd);
8015 return REDIS_ERR;
8016 }
8017 nwritten = write(dfd,buf,nread);
8018 if (nwritten == -1) {
8019 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8020 close(fd);
8021 close(dfd);
8022 return REDIS_ERR;
8023 }
8024 dumpsize -= nread;
8025 }
8026 close(dfd);
8027 if (rename(tmpfile,server.dbfilename) == -1) {
8028 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8029 unlink(tmpfile);
8030 close(fd);
8031 return REDIS_ERR;
8032 }
8033 emptyDb();
f78fd11b 8034 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 8035 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8036 close(fd);
8037 return REDIS_ERR;
8038 }
8039 server.master = createClient(fd);
8040 server.master->flags |= REDIS_MASTER;
179b3952 8041 server.master->authenticated = 1;
ed9b544e 8042 server.replstate = REDIS_REPL_CONNECTED;
8043 return REDIS_OK;
8044}
8045
321b0e13 8046static void slaveofCommand(redisClient *c) {
8047 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8048 !strcasecmp(c->argv[2]->ptr,"one")) {
8049 if (server.masterhost) {
8050 sdsfree(server.masterhost);
8051 server.masterhost = NULL;
8052 if (server.master) freeClient(server.master);
8053 server.replstate = REDIS_REPL_NONE;
8054 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8055 }
8056 } else {
8057 sdsfree(server.masterhost);
8058 server.masterhost = sdsdup(c->argv[1]->ptr);
8059 server.masterport = atoi(c->argv[2]->ptr);
8060 if (server.master) freeClient(server.master);
8061 server.replstate = REDIS_REPL_CONNECT;
8062 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8063 server.masterhost, server.masterport);
8064 }
8065 addReply(c,shared.ok);
8066}
8067
3fd78bcd 8068/* ============================ Maxmemory directive ======================== */
8069
a5819310 8070/* Try to free one object form the pre-allocated objects free list.
8071 * This is useful under low mem conditions as by default we take 1 million
8072 * free objects allocated. On success REDIS_OK is returned, otherwise
8073 * REDIS_ERR. */
8074static int tryFreeOneObjectFromFreelist(void) {
f870935d 8075 robj *o;
8076
a5819310 8077 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8078 if (listLength(server.objfreelist)) {
8079 listNode *head = listFirst(server.objfreelist);
8080 o = listNodeValue(head);
8081 listDelNode(server.objfreelist,head);
8082 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8083 zfree(o);
8084 return REDIS_OK;
8085 } else {
8086 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8087 return REDIS_ERR;
8088 }
f870935d 8089}
8090
3fd78bcd 8091/* This function gets called when 'maxmemory' is set on the config file to limit
8092 * the max memory used by the server, and we are out of memory.
8093 * This function will try to, in order:
8094 *
8095 * - Free objects from the free list
8096 * - Try to remove keys with an EXPIRE set
8097 *
8098 * It is not possible to free enough memory to reach used-memory < maxmemory
8099 * the server will start refusing commands that will enlarge even more the
8100 * memory usage.
8101 */
8102static void freeMemoryIfNeeded(void) {
8103 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 8104 int j, k, freed = 0;
8105
8106 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8107 for (j = 0; j < server.dbnum; j++) {
8108 int minttl = -1;
8109 robj *minkey = NULL;
8110 struct dictEntry *de;
8111
8112 if (dictSize(server.db[j].expires)) {
8113 freed = 1;
8114 /* From a sample of three keys drop the one nearest to
8115 * the natural expire */
8116 for (k = 0; k < 3; k++) {
8117 time_t t;
8118
8119 de = dictGetRandomKey(server.db[j].expires);
8120 t = (time_t) dictGetEntryVal(de);
8121 if (minttl == -1 || t < minttl) {
8122 minkey = dictGetEntryKey(de);
8123 minttl = t;
3fd78bcd 8124 }
3fd78bcd 8125 }
a5819310 8126 deleteKey(server.db+j,minkey);
3fd78bcd 8127 }
3fd78bcd 8128 }
a5819310 8129 if (!freed) return; /* nothing to free... */
3fd78bcd 8130 }
8131}
8132
f80dff62 8133/* ============================== Append Only file ========================== */
8134
28ed1f33 8135/* Write the append only file buffer on disk.
8136 *
8137 * Since we are required to write the AOF before replying to the client,
8138 * and the only way the client socket can get a write is entering when the
8139 * the event loop, we accumulate all the AOF writes in a memory
8140 * buffer and write it on disk using this function just before entering
8141 * the event loop again. */
8142static void flushAppendOnlyFile(void) {
8143 time_t now;
8144 ssize_t nwritten;
8145
8146 if (sdslen(server.aofbuf) == 0) return;
8147
8148 /* We want to perform a single write. This should be guaranteed atomic
8149 * at least if the filesystem we are writing is a real physical one.
8150 * While this will save us against the server being killed I don't think
8151 * there is much to do about the whole server stopping for power problems
8152 * or alike */
8153 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8154 if (nwritten != (signed)sdslen(server.aofbuf)) {
8155 /* Ooops, we are in troubles. The best thing to do for now is
8156 * aborting instead of giving the illusion that everything is
8157 * working as expected. */
8158 if (nwritten == -1) {
8159 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8160 } else {
8161 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8162 }
8163 exit(1);
8164 }
8165 sdsfree(server.aofbuf);
8166 server.aofbuf = sdsempty();
8167
8168 /* Fsync if needed */
8169 now = time(NULL);
8170 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8171 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8172 now-server.lastfsync > 1))
8173 {
8174 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8175 * flushing metadata. */
8176 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8177 server.lastfsync = now;
8178 }
8179}
8180
f80dff62 8181static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8182 sds buf = sdsempty();
8183 int j;
f80dff62 8184 robj *tmpargv[3];
8185
8186 /* The DB this command was targetting is not the same as the last command
8187 * we appendend. To issue a SELECT command is needed. */
8188 if (dictid != server.appendseldb) {
8189 char seldb[64];
8190
8191 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 8192 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 8193 (unsigned long)strlen(seldb),seldb);
f80dff62 8194 server.appendseldb = dictid;
8195 }
8196
8197 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
8198 * EXPIREs into EXPIREATs calls */
8199 if (cmd->proc == expireCommand) {
8200 long when;
8201
8202 tmpargv[0] = createStringObject("EXPIREAT",8);
8203 tmpargv[1] = argv[1];
8204 incrRefCount(argv[1]);
8205 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
8206 tmpargv[2] = createObject(REDIS_STRING,
8207 sdscatprintf(sdsempty(),"%ld",when));
8208 argv = tmpargv;
8209 }
8210
8211 /* Append the actual command */
8212 buf = sdscatprintf(buf,"*%d\r\n",argc);
8213 for (j = 0; j < argc; j++) {
8214 robj *o = argv[j];
8215
9d65a1bb 8216 o = getDecodedObject(o);
83c6a618 8217 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
f80dff62 8218 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8219 buf = sdscatlen(buf,"\r\n",2);
9d65a1bb 8220 decrRefCount(o);
f80dff62 8221 }
8222
8223 /* Free the objects from the modified argv for EXPIREAT */
8224 if (cmd->proc == expireCommand) {
8225 for (j = 0; j < 3; j++)
8226 decrRefCount(argv[j]);
8227 }
8228
28ed1f33 8229 /* Append to the AOF buffer. This will be flushed on disk just before
8230 * of re-entering the event loop, so before the client will get a
8231 * positive reply about the operation performed. */
8232 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8233
85a83172 8234 /* If a background append only file rewriting is in progress we want to
8235 * accumulate the differences between the child DB and the current one
8236 * in a buffer, so that when the child process will do its work we
8237 * can append the differences to the new append only file. */
8238 if (server.bgrewritechildpid != -1)
8239 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8240
8241 sdsfree(buf);
f80dff62 8242}
8243
8244/* In Redis commands are always executed in the context of a client, so in
8245 * order to load the append only file we need to create a fake client. */
8246static struct redisClient *createFakeClient(void) {
8247 struct redisClient *c = zmalloc(sizeof(*c));
8248
8249 selectDb(c,0);
8250 c->fd = -1;
8251 c->querybuf = sdsempty();
8252 c->argc = 0;
8253 c->argv = NULL;
8254 c->flags = 0;
9387d17d 8255 /* We set the fake client as a slave waiting for the synchronization
8256 * so that Redis will not try to send replies to this client. */
8257 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 8258 c->reply = listCreate();
8259 listSetFreeMethod(c->reply,decrRefCount);
8260 listSetDupMethod(c->reply,dupClientReplyValue);
4132ad8d 8261 initClientMultiState(c);
f80dff62 8262 return c;
8263}
8264
8265static void freeFakeClient(struct redisClient *c) {
8266 sdsfree(c->querybuf);
8267 listRelease(c->reply);
4132ad8d 8268 freeClientMultiState(c);
f80dff62 8269 zfree(c);
8270}
8271
8272/* Replay the append log file. On error REDIS_OK is returned. On non fatal
8273 * error (the append only file is zero-length) REDIS_ERR is returned. On
8274 * fatal error an error message is logged and the program exists. */
8275int loadAppendOnlyFile(char *filename) {
8276 struct redisClient *fakeClient;
8277 FILE *fp = fopen(filename,"r");
8278 struct redis_stat sb;
b492cf00 8279 unsigned long long loadedkeys = 0;
4132ad8d 8280 int appendonly = server.appendonly;
f80dff62 8281
8282 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8283 return REDIS_ERR;
8284
8285 if (fp == NULL) {
8286 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8287 exit(1);
8288 }
8289
4132ad8d
PN
8290 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8291 * to the same file we're about to read. */
8292 server.appendonly = 0;
8293
f80dff62 8294 fakeClient = createFakeClient();
8295 while(1) {
8296 int argc, j;
8297 unsigned long len;
8298 robj **argv;
8299 char buf[128];
8300 sds argsds;
8301 struct redisCommand *cmd;
8302
8303 if (fgets(buf,sizeof(buf),fp) == NULL) {
8304 if (feof(fp))
8305 break;
8306 else
8307 goto readerr;
8308 }
8309 if (buf[0] != '*') goto fmterr;
8310 argc = atoi(buf+1);
8311 argv = zmalloc(sizeof(robj*)*argc);
8312 for (j = 0; j < argc; j++) {
8313 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8314 if (buf[0] != '$') goto fmterr;
8315 len = strtol(buf+1,NULL,10);
8316 argsds = sdsnewlen(NULL,len);
0f151ef1 8317 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 8318 argv[j] = createObject(REDIS_STRING,argsds);
8319 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8320 }
8321
8322 /* Command lookup */
8323 cmd = lookupCommand(argv[0]->ptr);
8324 if (!cmd) {
8325 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8326 exit(1);
8327 }
bdcb92f2 8328 /* Try object encoding */
f80dff62 8329 if (cmd->flags & REDIS_CMD_BULK)
05df7621 8330 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
f80dff62 8331 /* Run the command in the context of a fake client */
8332 fakeClient->argc = argc;
8333 fakeClient->argv = argv;
8334 cmd->proc(fakeClient);
8335 /* Discard the reply objects list from the fake client */
8336 while(listLength(fakeClient->reply))
8337 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8338 /* Clean up, ready for the next command */
8339 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8340 zfree(argv);
b492cf00 8341 /* Handle swapping while loading big datasets when VM is on */
8342 loadedkeys++;
8343 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8344 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 8345 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 8346 }
8347 }
f80dff62 8348 }
4132ad8d
PN
8349
8350 /* This point can only be reached when EOF is reached without errors.
8351 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8352 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8353
f80dff62 8354 fclose(fp);
8355 freeFakeClient(fakeClient);
4132ad8d 8356 server.appendonly = appendonly;
f80dff62 8357 return REDIS_OK;
8358
8359readerr:
8360 if (feof(fp)) {
8361 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8362 } else {
8363 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8364 }
8365 exit(1);
8366fmterr:
8367 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8368 exit(1);
8369}
8370
9d65a1bb 8371/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
9c8e3cee 8372static int fwriteBulkObject(FILE *fp, robj *obj) {
9d65a1bb 8373 char buf[128];
b9bc0eef 8374 int decrrc = 0;
8375
f2d9f50f 8376 /* Avoid the incr/decr ref count business if possible to help
8377 * copy-on-write (we are often in a child process when this function
8378 * is called).
8379 * Also makes sure that key objects don't get incrRefCount-ed when VM
8380 * is enabled */
8381 if (obj->encoding != REDIS_ENCODING_RAW) {
b9bc0eef 8382 obj = getDecodedObject(obj);
8383 decrrc = 1;
8384 }
9d65a1bb 8385 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8386 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 8387 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8388 goto err;
9d65a1bb 8389 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 8390 if (decrrc) decrRefCount(obj);
9d65a1bb 8391 return 1;
8392err:
b9bc0eef 8393 if (decrrc) decrRefCount(obj);
9d65a1bb 8394 return 0;
8395}
8396
9c8e3cee 8397/* Write binary-safe string into a file in the bulkformat
8398 * $<count>\r\n<payload>\r\n */
8399static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8400 char buf[128];
8401
8402 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8403 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8404 if (len && fwrite(s,len,1,fp) == 0) return 0;
8405 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8406 return 1;
8407}
8408
9d65a1bb 8409/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8410static int fwriteBulkDouble(FILE *fp, double d) {
8411 char buf[128], dbuf[128];
8412
8413 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8414 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8415 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8416 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8417 return 1;
8418}
8419
8420/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8421static int fwriteBulkLong(FILE *fp, long l) {
8422 char buf[128], lbuf[128];
8423
8424 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8425 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8426 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8427 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8428 return 1;
8429}
8430
8431/* Write a sequence of commands able to fully rebuild the dataset into
8432 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8433static int rewriteAppendOnlyFile(char *filename) {
8434 dictIterator *di = NULL;
8435 dictEntry *de;
8436 FILE *fp;
8437 char tmpfile[256];
8438 int j;
8439 time_t now = time(NULL);
8440
8441 /* Note that we have to use a different temp name here compared to the
8442 * one used by rewriteAppendOnlyFileBackground() function. */
8443 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8444 fp = fopen(tmpfile,"w");
8445 if (!fp) {
8446 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8447 return REDIS_ERR;
8448 }
8449 for (j = 0; j < server.dbnum; j++) {
8450 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8451 redisDb *db = server.db+j;
8452 dict *d = db->dict;
8453 if (dictSize(d) == 0) continue;
8454 di = dictGetIterator(d);
8455 if (!di) {
8456 fclose(fp);
8457 return REDIS_ERR;
8458 }
8459
8460 /* SELECT the new DB */
8461 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 8462 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 8463
8464 /* Iterate this DB writing every entry */
8465 while((de = dictNext(di)) != NULL) {
e7546c63 8466 robj *key, *o;
8467 time_t expiretime;
8468 int swapped;
8469
8470 key = dictGetEntryKey(de);
b9bc0eef 8471 /* If the value for this key is swapped, load a preview in memory.
8472 * We use a "swapped" flag to remember if we need to free the
8473 * value object instead to just increment the ref count anyway
8474 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 8475 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8476 key->storage == REDIS_VM_SWAPPING) {
e7546c63 8477 o = dictGetEntryVal(de);
8478 swapped = 0;
8479 } else {
8480 o = vmPreviewObject(key);
e7546c63 8481 swapped = 1;
8482 }
8483 expiretime = getExpire(db,key);
9d65a1bb 8484
8485 /* Save the key and associated value */
9d65a1bb 8486 if (o->type == REDIS_STRING) {
8487 /* Emit a SET command */
8488 char cmd[]="*3\r\n$3\r\nSET\r\n";
8489 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8490 /* Key and value */
9c8e3cee 8491 if (fwriteBulkObject(fp,key) == 0) goto werr;
8492 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 8493 } else if (o->type == REDIS_LIST) {
8494 /* Emit the RPUSHes needed to rebuild the list */
8495 list *list = o->ptr;
8496 listNode *ln;
c7df85a4 8497 listIter li;
9d65a1bb 8498
c7df85a4 8499 listRewind(list,&li);
8500 while((ln = listNext(&li))) {
9d65a1bb 8501 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8502 robj *eleobj = listNodeValue(ln);
8503
8504 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8505 if (fwriteBulkObject(fp,key) == 0) goto werr;
8506 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8507 }
8508 } else if (o->type == REDIS_SET) {
8509 /* Emit the SADDs needed to rebuild the set */
8510 dict *set = o->ptr;
8511 dictIterator *di = dictGetIterator(set);
8512 dictEntry *de;
8513
8514 while((de = dictNext(di)) != NULL) {
8515 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8516 robj *eleobj = dictGetEntryKey(de);
8517
8518 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8519 if (fwriteBulkObject(fp,key) == 0) goto werr;
8520 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8521 }
8522 dictReleaseIterator(di);
8523 } else if (o->type == REDIS_ZSET) {
8524 /* Emit the ZADDs needed to rebuild the sorted set */
8525 zset *zs = o->ptr;
8526 dictIterator *di = dictGetIterator(zs->dict);
8527 dictEntry *de;
8528
8529 while((de = dictNext(di)) != NULL) {
8530 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8531 robj *eleobj = dictGetEntryKey(de);
8532 double *score = dictGetEntryVal(de);
8533
8534 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8535 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8536 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 8537 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8538 }
8539 dictReleaseIterator(di);
9c8e3cee 8540 } else if (o->type == REDIS_HASH) {
8541 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8542
8543 /* Emit the HSETs needed to rebuild the hash */
8544 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8545 unsigned char *p = zipmapRewind(o->ptr);
8546 unsigned char *field, *val;
8547 unsigned int flen, vlen;
8548
8549 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8550 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8551 if (fwriteBulkObject(fp,key) == 0) goto werr;
8552 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8553 return -1;
8554 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8555 return -1;
8556 }
8557 } else {
8558 dictIterator *di = dictGetIterator(o->ptr);
8559 dictEntry *de;
8560
8561 while((de = dictNext(di)) != NULL) {
8562 robj *field = dictGetEntryKey(de);
8563 robj *val = dictGetEntryVal(de);
8564
8565 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8566 if (fwriteBulkObject(fp,key) == 0) goto werr;
8567 if (fwriteBulkObject(fp,field) == -1) return -1;
8568 if (fwriteBulkObject(fp,val) == -1) return -1;
8569 }
8570 dictReleaseIterator(di);
8571 }
9d65a1bb 8572 } else {
f83c6cb5 8573 redisPanic("Unknown object type");
9d65a1bb 8574 }
8575 /* Save the expire time */
8576 if (expiretime != -1) {
e96e4fbf 8577 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 8578 /* If this key is already expired skip it */
8579 if (expiretime < now) continue;
8580 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8581 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8582 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8583 }
b9bc0eef 8584 if (swapped) decrRefCount(o);
9d65a1bb 8585 }
8586 dictReleaseIterator(di);
8587 }
8588
8589 /* Make sure data will not remain on the OS's output buffers */
8590 fflush(fp);
8591 fsync(fileno(fp));
8592 fclose(fp);
e0a62c7f 8593
9d65a1bb 8594 /* Use RENAME to make sure the DB file is changed atomically only
8595 * if the generate DB file is ok. */
8596 if (rename(tmpfile,filename) == -1) {
8597 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8598 unlink(tmpfile);
8599 return REDIS_ERR;
8600 }
8601 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8602 return REDIS_OK;
8603
8604werr:
8605 fclose(fp);
8606 unlink(tmpfile);
e96e4fbf 8607 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 8608 if (di) dictReleaseIterator(di);
8609 return REDIS_ERR;
8610}
8611
8612/* This is how rewriting of the append only file in background works:
8613 *
8614 * 1) The user calls BGREWRITEAOF
8615 * 2) Redis calls this function, that forks():
8616 * 2a) the child rewrite the append only file in a temp file.
8617 * 2b) the parent accumulates differences in server.bgrewritebuf.
8618 * 3) When the child finished '2a' exists.
8619 * 4) The parent will trap the exit code, if it's OK, will append the
8620 * data accumulated into server.bgrewritebuf into the temp file, and
8621 * finally will rename(2) the temp file in the actual file name.
8622 * The the new file is reopened as the new append only file. Profit!
8623 */
8624static int rewriteAppendOnlyFileBackground(void) {
8625 pid_t childpid;
8626
8627 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 8628 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 8629 if ((childpid = fork()) == 0) {
8630 /* Child */
8631 char tmpfile[256];
9d65a1bb 8632
054e426d 8633 if (server.vm_enabled) vmReopenSwapFile();
8634 close(server.fd);
9d65a1bb 8635 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8636 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 8637 _exit(0);
9d65a1bb 8638 } else {
478c2c6f 8639 _exit(1);
9d65a1bb 8640 }
8641 } else {
8642 /* Parent */
8643 if (childpid == -1) {
8644 redisLog(REDIS_WARNING,
8645 "Can't rewrite append only file in background: fork: %s",
8646 strerror(errno));
8647 return REDIS_ERR;
8648 }
8649 redisLog(REDIS_NOTICE,
8650 "Background append only file rewriting started by pid %d",childpid);
8651 server.bgrewritechildpid = childpid;
884d4b39 8652 updateDictResizePolicy();
85a83172 8653 /* We set appendseldb to -1 in order to force the next call to the
8654 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8655 * accumulated by the parent into server.bgrewritebuf will start
8656 * with a SELECT statement and it will be safe to merge. */
8657 server.appendseldb = -1;
9d65a1bb 8658 return REDIS_OK;
8659 }
8660 return REDIS_OK; /* unreached */
8661}
8662
8663static void bgrewriteaofCommand(redisClient *c) {
8664 if (server.bgrewritechildpid != -1) {
8665 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8666 return;
8667 }
8668 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 8669 char *status = "+Background append only file rewriting started\r\n";
8670 addReplySds(c,sdsnew(status));
9d65a1bb 8671 } else {
8672 addReply(c,shared.err);
8673 }
8674}
8675
8676static void aofRemoveTempFile(pid_t childpid) {
8677 char tmpfile[256];
8678
8679 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8680 unlink(tmpfile);
8681}
8682
996cb5f7 8683/* Virtual Memory is composed mainly of two subsystems:
8684 * - Blocking Virutal Memory
8685 * - Threaded Virtual Memory I/O
8686 * The two parts are not fully decoupled, but functions are split among two
8687 * different sections of the source code (delimited by comments) in order to
8688 * make more clear what functionality is about the blocking VM and what about
8689 * the threaded (not blocking) VM.
8690 *
8691 * Redis VM design:
8692 *
8693 * Redis VM is a blocking VM (one that blocks reading swapped values from
8694 * disk into memory when a value swapped out is needed in memory) that is made
8695 * unblocking by trying to examine the command argument vector in order to
8696 * load in background values that will likely be needed in order to exec
8697 * the command. The command is executed only once all the relevant keys
8698 * are loaded into memory.
8699 *
8700 * This basically is almost as simple of a blocking VM, but almost as parallel
8701 * as a fully non-blocking VM.
8702 */
8703
8704/* =================== Virtual Memory - Blocking Side ====================== */
054e426d 8705
75680a3c 8706static void vmInit(void) {
8707 off_t totsize;
996cb5f7 8708 int pipefds[2];
bcaa7a4f 8709 size_t stacksize;
8b5bb414 8710 struct flock fl;
75680a3c 8711
4ad37480 8712 if (server.vm_max_threads != 0)
8713 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8714
054e426d 8715 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8b5bb414 8716 /* Try to open the old swap file, otherwise create it */
6fa987e3 8717 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8718 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8719 }
75680a3c 8720 if (server.vm_fp == NULL) {
6fa987e3 8721 redisLog(REDIS_WARNING,
8b5bb414 8722 "Can't open the swap file: %s. Exiting.",
6fa987e3 8723 strerror(errno));
75680a3c 8724 exit(1);
8725 }
8726 server.vm_fd = fileno(server.vm_fp);
8b5bb414 8727 /* Lock the swap file for writing, this is useful in order to avoid
8728 * another instance to use the same swap file for a config error. */
8729 fl.l_type = F_WRLCK;
8730 fl.l_whence = SEEK_SET;
8731 fl.l_start = fl.l_len = 0;
8732 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
8733 redisLog(REDIS_WARNING,
8734 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
8735 exit(1);
8736 }
8737 /* Initialize */
75680a3c 8738 server.vm_next_page = 0;
8739 server.vm_near_pages = 0;
7d98e08c 8740 server.vm_stats_used_pages = 0;
8741 server.vm_stats_swapped_objects = 0;
8742 server.vm_stats_swapouts = 0;
8743 server.vm_stats_swapins = 0;
75680a3c 8744 totsize = server.vm_pages*server.vm_page_size;
8745 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8746 if (ftruncate(server.vm_fd,totsize) == -1) {
8747 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8748 strerror(errno));
8749 exit(1);
8750 } else {
8751 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8752 }
7d30035d 8753 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 8754 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 8755 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 8756 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 8757
996cb5f7 8758 /* Initialize threaded I/O (used by Virtual Memory) */
8759 server.io_newjobs = listCreate();
8760 server.io_processing = listCreate();
8761 server.io_processed = listCreate();
d5d55fc3 8762 server.io_ready_clients = listCreate();
92f8e882 8763 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 8764 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8765 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 8766 server.io_active_threads = 0;
996cb5f7 8767 if (pipe(pipefds) == -1) {
8768 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8769 ,strerror(errno));
8770 exit(1);
8771 }
8772 server.io_ready_pipe_read = pipefds[0];
8773 server.io_ready_pipe_write = pipefds[1];
8774 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 8775 /* LZF requires a lot of stack */
8776 pthread_attr_init(&server.io_threads_attr);
8777 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8778 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8779 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 8780 /* Listen for events in the threaded I/O pipe */
8781 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8782 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8783 oom("creating file event");
75680a3c 8784}
8785
06224fec 8786/* Mark the page as used */
8787static void vmMarkPageUsed(off_t page) {
8788 off_t byte = page/8;
8789 int bit = page&7;
970e10bb 8790 redisAssert(vmFreePage(page) == 1);
06224fec 8791 server.vm_bitmap[byte] |= 1<<bit;
8792}
8793
8794/* Mark N contiguous pages as used, with 'page' being the first. */
8795static void vmMarkPagesUsed(off_t page, off_t count) {
8796 off_t j;
8797
8798 for (j = 0; j < count; j++)
7d30035d 8799 vmMarkPageUsed(page+j);
7d98e08c 8800 server.vm_stats_used_pages += count;
7c775e09 8801 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8802 (long long)count, (long long)page);
06224fec 8803}
8804
8805/* Mark the page as free */
8806static void vmMarkPageFree(off_t page) {
8807 off_t byte = page/8;
8808 int bit = page&7;
970e10bb 8809 redisAssert(vmFreePage(page) == 0);
06224fec 8810 server.vm_bitmap[byte] &= ~(1<<bit);
8811}
8812
8813/* Mark N contiguous pages as free, with 'page' being the first. */
8814static void vmMarkPagesFree(off_t page, off_t count) {
8815 off_t j;
8816
8817 for (j = 0; j < count; j++)
7d30035d 8818 vmMarkPageFree(page+j);
7d98e08c 8819 server.vm_stats_used_pages -= count;
7c775e09 8820 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8821 (long long)count, (long long)page);
06224fec 8822}
8823
8824/* Test if the page is free */
8825static int vmFreePage(off_t page) {
8826 off_t byte = page/8;
8827 int bit = page&7;
7d30035d 8828 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 8829}
8830
8831/* Find N contiguous free pages storing the first page of the cluster in *first.
e0a62c7f 8832 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
3a66edc7 8833 * REDIS_ERR is returned.
06224fec 8834 *
8835 * This function uses a simple algorithm: we try to allocate
8836 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8837 * again from the start of the swap file searching for free spaces.
8838 *
8839 * If it looks pretty clear that there are no free pages near our offset
8840 * we try to find less populated places doing a forward jump of
8841 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8842 * without hurry, and then we jump again and so forth...
e0a62c7f 8843 *
06224fec 8844 * This function can be improved using a free list to avoid to guess
8845 * too much, since we could collect data about freed pages.
8846 *
8847 * note: I implemented this function just after watching an episode of
8848 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8849 */
c7df85a4 8850static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 8851 off_t base, offset = 0, since_jump = 0, numfree = 0;
8852
8853 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8854 server.vm_near_pages = 0;
8855 server.vm_next_page = 0;
8856 }
8857 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8858 base = server.vm_next_page;
8859
8860 while(offset < server.vm_pages) {
8861 off_t this = base+offset;
8862
8863 /* If we overflow, restart from page zero */
8864 if (this >= server.vm_pages) {
8865 this -= server.vm_pages;
8866 if (this == 0) {
8867 /* Just overflowed, what we found on tail is no longer
8868 * interesting, as it's no longer contiguous. */
8869 numfree = 0;
8870 }
8871 }
8872 if (vmFreePage(this)) {
8873 /* This is a free page */
8874 numfree++;
8875 /* Already got N free pages? Return to the caller, with success */
8876 if (numfree == n) {
7d30035d 8877 *first = this-(n-1);
8878 server.vm_next_page = this+1;
7c775e09 8879 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 8880 return REDIS_OK;
06224fec 8881 }
8882 } else {
8883 /* The current one is not a free page */
8884 numfree = 0;
8885 }
8886
8887 /* Fast-forward if the current page is not free and we already
8888 * searched enough near this place. */
8889 since_jump++;
8890 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8891 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8892 since_jump = 0;
8893 /* Note that even if we rewind after the jump, we are don't need
8894 * to make sure numfree is set to zero as we only jump *if* it
8895 * is set to zero. */
8896 } else {
8897 /* Otherwise just check the next page */
8898 offset++;
8899 }
8900 }
3a66edc7 8901 return REDIS_ERR;
8902}
8903
a5819310 8904/* Write the specified object at the specified page of the swap file */
8905static int vmWriteObjectOnSwap(robj *o, off_t page) {
8906 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8907 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8908 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8909 redisLog(REDIS_WARNING,
9ebed7cf 8910 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 8911 strerror(errno));
8912 return REDIS_ERR;
8913 }
8914 rdbSaveObject(server.vm_fp,o);
ba76a8f9 8915 fflush(server.vm_fp);
a5819310 8916 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8917 return REDIS_OK;
8918}
8919
3a66edc7 8920/* Swap the 'val' object relative to 'key' into disk. Store all the information
8921 * needed to later retrieve the object into the key object.
8922 * If we can't find enough contiguous empty pages to swap the object on disk
8923 * REDIS_ERR is returned. */
a69a0c9c 8924static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 8925 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 8926 off_t page;
8927
8928 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 8929 assert(key->refcount == 1);
3a66edc7 8930 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 8931 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 8932 key->vm.page = page;
8933 key->vm.usedpages = pages;
8934 key->storage = REDIS_VM_SWAPPED;
d894161b 8935 key->vtype = val->type;
3a66edc7 8936 decrRefCount(val); /* Deallocate the object from memory. */
8937 vmMarkPagesUsed(page,pages);
7d30035d 8938 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8939 (unsigned char*) key->ptr,
8940 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 8941 server.vm_stats_swapped_objects++;
8942 server.vm_stats_swapouts++;
3a66edc7 8943 return REDIS_OK;
8944}
8945
a5819310 8946static robj *vmReadObjectFromSwap(off_t page, int type) {
8947 robj *o;
3a66edc7 8948
a5819310 8949 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8950 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 8951 redisLog(REDIS_WARNING,
d5d55fc3 8952 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 8953 strerror(errno));
478c2c6f 8954 _exit(1);
3a66edc7 8955 }
a5819310 8956 o = rdbLoadObject(type,server.vm_fp);
8957 if (o == NULL) {
d5d55fc3 8958 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 8959 _exit(1);
3a66edc7 8960 }
a5819310 8961 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8962 return o;
8963}
8964
8965/* Load the value object relative to the 'key' object from swap to memory.
8966 * The newly allocated object is returned.
8967 *
8968 * If preview is true the unserialized object is returned to the caller but
8969 * no changes are made to the key object, nor the pages are marked as freed */
8970static robj *vmGenericLoadObject(robj *key, int preview) {
8971 robj *val;
8972
d5d55fc3 8973 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
a5819310 8974 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 8975 if (!preview) {
8976 key->storage = REDIS_VM_MEMORY;
8977 key->vm.atime = server.unixtime;
8978 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8979 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8980 (unsigned char*) key->ptr);
7d98e08c 8981 server.vm_stats_swapped_objects--;
38aba9a1 8982 } else {
8983 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8984 (unsigned char*) key->ptr);
7e69548d 8985 }
7d98e08c 8986 server.vm_stats_swapins++;
3a66edc7 8987 return val;
06224fec 8988}
8989
7e69548d 8990/* Plain object loading, from swap to memory */
8991static robj *vmLoadObject(robj *key) {
996cb5f7 8992 /* If we are loading the object in background, stop it, we
8993 * need to load this object synchronously ASAP. */
8994 if (key->storage == REDIS_VM_LOADING)
8995 vmCancelThreadedIOJob(key);
7e69548d 8996 return vmGenericLoadObject(key,0);
8997}
8998
8999/* Just load the value on disk, without to modify the key.
9000 * This is useful when we want to perform some operation on the value
9001 * without to really bring it from swap to memory, like while saving the
9002 * dataset or rewriting the append only log. */
9003static robj *vmPreviewObject(robj *key) {
9004 return vmGenericLoadObject(key,1);
9005}
9006
4ef8de8a 9007/* How a good candidate is this object for swapping?
9008 * The better candidate it is, the greater the returned value.
9009 *
9010 * Currently we try to perform a fast estimation of the object size in
9011 * memory, and combine it with aging informations.
9012 *
9013 * Basically swappability = idle-time * log(estimated size)
9014 *
9015 * Bigger objects are preferred over smaller objects, but not
9016 * proportionally, this is why we use the logarithm. This algorithm is
9017 * just a first try and will probably be tuned later. */
9018static double computeObjectSwappability(robj *o) {
9019 time_t age = server.unixtime - o->vm.atime;
9020 long asize = 0;
9021 list *l;
9022 dict *d;
9023 struct dictEntry *de;
9024 int z;
9025
9026 if (age <= 0) return 0;
9027 switch(o->type) {
9028 case REDIS_STRING:
9029 if (o->encoding != REDIS_ENCODING_RAW) {
9030 asize = sizeof(*o);
9031 } else {
9032 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9033 }
9034 break;
9035 case REDIS_LIST:
9036 l = o->ptr;
9037 listNode *ln = listFirst(l);
9038
9039 asize = sizeof(list);
9040 if (ln) {
9041 robj *ele = ln->value;
9042 long elesize;
9043
9044 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9045 (sizeof(*o)+sdslen(ele->ptr)) :
9046 sizeof(*o);
9047 asize += (sizeof(listNode)+elesize)*listLength(l);
9048 }
9049 break;
9050 case REDIS_SET:
9051 case REDIS_ZSET:
9052 z = (o->type == REDIS_ZSET);
9053 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9054
9055 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9056 if (z) asize += sizeof(zset)-sizeof(dict);
9057 if (dictSize(d)) {
9058 long elesize;
9059 robj *ele;
9060
9061 de = dictGetRandomKey(d);
9062 ele = dictGetEntryKey(de);
9063 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9064 (sizeof(*o)+sdslen(ele->ptr)) :
9065 sizeof(*o);
9066 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9067 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9068 }
9069 break;
a97b9060 9070 case REDIS_HASH:
9071 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9072 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9073 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9074 unsigned int klen, vlen;
9075 unsigned char *key, *val;
9076
9077 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9078 klen = 0;
9079 vlen = 0;
9080 }
9081 asize = len*(klen+vlen+3);
9082 } else if (o->encoding == REDIS_ENCODING_HT) {
9083 d = o->ptr;
9084 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9085 if (dictSize(d)) {
9086 long elesize;
9087 robj *ele;
9088
9089 de = dictGetRandomKey(d);
9090 ele = dictGetEntryKey(de);
9091 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9092 (sizeof(*o)+sdslen(ele->ptr)) :
9093 sizeof(*o);
9094 ele = dictGetEntryVal(de);
9095 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
9096 (sizeof(*o)+sdslen(ele->ptr)) :
9097 sizeof(*o);
9098 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9099 }
9100 }
9101 break;
4ef8de8a 9102 }
c8c72447 9103 return (double)age*log(1+asize);
4ef8de8a 9104}
9105
9106/* Try to swap an object that's a good candidate for swapping.
9107 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 9108 * to swap any object at all.
9109 *
9110 * If 'usethreaded' is true, Redis will try to swap the object in background
9111 * using I/O threads. */
9112static int vmSwapOneObject(int usethreads) {
4ef8de8a 9113 int j, i;
9114 struct dictEntry *best = NULL;
9115 double best_swappability = 0;
b9bc0eef 9116 redisDb *best_db = NULL;
4ef8de8a 9117 robj *key, *val;
9118
9119 for (j = 0; j < server.dbnum; j++) {
9120 redisDb *db = server.db+j;
b72f6a4b 9121 /* Why maxtries is set to 100?
9122 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9123 * are swappable objects */
b0d8747d 9124 int maxtries = 100;
4ef8de8a 9125
9126 if (dictSize(db->dict) == 0) continue;
9127 for (i = 0; i < 5; i++) {
9128 dictEntry *de;
9129 double swappability;
9130
e3cadb8a 9131 if (maxtries) maxtries--;
4ef8de8a 9132 de = dictGetRandomKey(db->dict);
9133 key = dictGetEntryKey(de);
9134 val = dictGetEntryVal(de);
1064ef87 9135 /* Only swap objects that are currently in memory.
9136 *
9137 * Also don't swap shared objects if threaded VM is on, as we
9138 * try to ensure that the main thread does not touch the
9139 * object while the I/O thread is using it, but we can't
9140 * control other keys without adding additional mutex. */
9141 if (key->storage != REDIS_VM_MEMORY ||
9142 (server.vm_max_threads != 0 && val->refcount != 1)) {
e3cadb8a 9143 if (maxtries) i--; /* don't count this try */
9144 continue;
9145 }
4ef8de8a 9146 swappability = computeObjectSwappability(val);
9147 if (!best || swappability > best_swappability) {
9148 best = de;
9149 best_swappability = swappability;
b9bc0eef 9150 best_db = db;
4ef8de8a 9151 }
9152 }
9153 }
7c775e09 9154 if (best == NULL) return REDIS_ERR;
4ef8de8a 9155 key = dictGetEntryKey(best);
9156 val = dictGetEntryVal(best);
9157
e3cadb8a 9158 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 9159 key->ptr, best_swappability);
9160
9161 /* Unshare the key if needed */
9162 if (key->refcount > 1) {
9163 robj *newkey = dupStringObject(key);
9164 decrRefCount(key);
9165 key = dictGetEntryKey(best) = newkey;
9166 }
9167 /* Swap it */
a69a0c9c 9168 if (usethreads) {
b9bc0eef 9169 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 9170 return REDIS_OK;
9171 } else {
a69a0c9c 9172 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9173 dictGetEntryVal(best) = NULL;
9174 return REDIS_OK;
9175 } else {
9176 return REDIS_ERR;
9177 }
4ef8de8a 9178 }
9179}
9180
a69a0c9c 9181static int vmSwapOneObjectBlocking() {
9182 return vmSwapOneObject(0);
9183}
9184
9185static int vmSwapOneObjectThreaded() {
9186 return vmSwapOneObject(1);
9187}
9188
7e69548d 9189/* Return true if it's safe to swap out objects in a given moment.
9190 * Basically we don't want to swap objects out while there is a BGSAVE
9191 * or a BGAEOREWRITE running in backgroud. */
9192static int vmCanSwapOut(void) {
9193 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9194}
9195
1b03836c 9196/* Delete a key if swapped. Returns 1 if the key was found, was swapped
9197 * and was deleted. Otherwise 0 is returned. */
9198static int deleteIfSwapped(redisDb *db, robj *key) {
9199 dictEntry *de;
9200 robj *foundkey;
9201
9202 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9203 foundkey = dictGetEntryKey(de);
9204 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9205 deleteKey(db,key);
9206 return 1;
9207}
9208
996cb5f7 9209/* =================== Virtual Memory - Threaded I/O ======================= */
9210
b9bc0eef 9211static void freeIOJob(iojob *j) {
d5d55fc3 9212 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9213 j->type == REDIS_IOJOB_DO_SWAP ||
9214 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
b9bc0eef 9215 decrRefCount(j->val);
78ebe4c8 9216 /* We don't decrRefCount the j->key field as we did't incremented
9217 * the count creating IO Jobs. This is because the key field here is
9218 * just used as an indentifier and if a key is removed the Job should
9219 * never be touched again. */
b9bc0eef 9220 zfree(j);
9221}
9222
996cb5f7 9223/* Every time a thread finished a Job, it writes a byte into the write side
9224 * of an unix pipe in order to "awake" the main thread, and this function
9225 * is called. */
9226static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9227 int mask)
9228{
9229 char buf[1];
b0d8747d 9230 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 9231 REDIS_NOTUSED(el);
9232 REDIS_NOTUSED(mask);
9233 REDIS_NOTUSED(privdata);
9234
9235 /* For every byte we read in the read side of the pipe, there is one
9236 * I/O job completed to process. */
9237 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 9238 iojob *j;
9239 listNode *ln;
9240 robj *key;
9241 struct dictEntry *de;
9242
996cb5f7 9243 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 9244
9245 /* Get the processed element (the oldest one) */
9246 lockThreadedIO();
1064ef87 9247 assert(listLength(server.io_processed) != 0);
f6c0bba8 9248 if (toprocess == -1) {
9249 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9250 if (toprocess <= 0) toprocess = 1;
9251 }
b9bc0eef 9252 ln = listFirst(server.io_processed);
9253 j = ln->value;
9254 listDelNode(server.io_processed,ln);
9255 unlockThreadedIO();
9256 /* If this job is marked as canceled, just ignore it */
9257 if (j->canceled) {
9258 freeIOJob(j);
9259 continue;
9260 }
9261 /* Post process it in the main thread, as there are things we
9262 * can do just here to avoid race conditions and/or invasive locks */
6c96ba7d 9263 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 9264 de = dictFind(j->db->dict,j->key);
9265 assert(de != NULL);
9266 key = dictGetEntryKey(de);
9267 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9268 redisDb *db;
9269
b9bc0eef 9270 /* Key loaded, bring it at home */
9271 key->storage = REDIS_VM_MEMORY;
9272 key->vm.atime = server.unixtime;
9273 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9274 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9275 (unsigned char*) key->ptr);
9276 server.vm_stats_swapped_objects--;
9277 server.vm_stats_swapins++;
d5d55fc3 9278 dictGetEntryVal(de) = j->val;
9279 incrRefCount(j->val);
9280 db = j->db;
b9bc0eef 9281 freeIOJob(j);
d5d55fc3 9282 /* Handle clients waiting for this key to be loaded. */
9283 handleClientsBlockedOnSwappedKey(db,key);
b9bc0eef 9284 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9285 /* Now we know the amount of pages required to swap this object.
9286 * Let's find some space for it, and queue this task again
9287 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 9288 if (!vmCanSwapOut() ||
9289 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9290 {
9291 /* Ooops... no space or we can't swap as there is
9292 * a fork()ed Redis trying to save stuff on disk. */
b9bc0eef 9293 freeIOJob(j);
054e426d 9294 key->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 9295 } else {
c7df85a4 9296 /* Note that we need to mark this pages as used now,
9297 * if the job will be canceled, we'll mark them as freed
9298 * again. */
9299 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 9300 j->type = REDIS_IOJOB_DO_SWAP;
9301 lockThreadedIO();
9302 queueIOJob(j);
9303 unlockThreadedIO();
9304 }
9305 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9306 robj *val;
9307
9308 /* Key swapped. We can finally free some memory. */
6c96ba7d 9309 if (key->storage != REDIS_VM_SWAPPING) {
9310 printf("key->storage: %d\n",key->storage);
9311 printf("key->name: %s\n",(char*)key->ptr);
9312 printf("key->refcount: %d\n",key->refcount);
9313 printf("val: %p\n",(void*)j->val);
9314 printf("val->type: %d\n",j->val->type);
9315 printf("val->ptr: %s\n",(char*)j->val->ptr);
9316 }
9317 redisAssert(key->storage == REDIS_VM_SWAPPING);
b9bc0eef 9318 val = dictGetEntryVal(de);
9319 key->vm.page = j->page;
9320 key->vm.usedpages = j->pages;
9321 key->storage = REDIS_VM_SWAPPED;
9322 key->vtype = j->val->type;
9323 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 9324 dictGetEntryVal(de) = NULL;
b9bc0eef 9325 redisLog(REDIS_DEBUG,
9326 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9327 (unsigned char*) key->ptr,
9328 (unsigned long long) j->page, (unsigned long long) j->pages);
9329 server.vm_stats_swapped_objects++;
9330 server.vm_stats_swapouts++;
9331 freeIOJob(j);
f11b8647 9332 /* Put a few more swap requests in queue if we are still
9333 * out of memory */
b0d8747d 9334 if (trytoswap && vmCanSwapOut() &&
9335 zmalloc_used_memory() > server.vm_max_memory)
9336 {
f11b8647 9337 int more = 1;
9338 while(more) {
9339 lockThreadedIO();
9340 more = listLength(server.io_newjobs) <
9341 (unsigned) server.vm_max_threads;
9342 unlockThreadedIO();
9343 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 9344 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9345 trytoswap = 0;
9346 break;
9347 }
f11b8647 9348 }
9349 }
b9bc0eef 9350 }
c953f24b 9351 processed++;
f6c0bba8 9352 if (processed == toprocess) return;
996cb5f7 9353 }
9354 if (retval < 0 && errno != EAGAIN) {
9355 redisLog(REDIS_WARNING,
9356 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9357 strerror(errno));
9358 }
9359}
9360
9361static void lockThreadedIO(void) {
9362 pthread_mutex_lock(&server.io_mutex);
9363}
9364
9365static void unlockThreadedIO(void) {
9366 pthread_mutex_unlock(&server.io_mutex);
9367}
9368
9369/* Remove the specified object from the threaded I/O queue if still not
9370 * processed, otherwise make sure to flag it as canceled. */
9371static void vmCancelThreadedIOJob(robj *o) {
9372 list *lists[3] = {
6c96ba7d 9373 server.io_newjobs, /* 0 */
9374 server.io_processing, /* 1 */
9375 server.io_processed /* 2 */
996cb5f7 9376 };
9377 int i;
9378
9379 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 9380again:
996cb5f7 9381 lockThreadedIO();
9382 /* Search for a matching key in one of the queues */
9383 for (i = 0; i < 3; i++) {
9384 listNode *ln;
c7df85a4 9385 listIter li;
996cb5f7 9386
c7df85a4 9387 listRewind(lists[i],&li);
9388 while ((ln = listNext(&li)) != NULL) {
996cb5f7 9389 iojob *job = ln->value;
9390
6c96ba7d 9391 if (job->canceled) continue; /* Skip this, already canceled. */
78ebe4c8 9392 if (job->key == o) {
970e10bb 9393 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9394 (void*)job, (char*)o->ptr, job->type, i);
427a2153 9395 /* Mark the pages as free since the swap didn't happened
9396 * or happened but is now discarded. */
970e10bb 9397 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 9398 vmMarkPagesFree(job->page,job->pages);
9399 /* Cancel the job. It depends on the list the job is
9400 * living in. */
996cb5f7 9401 switch(i) {
9402 case 0: /* io_newjobs */
6c96ba7d 9403 /* If the job was yet not processed the best thing to do
996cb5f7 9404 * is to remove it from the queue at all */
6c96ba7d 9405 freeIOJob(job);
996cb5f7 9406 listDelNode(lists[i],ln);
9407 break;
9408 case 1: /* io_processing */
d5d55fc3 9409 /* Oh Shi- the thread is messing with the Job:
9410 *
9411 * Probably it's accessing the object if this is a
9412 * PREPARE_SWAP or DO_SWAP job.
9413 * If it's a LOAD job it may be reading from disk and
9414 * if we don't wait for the job to terminate before to
9415 * cancel it, maybe in a few microseconds data can be
9416 * corrupted in this pages. So the short story is:
9417 *
9418 * Better to wait for the job to move into the
9419 * next queue (processed)... */
9420
9421 /* We try again and again until the job is completed. */
9422 unlockThreadedIO();
9423 /* But let's wait some time for the I/O thread
9424 * to finish with this job. After all this condition
9425 * should be very rare. */
9426 usleep(1);
9427 goto again;
996cb5f7 9428 case 2: /* io_processed */
2e111efe 9429 /* The job was already processed, that's easy...
9430 * just mark it as canceled so that we'll ignore it
9431 * when processing completed jobs. */
996cb5f7 9432 job->canceled = 1;
9433 break;
9434 }
c7df85a4 9435 /* Finally we have to adjust the storage type of the object
9436 * in order to "UNDO" the operaiton. */
996cb5f7 9437 if (o->storage == REDIS_VM_LOADING)
9438 o->storage = REDIS_VM_SWAPPED;
9439 else if (o->storage == REDIS_VM_SWAPPING)
9440 o->storage = REDIS_VM_MEMORY;
9441 unlockThreadedIO();
9442 return;
9443 }
9444 }
9445 }
9446 unlockThreadedIO();
9447 assert(1 != 1); /* We should never reach this */
9448}
9449
b9bc0eef 9450static void *IOThreadEntryPoint(void *arg) {
9451 iojob *j;
9452 listNode *ln;
9453 REDIS_NOTUSED(arg);
9454
9455 pthread_detach(pthread_self());
9456 while(1) {
9457 /* Get a new job to process */
9458 lockThreadedIO();
9459 if (listLength(server.io_newjobs) == 0) {
9460 /* No new jobs in queue, exit. */
9ebed7cf 9461 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9462 (long) pthread_self());
b9bc0eef 9463 server.io_active_threads--;
9464 unlockThreadedIO();
9465 return NULL;
9466 }
9467 ln = listFirst(server.io_newjobs);
9468 j = ln->value;
9469 listDelNode(server.io_newjobs,ln);
9470 /* Add the job in the processing queue */
9471 j->thread = pthread_self();
9472 listAddNodeTail(server.io_processing,j);
9473 ln = listLast(server.io_processing); /* We use ln later to remove it */
9474 unlockThreadedIO();
9ebed7cf 9475 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9476 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 9477
9478 /* Process the Job */
9479 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9480 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
b9bc0eef 9481 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9482 FILE *fp = fopen("/dev/null","w+");
9483 j->pages = rdbSavedObjectPages(j->val,fp);
9484 fclose(fp);
9485 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 9486 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9487 j->canceled = 1;
b9bc0eef 9488 }
9489
9490 /* Done: insert the job into the processed queue */
9ebed7cf 9491 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9492 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 9493 lockThreadedIO();
9494 listDelNode(server.io_processing,ln);
9495 listAddNodeTail(server.io_processed,j);
9496 unlockThreadedIO();
e0a62c7f 9497
b9bc0eef 9498 /* Signal the main thread there is new stuff to process */
9499 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9500 }
9501 return NULL; /* never reached */
9502}
9503
9504static void spawnIOThread(void) {
9505 pthread_t thread;
478c2c6f 9506 sigset_t mask, omask;
a97b9060 9507 int err;
b9bc0eef 9508
478c2c6f 9509 sigemptyset(&mask);
9510 sigaddset(&mask,SIGCHLD);
9511 sigaddset(&mask,SIGHUP);
9512 sigaddset(&mask,SIGPIPE);
9513 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 9514 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9515 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9516 strerror(err));
9517 usleep(1000000);
9518 }
478c2c6f 9519 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 9520 server.io_active_threads++;
9521}
9522
4ee9488d 9523/* We need to wait for the last thread to exit before we are able to
9524 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 9525static void waitEmptyIOJobsQueue(void) {
4ee9488d 9526 while(1) {
76b7233a 9527 int io_processed_len;
9528
4ee9488d 9529 lockThreadedIO();
054e426d 9530 if (listLength(server.io_newjobs) == 0 &&
9531 listLength(server.io_processing) == 0 &&
9532 server.io_active_threads == 0)
9533 {
4ee9488d 9534 unlockThreadedIO();
9535 return;
9536 }
76b7233a 9537 /* While waiting for empty jobs queue condition we post-process some
9538 * finshed job, as I/O threads may be hanging trying to write against
9539 * the io_ready_pipe_write FD but there are so much pending jobs that
9540 * it's blocking. */
9541 io_processed_len = listLength(server.io_processed);
4ee9488d 9542 unlockThreadedIO();
76b7233a 9543 if (io_processed_len) {
9544 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9545 usleep(1000); /* 1 millisecond */
9546 } else {
9547 usleep(10000); /* 10 milliseconds */
9548 }
4ee9488d 9549 }
9550}
9551
054e426d 9552static void vmReopenSwapFile(void) {
478c2c6f 9553 /* Note: we don't close the old one as we are in the child process
9554 * and don't want to mess at all with the original file object. */
054e426d 9555 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9556 if (server.vm_fp == NULL) {
9557 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9558 server.vm_swap_file);
478c2c6f 9559 _exit(1);
054e426d 9560 }
9561 server.vm_fd = fileno(server.vm_fp);
9562}
9563
b9bc0eef 9564/* This function must be called while with threaded IO locked */
9565static void queueIOJob(iojob *j) {
6c96ba7d 9566 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9567 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 9568 listAddNodeTail(server.io_newjobs,j);
9569 if (server.io_active_threads < server.vm_max_threads)
9570 spawnIOThread();
9571}
9572
9573static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9574 iojob *j;
e0a62c7f 9575
b9bc0eef 9576 assert(key->storage == REDIS_VM_MEMORY);
9577 assert(key->refcount == 1);
9578
9579 j = zmalloc(sizeof(*j));
9580 j->type = REDIS_IOJOB_PREPARE_SWAP;
9581 j->db = db;
78ebe4c8 9582 j->key = key;
b9bc0eef 9583 j->val = val;
9584 incrRefCount(val);
9585 j->canceled = 0;
9586 j->thread = (pthread_t) -1;
f11b8647 9587 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 9588
9589 lockThreadedIO();
9590 queueIOJob(j);
9591 unlockThreadedIO();
9592 return REDIS_OK;
9593}
9594
b0d8747d 9595/* ============ Virtual Memory - Blocking clients on missing keys =========== */
9596
d5d55fc3 9597/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9598 * If there is not already a job loading the key, it is craeted.
9599 * The key is added to the io_keys list in the client structure, and also
9600 * in the hash table mapping swapped keys to waiting clients, that is,
9601 * server.io_waited_keys. */
9602static int waitForSwappedKey(redisClient *c, robj *key) {
9603 struct dictEntry *de;
9604 robj *o;
9605 list *l;
9606
9607 /* If the key does not exist or is already in RAM we don't need to
9608 * block the client at all. */
9609 de = dictFind(c->db->dict,key);
9610 if (de == NULL) return 0;
9611 o = dictGetEntryKey(de);
9612 if (o->storage == REDIS_VM_MEMORY) {
9613 return 0;
9614 } else if (o->storage == REDIS_VM_SWAPPING) {
9615 /* We were swapping the key, undo it! */
9616 vmCancelThreadedIOJob(o);
9617 return 0;
9618 }
e0a62c7f 9619
d5d55fc3 9620 /* OK: the key is either swapped, or being loaded just now. */
9621
9622 /* Add the key to the list of keys this client is waiting for.
9623 * This maps clients to keys they are waiting for. */
9624 listAddNodeTail(c->io_keys,key);
9625 incrRefCount(key);
9626
9627 /* Add the client to the swapped keys => clients waiting map. */
9628 de = dictFind(c->db->io_keys,key);
9629 if (de == NULL) {
9630 int retval;
9631
9632 /* For every key we take a list of clients blocked for it */
9633 l = listCreate();
9634 retval = dictAdd(c->db->io_keys,key,l);
9635 incrRefCount(key);
9636 assert(retval == DICT_OK);
9637 } else {
9638 l = dictGetEntryVal(de);
9639 }
9640 listAddNodeTail(l,c);
9641
9642 /* Are we already loading the key from disk? If not create a job */
9643 if (o->storage == REDIS_VM_SWAPPED) {
9644 iojob *j;
9645
9646 o->storage = REDIS_VM_LOADING;
9647 j = zmalloc(sizeof(*j));
9648 j->type = REDIS_IOJOB_LOAD;
9649 j->db = c->db;
78ebe4c8 9650 j->key = o;
d5d55fc3 9651 j->key->vtype = o->vtype;
9652 j->page = o->vm.page;
9653 j->val = NULL;
9654 j->canceled = 0;
9655 j->thread = (pthread_t) -1;
9656 lockThreadedIO();
9657 queueIOJob(j);
9658 unlockThreadedIO();
9659 }
9660 return 1;
9661}
9662
76583ea4
PN
9663/* Preload keys needed for the ZUNION and ZINTER commands. */
9664static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9665 int i, num;
9666 num = atoi(c->argv[2]->ptr);
9667 for (i = 0; i < num; i++) {
9668 waitForSwappedKey(c,c->argv[3+i]);
9669 }
9670}
9671
b0d8747d 9672/* Is this client attempting to run a command against swapped keys?
d5d55fc3 9673 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 9674 *
d5d55fc3 9675 * The important idea about this function is that it can fail! If keys will
9676 * still be swapped when the client is resumed, this key lookups will
9677 * just block loading keys from disk. In practical terms this should only
9678 * happen with SORT BY command or if there is a bug in this function.
9679 *
9680 * Return 1 if the client is marked as blocked, 0 if the client can
9681 * continue as the keys it is going to access appear to be in memory. */
9682static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
7c775e09 9683 int j, last;
9684
76583ea4
PN
9685 if (cmd->vm_preload_proc != NULL) {
9686 cmd->vm_preload_proc(c);
9687 } else {
9688 if (cmd->vm_firstkey == 0) return 0;
9689 last = cmd->vm_lastkey;
9690 if (last < 0) last = c->argc+last;
9691 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9692 waitForSwappedKey(c,c->argv[j]);
9693 }
9694
d5d55fc3 9695 /* If the client was blocked for at least one key, mark it as blocked. */
9696 if (listLength(c->io_keys)) {
9697 c->flags |= REDIS_IO_WAIT;
9698 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9699 server.vm_blocked_clients++;
9700 return 1;
9701 } else {
9702 return 0;
9703 }
9704}
9705
9706/* Remove the 'key' from the list of blocked keys for a given client.
9707 *
9708 * The function returns 1 when there are no longer blocking keys after
9709 * the current one was removed (and the client can be unblocked). */
9710static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9711 list *l;
9712 listNode *ln;
9713 listIter li;
9714 struct dictEntry *de;
9715
9716 /* Remove the key from the list of keys this client is waiting for. */
9717 listRewind(c->io_keys,&li);
9718 while ((ln = listNext(&li)) != NULL) {
bf028098 9719 if (equalStringObjects(ln->value,key)) {
d5d55fc3 9720 listDelNode(c->io_keys,ln);
9721 break;
9722 }
9723 }
9724 assert(ln != NULL);
9725
9726 /* Remove the client form the key => waiting clients map. */
9727 de = dictFind(c->db->io_keys,key);
9728 assert(de != NULL);
9729 l = dictGetEntryVal(de);
9730 ln = listSearchKey(l,c);
9731 assert(ln != NULL);
9732 listDelNode(l,ln);
9733 if (listLength(l) == 0)
9734 dictDelete(c->db->io_keys,key);
9735
9736 return listLength(c->io_keys) == 0;
9737}
9738
9739static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9740 struct dictEntry *de;
9741 list *l;
9742 listNode *ln;
9743 int len;
9744
9745 de = dictFind(db->io_keys,key);
9746 if (!de) return;
9747
9748 l = dictGetEntryVal(de);
9749 len = listLength(l);
9750 /* Note: we can't use something like while(listLength(l)) as the list
9751 * can be freed by the calling function when we remove the last element. */
9752 while (len--) {
9753 ln = listFirst(l);
9754 redisClient *c = ln->value;
9755
9756 if (dontWaitForSwappedKey(c,key)) {
9757 /* Put the client in the list of clients ready to go as we
9758 * loaded all the keys about it. */
9759 listAddNodeTail(server.io_ready_clients,c);
9760 }
9761 }
b0d8747d 9762}
b0d8747d 9763
500ece7c 9764/* =========================== Remote Configuration ========================= */
9765
9766static void configSetCommand(redisClient *c) {
9767 robj *o = getDecodedObject(c->argv[3]);
9768 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9769 zfree(server.dbfilename);
9770 server.dbfilename = zstrdup(o->ptr);
9771 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9772 zfree(server.requirepass);
9773 server.requirepass = zstrdup(o->ptr);
9774 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9775 zfree(server.masterauth);
9776 server.masterauth = zstrdup(o->ptr);
9777 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9778 server.maxmemory = strtoll(o->ptr, NULL, 10);
1b677732 9779 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
9780 if (!strcasecmp(o->ptr,"no")) {
9781 server.appendfsync = APPENDFSYNC_NO;
9782 } else if (!strcasecmp(o->ptr,"everysec")) {
9783 server.appendfsync = APPENDFSYNC_EVERYSEC;
9784 } else if (!strcasecmp(o->ptr,"always")) {
9785 server.appendfsync = APPENDFSYNC_ALWAYS;
9786 } else {
9787 goto badfmt;
9788 }
a34e0a25 9789 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
9790 int vlen, j;
9791 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
9792
9793 /* Perform sanity check before setting the new config:
9794 * - Even number of args
9795 * - Seconds >= 1, changes >= 0 */
9796 if (vlen & 1) {
9797 sdsfreesplitres(v,vlen);
9798 goto badfmt;
9799 }
9800 for (j = 0; j < vlen; j++) {
9801 char *eptr;
9802 long val;
9803
9804 val = strtoll(v[j], &eptr, 10);
9805 if (eptr[0] != '\0' ||
9806 ((j & 1) == 0 && val < 1) ||
9807 ((j & 1) == 1 && val < 0)) {
9808 sdsfreesplitres(v,vlen);
9809 goto badfmt;
9810 }
9811 }
9812 /* Finally set the new config */
9813 resetServerSaveParams();
9814 for (j = 0; j < vlen; j += 2) {
9815 time_t seconds;
9816 int changes;
9817
9818 seconds = strtoll(v[j],NULL,10);
9819 changes = strtoll(v[j+1],NULL,10);
9820 appendServerSaveParams(seconds, changes);
9821 }
9822 sdsfreesplitres(v,vlen);
500ece7c 9823 } else {
9824 addReplySds(c,sdscatprintf(sdsempty(),
9825 "-ERR not supported CONFIG parameter %s\r\n",
9826 (char*)c->argv[2]->ptr));
9827 decrRefCount(o);
9828 return;
9829 }
9830 decrRefCount(o);
9831 addReply(c,shared.ok);
a34e0a25 9832 return;
9833
9834badfmt: /* Bad format errors */
9835 addReplySds(c,sdscatprintf(sdsempty(),
9836 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
9837 (char*)o->ptr,
9838 (char*)c->argv[2]->ptr));
9839 decrRefCount(o);
500ece7c 9840}
9841
9842static void configGetCommand(redisClient *c) {
9843 robj *o = getDecodedObject(c->argv[2]);
9844 robj *lenobj = createObject(REDIS_STRING,NULL);
9845 char *pattern = o->ptr;
9846 int matches = 0;
9847
9848 addReply(c,lenobj);
9849 decrRefCount(lenobj);
9850
9851 if (stringmatch(pattern,"dbfilename",0)) {
9852 addReplyBulkCString(c,"dbfilename");
9853 addReplyBulkCString(c,server.dbfilename);
9854 matches++;
9855 }
9856 if (stringmatch(pattern,"requirepass",0)) {
9857 addReplyBulkCString(c,"requirepass");
9858 addReplyBulkCString(c,server.requirepass);
9859 matches++;
9860 }
9861 if (stringmatch(pattern,"masterauth",0)) {
9862 addReplyBulkCString(c,"masterauth");
9863 addReplyBulkCString(c,server.masterauth);
9864 matches++;
9865 }
9866 if (stringmatch(pattern,"maxmemory",0)) {
9867 char buf[128];
9868
9869 snprintf(buf,128,"%llu\n",server.maxmemory);
9870 addReplyBulkCString(c,"maxmemory");
9871 addReplyBulkCString(c,buf);
9872 matches++;
9873 }
1b677732 9874 if (stringmatch(pattern,"appendfsync",0)) {
9875 char *policy;
9876
9877 switch(server.appendfsync) {
9878 case APPENDFSYNC_NO: policy = "no"; break;
9879 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
9880 case APPENDFSYNC_ALWAYS: policy = "always"; break;
9881 default: policy = "unknown"; break; /* too harmless to panic */
9882 }
9883 addReplyBulkCString(c,"appendfsync");
9884 addReplyBulkCString(c,policy);
9885 matches++;
9886 }
a34e0a25 9887 if (stringmatch(pattern,"save",0)) {
9888 sds buf = sdsempty();
9889 int j;
9890
9891 for (j = 0; j < server.saveparamslen; j++) {
9892 buf = sdscatprintf(buf,"%ld %d",
9893 server.saveparams[j].seconds,
9894 server.saveparams[j].changes);
9895 if (j != server.saveparamslen-1)
9896 buf = sdscatlen(buf," ",1);
9897 }
9898 addReplyBulkCString(c,"save");
9899 addReplyBulkCString(c,buf);
9900 sdsfree(buf);
9901 matches++;
9902 }
500ece7c 9903 decrRefCount(o);
9904 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9905}
9906
9907static void configCommand(redisClient *c) {
9908 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9909 if (c->argc != 4) goto badarity;
9910 configSetCommand(c);
9911 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9912 if (c->argc != 3) goto badarity;
9913 configGetCommand(c);
9914 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9915 if (c->argc != 2) goto badarity;
9916 server.stat_numcommands = 0;
9917 server.stat_numconnections = 0;
9918 server.stat_expiredkeys = 0;
9919 server.stat_starttime = time(NULL);
9920 addReply(c,shared.ok);
9921 } else {
9922 addReplySds(c,sdscatprintf(sdsempty(),
9923 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9924 }
9925 return;
9926
9927badarity:
9928 addReplySds(c,sdscatprintf(sdsempty(),
9929 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9930 (char*) c->argv[1]->ptr));
9931}
9932
befec3cd 9933/* =========================== Pubsub implementation ======================== */
9934
ffc6b7f8 9935static void freePubsubPattern(void *p) {
9936 pubsubPattern *pat = p;
9937
9938 decrRefCount(pat->pattern);
9939 zfree(pat);
9940}
9941
9942static int listMatchPubsubPattern(void *a, void *b) {
9943 pubsubPattern *pa = a, *pb = b;
9944
9945 return (pa->client == pb->client) &&
bf028098 9946 (equalStringObjects(pa->pattern,pb->pattern));
ffc6b7f8 9947}
9948
9949/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9950 * 0 if the client was already subscribed to that channel. */
9951static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
befec3cd 9952 struct dictEntry *de;
9953 list *clients = NULL;
9954 int retval = 0;
9955
ffc6b7f8 9956 /* Add the channel to the client -> channels hash table */
9957 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
befec3cd 9958 retval = 1;
ffc6b7f8 9959 incrRefCount(channel);
9960 /* Add the client to the channel -> list of clients hash table */
9961 de = dictFind(server.pubsub_channels,channel);
befec3cd 9962 if (de == NULL) {
9963 clients = listCreate();
ffc6b7f8 9964 dictAdd(server.pubsub_channels,channel,clients);
9965 incrRefCount(channel);
befec3cd 9966 } else {
9967 clients = dictGetEntryVal(de);
9968 }
9969 listAddNodeTail(clients,c);
9970 }
9971 /* Notify the client */
9972 addReply(c,shared.mbulk3);
9973 addReply(c,shared.subscribebulk);
ffc6b7f8 9974 addReplyBulk(c,channel);
9975 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
befec3cd 9976 return retval;
9977}
9978
ffc6b7f8 9979/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9980 * 0 if the client was not subscribed to the specified channel. */
9981static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
befec3cd 9982 struct dictEntry *de;
9983 list *clients;
9984 listNode *ln;
9985 int retval = 0;
9986
ffc6b7f8 9987 /* Remove the channel from the client -> channels hash table */
9988 incrRefCount(channel); /* channel may be just a pointer to the same object
201037f5 9989 we have in the hash tables. Protect it... */
ffc6b7f8 9990 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
befec3cd 9991 retval = 1;
ffc6b7f8 9992 /* Remove the client from the channel -> clients list hash table */
9993 de = dictFind(server.pubsub_channels,channel);
befec3cd 9994 assert(de != NULL);
9995 clients = dictGetEntryVal(de);
9996 ln = listSearchKey(clients,c);
9997 assert(ln != NULL);
9998 listDelNode(clients,ln);
ff767a75 9999 if (listLength(clients) == 0) {
10000 /* Free the list and associated hash entry at all if this was
10001 * the latest client, so that it will be possible to abuse
ffc6b7f8 10002 * Redis PUBSUB creating millions of channels. */
10003 dictDelete(server.pubsub_channels,channel);
ff767a75 10004 }
befec3cd 10005 }
10006 /* Notify the client */
10007 if (notify) {
10008 addReply(c,shared.mbulk3);
10009 addReply(c,shared.unsubscribebulk);
ffc6b7f8 10010 addReplyBulk(c,channel);
10011 addReplyLong(c,dictSize(c->pubsub_channels)+
10012 listLength(c->pubsub_patterns));
10013
10014 }
10015 decrRefCount(channel); /* it is finally safe to release it */
10016 return retval;
10017}
10018
10019/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10020static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10021 int retval = 0;
10022
10023 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10024 retval = 1;
10025 pubsubPattern *pat;
10026 listAddNodeTail(c->pubsub_patterns,pattern);
10027 incrRefCount(pattern);
10028 pat = zmalloc(sizeof(*pat));
10029 pat->pattern = getDecodedObject(pattern);
10030 pat->client = c;
10031 listAddNodeTail(server.pubsub_patterns,pat);
10032 }
10033 /* Notify the client */
10034 addReply(c,shared.mbulk3);
10035 addReply(c,shared.psubscribebulk);
10036 addReplyBulk(c,pattern);
10037 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
10038 return retval;
10039}
10040
10041/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10042 * 0 if the client was not subscribed to the specified channel. */
10043static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10044 listNode *ln;
10045 pubsubPattern pat;
10046 int retval = 0;
10047
10048 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10049 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10050 retval = 1;
10051 listDelNode(c->pubsub_patterns,ln);
10052 pat.client = c;
10053 pat.pattern = pattern;
10054 ln = listSearchKey(server.pubsub_patterns,&pat);
10055 listDelNode(server.pubsub_patterns,ln);
10056 }
10057 /* Notify the client */
10058 if (notify) {
10059 addReply(c,shared.mbulk3);
10060 addReply(c,shared.punsubscribebulk);
10061 addReplyBulk(c,pattern);
10062 addReplyLong(c,dictSize(c->pubsub_channels)+
10063 listLength(c->pubsub_patterns));
befec3cd 10064 }
ffc6b7f8 10065 decrRefCount(pattern);
befec3cd 10066 return retval;
10067}
10068
ffc6b7f8 10069/* Unsubscribe from all the channels. Return the number of channels the
10070 * client was subscribed from. */
10071static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10072 dictIterator *di = dictGetIterator(c->pubsub_channels);
befec3cd 10073 dictEntry *de;
10074 int count = 0;
10075
10076 while((de = dictNext(di)) != NULL) {
ffc6b7f8 10077 robj *channel = dictGetEntryKey(de);
befec3cd 10078
ffc6b7f8 10079 count += pubsubUnsubscribeChannel(c,channel,notify);
befec3cd 10080 }
10081 dictReleaseIterator(di);
10082 return count;
10083}
10084
ffc6b7f8 10085/* Unsubscribe from all the patterns. Return the number of patterns the
10086 * client was subscribed from. */
10087static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10088 listNode *ln;
10089 listIter li;
10090 int count = 0;
10091
10092 listRewind(c->pubsub_patterns,&li);
10093 while ((ln = listNext(&li)) != NULL) {
10094 robj *pattern = ln->value;
10095
10096 count += pubsubUnsubscribePattern(c,pattern,notify);
10097 }
10098 return count;
10099}
10100
befec3cd 10101/* Publish a message */
ffc6b7f8 10102static int pubsubPublishMessage(robj *channel, robj *message) {
befec3cd 10103 int receivers = 0;
10104 struct dictEntry *de;
ffc6b7f8 10105 listNode *ln;
10106 listIter li;
befec3cd 10107
ffc6b7f8 10108 /* Send to clients listening for that channel */
10109 de = dictFind(server.pubsub_channels,channel);
befec3cd 10110 if (de) {
10111 list *list = dictGetEntryVal(de);
10112 listNode *ln;
10113 listIter li;
10114
10115 listRewind(list,&li);
10116 while ((ln = listNext(&li)) != NULL) {
10117 redisClient *c = ln->value;
10118
10119 addReply(c,shared.mbulk3);
10120 addReply(c,shared.messagebulk);
ffc6b7f8 10121 addReplyBulk(c,channel);
befec3cd 10122 addReplyBulk(c,message);
10123 receivers++;
10124 }
10125 }
ffc6b7f8 10126 /* Send to clients listening to matching channels */
10127 if (listLength(server.pubsub_patterns)) {
10128 listRewind(server.pubsub_patterns,&li);
10129 channel = getDecodedObject(channel);
10130 while ((ln = listNext(&li)) != NULL) {
10131 pubsubPattern *pat = ln->value;
10132
10133 if (stringmatchlen((char*)pat->pattern->ptr,
10134 sdslen(pat->pattern->ptr),
10135 (char*)channel->ptr,
10136 sdslen(channel->ptr),0)) {
c8d0ea0e 10137 addReply(pat->client,shared.mbulk4);
10138 addReply(pat->client,shared.pmessagebulk);
10139 addReplyBulk(pat->client,pat->pattern);
ffc6b7f8 10140 addReplyBulk(pat->client,channel);
10141 addReplyBulk(pat->client,message);
10142 receivers++;
10143 }
10144 }
10145 decrRefCount(channel);
10146 }
befec3cd 10147 return receivers;
10148}
10149
10150static void subscribeCommand(redisClient *c) {
10151 int j;
10152
10153 for (j = 1; j < c->argc; j++)
ffc6b7f8 10154 pubsubSubscribeChannel(c,c->argv[j]);
befec3cd 10155}
10156
10157static void unsubscribeCommand(redisClient *c) {
10158 if (c->argc == 1) {
ffc6b7f8 10159 pubsubUnsubscribeAllChannels(c,1);
10160 return;
10161 } else {
10162 int j;
10163
10164 for (j = 1; j < c->argc; j++)
10165 pubsubUnsubscribeChannel(c,c->argv[j],1);
10166 }
10167}
10168
10169static void psubscribeCommand(redisClient *c) {
10170 int j;
10171
10172 for (j = 1; j < c->argc; j++)
10173 pubsubSubscribePattern(c,c->argv[j]);
10174}
10175
10176static void punsubscribeCommand(redisClient *c) {
10177 if (c->argc == 1) {
10178 pubsubUnsubscribeAllPatterns(c,1);
befec3cd 10179 return;
10180 } else {
10181 int j;
10182
10183 for (j = 1; j < c->argc; j++)
ffc6b7f8 10184 pubsubUnsubscribePattern(c,c->argv[j],1);
befec3cd 10185 }
10186}
10187
10188static void publishCommand(redisClient *c) {
10189 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
10190 addReplyLong(c,receivers);
10191}
10192
7f957c92 10193/* ================================= Debugging ============================== */
10194
10195static void debugCommand(redisClient *c) {
10196 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
10197 *((char*)-1) = 'x';
210e29f7 10198 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
10199 if (rdbSave(server.dbfilename) != REDIS_OK) {
10200 addReply(c,shared.err);
10201 return;
10202 }
10203 emptyDb();
10204 if (rdbLoad(server.dbfilename) != REDIS_OK) {
10205 addReply(c,shared.err);
10206 return;
10207 }
10208 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
10209 addReply(c,shared.ok);
71c2b467 10210 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
10211 emptyDb();
10212 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
10213 addReply(c,shared.err);
10214 return;
10215 }
10216 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
10217 addReply(c,shared.ok);
333298da 10218 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
10219 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10220 robj *key, *val;
10221
10222 if (!de) {
10223 addReply(c,shared.nokeyerr);
10224 return;
10225 }
10226 key = dictGetEntryKey(de);
10227 val = dictGetEntryVal(de);
59146ef3 10228 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
10229 key->storage == REDIS_VM_SWAPPING)) {
07efaf74 10230 char *strenc;
10231 char buf[128];
10232
10233 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10234 strenc = strencoding[val->encoding];
10235 } else {
10236 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10237 strenc = buf;
10238 }
ace06542 10239 addReplySds(c,sdscatprintf(sdsempty(),
10240 "+Key at:%p refcount:%d, value at:%p refcount:%d "
07efaf74 10241 "encoding:%s serializedlength:%lld\r\n",
682ac724 10242 (void*)key, key->refcount, (void*)val, val->refcount,
07efaf74 10243 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 10244 } else {
10245 addReplySds(c,sdscatprintf(sdsempty(),
10246 "+Key at:%p refcount:%d, value swapped at: page %llu "
10247 "using %llu pages\r\n",
10248 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10249 (unsigned long long) key->vm.usedpages));
10250 }
78ebe4c8 10251 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10252 lookupKeyRead(c->db,c->argv[2]);
10253 addReply(c,shared.ok);
7d30035d 10254 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10255 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10256 robj *key, *val;
10257
10258 if (!server.vm_enabled) {
10259 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10260 return;
10261 }
10262 if (!de) {
10263 addReply(c,shared.nokeyerr);
10264 return;
10265 }
10266 key = dictGetEntryKey(de);
10267 val = dictGetEntryVal(de);
4ef8de8a 10268 /* If the key is shared we want to create a copy */
10269 if (key->refcount > 1) {
10270 robj *newkey = dupStringObject(key);
10271 decrRefCount(key);
10272 key = dictGetEntryKey(de) = newkey;
10273 }
10274 /* Swap it */
7d30035d 10275 if (key->storage != REDIS_VM_MEMORY) {
10276 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 10277 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 10278 dictGetEntryVal(de) = NULL;
10279 addReply(c,shared.ok);
10280 } else {
10281 addReply(c,shared.err);
10282 }
59305dc7 10283 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
10284 long keys, j;
10285 robj *key, *val;
10286 char buf[128];
10287
10288 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
10289 return;
10290 for (j = 0; j < keys; j++) {
10291 snprintf(buf,sizeof(buf),"key:%lu",j);
10292 key = createStringObject(buf,strlen(buf));
10293 if (lookupKeyRead(c->db,key) != NULL) {
10294 decrRefCount(key);
10295 continue;
10296 }
10297 snprintf(buf,sizeof(buf),"value:%lu",j);
10298 val = createStringObject(buf,strlen(buf));
10299 dictAdd(c->db->dict,key,val);
10300 }
10301 addReply(c,shared.ok);
7f957c92 10302 } else {
333298da 10303 addReplySds(c,sdsnew(
bdcb92f2 10304 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 10305 }
10306}
56906eef 10307
6c96ba7d 10308static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 10309 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
6c96ba7d 10310 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
dfc5e96c 10311#ifdef HAVE_BACKTRACE
10312 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10313 *((char*)-1) = 'x';
10314#endif
10315}
10316
c651fd9e 10317static void _redisPanic(char *msg, char *file, int line) {
10318 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
17772754 10319 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
c651fd9e 10320#ifdef HAVE_BACKTRACE
10321 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10322 *((char*)-1) = 'x';
10323#endif
10324}
10325
bcfc686d 10326/* =================================== Main! ================================ */
56906eef 10327
bcfc686d 10328#ifdef __linux__
10329int linuxOvercommitMemoryValue(void) {
10330 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10331 char buf[64];
56906eef 10332
bcfc686d 10333 if (!fp) return -1;
10334 if (fgets(buf,64,fp) == NULL) {
10335 fclose(fp);
10336 return -1;
10337 }
10338 fclose(fp);
56906eef 10339
bcfc686d 10340 return atoi(buf);
10341}
10342
10343void linuxOvercommitMemoryWarning(void) {
10344 if (linuxOvercommitMemoryValue() == 0) {
7ccd2d0a 10345 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
bcfc686d 10346 }
10347}
10348#endif /* __linux__ */
10349
10350static void daemonize(void) {
10351 int fd;
10352 FILE *fp;
10353
10354 if (fork() != 0) exit(0); /* parent exits */
10355 setsid(); /* create a new session */
10356
10357 /* Every output goes to /dev/null. If Redis is daemonized but
10358 * the 'logfile' is set to 'stdout' in the configuration file
10359 * it will not log at all. */
10360 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10361 dup2(fd, STDIN_FILENO);
10362 dup2(fd, STDOUT_FILENO);
10363 dup2(fd, STDERR_FILENO);
10364 if (fd > STDERR_FILENO) close(fd);
10365 }
10366 /* Try to write the pid file */
10367 fp = fopen(server.pidfile,"w");
10368 if (fp) {
10369 fprintf(fp,"%d\n",getpid());
10370 fclose(fp);
56906eef 10371 }
56906eef 10372}
10373
42ab0172
AO
10374static void version() {
10375 printf("Redis server version %s\n", REDIS_VERSION);
10376 exit(0);
10377}
10378
723fb69b
AO
10379static void usage() {
10380 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 10381 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
10382 exit(1);
10383}
10384
bcfc686d 10385int main(int argc, char **argv) {
9651a787 10386 time_t start;
10387
bcfc686d 10388 initServerConfig();
10389 if (argc == 2) {
44efe66e 10390 if (strcmp(argv[1], "-v") == 0 ||
10391 strcmp(argv[1], "--version") == 0) version();
10392 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 10393 resetServerSaveParams();
10394 loadServerConfig(argv[1]);
723fb69b
AO
10395 } else if ((argc > 2)) {
10396 usage();
bcfc686d 10397 } else {
10398 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10399 }
bcfc686d 10400 if (server.daemonize) daemonize();
71c54b21 10401 initServer();
bcfc686d 10402 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10403#ifdef __linux__
10404 linuxOvercommitMemoryWarning();
10405#endif
9651a787 10406 start = time(NULL);
bcfc686d 10407 if (server.appendonly) {
10408 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 10409 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 10410 } else {
10411 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 10412 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 10413 }
bcfc686d 10414 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 10415 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 10416 aeMain(server.el);
10417 aeDeleteEventLoop(server.el);
10418 return 0;
10419}
10420
10421/* ============================= Backtrace support ========================= */
10422
10423#ifdef HAVE_BACKTRACE
10424static char *findFuncName(void *pointer, unsigned long *offset);
10425
56906eef 10426static void *getMcontextEip(ucontext_t *uc) {
10427#if defined(__FreeBSD__)
10428 return (void*) uc->uc_mcontext.mc_eip;
10429#elif defined(__dietlibc__)
10430 return (void*) uc->uc_mcontext.eip;
06db1f50 10431#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 10432 #if __x86_64__
10433 return (void*) uc->uc_mcontext->__ss.__rip;
10434 #else
56906eef 10435 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 10436 #endif
06db1f50 10437#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 10438 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 10439 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 10440 #else
10441 return (void*) uc->uc_mcontext->__ss.__eip;
e0a62c7f 10442 #endif
54bac49d 10443#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 10444 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 10445#elif defined(__ia64__) /* Linux IA64 */
10446 return (void*) uc->uc_mcontext.sc_ip;
10447#else
10448 return NULL;
56906eef 10449#endif
10450}
10451
10452static void segvHandler(int sig, siginfo_t *info, void *secret) {
10453 void *trace[100];
10454 char **messages = NULL;
10455 int i, trace_size = 0;
10456 unsigned long offset=0;
56906eef 10457 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 10458 sds infostring;
56906eef 10459 REDIS_NOTUSED(info);
10460
10461 redisLog(REDIS_WARNING,
10462 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 10463 infostring = genRedisInfoString();
10464 redisLog(REDIS_WARNING, "%s",infostring);
10465 /* It's not safe to sdsfree() the returned string under memory
10466 * corruption conditions. Let it leak as we are going to abort */
e0a62c7f 10467
56906eef 10468 trace_size = backtrace(trace, 100);
de96dbfe 10469 /* overwrite sigaction with caller's address */
b91cf5ef 10470 if (getMcontextEip(uc) != NULL) {
10471 trace[1] = getMcontextEip(uc);
10472 }
56906eef 10473 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 10474
d76412d1 10475 for (i=1; i<trace_size; ++i) {
56906eef 10476 char *fn = findFuncName(trace[i], &offset), *p;
10477
10478 p = strchr(messages[i],'+');
10479 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
10480 redisLog(REDIS_WARNING,"%s", messages[i]);
10481 } else {
10482 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
10483 }
10484 }
b177fd30 10485 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 10486 _exit(0);
fe3bbfbe 10487}
56906eef 10488
10489static void setupSigSegvAction(void) {
10490 struct sigaction act;
10491
10492 sigemptyset (&act.sa_mask);
10493 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10494 * is used. Otherwise, sa_handler is used */
10495 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10496 act.sa_sigaction = segvHandler;
10497 sigaction (SIGSEGV, &act, NULL);
10498 sigaction (SIGBUS, &act, NULL);
12fea928 10499 sigaction (SIGFPE, &act, NULL);
10500 sigaction (SIGILL, &act, NULL);
10501 sigaction (SIGBUS, &act, NULL);
e65fdc78 10502 return;
56906eef 10503}
e65fdc78 10504
bcfc686d 10505#include "staticsymbols.h"
10506/* This function try to convert a pointer into a function name. It's used in
10507 * oreder to provide a backtrace under segmentation fault that's able to
10508 * display functions declared as static (otherwise the backtrace is useless). */
10509static char *findFuncName(void *pointer, unsigned long *offset){
10510 int i, ret = -1;
10511 unsigned long off, minoff = 0;
ed9b544e 10512
bcfc686d 10513 /* Try to match against the Symbol with the smallest offset */
10514 for (i=0; symsTable[i].pointer; i++) {
10515 unsigned long lp = (unsigned long) pointer;
0bc03378 10516
bcfc686d 10517 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
10518 off=lp-symsTable[i].pointer;
10519 if (ret < 0 || off < minoff) {
10520 minoff=off;
10521 ret=i;
10522 }
10523 }
0bc03378 10524 }
bcfc686d 10525 if (ret == -1) return NULL;
10526 *offset = minoff;
10527 return symsTable[ret].name;
0bc03378 10528}
bcfc686d 10529#else /* HAVE_BACKTRACE */
10530static void setupSigSegvAction(void) {
0bc03378 10531}
bcfc686d 10532#endif /* HAVE_BACKTRACE */
0bc03378 10533
ed9b544e 10534
ed9b544e 10535
bcfc686d 10536/* The End */
10537
10538
ed9b544e 10539