]> git.saurik.com Git - redis.git/blame - redis.c
configure maximum number of entries in an intset
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
9005896c 30#define REDIS_VERSION "2.1.1"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
40#include <signal.h>
fbf9bcdb 41
42#ifdef HAVE_BACKTRACE
c9468bcf 43#include <execinfo.h>
44#include <ucontext.h>
fbf9bcdb 45#endif /* HAVE_BACKTRACE */
46
ed9b544e 47#include <sys/wait.h>
48#include <errno.h>
49#include <assert.h>
50#include <ctype.h>
51#include <stdarg.h>
52#include <inttypes.h>
53#include <arpa/inet.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#include <sys/time.h>
57#include <sys/resource.h>
2895e862 58#include <sys/uio.h>
f78fd11b 59#include <limits.h>
fb82e75c 60#include <float.h>
a7866db6 61#include <math.h>
92f8e882 62#include <pthread.h>
0bc1b2f6 63
64#if defined(__sun)
5043dff3 65#include "solarisfixes.h"
66#endif
ed9b544e 67
c9468bcf 68#include "redis.h"
ed9b544e 69#include "ae.h" /* Event driven programming library */
70#include "sds.h" /* Dynamic safe strings */
71#include "anet.h" /* Networking the easy way */
72#include "dict.h" /* Hash tables */
73#include "adlist.h" /* Linked lists */
74#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 75#include "lzf.h" /* LZF compression library */
76#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
ba798261 77#include "zipmap.h" /* Compact dictionary-alike data structure */
c7d9d662 78#include "ziplist.h" /* Compact list data structure */
d0b58d53 79#include "intset.h" /* Compact integer set structure */
ba798261 80#include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
5436146c 81#include "release.h" /* Release and/or git repository information */
ed9b544e 82
83/* Error codes */
84#define REDIS_OK 0
85#define REDIS_ERR -1
86
87/* Static server configuration */
88#define REDIS_SERVERPORT 6379 /* TCP port */
89#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 90#define REDIS_IOBUF_LEN 1024
ed9b544e 91#define REDIS_LOADBUF_LEN 1024
248ea310 92#define REDIS_STATIC_ARGS 8
ed9b544e 93#define REDIS_DEFAULT_DBNUM 16
94#define REDIS_CONFIGLINE_MAX 1024
95#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
96#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
8ca3e9d1 97#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
6f376729 98#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 99#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
100
101/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
102#define REDIS_WRITEV_THRESHOLD 3
103/* Max number of iovecs used for each writev call */
104#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 105
106/* Hash table parameters */
107#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 108
109/* Command flags */
3fd78bcd 110#define REDIS_CMD_BULK 1 /* Bulk write command */
111#define REDIS_CMD_INLINE 2 /* Inline command */
112/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
113 this flags will return an error when the 'maxmemory' option is set in the
114 config file and the server is using more than maxmemory bytes of memory.
115 In short this commands are denied on low memory conditions. */
116#define REDIS_CMD_DENYOOM 4
4005fef1 117#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
ed9b544e 118
119/* Object types */
120#define REDIS_STRING 0
121#define REDIS_LIST 1
122#define REDIS_SET 2
1812e024 123#define REDIS_ZSET 3
124#define REDIS_HASH 4
560db612 125#define REDIS_VMPOINTER 8
f78fd11b 126
5234952b 127/* Objects encoding. Some kind of objects like Strings and Hashes can be
128 * internally represented in multiple ways. The 'encoding' field of the object
129 * is set to one of this fields for this object. */
c7d9d662
PN
130#define REDIS_ENCODING_RAW 0 /* Raw representation */
131#define REDIS_ENCODING_INT 1 /* Encoded as integer */
132#define REDIS_ENCODING_HT 2 /* Encoded as hash table */
133#define REDIS_ENCODING_ZIPMAP 3 /* Encoded as zipmap */
134#define REDIS_ENCODING_LIST 4 /* Encoded as zipmap */
135#define REDIS_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
d0b58d53 136#define REDIS_ENCODING_INTSET 6 /* Encoded as intset */
942a3961 137
07efaf74 138static char* strencoding[] = {
d0b58d53 139 "raw", "int", "hashtable", "zipmap", "list", "ziplist", "intset"
07efaf74 140};
141
f78fd11b 142/* Object types only used for dumping to disk */
bb32ede5 143#define REDIS_EXPIRETIME 253
ed9b544e 144#define REDIS_SELECTDB 254
145#define REDIS_EOF 255
146
f78fd11b 147/* Defines related to the dump file format. To store 32 bits lengths for short
148 * keys requires a lot of space, so we check the most significant 2 bits of
149 * the first byte to interpreter the length:
150 *
151 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
152 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
153 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 154 * 11|000000 this means: specially encoded object will follow. The six bits
155 * number specify the kind of object that follows.
156 * See the REDIS_RDB_ENC_* defines.
f78fd11b 157 *
10c43610 158 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
159 * values, will fit inside. */
f78fd11b 160#define REDIS_RDB_6BITLEN 0
161#define REDIS_RDB_14BITLEN 1
162#define REDIS_RDB_32BITLEN 2
17be1a4a 163#define REDIS_RDB_ENCVAL 3
f78fd11b 164#define REDIS_RDB_LENERR UINT_MAX
165
a4d1ba9a 166/* When a length of a string object stored on disk has the first two bits
167 * set, the remaining two bits specify a special encoding for the object
168 * accordingly to the following defines: */
169#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
170#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
171#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 172#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 173
75680a3c 174/* Virtual memory object->where field. */
175#define REDIS_VM_MEMORY 0 /* The object is on memory */
176#define REDIS_VM_SWAPPED 1 /* The object is on disk */
177#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
178#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
179
06224fec 180/* Virtual memory static configuration stuff.
181 * Check vmFindContiguousPages() to know more about this magic numbers. */
182#define REDIS_VM_MAX_NEAR_PAGES 65536
183#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 184#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 185#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 186/* The following is the *percentage* of completed I/O jobs to process when the
187 * handelr is called. While Virtual Memory I/O operations are performed by
188 * threads, this operations must be processed by the main thread when completed
189 * in order to take effect. */
c953f24b 190#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 191
ed9b544e 192/* Client flags */
d5d55fc3 193#define REDIS_SLAVE 1 /* This client is a slave server */
194#define REDIS_MASTER 2 /* This client is a master server */
195#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
196#define REDIS_MULTI 8 /* This client is in a MULTI context */
197#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
198#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
37ab76c9 199#define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
ed9b544e 200
40d224a9 201/* Slave replication state - slave side */
ed9b544e 202#define REDIS_REPL_NONE 0 /* No active replication */
203#define REDIS_REPL_CONNECT 1 /* Must connect to master */
204#define REDIS_REPL_CONNECTED 2 /* Connected to master */
205
40d224a9 206/* Slave replication state - from the point of view of master
207 * Note that in SEND_BULK and ONLINE state the slave receives new updates
208 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
209 * to start the next background saving in order to send updates to it. */
210#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
211#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
212#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
213#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
214
ed9b544e 215/* List related stuff */
216#define REDIS_HEAD 0
217#define REDIS_TAIL 1
218
219/* Sort operations */
220#define REDIS_SORT_GET 0
443c6409 221#define REDIS_SORT_ASC 1
222#define REDIS_SORT_DESC 2
ed9b544e 223#define REDIS_SORTKEY_MAX 1024
224
225/* Log levels */
226#define REDIS_DEBUG 0
f870935d 227#define REDIS_VERBOSE 1
228#define REDIS_NOTICE 2
229#define REDIS_WARNING 3
ed9b544e 230
231/* Anti-warning macro... */
232#define REDIS_NOTUSED(V) ((void) V)
233
6b47e12e 234#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
235#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 236
48f0308a 237/* Append only defines */
238#define APPENDFSYNC_NO 0
239#define APPENDFSYNC_ALWAYS 1
240#define APPENDFSYNC_EVERYSEC 2
241
d0686e07 242/* Zip structure related defaults */
cbba7dd7 243#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
244#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
d0686e07
PN
245#define REDIS_LIST_MAX_ZIPLIST_ENTRIES 1024
246#define REDIS_LIST_MAX_ZIPLIST_VALUE 32
70ff3511 247#define REDIS_SET_MAX_INTSET_ENTRIES 4096
cbba7dd7 248
dfc5e96c 249/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 250#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
c651fd9e 251#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
6c96ba7d 252static void _redisAssert(char *estr, char *file, int line);
c651fd9e 253static void _redisPanic(char *msg, char *file, int line);
dfc5e96c 254
ed9b544e 255/*================================= Data types ============================== */
256
257/* A redis object, that is a type able to hold a string / list / set */
75680a3c 258
75680a3c 259/* The actual Redis Object */
ed9b544e 260typedef struct redisObject {
560db612 261 unsigned type:4;
262 unsigned storage:2; /* REDIS_VM_MEMORY or REDIS_VM_SWAPPING */
263 unsigned encoding:4;
264 unsigned lru:22; /* lru time (relative to server.lruclock) */
ed9b544e 265 int refcount;
560db612 266 void *ptr;
75680a3c 267 /* VM fields, this are only allocated if VM is active, otherwise the
268 * object allocation function will just allocate
269 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
270 * Redis without VM active will not have any overhead. */
ed9b544e 271} robj;
272
560db612 273/* The VM pointer structure - identifies an object in the swap file.
274 *
275 * This object is stored in place of the value
276 * object in the main key->value hash table representing a database.
277 * Note that the first fields (type, storage) are the same as the redisObject
278 * structure so that vmPointer strucuters can be accessed even when casted
279 * as redisObject structures.
280 *
281 * This is useful as we don't know if a value object is or not on disk, but we
169dd6b7 282 * are always able to read obj->storage to check this. For vmPointer
560db612 283 * structures "type" is set to REDIS_VMPOINTER (even if without this field
284 * is still possible to check the kind of object from the value of 'storage').*/
285typedef struct vmPointer {
286 unsigned type:4;
287 unsigned storage:2; /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
288 unsigned notused:26;
289 unsigned int vtype; /* type of the object stored in the swap file */
290 off_t page; /* the page at witch the object is stored on disk */
291 off_t usedpages; /* number of pages used on disk */
292} vmpointer;
293
dfc5e96c 294/* Macro used to initalize a Redis object allocated on the stack.
295 * Note that this macro is taken near the structure definition to make sure
296 * we'll update it when the structure is changed, to avoid bugs like
297 * bug #85 introduced exactly in this way. */
298#define initStaticStringObject(_var,_ptr) do { \
299 _var.refcount = 1; \
300 _var.type = REDIS_STRING; \
301 _var.encoding = REDIS_ENCODING_RAW; \
302 _var.ptr = _ptr; \
560db612 303 _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 304} while(0);
305
3305306f 306typedef struct redisDb {
4409877e 307 dict *dict; /* The keyspace for this DB */
308 dict *expires; /* Timeout of keys with a timeout set */
37ab76c9 309 dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 310 dict *io_keys; /* Keys with clients waiting for VM I/O */
37ab76c9 311 dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
3305306f 312 int id;
313} redisDb;
314
6e469882 315/* Client MULTI/EXEC state */
316typedef struct multiCmd {
317 robj **argv;
318 int argc;
319 struct redisCommand *cmd;
320} multiCmd;
321
322typedef struct multiState {
323 multiCmd *commands; /* Array of MULTI commands */
324 int count; /* Total number of MULTI commands */
325} multiState;
326
ed9b544e 327/* With multiplexing we need to take per-clinet state.
328 * Clients are taken in a liked list. */
329typedef struct redisClient {
330 int fd;
3305306f 331 redisDb *db;
ed9b544e 332 int dictid;
333 sds querybuf;
e8a74421 334 robj **argv, **mbargv;
335 int argc, mbargc;
40d224a9 336 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 337 int multibulk; /* multi bulk command format active */
ed9b544e 338 list *reply;
339 int sentlen;
340 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 341 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 342 int slaveseldb; /* slave selected db, if this client is a slave */
343 int authenticated; /* when requirepass is non-NULL */
344 int replstate; /* replication state if this is a slave */
345 int repldbfd; /* replication DB file descriptor */
6e469882 346 long repldboff; /* replication DB file offset */
40d224a9 347 off_t repldbsize; /* replication DB file size */
6e469882 348 multiState mstate; /* MULTI/EXEC state */
37ab76c9 349 robj **blocking_keys; /* The key we are waiting to terminate a blocking
4409877e 350 * operation such as BLPOP. Otherwise NULL. */
37ab76c9 351 int blocking_keys_num; /* Number of blocking keys */
4409877e 352 time_t blockingto; /* Blocking operation timeout. If UNIX current time
353 * is >= blockingto then the operation timed out. */
92f8e882 354 list *io_keys; /* Keys this client is waiting to be loaded from the
355 * swap file in order to continue. */
37ab76c9 356 list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
ffc6b7f8 357 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
358 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
ed9b544e 359} redisClient;
360
361struct saveparam {
362 time_t seconds;
363 int changes;
364};
365
366/* Global server state structure */
367struct redisServer {
368 int port;
369 int fd;
3305306f 370 redisDb *db;
ed9b544e 371 long long dirty; /* changes to DB from the last save */
372 list *clients;
87eca727 373 list *slaves, *monitors;
ed9b544e 374 char neterr[ANET_ERR_LEN];
375 aeEventLoop *el;
376 int cronloops; /* number of times the cron function run */
377 list *objfreelist; /* A list of freed objects to avoid malloc() */
378 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 379 /* Fields used only for stats */
380 time_t stat_starttime; /* server start time */
381 long long stat_numcommands; /* number of processed commands */
382 long long stat_numconnections; /* number of connections received */
2a6a2ed1 383 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 384 /* Configuration */
385 int verbosity;
386 int glueoutputbuf;
387 int maxidletime;
388 int dbnum;
389 int daemonize;
44b38ef4 390 int appendonly;
48f0308a 391 int appendfsync;
38db9171 392 int no_appendfsync_on_rewrite;
fab43727 393 int shutdown_asap;
48f0308a 394 time_t lastfsync;
44b38ef4 395 int appendfd;
396 int appendseldb;
ed329fcf 397 char *pidfile;
9f3c422c 398 pid_t bgsavechildpid;
9d65a1bb 399 pid_t bgrewritechildpid;
400 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
28ed1f33 401 sds aofbuf; /* AOF buffer, written before entering the event loop */
ed9b544e 402 struct saveparam *saveparams;
403 int saveparamslen;
404 char *logfile;
405 char *bindaddr;
406 char *dbfilename;
44b38ef4 407 char *appendfilename;
abcb223e 408 char *requirepass;
121f70cf 409 int rdbcompression;
8ca3e9d1 410 int activerehashing;
ed9b544e 411 /* Replication related */
412 int isslave;
d0ccebcf 413 char *masterauth;
ed9b544e 414 char *masterhost;
415 int masterport;
40d224a9 416 redisClient *master; /* client that is master for this slave */
ed9b544e 417 int replstate;
285add55 418 unsigned int maxclients;
4ef8de8a 419 unsigned long long maxmemory;
d5d55fc3 420 unsigned int blpop_blocked_clients;
421 unsigned int vm_blocked_clients;
ed9b544e 422 /* Sort parameters - qsort_r() is only available under BSD so we
423 * have to take this state global, in order to pass it to sortCompare() */
424 int sort_desc;
425 int sort_alpha;
426 int sort_bypattern;
75680a3c 427 /* Virtual memory configuration */
428 int vm_enabled;
054e426d 429 char *vm_swap_file;
75680a3c 430 off_t vm_page_size;
431 off_t vm_pages;
4ef8de8a 432 unsigned long long vm_max_memory;
d0686e07 433 /* Zip structure config */
cbba7dd7 434 size_t hash_max_zipmap_entries;
435 size_t hash_max_zipmap_value;
d0686e07
PN
436 size_t list_max_ziplist_entries;
437 size_t list_max_ziplist_value;
70ff3511 438 size_t set_max_intset_entries;
75680a3c 439 /* Virtual memory state */
440 FILE *vm_fp;
441 int vm_fd;
442 off_t vm_next_page; /* Next probably empty page */
443 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 444 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 445 time_t unixtime; /* Unix time sampled every second. */
92f8e882 446 /* Virtual memory I/O threads stuff */
92f8e882 447 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 448 * put the result of the operation in the io_done list. While the
449 * job is being processed, it's put on io_processing queue. */
450 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
451 list *io_processing; /* List of VM I/O jobs being processed */
452 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 453 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 454 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 455 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
456 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 457 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 458 int io_active_threads; /* Number of running I/O threads */
459 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 460 /* Our main thread is blocked on the event loop, locking for sockets ready
461 * to be read or written, so when a threaded I/O operation is ready to be
462 * processed by the main thread, the I/O thread will use a unix pipe to
463 * awake the main thread. The followings are the two pipe FDs. */
464 int io_ready_pipe_read;
465 int io_ready_pipe_write;
7d98e08c 466 /* Virtual memory stats */
467 unsigned long long vm_stats_used_pages;
468 unsigned long long vm_stats_swapped_objects;
469 unsigned long long vm_stats_swapouts;
470 unsigned long long vm_stats_swapins;
befec3cd 471 /* Pubsub */
ffc6b7f8 472 dict *pubsub_channels; /* Map channels to list of subscribed clients */
473 list *pubsub_patterns; /* A list of pubsub_patterns */
befec3cd 474 /* Misc */
b9bc0eef 475 FILE *devnull;
560db612 476 unsigned lruclock:22; /* clock incrementing every minute, for LRU */
477 unsigned lruclock_padding:10;
ed9b544e 478};
479
ffc6b7f8 480typedef struct pubsubPattern {
481 redisClient *client;
482 robj *pattern;
483} pubsubPattern;
484
ed9b544e 485typedef void redisCommandProc(redisClient *c);
ca1788b5 486typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
ed9b544e 487struct redisCommand {
488 char *name;
489 redisCommandProc *proc;
490 int arity;
491 int flags;
76583ea4
PN
492 /* Use a function to determine which keys need to be loaded
493 * in the background prior to executing this command. Takes precedence
494 * over vm_firstkey and others, ignored when NULL */
ca1788b5 495 redisVmPreloadProc *vm_preload_proc;
7c775e09 496 /* What keys should be loaded in background when calling this command? */
497 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
498 int vm_lastkey; /* THe last argument that's a key */
499 int vm_keystep; /* The step between first and last key */
ed9b544e 500};
501
de96dbfe 502struct redisFunctionSym {
503 char *name;
56906eef 504 unsigned long pointer;
de96dbfe 505};
506
ed9b544e 507typedef struct _redisSortObject {
508 robj *obj;
509 union {
510 double score;
511 robj *cmpobj;
512 } u;
513} redisSortObject;
514
515typedef struct _redisSortOperation {
516 int type;
517 robj *pattern;
518} redisSortOperation;
519
6b47e12e 520/* ZSETs use a specialized version of Skiplists */
521
522typedef struct zskiplistNode {
523 struct zskiplistNode **forward;
e3870fab 524 struct zskiplistNode *backward;
912b9165 525 unsigned int *span;
6b47e12e 526 double score;
527 robj *obj;
528} zskiplistNode;
529
530typedef struct zskiplist {
e3870fab 531 struct zskiplistNode *header, *tail;
d13f767c 532 unsigned long length;
6b47e12e 533 int level;
534} zskiplist;
535
1812e024 536typedef struct zset {
537 dict *dict;
6b47e12e 538 zskiplist *zsl;
1812e024 539} zset;
540
6b47e12e 541/* Our shared "common" objects */
542
05df7621 543#define REDIS_SHARED_INTEGERS 10000
ed9b544e 544struct sharedObjectsStruct {
c937aa89 545 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 546 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 547 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
548 *outofrangeerr, *plus,
ed9b544e 549 *select0, *select1, *select2, *select3, *select4,
befec3cd 550 *select5, *select6, *select7, *select8, *select9,
c8d0ea0e 551 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
552 *mbulk4, *psubscribebulk, *punsubscribebulk,
553 *integers[REDIS_SHARED_INTEGERS];
ed9b544e 554} shared;
555
a7866db6 556/* Global vars that are actally used as constants. The following double
557 * values are used for double on-disk serialization, and are initialized
558 * at runtime to avoid strange compiler optimizations. */
559
560static double R_Zero, R_PosInf, R_NegInf, R_Nan;
561
92f8e882 562/* VM threaded I/O request message */
b9bc0eef 563#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
564#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
565#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 566typedef struct iojob {
996cb5f7 567 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 568 redisDb *db;/* Redis database */
92f8e882 569 robj *key; /* This I/O request is about swapping this key */
560db612 570 robj *id; /* Unique identifier of this job:
571 this is the object to swap for REDIS_IOREQ_*_SWAP, or the
572 vmpointer objct for REDIS_IOREQ_LOAD. */
b9bc0eef 573 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 574 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
575 off_t page; /* Swap page where to read/write the object */
248ea310 576 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 577 int canceled; /* True if this command was canceled by blocking side of VM */
578 pthread_t thread; /* ID of the thread processing this entry */
579} iojob;
92f8e882 580
ed9b544e 581/*================================ Prototypes =============================== */
582
583static void freeStringObject(robj *o);
584static void freeListObject(robj *o);
585static void freeSetObject(robj *o);
586static void decrRefCount(void *o);
587static robj *createObject(int type, void *ptr);
588static void freeClient(redisClient *c);
f78fd11b 589static int rdbLoad(char *filename);
ed9b544e 590static void addReply(redisClient *c, robj *obj);
591static void addReplySds(redisClient *c, sds s);
592static void incrRefCount(robj *o);
f78fd11b 593static int rdbSaveBackground(char *filename);
ed9b544e 594static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 595static robj *dupStringObject(robj *o);
248ea310 596static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
dd142b9c 597static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
28ed1f33 598static void flushAppendOnlyFile(void);
44b38ef4 599static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 600static int syncWithMaster(void);
05df7621 601static robj *tryObjectEncoding(robj *o);
9d65a1bb 602static robj *getDecodedObject(robj *o);
3305306f 603static int removeExpire(redisDb *db, robj *key);
604static int expireIfNeeded(redisDb *db, robj *key);
605static int deleteIfVolatile(redisDb *db, robj *key);
09241813 606static int dbDelete(redisDb *db, robj *key);
bb32ede5 607static time_t getExpire(redisDb *db, robj *key);
608static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 609static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 610static void freeMemoryIfNeeded(void);
de96dbfe 611static int processCommand(redisClient *c);
56906eef 612static void setupSigSegvAction(void);
a3b21203 613static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 614static void aofRemoveTempFile(pid_t childpid);
0ea663ea 615static size_t stringObjectLen(robj *o);
638e42ac 616static void processInputBuffer(redisClient *c);
6b47e12e 617static zskiplist *zslCreate(void);
fd8ccf44 618static void zslFree(zskiplist *zsl);
2b59cfdf 619static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 620static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 621static void initClientMultiState(redisClient *c);
622static void freeClientMultiState(redisClient *c);
623static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 624static void unblockClientWaitingData(redisClient *c);
4409877e 625static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 626static void vmInit(void);
a35ddf12 627static void vmMarkPagesFree(off_t page, off_t count);
560db612 628static robj *vmLoadObject(robj *o);
629static robj *vmPreviewObject(robj *o);
a69a0c9c 630static int vmSwapOneObjectBlocking(void);
631static int vmSwapOneObjectThreaded(void);
7e69548d 632static int vmCanSwapOut(void);
a5819310 633static int tryFreeOneObjectFromFreelist(void);
996cb5f7 634static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
635static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
636static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 637static void lockThreadedIO(void);
638static void unlockThreadedIO(void);
639static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
640static void freeIOJob(iojob *j);
641static void queueIOJob(iojob *j);
a5819310 642static int vmWriteObjectOnSwap(robj *o, off_t page);
643static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 644static void waitEmptyIOJobsQueue(void);
645static void vmReopenSwapFile(void);
970e10bb 646static int vmFreePage(off_t page);
ca1788b5 647static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
3805e04f 648static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
0a6f3f0f 649static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
d5d55fc3 650static int dontWaitForSwappedKey(redisClient *c, robj *key);
651static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
652static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
653static struct redisCommand *lookupCommand(char *name);
654static void call(redisClient *c, struct redisCommand *cmd);
655static void resetClient(redisClient *c);
ada386b2 656static void convertToRealHash(robj *o);
003f0840 657static void listTypeConvert(robj *o, int enc);
d0b58d53 658static void setTypeConvert(robj *o, int enc);
ffc6b7f8 659static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
660static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
661static void freePubsubPattern(void *p);
662static int listMatchPubsubPattern(void *a, void *b);
663static int compareStringObjects(robj *a, robj *b);
bf028098 664static int equalStringObjects(robj *a, robj *b);
befec3cd 665static void usage();
8f63ddca 666static int rewriteAppendOnlyFileBackground(void);
560db612 667static vmpointer *vmSwapObjectBlocking(robj *val);
fab43727 668static int prepareForShutdown();
37ab76c9 669static void touchWatchedKey(redisDb *db, robj *key);
9b30e1a2 670static void touchWatchedKeysOnFlush(int dbid);
37ab76c9 671static void unwatchAllKeys(redisClient *c);
ed9b544e 672
abcb223e 673static void authCommand(redisClient *c);
ed9b544e 674static void pingCommand(redisClient *c);
675static void echoCommand(redisClient *c);
676static void setCommand(redisClient *c);
677static void setnxCommand(redisClient *c);
526d00a5 678static void setexCommand(redisClient *c);
ed9b544e 679static void getCommand(redisClient *c);
680static void delCommand(redisClient *c);
681static void existsCommand(redisClient *c);
682static void incrCommand(redisClient *c);
683static void decrCommand(redisClient *c);
684static void incrbyCommand(redisClient *c);
685static void decrbyCommand(redisClient *c);
686static void selectCommand(redisClient *c);
687static void randomkeyCommand(redisClient *c);
688static void keysCommand(redisClient *c);
689static void dbsizeCommand(redisClient *c);
690static void lastsaveCommand(redisClient *c);
691static void saveCommand(redisClient *c);
692static void bgsaveCommand(redisClient *c);
9d65a1bb 693static void bgrewriteaofCommand(redisClient *c);
ed9b544e 694static void shutdownCommand(redisClient *c);
695static void moveCommand(redisClient *c);
696static void renameCommand(redisClient *c);
697static void renamenxCommand(redisClient *c);
698static void lpushCommand(redisClient *c);
699static void rpushCommand(redisClient *c);
700static void lpopCommand(redisClient *c);
701static void rpopCommand(redisClient *c);
702static void llenCommand(redisClient *c);
703static void lindexCommand(redisClient *c);
704static void lrangeCommand(redisClient *c);
705static void ltrimCommand(redisClient *c);
706static void typeCommand(redisClient *c);
707static void lsetCommand(redisClient *c);
708static void saddCommand(redisClient *c);
709static void sremCommand(redisClient *c);
a4460ef4 710static void smoveCommand(redisClient *c);
ed9b544e 711static void sismemberCommand(redisClient *c);
712static void scardCommand(redisClient *c);
12fea928 713static void spopCommand(redisClient *c);
2abb95a9 714static void srandmemberCommand(redisClient *c);
ed9b544e 715static void sinterCommand(redisClient *c);
716static void sinterstoreCommand(redisClient *c);
40d224a9 717static void sunionCommand(redisClient *c);
718static void sunionstoreCommand(redisClient *c);
f4f56e1d 719static void sdiffCommand(redisClient *c);
720static void sdiffstoreCommand(redisClient *c);
ed9b544e 721static void syncCommand(redisClient *c);
722static void flushdbCommand(redisClient *c);
723static void flushallCommand(redisClient *c);
724static void sortCommand(redisClient *c);
725static void lremCommand(redisClient *c);
0f5f7e9a 726static void rpoplpushcommand(redisClient *c);
ed9b544e 727static void infoCommand(redisClient *c);
70003d28 728static void mgetCommand(redisClient *c);
87eca727 729static void monitorCommand(redisClient *c);
3305306f 730static void expireCommand(redisClient *c);
802e8373 731static void expireatCommand(redisClient *c);
f6b141c5 732static void getsetCommand(redisClient *c);
fd88489a 733static void ttlCommand(redisClient *c);
321b0e13 734static void slaveofCommand(redisClient *c);
7f957c92 735static void debugCommand(redisClient *c);
f6b141c5 736static void msetCommand(redisClient *c);
737static void msetnxCommand(redisClient *c);
fd8ccf44 738static void zaddCommand(redisClient *c);
7db723ad 739static void zincrbyCommand(redisClient *c);
cc812361 740static void zrangeCommand(redisClient *c);
50c55df5 741static void zrangebyscoreCommand(redisClient *c);
f44dd428 742static void zcountCommand(redisClient *c);
e3870fab 743static void zrevrangeCommand(redisClient *c);
3c41331e 744static void zcardCommand(redisClient *c);
1b7106e7 745static void zremCommand(redisClient *c);
6e333bbe 746static void zscoreCommand(redisClient *c);
1807985b 747static void zremrangebyscoreCommand(redisClient *c);
6e469882 748static void multiCommand(redisClient *c);
749static void execCommand(redisClient *c);
18b6cb76 750static void discardCommand(redisClient *c);
4409877e 751static void blpopCommand(redisClient *c);
752static void brpopCommand(redisClient *c);
4b00bebd 753static void appendCommand(redisClient *c);
39191553 754static void substrCommand(redisClient *c);
69d95c3e 755static void zrankCommand(redisClient *c);
798d9e55 756static void zrevrankCommand(redisClient *c);
978c2c94 757static void hsetCommand(redisClient *c);
1f1c7695 758static void hsetnxCommand(redisClient *c);
978c2c94 759static void hgetCommand(redisClient *c);
09aeb579
PN
760static void hmsetCommand(redisClient *c);
761static void hmgetCommand(redisClient *c);
07efaf74 762static void hdelCommand(redisClient *c);
92b27fe9 763static void hlenCommand(redisClient *c);
9212eafd 764static void zremrangebyrankCommand(redisClient *c);
5d373da9 765static void zunionstoreCommand(redisClient *c);
766static void zinterstoreCommand(redisClient *c);
78409a0f 767static void hkeysCommand(redisClient *c);
768static void hvalsCommand(redisClient *c);
769static void hgetallCommand(redisClient *c);
a86f14b1 770static void hexistsCommand(redisClient *c);
500ece7c 771static void configCommand(redisClient *c);
01426b05 772static void hincrbyCommand(redisClient *c);
befec3cd 773static void subscribeCommand(redisClient *c);
774static void unsubscribeCommand(redisClient *c);
ffc6b7f8 775static void psubscribeCommand(redisClient *c);
776static void punsubscribeCommand(redisClient *c);
befec3cd 777static void publishCommand(redisClient *c);
37ab76c9 778static void watchCommand(redisClient *c);
779static void unwatchCommand(redisClient *c);
f6b141c5 780
ed9b544e 781/*================================= Globals ================================= */
782
783/* Global vars */
784static struct redisServer server; /* server global state */
1a132bbc 785static struct redisCommand *commandTable;
1a132bbc 786static struct redisCommand readonlyCommandTable[] = {
76583ea4
PN
787 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
789 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
526d00a5 790 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
791 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
792 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
793 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
794 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
795 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
798 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
799 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
800 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
802 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
806 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
807 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
808 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
809 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
810 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
811 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
812 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
813 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
814 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
815 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
816 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
817 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
819 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
820 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
821 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
822 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
823 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
824 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
825 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
826 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
827 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
828 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
829 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
5d373da9 830 {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
831 {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
832 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
833 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
834 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
835 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
836 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
837 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
838 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
839 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
840 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
1f1c7695 841 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 842 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
d33278d1 843 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 844 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
01426b05 845 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
846 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
847 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
848 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
849 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
850 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 851 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
852 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
853 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
854 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
855 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
856 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
857 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
858 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
859 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
860 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
861 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
862 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
863 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
864 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
865 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
866 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
867 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
868 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
869 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
870 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
871 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
872 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
873 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
874 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
875 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
3805e04f 876 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
76583ea4
PN
877 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
878 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
879 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
880 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
881 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
882 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
883 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
884 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
885 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
886 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 887 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
befec3cd 888 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
889 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
ffc6b7f8 890 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
891 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
4005fef1 892 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
37ab76c9 893 {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
d55d5c5d 894 {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}
ed9b544e 895};
bcfc686d 896
ed9b544e 897/*============================ Utility functions ============================ */
898
899/* Glob-style pattern matching. */
500ece7c 900static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 901 const char *string, int stringLen, int nocase)
902{
903 while(patternLen) {
904 switch(pattern[0]) {
905 case '*':
906 while (pattern[1] == '*') {
907 pattern++;
908 patternLen--;
909 }
910 if (patternLen == 1)
911 return 1; /* match */
912 while(stringLen) {
913 if (stringmatchlen(pattern+1, patternLen-1,
914 string, stringLen, nocase))
915 return 1; /* match */
916 string++;
917 stringLen--;
918 }
919 return 0; /* no match */
920 break;
921 case '?':
922 if (stringLen == 0)
923 return 0; /* no match */
924 string++;
925 stringLen--;
926 break;
927 case '[':
928 {
929 int not, match;
930
931 pattern++;
932 patternLen--;
933 not = pattern[0] == '^';
934 if (not) {
935 pattern++;
936 patternLen--;
937 }
938 match = 0;
939 while(1) {
940 if (pattern[0] == '\\') {
941 pattern++;
942 patternLen--;
943 if (pattern[0] == string[0])
944 match = 1;
945 } else if (pattern[0] == ']') {
946 break;
947 } else if (patternLen == 0) {
948 pattern--;
949 patternLen++;
950 break;
951 } else if (pattern[1] == '-' && patternLen >= 3) {
952 int start = pattern[0];
953 int end = pattern[2];
954 int c = string[0];
955 if (start > end) {
956 int t = start;
957 start = end;
958 end = t;
959 }
960 if (nocase) {
961 start = tolower(start);
962 end = tolower(end);
963 c = tolower(c);
964 }
965 pattern += 2;
966 patternLen -= 2;
967 if (c >= start && c <= end)
968 match = 1;
969 } else {
970 if (!nocase) {
971 if (pattern[0] == string[0])
972 match = 1;
973 } else {
974 if (tolower((int)pattern[0]) == tolower((int)string[0]))
975 match = 1;
976 }
977 }
978 pattern++;
979 patternLen--;
980 }
981 if (not)
982 match = !match;
983 if (!match)
984 return 0; /* no match */
985 string++;
986 stringLen--;
987 break;
988 }
989 case '\\':
990 if (patternLen >= 2) {
991 pattern++;
992 patternLen--;
993 }
994 /* fall through */
995 default:
996 if (!nocase) {
997 if (pattern[0] != string[0])
998 return 0; /* no match */
999 } else {
1000 if (tolower((int)pattern[0]) != tolower((int)string[0]))
1001 return 0; /* no match */
1002 }
1003 string++;
1004 stringLen--;
1005 break;
1006 }
1007 pattern++;
1008 patternLen--;
1009 if (stringLen == 0) {
1010 while(*pattern == '*') {
1011 pattern++;
1012 patternLen--;
1013 }
1014 break;
1015 }
1016 }
1017 if (patternLen == 0 && stringLen == 0)
1018 return 1;
1019 return 0;
1020}
1021
500ece7c 1022static int stringmatch(const char *pattern, const char *string, int nocase) {
1023 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
1024}
1025
2b619329 1026/* Convert a string representing an amount of memory into the number of
1027 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
1028 * (1024*1024*1024).
1029 *
1030 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
1031 * set to 0 */
1032static long long memtoll(const char *p, int *err) {
1033 const char *u;
1034 char buf[128];
1035 long mul; /* unit multiplier */
1036 long long val;
1037 unsigned int digits;
1038
1039 if (err) *err = 0;
1040 /* Search the first non digit character. */
1041 u = p;
1042 if (*u == '-') u++;
1043 while(*u && isdigit(*u)) u++;
1044 if (*u == '\0' || !strcasecmp(u,"b")) {
1045 mul = 1;
72324005 1046 } else if (!strcasecmp(u,"k")) {
2b619329 1047 mul = 1000;
72324005 1048 } else if (!strcasecmp(u,"kb")) {
2b619329 1049 mul = 1024;
72324005 1050 } else if (!strcasecmp(u,"m")) {
2b619329 1051 mul = 1000*1000;
72324005 1052 } else if (!strcasecmp(u,"mb")) {
2b619329 1053 mul = 1024*1024;
72324005 1054 } else if (!strcasecmp(u,"g")) {
2b619329 1055 mul = 1000L*1000*1000;
72324005 1056 } else if (!strcasecmp(u,"gb")) {
2b619329 1057 mul = 1024L*1024*1024;
1058 } else {
1059 if (err) *err = 1;
1060 mul = 1;
1061 }
1062 digits = u-p;
1063 if (digits >= sizeof(buf)) {
1064 if (err) *err = 1;
1065 return LLONG_MAX;
1066 }
1067 memcpy(buf,p,digits);
1068 buf[digits] = '\0';
1069 val = strtoll(buf,NULL,10);
1070 return val*mul;
1071}
1072
ee14da56 1073/* Convert a long long into a string. Returns the number of
1074 * characters needed to represent the number, that can be shorter if passed
1075 * buffer length is not enough to store the whole number. */
1076static int ll2string(char *s, size_t len, long long value) {
1077 char buf[32], *p;
1078 unsigned long long v;
1079 size_t l;
1080
1081 if (len == 0) return 0;
1082 v = (value < 0) ? -value : value;
1083 p = buf+31; /* point to the last character */
1084 do {
1085 *p-- = '0'+(v%10);
1086 v /= 10;
1087 } while(v);
1088 if (value < 0) *p-- = '-';
1089 p++;
1090 l = 32-(p-buf);
1091 if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
1092 memcpy(s,p,l);
1093 s[l] = '\0';
1094 return l;
1095}
1096
56906eef 1097static void redisLog(int level, const char *fmt, ...) {
ed9b544e 1098 va_list ap;
1099 FILE *fp;
1100
1101 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1102 if (!fp) return;
1103
1104 va_start(ap, fmt);
1105 if (level >= server.verbosity) {
6766f45e 1106 char *c = ".-*#";
1904ecc1 1107 char buf[64];
1108 time_t now;
1109
1110 now = time(NULL);
6c9385e0 1111 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 1112 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 1113 vfprintf(fp, fmt, ap);
1114 fprintf(fp,"\n");
1115 fflush(fp);
1116 }
1117 va_end(ap);
1118
1119 if (server.logfile) fclose(fp);
1120}
1121
1122/*====================== Hash table type implementation ==================== */
1123
1124/* This is an hash table type that uses the SDS dynamic strings libary as
1125 * keys and radis objects as values (objects can hold SDS strings,
1126 * lists, sets). */
1127
1812e024 1128static void dictVanillaFree(void *privdata, void *val)
1129{
1130 DICT_NOTUSED(privdata);
1131 zfree(val);
1132}
1133
4409877e 1134static void dictListDestructor(void *privdata, void *val)
1135{
1136 DICT_NOTUSED(privdata);
1137 listRelease((list*)val);
1138}
1139
09241813 1140static int dictSdsKeyCompare(void *privdata, const void *key1,
ed9b544e 1141 const void *key2)
1142{
1143 int l1,l2;
1144 DICT_NOTUSED(privdata);
1145
1146 l1 = sdslen((sds)key1);
1147 l2 = sdslen((sds)key2);
1148 if (l1 != l2) return 0;
1149 return memcmp(key1, key2, l1) == 0;
1150}
1151
1152static void dictRedisObjectDestructor(void *privdata, void *val)
1153{
1154 DICT_NOTUSED(privdata);
1155
a35ddf12 1156 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 1157 decrRefCount(val);
1158}
1159
09241813 1160static void dictSdsDestructor(void *privdata, void *val)
1161{
1162 DICT_NOTUSED(privdata);
1163
1164 sdsfree(val);
1165}
1166
942a3961 1167static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1168 const void *key2)
1169{
1170 const robj *o1 = key1, *o2 = key2;
09241813 1171 return dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
ed9b544e 1172}
1173
942a3961 1174static unsigned int dictObjHash(const void *key) {
ed9b544e 1175 const robj *o = key;
1176 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1177}
1178
09241813 1179static unsigned int dictSdsHash(const void *key) {
1180 return dictGenHashFunction((unsigned char*)key, sdslen((char*)key));
1181}
1182
942a3961 1183static int dictEncObjKeyCompare(void *privdata, const void *key1,
1184 const void *key2)
1185{
9d65a1bb 1186 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1187 int cmp;
942a3961 1188
2a1198b4 1189 if (o1->encoding == REDIS_ENCODING_INT &&
dc05abde 1190 o2->encoding == REDIS_ENCODING_INT)
1191 return o1->ptr == o2->ptr;
2a1198b4 1192
9d65a1bb 1193 o1 = getDecodedObject(o1);
1194 o2 = getDecodedObject(o2);
09241813 1195 cmp = dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
9d65a1bb 1196 decrRefCount(o1);
1197 decrRefCount(o2);
1198 return cmp;
942a3961 1199}
1200
1201static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1202 robj *o = (robj*) key;
942a3961 1203
ed9e4966 1204 if (o->encoding == REDIS_ENCODING_RAW) {
1205 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1206 } else {
1207 if (o->encoding == REDIS_ENCODING_INT) {
1208 char buf[32];
1209 int len;
1210
ee14da56 1211 len = ll2string(buf,32,(long)o->ptr);
ed9e4966 1212 return dictGenHashFunction((unsigned char*)buf, len);
1213 } else {
1214 unsigned int hash;
1215
1216 o = getDecodedObject(o);
1217 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1218 decrRefCount(o);
1219 return hash;
1220 }
1221 }
942a3961 1222}
1223
09241813 1224/* Sets type */
ed9b544e 1225static dictType setDictType = {
942a3961 1226 dictEncObjHash, /* hash function */
ed9b544e 1227 NULL, /* key dup */
1228 NULL, /* val dup */
942a3961 1229 dictEncObjKeyCompare, /* key compare */
ed9b544e 1230 dictRedisObjectDestructor, /* key destructor */
1231 NULL /* val destructor */
1232};
1233
f2d9f50f 1234/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1235static dictType zsetDictType = {
1236 dictEncObjHash, /* hash function */
1237 NULL, /* key dup */
1238 NULL, /* val dup */
1239 dictEncObjKeyCompare, /* key compare */
1240 dictRedisObjectDestructor, /* key destructor */
da0a1620 1241 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1242};
1243
09241813 1244/* Db->dict, keys are sds strings, vals are Redis objects. */
5234952b 1245static dictType dbDictType = {
09241813 1246 dictSdsHash, /* hash function */
ed9b544e 1247 NULL, /* key dup */
1248 NULL, /* val dup */
09241813 1249 dictSdsKeyCompare, /* key compare */
1250 dictSdsDestructor, /* key destructor */
ed9b544e 1251 dictRedisObjectDestructor /* val destructor */
1252};
1253
f2d9f50f 1254/* Db->expires */
1255static dictType keyptrDictType = {
09241813 1256 dictSdsHash, /* hash function */
f2d9f50f 1257 NULL, /* key dup */
1258 NULL, /* val dup */
09241813 1259 dictSdsKeyCompare, /* key compare */
1260 dictSdsDestructor, /* key destructor */
f2d9f50f 1261 NULL /* val destructor */
1262};
1263
5234952b 1264/* Hash type hash table (note that small hashes are represented with zimpaps) */
1265static dictType hashDictType = {
1266 dictEncObjHash, /* hash function */
1267 NULL, /* key dup */
1268 NULL, /* val dup */
1269 dictEncObjKeyCompare, /* key compare */
1270 dictRedisObjectDestructor, /* key destructor */
1271 dictRedisObjectDestructor /* val destructor */
1272};
1273
4409877e 1274/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1275 * lists as values. It's used for blocking operations (BLPOP) and to
1276 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1277static dictType keylistDictType = {
1278 dictObjHash, /* hash function */
1279 NULL, /* key dup */
1280 NULL, /* val dup */
1281 dictObjKeyCompare, /* key compare */
1282 dictRedisObjectDestructor, /* key destructor */
1283 dictListDestructor /* val destructor */
1284};
1285
42ab0172
AO
1286static void version();
1287
ed9b544e 1288/* ========================= Random utility functions ======================= */
1289
1290/* Redis generally does not try to recover from out of memory conditions
1291 * when allocating objects or strings, it is not clear if it will be possible
1292 * to report this condition to the client since the networking layer itself
1293 * is based on heap allocation for send buffers, so we simply abort.
1294 * At least the code will be simpler to read... */
1295static void oom(const char *msg) {
71c54b21 1296 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1297 sleep(1);
1298 abort();
1299}
1300
1301/* ====================== Redis server networking stuff ===================== */
56906eef 1302static void closeTimedoutClients(void) {
ed9b544e 1303 redisClient *c;
ed9b544e 1304 listNode *ln;
1305 time_t now = time(NULL);
c7df85a4 1306 listIter li;
ed9b544e 1307
c7df85a4 1308 listRewind(server.clients,&li);
1309 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1310 c = listNodeValue(ln);
f86a74e9 1311 if (server.maxidletime &&
1312 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1313 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
ffc6b7f8 1314 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1315 listLength(c->pubsub_patterns) == 0 &&
d6cc8867 1316 (now - c->lastinteraction > server.maxidletime))
f86a74e9 1317 {
f870935d 1318 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1319 freeClient(c);
f86a74e9 1320 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1321 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1322 addReply(c,shared.nullmultibulk);
b0d8747d 1323 unblockClientWaitingData(c);
f86a74e9 1324 }
ed9b544e 1325 }
1326 }
ed9b544e 1327}
1328
12fea928 1329static int htNeedsResize(dict *dict) {
1330 long long size, used;
1331
1332 size = dictSlots(dict);
1333 used = dictSize(dict);
1334 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1335 (used*100/size < REDIS_HT_MINFILL));
1336}
1337
0bc03378 1338/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1339 * we resize the hash table to save memory */
56906eef 1340static void tryResizeHashTables(void) {
0bc03378 1341 int j;
1342
1343 for (j = 0; j < server.dbnum; j++) {
5413c40d 1344 if (htNeedsResize(server.db[j].dict))
0bc03378 1345 dictResize(server.db[j].dict);
12fea928 1346 if (htNeedsResize(server.db[j].expires))
1347 dictResize(server.db[j].expires);
0bc03378 1348 }
1349}
1350
8ca3e9d1 1351/* Our hash table implementation performs rehashing incrementally while
1352 * we write/read from the hash table. Still if the server is idle, the hash
1353 * table will use two tables for a long time. So we try to use 1 millisecond
1354 * of CPU time at every serverCron() loop in order to rehash some key. */
1355static void incrementallyRehash(void) {
1356 int j;
1357
1358 for (j = 0; j < server.dbnum; j++) {
1359 if (dictIsRehashing(server.db[j].dict)) {
1360 dictRehashMilliseconds(server.db[j].dict,1);
1361 break; /* already used our millisecond for this loop... */
1362 }
1363 }
1364}
1365
9d65a1bb 1366/* A background saving child (BGSAVE) terminated its work. Handle this. */
1367void backgroundSaveDoneHandler(int statloc) {
1368 int exitcode = WEXITSTATUS(statloc);
1369 int bysignal = WIFSIGNALED(statloc);
1370
1371 if (!bysignal && exitcode == 0) {
1372 redisLog(REDIS_NOTICE,
1373 "Background saving terminated with success");
1374 server.dirty = 0;
1375 server.lastsave = time(NULL);
1376 } else if (!bysignal && exitcode != 0) {
1377 redisLog(REDIS_WARNING, "Background saving error");
1378 } else {
1379 redisLog(REDIS_WARNING,
454eea7c 1380 "Background saving terminated by signal %d", WTERMSIG(statloc));
9d65a1bb 1381 rdbRemoveTempFile(server.bgsavechildpid);
1382 }
1383 server.bgsavechildpid = -1;
1384 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1385 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1386 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1387}
1388
1389/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1390 * Handle this. */
1391void backgroundRewriteDoneHandler(int statloc) {
1392 int exitcode = WEXITSTATUS(statloc);
1393 int bysignal = WIFSIGNALED(statloc);
1394
1395 if (!bysignal && exitcode == 0) {
1396 int fd;
1397 char tmpfile[256];
1398
1399 redisLog(REDIS_NOTICE,
1400 "Background append only file rewriting terminated with success");
1401 /* Now it's time to flush the differences accumulated by the parent */
1402 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1403 fd = open(tmpfile,O_WRONLY|O_APPEND);
1404 if (fd == -1) {
1405 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1406 goto cleanup;
1407 }
1408 /* Flush our data... */
1409 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1410 (signed) sdslen(server.bgrewritebuf)) {
1411 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1412 close(fd);
1413 goto cleanup;
1414 }
b32627cd 1415 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1416 /* Now our work is to rename the temp file into the stable file. And
1417 * switch the file descriptor used by the server for append only. */
1418 if (rename(tmpfile,server.appendfilename) == -1) {
1419 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1420 close(fd);
1421 goto cleanup;
1422 }
1423 /* Mission completed... almost */
1424 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1425 if (server.appendfd != -1) {
1426 /* If append only is actually enabled... */
1427 close(server.appendfd);
1428 server.appendfd = fd;
d5d23dab 1429 if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
85a83172 1430 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1431 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1432 } else {
1433 /* If append only is disabled we just generate a dump in this
1434 * format. Why not? */
1435 close(fd);
1436 }
1437 } else if (!bysignal && exitcode != 0) {
1438 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1439 } else {
1440 redisLog(REDIS_WARNING,
454eea7c 1441 "Background append only file rewriting terminated by signal %d",
1442 WTERMSIG(statloc));
9d65a1bb 1443 }
1444cleanup:
1445 sdsfree(server.bgrewritebuf);
1446 server.bgrewritebuf = sdsempty();
1447 aofRemoveTempFile(server.bgrewritechildpid);
1448 server.bgrewritechildpid = -1;
1449}
1450
884d4b39 1451/* This function is called once a background process of some kind terminates,
1452 * as we want to avoid resizing the hash tables when there is a child in order
1453 * to play well with copy-on-write (otherwise when a resize happens lots of
1454 * memory pages are copied). The goal of this function is to update the ability
1455 * for dict.c to resize the hash tables accordingly to the fact we have o not
1456 * running childs. */
1457static void updateDictResizePolicy(void) {
1458 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1459 dictEnableResize();
1460 else
1461 dictDisableResize();
1462}
1463
56906eef 1464static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1465 int j, loops = server.cronloops++;
ed9b544e 1466 REDIS_NOTUSED(eventLoop);
1467 REDIS_NOTUSED(id);
1468 REDIS_NOTUSED(clientData);
1469
3a66edc7 1470 /* We take a cached value of the unix time in the global state because
1471 * with virtual memory and aging there is to store the current time
1472 * in objects at every object access, and accuracy is not needed.
1473 * To access a global var is faster than calling time(NULL) */
1474 server.unixtime = time(NULL);
560db612 1475 /* We have just 21 bits per object for LRU information.
1476 * So we use an (eventually wrapping) LRU clock with minutes resolution.
1477 *
1478 * When we need to select what object to swap, we compute the minimum
1479 * time distance between the current lruclock and the object last access
1480 * lruclock info. Even if clocks will wrap on overflow, there is
1481 * the interesting property that we are sure that at least
1482 * ABS(A-B) minutes passed between current time and timestamp B.
1483 *
1484 * This is not precise but we don't need at all precision, but just
1485 * something statistically reasonable.
1486 */
1487 server.lruclock = (time(NULL)/60)&((1<<21)-1);
3a66edc7 1488
fab43727 1489 /* We received a SIGTERM, shutting down here in a safe way, as it is
1490 * not ok doing so inside the signal handler. */
1491 if (server.shutdown_asap) {
1492 if (prepareForShutdown() == REDIS_OK) exit(0);
1493 redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
1494 }
1495
0bc03378 1496 /* Show some info about non-empty databases */
ed9b544e 1497 for (j = 0; j < server.dbnum; j++) {
dec423d9 1498 long long size, used, vkeys;
94754ccc 1499
3305306f 1500 size = dictSlots(server.db[j].dict);
1501 used = dictSize(server.db[j].dict);
94754ccc 1502 vkeys = dictSize(server.db[j].expires);
1763929f 1503 if (!(loops % 50) && (used || vkeys)) {
f870935d 1504 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1505 /* dictPrintStats(server.dict); */
ed9b544e 1506 }
ed9b544e 1507 }
1508
0bc03378 1509 /* We don't want to resize the hash tables while a bacground saving
1510 * is in progress: the saving child is created using fork() that is
1511 * implemented with a copy-on-write semantic in most modern systems, so
1512 * if we resize the HT while there is the saving child at work actually
1513 * a lot of memory movements in the parent will cause a lot of pages
1514 * copied. */
8ca3e9d1 1515 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1516 if (!(loops % 10)) tryResizeHashTables();
1517 if (server.activerehashing) incrementallyRehash();
884d4b39 1518 }
0bc03378 1519
ed9b544e 1520 /* Show information about connected clients */
1763929f 1521 if (!(loops % 50)) {
bdcb92f2 1522 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
ed9b544e 1523 listLength(server.clients)-listLength(server.slaves),
1524 listLength(server.slaves),
bdcb92f2 1525 zmalloc_used_memory());
ed9b544e 1526 }
1527
1528 /* Close connections of timedout clients */
1763929f 1529 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1530 closeTimedoutClients();
1531
9d65a1bb 1532 /* Check if a background saving or AOF rewrite in progress terminated */
1533 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1534 int statloc;
9d65a1bb 1535 pid_t pid;
1536
1537 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1538 if (pid == server.bgsavechildpid) {
1539 backgroundSaveDoneHandler(statloc);
ed9b544e 1540 } else {
9d65a1bb 1541 backgroundRewriteDoneHandler(statloc);
ed9b544e 1542 }
884d4b39 1543 updateDictResizePolicy();
ed9b544e 1544 }
1545 } else {
1546 /* If there is not a background saving in progress check if
1547 * we have to save now */
1548 time_t now = time(NULL);
1549 for (j = 0; j < server.saveparamslen; j++) {
1550 struct saveparam *sp = server.saveparams+j;
1551
1552 if (server.dirty >= sp->changes &&
1553 now-server.lastsave > sp->seconds) {
1554 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1555 sp->changes, sp->seconds);
f78fd11b 1556 rdbSaveBackground(server.dbfilename);
ed9b544e 1557 break;
1558 }
1559 }
1560 }
94754ccc 1561
f2324293 1562 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1563 * will use few CPU cycles if there are few expiring keys, otherwise
1564 * it will get more aggressive to avoid that too much memory is used by
1565 * keys that can be removed from the keyspace. */
94754ccc 1566 for (j = 0; j < server.dbnum; j++) {
f2324293 1567 int expired;
94754ccc 1568 redisDb *db = server.db+j;
94754ccc 1569
f2324293 1570 /* Continue to expire if at the end of the cycle more than 25%
1571 * of the keys were expired. */
1572 do {
4ef8de8a 1573 long num = dictSize(db->expires);
94754ccc 1574 time_t now = time(NULL);
1575
f2324293 1576 expired = 0;
94754ccc 1577 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1578 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1579 while (num--) {
1580 dictEntry *de;
1581 time_t t;
1582
1583 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1584 t = (time_t) dictGetEntryVal(de);
1585 if (now > t) {
09241813 1586 sds key = dictGetEntryKey(de);
1587 robj *keyobj = createStringObject(key,sdslen(key));
1588
1589 dbDelete(db,keyobj);
1590 decrRefCount(keyobj);
f2324293 1591 expired++;
2a6a2ed1 1592 server.stat_expiredkeys++;
94754ccc 1593 }
1594 }
f2324293 1595 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1596 }
1597
4ef8de8a 1598 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1599 * is enbled. Try to free objects from the free list first. */
7e69548d 1600 if (vmCanSwapOut()) {
1601 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1602 server.vm_max_memory)
1603 {
72e9fd40 1604 int retval;
1605
a5819310 1606 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1607 retval = (server.vm_max_threads == 0) ?
1608 vmSwapOneObjectBlocking() :
1609 vmSwapOneObjectThreaded();
1763929f 1610 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1611 zmalloc_used_memory() >
1612 (server.vm_max_memory+server.vm_max_memory/10))
1613 {
1614 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1615 }
72e9fd40 1616 /* Note that when using threade I/O we free just one object,
1617 * because anyway when the I/O thread in charge to swap this
1618 * object out will finish, the handler of completed jobs
1619 * will try to swap more objects if we are still out of memory. */
1620 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1621 }
1622 }
1623
ed9b544e 1624 /* Check if we should connect to a MASTER */
1763929f 1625 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1626 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1627 if (syncWithMaster() == REDIS_OK) {
1628 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
8f63ddca 1629 if (server.appendonly) rewriteAppendOnlyFileBackground();
ed9b544e 1630 }
1631 }
1763929f 1632 return 100;
ed9b544e 1633}
1634
d5d55fc3 1635/* This function gets called every time Redis is entering the
1636 * main loop of the event driven library, that is, before to sleep
1637 * for ready file descriptors. */
1638static void beforeSleep(struct aeEventLoop *eventLoop) {
1639 REDIS_NOTUSED(eventLoop);
1640
28ed1f33 1641 /* Awake clients that got all the swapped keys they requested */
d5d55fc3 1642 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1643 listIter li;
1644 listNode *ln;
1645
1646 listRewind(server.io_ready_clients,&li);
1647 while((ln = listNext(&li))) {
1648 redisClient *c = ln->value;
1649 struct redisCommand *cmd;
1650
1651 /* Resume the client. */
1652 listDelNode(server.io_ready_clients,ln);
1653 c->flags &= (~REDIS_IO_WAIT);
1654 server.vm_blocked_clients--;
1655 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1656 readQueryFromClient, c);
1657 cmd = lookupCommand(c->argv[0]->ptr);
1658 assert(cmd != NULL);
1659 call(c,cmd);
1660 resetClient(c);
1661 /* There may be more data to process in the input buffer. */
1662 if (c->querybuf && sdslen(c->querybuf) > 0)
1663 processInputBuffer(c);
1664 }
1665 }
28ed1f33 1666 /* Write the AOF buffer on disk */
1667 flushAppendOnlyFile();
d5d55fc3 1668}
1669
ed9b544e 1670static void createSharedObjects(void) {
05df7621 1671 int j;
1672
ed9b544e 1673 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1674 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1675 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1676 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1677 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1678 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1679 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1680 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1681 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1682 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1683 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1684 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1685 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1686 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1687 "-ERR no such key\r\n"));
ed9b544e 1688 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1689 "-ERR syntax error\r\n"));
c937aa89 1690 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1691 "-ERR source and destination objects are the same\r\n"));
1692 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1693 "-ERR index out of range\r\n"));
ed9b544e 1694 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1695 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1696 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1697 shared.select0 = createStringObject("select 0\r\n",10);
1698 shared.select1 = createStringObject("select 1\r\n",10);
1699 shared.select2 = createStringObject("select 2\r\n",10);
1700 shared.select3 = createStringObject("select 3\r\n",10);
1701 shared.select4 = createStringObject("select 4\r\n",10);
1702 shared.select5 = createStringObject("select 5\r\n",10);
1703 shared.select6 = createStringObject("select 6\r\n",10);
1704 shared.select7 = createStringObject("select 7\r\n",10);
1705 shared.select8 = createStringObject("select 8\r\n",10);
1706 shared.select9 = createStringObject("select 9\r\n",10);
befec3cd 1707 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
c8d0ea0e 1708 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
befec3cd 1709 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
fc46bb71 1710 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
ffc6b7f8 1711 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1712 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
befec3cd 1713 shared.mbulk3 = createStringObject("*3\r\n",4);
c8d0ea0e 1714 shared.mbulk4 = createStringObject("*4\r\n",4);
05df7621 1715 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1716 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1717 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1718 }
ed9b544e 1719}
1720
1721static void appendServerSaveParams(time_t seconds, int changes) {
1722 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1723 server.saveparams[server.saveparamslen].seconds = seconds;
1724 server.saveparams[server.saveparamslen].changes = changes;
1725 server.saveparamslen++;
1726}
1727
bcfc686d 1728static void resetServerSaveParams() {
ed9b544e 1729 zfree(server.saveparams);
1730 server.saveparams = NULL;
1731 server.saveparamslen = 0;
1732}
1733
1734static void initServerConfig() {
1735 server.dbnum = REDIS_DEFAULT_DBNUM;
1736 server.port = REDIS_SERVERPORT;
f870935d 1737 server.verbosity = REDIS_VERBOSE;
ed9b544e 1738 server.maxidletime = REDIS_MAXIDLETIME;
1739 server.saveparams = NULL;
1740 server.logfile = NULL; /* NULL = log on standard output */
1741 server.bindaddr = NULL;
1742 server.glueoutputbuf = 1;
1743 server.daemonize = 0;
44b38ef4 1744 server.appendonly = 0;
1b677732 1745 server.appendfsync = APPENDFSYNC_EVERYSEC;
38db9171 1746 server.no_appendfsync_on_rewrite = 0;
48f0308a 1747 server.lastfsync = time(NULL);
44b38ef4 1748 server.appendfd = -1;
1749 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1750 server.pidfile = zstrdup("/var/run/redis.pid");
1751 server.dbfilename = zstrdup("dump.rdb");
1752 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1753 server.requirepass = NULL;
b0553789 1754 server.rdbcompression = 1;
8ca3e9d1 1755 server.activerehashing = 1;
285add55 1756 server.maxclients = 0;
d5d55fc3 1757 server.blpop_blocked_clients = 0;
3fd78bcd 1758 server.maxmemory = 0;
75680a3c 1759 server.vm_enabled = 0;
054e426d 1760 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1761 server.vm_page_size = 256; /* 256 bytes per page */
1762 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1763 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1764 server.vm_max_threads = 4;
d5d55fc3 1765 server.vm_blocked_clients = 0;
cbba7dd7 1766 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1767 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
d0686e07
PN
1768 server.list_max_ziplist_entries = REDIS_LIST_MAX_ZIPLIST_ENTRIES;
1769 server.list_max_ziplist_value = REDIS_LIST_MAX_ZIPLIST_VALUE;
70ff3511 1770 server.set_max_intset_entries = REDIS_SET_MAX_INTSET_ENTRIES;
fab43727 1771 server.shutdown_asap = 0;
75680a3c 1772
bcfc686d 1773 resetServerSaveParams();
ed9b544e 1774
1775 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1776 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1777 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1778 /* Replication related */
1779 server.isslave = 0;
d0ccebcf 1780 server.masterauth = NULL;
ed9b544e 1781 server.masterhost = NULL;
1782 server.masterport = 6379;
1783 server.master = NULL;
1784 server.replstate = REDIS_REPL_NONE;
a7866db6 1785
1786 /* Double constants initialization */
1787 R_Zero = 0.0;
1788 R_PosInf = 1.0/R_Zero;
1789 R_NegInf = -1.0/R_Zero;
1790 R_Nan = R_Zero/R_Zero;
ed9b544e 1791}
1792
1793static void initServer() {
1794 int j;
1795
1796 signal(SIGHUP, SIG_IGN);
1797 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1798 setupSigSegvAction();
ed9b544e 1799
b9bc0eef 1800 server.devnull = fopen("/dev/null","w");
1801 if (server.devnull == NULL) {
1802 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1803 exit(1);
1804 }
ed9b544e 1805 server.clients = listCreate();
1806 server.slaves = listCreate();
87eca727 1807 server.monitors = listCreate();
ed9b544e 1808 server.objfreelist = listCreate();
1809 createSharedObjects();
1810 server.el = aeCreateEventLoop();
3305306f 1811 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
ed9b544e 1812 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1813 if (server.fd == -1) {
1814 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1815 exit(1);
1816 }
3305306f 1817 for (j = 0; j < server.dbnum; j++) {
5234952b 1818 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1819 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
37ab76c9 1820 server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
1821 server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1822 if (server.vm_enabled)
1823 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1824 server.db[j].id = j;
1825 }
ffc6b7f8 1826 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1827 server.pubsub_patterns = listCreate();
1828 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1829 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
ed9b544e 1830 server.cronloops = 0;
9f3c422c 1831 server.bgsavechildpid = -1;
9d65a1bb 1832 server.bgrewritechildpid = -1;
1833 server.bgrewritebuf = sdsempty();
28ed1f33 1834 server.aofbuf = sdsempty();
ed9b544e 1835 server.lastsave = time(NULL);
1836 server.dirty = 0;
ed9b544e 1837 server.stat_numcommands = 0;
1838 server.stat_numconnections = 0;
2a6a2ed1 1839 server.stat_expiredkeys = 0;
ed9b544e 1840 server.stat_starttime = time(NULL);
3a66edc7 1841 server.unixtime = time(NULL);
d8f8b666 1842 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1843 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1844 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1845
1846 if (server.appendonly) {
3bb225d6 1847 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1848 if (server.appendfd == -1) {
1849 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1850 strerror(errno));
1851 exit(1);
1852 }
1853 }
75680a3c 1854
1855 if (server.vm_enabled) vmInit();
ed9b544e 1856}
1857
1858/* Empty the whole database */
ca37e9cd 1859static long long emptyDb() {
ed9b544e 1860 int j;
ca37e9cd 1861 long long removed = 0;
ed9b544e 1862
3305306f 1863 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1864 removed += dictSize(server.db[j].dict);
3305306f 1865 dictEmpty(server.db[j].dict);
1866 dictEmpty(server.db[j].expires);
1867 }
ca37e9cd 1868 return removed;
ed9b544e 1869}
1870
85dd2f3a 1871static int yesnotoi(char *s) {
1872 if (!strcasecmp(s,"yes")) return 1;
1873 else if (!strcasecmp(s,"no")) return 0;
1874 else return -1;
1875}
1876
ed9b544e 1877/* I agree, this is a very rudimental way to load a configuration...
1878 will improve later if the config gets more complex */
1879static void loadServerConfig(char *filename) {
c9a111ac 1880 FILE *fp;
ed9b544e 1881 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1882 int linenum = 0;
1883 sds line = NULL;
c9a111ac 1884
1885 if (filename[0] == '-' && filename[1] == '\0')
1886 fp = stdin;
1887 else {
1888 if ((fp = fopen(filename,"r")) == NULL) {
9a22de82 1889 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
c9a111ac 1890 exit(1);
1891 }
ed9b544e 1892 }
c9a111ac 1893
ed9b544e 1894 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1895 sds *argv;
1896 int argc, j;
1897
1898 linenum++;
1899 line = sdsnew(buf);
1900 line = sdstrim(line," \t\r\n");
1901
1902 /* Skip comments and blank lines*/
1903 if (line[0] == '#' || line[0] == '\0') {
1904 sdsfree(line);
1905 continue;
1906 }
1907
1908 /* Split into arguments */
1909 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1910 sdstolower(argv[0]);
1911
1912 /* Execute config directives */
bb0b03a3 1913 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1914 server.maxidletime = atoi(argv[1]);
0150db36 1915 if (server.maxidletime < 0) {
ed9b544e 1916 err = "Invalid timeout value"; goto loaderr;
1917 }
bb0b03a3 1918 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1919 server.port = atoi(argv[1]);
1920 if (server.port < 1 || server.port > 65535) {
1921 err = "Invalid port"; goto loaderr;
1922 }
bb0b03a3 1923 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1924 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1925 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1926 int seconds = atoi(argv[1]);
1927 int changes = atoi(argv[2]);
1928 if (seconds < 1 || changes < 0) {
1929 err = "Invalid save parameters"; goto loaderr;
1930 }
1931 appendServerSaveParams(seconds,changes);
bb0b03a3 1932 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1933 if (chdir(argv[1]) == -1) {
1934 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1935 argv[1], strerror(errno));
1936 exit(1);
1937 }
bb0b03a3 1938 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1939 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1940 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1941 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1942 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1943 else {
1944 err = "Invalid log level. Must be one of debug, notice, warning";
1945 goto loaderr;
1946 }
bb0b03a3 1947 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1948 FILE *logfp;
ed9b544e 1949
1950 server.logfile = zstrdup(argv[1]);
bb0b03a3 1951 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1952 zfree(server.logfile);
1953 server.logfile = NULL;
1954 }
1955 if (server.logfile) {
1956 /* Test if we are able to open the file. The server will not
1957 * be able to abort just for this problem later... */
c9a111ac 1958 logfp = fopen(server.logfile,"a");
1959 if (logfp == NULL) {
ed9b544e 1960 err = sdscatprintf(sdsempty(),
1961 "Can't open the log file: %s", strerror(errno));
1962 goto loaderr;
1963 }
c9a111ac 1964 fclose(logfp);
ed9b544e 1965 }
bb0b03a3 1966 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1967 server.dbnum = atoi(argv[1]);
1968 if (server.dbnum < 1) {
1969 err = "Invalid number of databases"; goto loaderr;
1970 }
b3f83f12
JZ
1971 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1972 loadServerConfig(argv[1]);
285add55 1973 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1974 server.maxclients = atoi(argv[1]);
3fd78bcd 1975 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
2b619329 1976 server.maxmemory = memtoll(argv[1],NULL);
bb0b03a3 1977 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1978 server.masterhost = sdsnew(argv[1]);
1979 server.masterport = atoi(argv[2]);
1980 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1981 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1982 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1983 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1984 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1985 err = "argument must be 'yes' or 'no'"; goto loaderr;
1986 }
121f70cf 1987 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1988 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
8ca3e9d1 1989 err = "argument must be 'yes' or 'no'"; goto loaderr;
1990 }
1991 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1992 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
121f70cf 1993 err = "argument must be 'yes' or 'no'"; goto loaderr;
1994 }
bb0b03a3 1995 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1996 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1997 err = "argument must be 'yes' or 'no'"; goto loaderr;
1998 }
44b38ef4 1999 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
2000 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
2001 err = "argument must be 'yes' or 'no'"; goto loaderr;
2002 }
f3b52411
PN
2003 } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
2004 zfree(server.appendfilename);
2005 server.appendfilename = zstrdup(argv[1]);
38db9171 2006 } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite")
2007 && argc == 2) {
2008 if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) {
2009 err = "argument must be 'yes' or 'no'"; goto loaderr;
2010 }
48f0308a 2011 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 2012 if (!strcasecmp(argv[1],"no")) {
48f0308a 2013 server.appendfsync = APPENDFSYNC_NO;
1766c6da 2014 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 2015 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 2016 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 2017 server.appendfsync = APPENDFSYNC_EVERYSEC;
2018 } else {
2019 err = "argument must be 'no', 'always' or 'everysec'";
2020 goto loaderr;
2021 }
bb0b03a3 2022 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 2023 server.requirepass = zstrdup(argv[1]);
bb0b03a3 2024 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 2025 zfree(server.pidfile);
054e426d 2026 server.pidfile = zstrdup(argv[1]);
bb0b03a3 2027 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 2028 zfree(server.dbfilename);
054e426d 2029 server.dbfilename = zstrdup(argv[1]);
75680a3c 2030 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
2031 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
2032 err = "argument must be 'yes' or 'no'"; goto loaderr;
2033 }
054e426d 2034 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 2035 zfree(server.vm_swap_file);
054e426d 2036 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 2037 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2b619329 2038 server.vm_max_memory = memtoll(argv[1],NULL);
4ef8de8a 2039 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2b619329 2040 server.vm_page_size = memtoll(argv[1], NULL);
4ef8de8a 2041 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2b619329 2042 server.vm_pages = memtoll(argv[1], NULL);
92f8e882 2043 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
2044 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 2045 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2b619329 2046 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
cbba7dd7 2047 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2b619329 2048 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
d0686e07
PN
2049 } else if (!strcasecmp(argv[0],"list-max-ziplist-entries") && argc == 2){
2050 server.list_max_ziplist_entries = memtoll(argv[1], NULL);
2051 } else if (!strcasecmp(argv[0],"list-max-ziplist-value") && argc == 2){
2052 server.list_max_ziplist_value = memtoll(argv[1], NULL);
70ff3511
PN
2053 } else if (!strcasecmp(argv[0],"set-max-intset-entries") && argc == 2){
2054 server.set_max_intset_entries = memtoll(argv[1], NULL);
ed9b544e 2055 } else {
2056 err = "Bad directive or wrong number of arguments"; goto loaderr;
2057 }
2058 for (j = 0; j < argc; j++)
2059 sdsfree(argv[j]);
2060 zfree(argv);
2061 sdsfree(line);
2062 }
c9a111ac 2063 if (fp != stdin) fclose(fp);
ed9b544e 2064 return;
2065
2066loaderr:
2067 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
2068 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
2069 fprintf(stderr, ">>> '%s'\n", line);
2070 fprintf(stderr, "%s\n", err);
2071 exit(1);
2072}
2073
2074static void freeClientArgv(redisClient *c) {
2075 int j;
2076
2077 for (j = 0; j < c->argc; j++)
2078 decrRefCount(c->argv[j]);
e8a74421 2079 for (j = 0; j < c->mbargc; j++)
2080 decrRefCount(c->mbargv[j]);
ed9b544e 2081 c->argc = 0;
e8a74421 2082 c->mbargc = 0;
ed9b544e 2083}
2084
2085static void freeClient(redisClient *c) {
2086 listNode *ln;
2087
4409877e 2088 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 2089 * call, we have to set querybuf to NULL *before* to call
2090 * unblockClientWaitingData() to avoid processInputBuffer() will get
2091 * called. Also it is important to remove the file events after
2092 * this, because this call adds the READABLE event. */
4409877e 2093 sdsfree(c->querybuf);
2094 c->querybuf = NULL;
2095 if (c->flags & REDIS_BLOCKED)
b0d8747d 2096 unblockClientWaitingData(c);
4409877e 2097
37ab76c9 2098 /* UNWATCH all the keys */
2099 unwatchAllKeys(c);
2100 listRelease(c->watched_keys);
ffc6b7f8 2101 /* Unsubscribe from all the pubsub channels */
2102 pubsubUnsubscribeAllChannels(c,0);
2103 pubsubUnsubscribeAllPatterns(c,0);
2104 dictRelease(c->pubsub_channels);
2105 listRelease(c->pubsub_patterns);
befec3cd 2106 /* Obvious cleanup */
ed9b544e 2107 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
2108 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 2109 listRelease(c->reply);
2110 freeClientArgv(c);
2111 close(c->fd);
92f8e882 2112 /* Remove from the list of clients */
ed9b544e 2113 ln = listSearchKey(server.clients,c);
dfc5e96c 2114 redisAssert(ln != NULL);
ed9b544e 2115 listDelNode(server.clients,ln);
37ab76c9 2116 /* Remove from the list of clients that are now ready to be restarted
2117 * after waiting for swapped keys */
d5d55fc3 2118 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
2119 ln = listSearchKey(server.io_ready_clients,c);
2120 if (ln) {
2121 listDelNode(server.io_ready_clients,ln);
2122 server.vm_blocked_clients--;
2123 }
2124 }
37ab76c9 2125 /* Remove from the list of clients waiting for swapped keys */
d5d55fc3 2126 while (server.vm_enabled && listLength(c->io_keys)) {
2127 ln = listFirst(c->io_keys);
2128 dontWaitForSwappedKey(c,ln->value);
92f8e882 2129 }
b3e3d0d7 2130 listRelease(c->io_keys);
befec3cd 2131 /* Master/slave cleanup */
ed9b544e 2132 if (c->flags & REDIS_SLAVE) {
6208b3a7 2133 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
2134 close(c->repldbfd);
87eca727 2135 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
2136 ln = listSearchKey(l,c);
dfc5e96c 2137 redisAssert(ln != NULL);
87eca727 2138 listDelNode(l,ln);
ed9b544e 2139 }
2140 if (c->flags & REDIS_MASTER) {
2141 server.master = NULL;
2142 server.replstate = REDIS_REPL_CONNECT;
2143 }
befec3cd 2144 /* Release memory */
93ea3759 2145 zfree(c->argv);
e8a74421 2146 zfree(c->mbargv);
6e469882 2147 freeClientMultiState(c);
ed9b544e 2148 zfree(c);
2149}
2150
cc30e368 2151#define GLUEREPLY_UP_TO (1024)
ed9b544e 2152static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 2153 int copylen = 0;
2154 char buf[GLUEREPLY_UP_TO];
6208b3a7 2155 listNode *ln;
c7df85a4 2156 listIter li;
ed9b544e 2157 robj *o;
2158
c7df85a4 2159 listRewind(c->reply,&li);
2160 while((ln = listNext(&li))) {
c28b42ac 2161 int objlen;
2162
ed9b544e 2163 o = ln->value;
c28b42ac 2164 objlen = sdslen(o->ptr);
2165 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2166 memcpy(buf+copylen,o->ptr,objlen);
2167 copylen += objlen;
ed9b544e 2168 listDelNode(c->reply,ln);
c28b42ac 2169 } else {
2170 if (copylen == 0) return;
2171 break;
ed9b544e 2172 }
ed9b544e 2173 }
c28b42ac 2174 /* Now the output buffer is empty, add the new single element */
2175 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2176 listAddNodeHead(c->reply,o);
ed9b544e 2177}
2178
2179static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2180 redisClient *c = privdata;
2181 int nwritten = 0, totwritten = 0, objlen;
2182 robj *o;
2183 REDIS_NOTUSED(el);
2184 REDIS_NOTUSED(mask);
2185
2895e862 2186 /* Use writev() if we have enough buffers to send */
7ea870c0 2187 if (!server.glueoutputbuf &&
e0a62c7f 2188 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
7ea870c0 2189 !(c->flags & REDIS_MASTER))
2895e862 2190 {
2191 sendReplyToClientWritev(el, fd, privdata, mask);
2192 return;
2193 }
2895e862 2194
ed9b544e 2195 while(listLength(c->reply)) {
c28b42ac 2196 if (server.glueoutputbuf && listLength(c->reply) > 1)
2197 glueReplyBuffersIfNeeded(c);
2198
ed9b544e 2199 o = listNodeValue(listFirst(c->reply));
2200 objlen = sdslen(o->ptr);
2201
2202 if (objlen == 0) {
2203 listDelNode(c->reply,listFirst(c->reply));
2204 continue;
2205 }
2206
2207 if (c->flags & REDIS_MASTER) {
6f376729 2208 /* Don't reply to a master */
ed9b544e 2209 nwritten = objlen - c->sentlen;
2210 } else {
a4d1ba9a 2211 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 2212 if (nwritten <= 0) break;
2213 }
2214 c->sentlen += nwritten;
2215 totwritten += nwritten;
2216 /* If we fully sent the object on head go to the next one */
2217 if (c->sentlen == objlen) {
2218 listDelNode(c->reply,listFirst(c->reply));
2219 c->sentlen = 0;
2220 }
6f376729 2221 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 2222 * bytes, in a single threaded server it's a good idea to serve
6f376729 2223 * other clients as well, even if a very large request comes from
2224 * super fast link that is always able to accept data (in real world
12f9d551 2225 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 2226 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 2227 }
2228 if (nwritten == -1) {
2229 if (errno == EAGAIN) {
2230 nwritten = 0;
2231 } else {
f870935d 2232 redisLog(REDIS_VERBOSE,
ed9b544e 2233 "Error writing to client: %s", strerror(errno));
2234 freeClient(c);
2235 return;
2236 }
2237 }
2238 if (totwritten > 0) c->lastinteraction = time(NULL);
2239 if (listLength(c->reply) == 0) {
2240 c->sentlen = 0;
2241 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2242 }
2243}
2244
2895e862 2245static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2246{
2247 redisClient *c = privdata;
2248 int nwritten = 0, totwritten = 0, objlen, willwrite;
2249 robj *o;
2250 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2251 int offset, ion = 0;
2252 REDIS_NOTUSED(el);
2253 REDIS_NOTUSED(mask);
2254
2255 listNode *node;
2256 while (listLength(c->reply)) {
2257 offset = c->sentlen;
2258 ion = 0;
2259 willwrite = 0;
2260
2261 /* fill-in the iov[] array */
2262 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2263 o = listNodeValue(node);
2264 objlen = sdslen(o->ptr);
2265
e0a62c7f 2266 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2895e862 2267 break;
2268
2269 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2270 break; /* no more iovecs */
2271
2272 iov[ion].iov_base = ((char*)o->ptr) + offset;
2273 iov[ion].iov_len = objlen - offset;
2274 willwrite += objlen - offset;
2275 offset = 0; /* just for the first item */
2276 ion++;
2277 }
2278
2279 if(willwrite == 0)
2280 break;
2281
2282 /* write all collected blocks at once */
2283 if((nwritten = writev(fd, iov, ion)) < 0) {
2284 if (errno != EAGAIN) {
f870935d 2285 redisLog(REDIS_VERBOSE,
2895e862 2286 "Error writing to client: %s", strerror(errno));
2287 freeClient(c);
2288 return;
2289 }
2290 break;
2291 }
2292
2293 totwritten += nwritten;
2294 offset = c->sentlen;
2295
2296 /* remove written robjs from c->reply */
2297 while (nwritten && listLength(c->reply)) {
2298 o = listNodeValue(listFirst(c->reply));
2299 objlen = sdslen(o->ptr);
2300
2301 if(nwritten >= objlen - offset) {
2302 listDelNode(c->reply, listFirst(c->reply));
2303 nwritten -= objlen - offset;
2304 c->sentlen = 0;
2305 } else {
2306 /* partial write */
2307 c->sentlen += nwritten;
2308 break;
2309 }
2310 offset = 0;
2311 }
2312 }
2313
e0a62c7f 2314 if (totwritten > 0)
2895e862 2315 c->lastinteraction = time(NULL);
2316
2317 if (listLength(c->reply) == 0) {
2318 c->sentlen = 0;
2319 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2320 }
2321}
2322
1a132bbc
PN
2323static int qsortRedisCommands(const void *r1, const void *r2) {
2324 return strcasecmp(
2325 ((struct redisCommand*)r1)->name,
2326 ((struct redisCommand*)r2)->name);
2327}
2328
2329static void sortCommandTable() {
1a132bbc
PN
2330 /* Copy and sort the read-only version of the command table */
2331 commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
2332 memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
d55d5c5d 2333 qsort(commandTable,
2334 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
2335 sizeof(struct redisCommand),qsortRedisCommands);
1a132bbc
PN
2336}
2337
ed9b544e 2338static struct redisCommand *lookupCommand(char *name) {
1a132bbc
PN
2339 struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
2340 return bsearch(
2341 &tmp,
2342 commandTable,
d55d5c5d 2343 sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
1a132bbc
PN
2344 sizeof(struct redisCommand),
2345 qsortRedisCommands);
ed9b544e 2346}
2347
2348/* resetClient prepare the client to process the next command */
2349static void resetClient(redisClient *c) {
2350 freeClientArgv(c);
2351 c->bulklen = -1;
e8a74421 2352 c->multibulk = 0;
ed9b544e 2353}
2354
6e469882 2355/* Call() is the core of Redis execution of a command */
2356static void call(redisClient *c, struct redisCommand *cmd) {
2357 long long dirty;
2358
2359 dirty = server.dirty;
2360 cmd->proc(c);
4005fef1 2361 dirty = server.dirty-dirty;
2362
2363 if (server.appendonly && dirty)
6e469882 2364 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
4005fef1 2365 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2366 listLength(server.slaves))
248ea310 2367 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2368 if (listLength(server.monitors))
dd142b9c 2369 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2370 server.stat_numcommands++;
2371}
2372
ed9b544e 2373/* If this function gets called we already read a whole
2374 * command, argments are in the client argv/argc fields.
2375 * processCommand() execute the command or prepare the
2376 * server for a bulk read from the client.
2377 *
2378 * If 1 is returned the client is still alive and valid and
2379 * and other operations can be performed by the caller. Otherwise
2380 * if 0 is returned the client was destroied (i.e. after QUIT). */
2381static int processCommand(redisClient *c) {
2382 struct redisCommand *cmd;
ed9b544e 2383
3fd78bcd 2384 /* Free some memory if needed (maxmemory setting) */
2385 if (server.maxmemory) freeMemoryIfNeeded();
2386
e8a74421 2387 /* Handle the multi bulk command type. This is an alternative protocol
2388 * supported by Redis in order to receive commands that are composed of
2389 * multiple binary-safe "bulk" arguments. The latency of processing is
2390 * a bit higher but this allows things like multi-sets, so if this
2391 * protocol is used only for MSET and similar commands this is a big win. */
2392 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2393 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2394 if (c->multibulk <= 0) {
2395 resetClient(c);
2396 return 1;
2397 } else {
2398 decrRefCount(c->argv[c->argc-1]);
2399 c->argc--;
2400 return 1;
2401 }
2402 } else if (c->multibulk) {
2403 if (c->bulklen == -1) {
2404 if (((char*)c->argv[0]->ptr)[0] != '$') {
2405 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2406 resetClient(c);
2407 return 1;
2408 } else {
2409 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2410 decrRefCount(c->argv[0]);
2411 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2412 c->argc--;
2413 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2414 resetClient(c);
2415 return 1;
2416 }
2417 c->argc--;
2418 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2419 return 1;
2420 }
2421 } else {
2422 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2423 c->mbargv[c->mbargc] = c->argv[0];
2424 c->mbargc++;
2425 c->argc--;
2426 c->multibulk--;
2427 if (c->multibulk == 0) {
2428 robj **auxargv;
2429 int auxargc;
2430
2431 /* Here we need to swap the multi-bulk argc/argv with the
2432 * normal argc/argv of the client structure. */
2433 auxargv = c->argv;
2434 c->argv = c->mbargv;
2435 c->mbargv = auxargv;
2436
2437 auxargc = c->argc;
2438 c->argc = c->mbargc;
2439 c->mbargc = auxargc;
2440
2441 /* We need to set bulklen to something different than -1
2442 * in order for the code below to process the command without
2443 * to try to read the last argument of a bulk command as
2444 * a special argument. */
2445 c->bulklen = 0;
2446 /* continue below and process the command */
2447 } else {
2448 c->bulklen = -1;
2449 return 1;
2450 }
2451 }
2452 }
2453 /* -- end of multi bulk commands processing -- */
2454
ed9b544e 2455 /* The QUIT command is handled as a special case. Normal command
2456 * procs are unable to close the client connection safely */
bb0b03a3 2457 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2458 freeClient(c);
2459 return 0;
2460 }
d5d55fc3 2461
2462 /* Now lookup the command and check ASAP about trivial error conditions
2463 * such wrong arity, bad command name and so forth. */
ed9b544e 2464 cmd = lookupCommand(c->argv[0]->ptr);
2465 if (!cmd) {
2c14807b 2466 addReplySds(c,
2467 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2468 (char*)c->argv[0]->ptr));
ed9b544e 2469 resetClient(c);
2470 return 1;
2471 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2472 (c->argc < -cmd->arity)) {
454d4e43 2473 addReplySds(c,
2474 sdscatprintf(sdsempty(),
2475 "-ERR wrong number of arguments for '%s' command\r\n",
2476 cmd->name));
ed9b544e 2477 resetClient(c);
2478 return 1;
2479 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2480 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2481 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2482
2483 decrRefCount(c->argv[c->argc-1]);
2484 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2485 c->argc--;
2486 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2487 resetClient(c);
2488 return 1;
2489 }
2490 c->argc--;
2491 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2492 /* It is possible that the bulk read is already in the
8d0490e7 2493 * buffer. Check this condition and handle it accordingly.
2494 * This is just a fast path, alternative to call processInputBuffer().
2495 * It's a good idea since the code is small and this condition
2496 * happens most of the times. */
ed9b544e 2497 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2498 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2499 c->argc++;
2500 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2501 } else {
d5d55fc3 2502 /* Otherwise return... there is to read the last argument
2503 * from the socket. */
ed9b544e 2504 return 1;
2505 }
2506 }
942a3961 2507 /* Let's try to encode the bulk object to save space. */
2508 if (cmd->flags & REDIS_CMD_BULK)
05df7621 2509 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
942a3961 2510
e63943a4 2511 /* Check if the user is authenticated */
2512 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2513 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2514 resetClient(c);
2515 return 1;
2516 }
2517
b61a28fe 2518 /* Handle the maxmemory directive */
2519 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2520 zmalloc_used_memory() > server.maxmemory)
2521 {
2522 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2523 resetClient(c);
2524 return 1;
2525 }
2526
d6cc8867 2527 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
e6cca5db 2528 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2529 &&
ffc6b7f8 2530 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2531 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2532 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
d6cc8867 2533 resetClient(c);
2534 return 1;
2535 }
2536
ed9b544e 2537 /* Exec the command */
6531c94d 2538 if (c->flags & REDIS_MULTI &&
2539 cmd->proc != execCommand && cmd->proc != discardCommand &&
2540 cmd->proc != multiCommand && cmd->proc != watchCommand)
2541 {
6e469882 2542 queueMultiCommand(c,cmd);
2543 addReply(c,shared.queued);
2544 } else {
d5d55fc3 2545 if (server.vm_enabled && server.vm_max_threads > 0 &&
0a6f3f0f 2546 blockClientOnSwappedKeys(c,cmd)) return 1;
6e469882 2547 call(c,cmd);
2548 }
ed9b544e 2549
2550 /* Prepare the client for the next command */
ed9b544e 2551 resetClient(c);
2552 return 1;
2553}
2554
248ea310 2555static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2556 listNode *ln;
c7df85a4 2557 listIter li;
ed9b544e 2558 int outc = 0, j;
93ea3759 2559 robj **outv;
248ea310 2560 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2561 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2562 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2563 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2564 robj *lenobj;
93ea3759 2565
2566 if (argc <= REDIS_STATIC_ARGS) {
2567 outv = static_outv;
2568 } else {
248ea310 2569 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2570 }
248ea310 2571
2572 lenobj = createObject(REDIS_STRING,
2573 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2574 lenobj->refcount = 0;
2575 outv[outc++] = lenobj;
ed9b544e 2576 for (j = 0; j < argc; j++) {
248ea310 2577 lenobj = createObject(REDIS_STRING,
2578 sdscatprintf(sdsempty(),"$%lu\r\n",
2579 (unsigned long) stringObjectLen(argv[j])));
2580 lenobj->refcount = 0;
2581 outv[outc++] = lenobj;
ed9b544e 2582 outv[outc++] = argv[j];
248ea310 2583 outv[outc++] = shared.crlf;
ed9b544e 2584 }
ed9b544e 2585
40d224a9 2586 /* Increment all the refcounts at start and decrement at end in order to
2587 * be sure to free objects if there is no slave in a replication state
2588 * able to be feed with commands */
2589 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2590 listRewind(slaves,&li);
2591 while((ln = listNext(&li))) {
ed9b544e 2592 redisClient *slave = ln->value;
40d224a9 2593
2594 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2595 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2596
2597 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2598 if (slave->slaveseldb != dictid) {
2599 robj *selectcmd;
2600
2601 switch(dictid) {
2602 case 0: selectcmd = shared.select0; break;
2603 case 1: selectcmd = shared.select1; break;
2604 case 2: selectcmd = shared.select2; break;
2605 case 3: selectcmd = shared.select3; break;
2606 case 4: selectcmd = shared.select4; break;
2607 case 5: selectcmd = shared.select5; break;
2608 case 6: selectcmd = shared.select6; break;
2609 case 7: selectcmd = shared.select7; break;
2610 case 8: selectcmd = shared.select8; break;
2611 case 9: selectcmd = shared.select9; break;
2612 default:
2613 selectcmd = createObject(REDIS_STRING,
2614 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2615 selectcmd->refcount = 0;
2616 break;
2617 }
2618 addReply(slave,selectcmd);
2619 slave->slaveseldb = dictid;
2620 }
2621 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2622 }
40d224a9 2623 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2624 if (outv != static_outv) zfree(outv);
ed9b544e 2625}
2626
dd142b9c 2627static sds sdscatrepr(sds s, char *p, size_t len) {
2628 s = sdscatlen(s,"\"",1);
2629 while(len--) {
2630 switch(*p) {
2631 case '\\':
2632 case '"':
2633 s = sdscatprintf(s,"\\%c",*p);
2634 break;
2635 case '\n': s = sdscatlen(s,"\\n",1); break;
2636 case '\r': s = sdscatlen(s,"\\r",1); break;
2637 case '\t': s = sdscatlen(s,"\\t",1); break;
2638 case '\a': s = sdscatlen(s,"\\a",1); break;
2639 case '\b': s = sdscatlen(s,"\\b",1); break;
2640 default:
2641 if (isprint(*p))
2642 s = sdscatprintf(s,"%c",*p);
2643 else
2644 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2645 break;
2646 }
2647 p++;
2648 }
2649 return sdscatlen(s,"\"",1);
2650}
2651
2652static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2653 listNode *ln;
2654 listIter li;
2655 int j;
2656 sds cmdrepr = sdsnew("+");
2657 robj *cmdobj;
2658 struct timeval tv;
2659
2660 gettimeofday(&tv,NULL);
2661 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2662 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2663
2664 for (j = 0; j < argc; j++) {
2665 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2666 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2667 } else {
2668 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2669 sdslen(argv[j]->ptr));
2670 }
2671 if (j != argc-1)
2672 cmdrepr = sdscatlen(cmdrepr," ",1);
2673 }
2674 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2675 cmdobj = createObject(REDIS_STRING,cmdrepr);
2676
2677 listRewind(monitors,&li);
2678 while((ln = listNext(&li))) {
2679 redisClient *monitor = ln->value;
2680 addReply(monitor,cmdobj);
2681 }
2682 decrRefCount(cmdobj);
2683}
2684
638e42ac 2685static void processInputBuffer(redisClient *c) {
ed9b544e 2686again:
4409877e 2687 /* Before to process the input buffer, make sure the client is not
2688 * waitig for a blocking operation such as BLPOP. Note that the first
2689 * iteration the client is never blocked, otherwise the processInputBuffer
2690 * would not be called at all, but after the execution of the first commands
2691 * in the input buffer the client may be blocked, and the "goto again"
2692 * will try to reiterate. The following line will make it return asap. */
92f8e882 2693 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2694 if (c->bulklen == -1) {
2695 /* Read the first line of the query */
2696 char *p = strchr(c->querybuf,'\n');
2697 size_t querylen;
644fafa3 2698
ed9b544e 2699 if (p) {
2700 sds query, *argv;
2701 int argc, j;
e0a62c7f 2702
ed9b544e 2703 query = c->querybuf;
2704 c->querybuf = sdsempty();
2705 querylen = 1+(p-(query));
2706 if (sdslen(query) > querylen) {
2707 /* leave data after the first line of the query in the buffer */
2708 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2709 }
2710 *p = '\0'; /* remove "\n" */
2711 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2712 sdsupdatelen(query);
2713
2714 /* Now we can split the query in arguments */
ed9b544e 2715 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2716 sdsfree(query);
2717
2718 if (c->argv) zfree(c->argv);
2719 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2720
2721 for (j = 0; j < argc; j++) {
ed9b544e 2722 if (sdslen(argv[j])) {
2723 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2724 c->argc++;
2725 } else {
2726 sdsfree(argv[j]);
2727 }
2728 }
2729 zfree(argv);
7c49733c 2730 if (c->argc) {
2731 /* Execute the command. If the client is still valid
2732 * after processCommand() return and there is something
2733 * on the query buffer try to process the next command. */
2734 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2735 } else {
2736 /* Nothing to process, argc == 0. Just process the query
2737 * buffer if it's not empty or return to the caller */
2738 if (sdslen(c->querybuf)) goto again;
2739 }
ed9b544e 2740 return;
644fafa3 2741 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2742 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2743 freeClient(c);
2744 return;
2745 }
2746 } else {
2747 /* Bulk read handling. Note that if we are at this point
2748 the client already sent a command terminated with a newline,
2749 we are reading the bulk data that is actually the last
2750 argument of the command. */
2751 int qbl = sdslen(c->querybuf);
2752
2753 if (c->bulklen <= qbl) {
2754 /* Copy everything but the final CRLF as final argument */
2755 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2756 c->argc++;
2757 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2758 /* Process the command. If the client is still valid after
2759 * the processing and there is more data in the buffer
2760 * try to parse it. */
2761 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2762 return;
2763 }
2764 }
2765}
2766
638e42ac 2767static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2768 redisClient *c = (redisClient*) privdata;
2769 char buf[REDIS_IOBUF_LEN];
2770 int nread;
2771 REDIS_NOTUSED(el);
2772 REDIS_NOTUSED(mask);
2773
2774 nread = read(fd, buf, REDIS_IOBUF_LEN);
2775 if (nread == -1) {
2776 if (errno == EAGAIN) {
2777 nread = 0;
2778 } else {
f870935d 2779 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2780 freeClient(c);
2781 return;
2782 }
2783 } else if (nread == 0) {
f870935d 2784 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2785 freeClient(c);
2786 return;
2787 }
2788 if (nread) {
2789 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2790 c->lastinteraction = time(NULL);
2791 } else {
2792 return;
2793 }
168ac5c6 2794 processInputBuffer(c);
638e42ac 2795}
2796
ed9b544e 2797static int selectDb(redisClient *c, int id) {
2798 if (id < 0 || id >= server.dbnum)
2799 return REDIS_ERR;
3305306f 2800 c->db = &server.db[id];
ed9b544e 2801 return REDIS_OK;
2802}
2803
40d224a9 2804static void *dupClientReplyValue(void *o) {
2805 incrRefCount((robj*)o);
12d090d2 2806 return o;
40d224a9 2807}
2808
ffc6b7f8 2809static int listMatchObjects(void *a, void *b) {
bf028098 2810 return equalStringObjects(a,b);
ffc6b7f8 2811}
2812
ed9b544e 2813static redisClient *createClient(int fd) {
2814 redisClient *c = zmalloc(sizeof(*c));
2815
2816 anetNonBlock(NULL,fd);
2817 anetTcpNoDelay(NULL,fd);
2818 if (!c) return NULL;
2819 selectDb(c,0);
2820 c->fd = fd;
2821 c->querybuf = sdsempty();
2822 c->argc = 0;
93ea3759 2823 c->argv = NULL;
ed9b544e 2824 c->bulklen = -1;
e8a74421 2825 c->multibulk = 0;
2826 c->mbargc = 0;
2827 c->mbargv = NULL;
ed9b544e 2828 c->sentlen = 0;
2829 c->flags = 0;
2830 c->lastinteraction = time(NULL);
abcb223e 2831 c->authenticated = 0;
40d224a9 2832 c->replstate = REDIS_REPL_NONE;
6b47e12e 2833 c->reply = listCreate();
ed9b544e 2834 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2835 listSetDupMethod(c->reply,dupClientReplyValue);
37ab76c9 2836 c->blocking_keys = NULL;
2837 c->blocking_keys_num = 0;
92f8e882 2838 c->io_keys = listCreate();
87c68815 2839 c->watched_keys = listCreate();
92f8e882 2840 listSetFreeMethod(c->io_keys,decrRefCount);
ffc6b7f8 2841 c->pubsub_channels = dictCreate(&setDictType,NULL);
2842 c->pubsub_patterns = listCreate();
2843 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2844 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
ed9b544e 2845 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2846 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2847 freeClient(c);
2848 return NULL;
2849 }
6b47e12e 2850 listAddNodeTail(server.clients,c);
6e469882 2851 initClientMultiState(c);
ed9b544e 2852 return c;
2853}
2854
2855static void addReply(redisClient *c, robj *obj) {
2856 if (listLength(c->reply) == 0 &&
6208b3a7 2857 (c->replstate == REDIS_REPL_NONE ||
2858 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2859 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2860 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2861
2862 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2863 obj = dupStringObject(obj);
2864 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2865 }
9d65a1bb 2866 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2867}
2868
2869static void addReplySds(redisClient *c, sds s) {
2870 robj *o = createObject(REDIS_STRING,s);
2871 addReply(c,o);
2872 decrRefCount(o);
2873}
2874
e2665397 2875static void addReplyDouble(redisClient *c, double d) {
2876 char buf[128];
2877
2878 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2879 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2880 (unsigned long) strlen(buf),buf));
e2665397 2881}
2882
aa7c2934
PN
2883static void addReplyLongLong(redisClient *c, long long ll) {
2884 char buf[128];
2885 size_t len;
2886
2887 if (ll == 0) {
2888 addReply(c,shared.czero);
2889 return;
2890 } else if (ll == 1) {
2891 addReply(c,shared.cone);
2892 return;
2893 }
482b672d 2894 buf[0] = ':';
2895 len = ll2string(buf+1,sizeof(buf)-1,ll);
2896 buf[len+1] = '\r';
2897 buf[len+2] = '\n';
2898 addReplySds(c,sdsnewlen(buf,len+3));
aa7c2934
PN
2899}
2900
92b27fe9 2901static void addReplyUlong(redisClient *c, unsigned long ul) {
2902 char buf[128];
2903 size_t len;
2904
dd88747b 2905 if (ul == 0) {
2906 addReply(c,shared.czero);
2907 return;
2908 } else if (ul == 1) {
2909 addReply(c,shared.cone);
2910 return;
2911 }
92b27fe9 2912 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2913 addReplySds(c,sdsnewlen(buf,len));
2914}
2915
942a3961 2916static void addReplyBulkLen(redisClient *c, robj *obj) {
482b672d 2917 size_t len, intlen;
2918 char buf[128];
942a3961 2919
2920 if (obj->encoding == REDIS_ENCODING_RAW) {
2921 len = sdslen(obj->ptr);
2922 } else {
2923 long n = (long)obj->ptr;
2924
e054afda 2925 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2926 len = 1;
2927 if (n < 0) {
2928 len++;
2929 n = -n;
2930 }
2931 while((n = n/10) != 0) {
2932 len++;
2933 }
2934 }
482b672d 2935 buf[0] = '$';
2936 intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
2937 buf[intlen+1] = '\r';
2938 buf[intlen+2] = '\n';
2939 addReplySds(c,sdsnewlen(buf,intlen+3));
942a3961 2940}
2941
dd88747b 2942static void addReplyBulk(redisClient *c, robj *obj) {
2943 addReplyBulkLen(c,obj);
2944 addReply(c,obj);
2945 addReply(c,shared.crlf);
2946}
2947
09241813 2948static void addReplyBulkSds(redisClient *c, sds s) {
2949 robj *o = createStringObject(s, sdslen(s));
2950 addReplyBulk(c,o);
2951 decrRefCount(o);
2952}
2953
500ece7c 2954/* In the CONFIG command we need to add vanilla C string as bulk replies */
2955static void addReplyBulkCString(redisClient *c, char *s) {
2956 if (s == NULL) {
2957 addReply(c,shared.nullbulk);
2958 } else {
2959 robj *o = createStringObject(s,strlen(s));
2960 addReplyBulk(c,o);
2961 decrRefCount(o);
2962 }
2963}
2964
ed9b544e 2965static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2966 int cport, cfd;
2967 char cip[128];
285add55 2968 redisClient *c;
ed9b544e 2969 REDIS_NOTUSED(el);
2970 REDIS_NOTUSED(mask);
2971 REDIS_NOTUSED(privdata);
2972
2973 cfd = anetAccept(server.neterr, fd, cip, &cport);
2974 if (cfd == AE_ERR) {
f870935d 2975 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2976 return;
2977 }
f870935d 2978 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2979 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2980 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2981 close(cfd); /* May be already closed, just ingore errors */
2982 return;
2983 }
285add55 2984 /* If maxclient directive is set and this is one client more... close the
2985 * connection. Note that we create the client instead to check before
2986 * for this condition, since now the socket is already set in nonblocking
2987 * mode and we can send an error for free using the Kernel I/O */
2988 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2989 char *err = "-ERR max number of clients reached\r\n";
2990
2991 /* That's a best effort error message, don't check write errors */
fee803ba 2992 if (write(c->fd,err,strlen(err)) == -1) {
2993 /* Nothing to do, Just to avoid the warning... */
2994 }
285add55 2995 freeClient(c);
2996 return;
2997 }
ed9b544e 2998 server.stat_numconnections++;
2999}
3000
3001/* ======================= Redis objects implementation ===================== */
3002
3003static robj *createObject(int type, void *ptr) {
3004 robj *o;
3005
a5819310 3006 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 3007 if (listLength(server.objfreelist)) {
3008 listNode *head = listFirst(server.objfreelist);
3009 o = listNodeValue(head);
3010 listDelNode(server.objfreelist,head);
a5819310 3011 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 3012 } else {
560db612 3013 if (server.vm_enabled)
a5819310 3014 pthread_mutex_unlock(&server.obj_freelist_mutex);
560db612 3015 o = zmalloc(sizeof(*o));
ed9b544e 3016 }
ed9b544e 3017 o->type = type;
942a3961 3018 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 3019 o->ptr = ptr;
3020 o->refcount = 1;
3a66edc7 3021 if (server.vm_enabled) {
1064ef87 3022 /* Note that this code may run in the context of an I/O thread
560db612 3023 * and accessing server.lruclock in theory is an error
1064ef87 3024 * (no locks). But in practice this is safe, and even if we read
560db612 3025 * garbage Redis will not fail. */
3026 o->lru = server.lruclock;
3a66edc7 3027 o->storage = REDIS_VM_MEMORY;
3028 }
ed9b544e 3029 return o;
3030}
3031
3032static robj *createStringObject(char *ptr, size_t len) {
3033 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
3034}
3035
3f973463
PN
3036static robj *createStringObjectFromLongLong(long long value) {
3037 robj *o;
3038 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3039 incrRefCount(shared.integers[value]);
3040 o = shared.integers[value];
3041 } else {
3f973463 3042 if (value >= LONG_MIN && value <= LONG_MAX) {
10dea8dc 3043 o = createObject(REDIS_STRING, NULL);
3f973463
PN
3044 o->encoding = REDIS_ENCODING_INT;
3045 o->ptr = (void*)((long)value);
3046 } else {
ee14da56 3047 o = createObject(REDIS_STRING,sdsfromlonglong(value));
3f973463
PN
3048 }
3049 }
3050 return o;
3051}
3052
4ef8de8a 3053static robj *dupStringObject(robj *o) {
b9bc0eef 3054 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 3055 return createStringObject(o->ptr,sdslen(o->ptr));
3056}
3057
ed9b544e 3058static robj *createListObject(void) {
3059 list *l = listCreate();
1cd92e7f 3060 robj *o = createObject(REDIS_LIST,l);
ed9b544e 3061 listSetFreeMethod(l,decrRefCount);
1cd92e7f
PN
3062 o->encoding = REDIS_ENCODING_LIST;
3063 return o;
3064}
3065
3066static robj *createZiplistObject(void) {
3067 unsigned char *zl = ziplistNew();
3068 robj *o = createObject(REDIS_LIST,zl);
3069 o->encoding = REDIS_ENCODING_ZIPLIST;
3070 return o;
ed9b544e 3071}
3072
3073static robj *createSetObject(void) {
3074 dict *d = dictCreate(&setDictType,NULL);
35cabcb5
PN
3075 robj *o = createObject(REDIS_SET,d);
3076 o->encoding = REDIS_ENCODING_HT;
3077 return o;
ed9b544e 3078}
3079
d0b58d53
PN
3080static robj *createIntsetObject(void) {
3081 intset *is = intsetNew();
3082 robj *o = createObject(REDIS_SET,is);
3083 o->encoding = REDIS_ENCODING_INTSET;
3084 return o;
3085}
3086
5234952b 3087static robj *createHashObject(void) {
3088 /* All the Hashes start as zipmaps. Will be automatically converted
3089 * into hash tables if there are enough elements or big elements
3090 * inside. */
3091 unsigned char *zm = zipmapNew();
3092 robj *o = createObject(REDIS_HASH,zm);
3093 o->encoding = REDIS_ENCODING_ZIPMAP;
3094 return o;
3095}
3096
1812e024 3097static robj *createZsetObject(void) {
6b47e12e 3098 zset *zs = zmalloc(sizeof(*zs));
3099
3100 zs->dict = dictCreate(&zsetDictType,NULL);
3101 zs->zsl = zslCreate();
3102 return createObject(REDIS_ZSET,zs);
1812e024 3103}
3104
ed9b544e 3105static void freeStringObject(robj *o) {
942a3961 3106 if (o->encoding == REDIS_ENCODING_RAW) {
3107 sdsfree(o->ptr);
3108 }
ed9b544e 3109}
3110
3111static void freeListObject(robj *o) {
c7d9d662
PN
3112 switch (o->encoding) {
3113 case REDIS_ENCODING_LIST:
3114 listRelease((list*) o->ptr);
3115 break;
3116 case REDIS_ENCODING_ZIPLIST:
3117 zfree(o->ptr);
3118 break;
3119 default:
3120 redisPanic("Unknown list encoding type");
3121 }
ed9b544e 3122}
3123
3124static void freeSetObject(robj *o) {
d0b58d53
PN
3125 switch (o->encoding) {
3126 case REDIS_ENCODING_HT:
3127 dictRelease((dict*) o->ptr);
3128 break;
3129 case REDIS_ENCODING_INTSET:
3130 zfree(o->ptr);
3131 break;
3132 default:
3133 redisPanic("Unknown set encoding type");
3134 }
ed9b544e 3135}
3136
fd8ccf44 3137static void freeZsetObject(robj *o) {
3138 zset *zs = o->ptr;
3139
3140 dictRelease(zs->dict);
3141 zslFree(zs->zsl);
3142 zfree(zs);
3143}
3144
ed9b544e 3145static void freeHashObject(robj *o) {
cbba7dd7 3146 switch (o->encoding) {
3147 case REDIS_ENCODING_HT:
3148 dictRelease((dict*) o->ptr);
3149 break;
3150 case REDIS_ENCODING_ZIPMAP:
3151 zfree(o->ptr);
3152 break;
3153 default:
f83c6cb5 3154 redisPanic("Unknown hash encoding type");
cbba7dd7 3155 break;
3156 }
ed9b544e 3157}
3158
3159static void incrRefCount(robj *o) {
3160 o->refcount++;
3161}
3162
3163static void decrRefCount(void *obj) {
3164 robj *o = obj;
94754ccc 3165
560db612 3166 /* Object is a swapped out value, or in the process of being loaded. */
996cb5f7 3167 if (server.vm_enabled &&
3168 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
3169 {
560db612 3170 vmpointer *vp = obj;
3171 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(o);
3172 vmMarkPagesFree(vp->page,vp->usedpages);
7d98e08c 3173 server.vm_stats_swapped_objects--;
560db612 3174 zfree(vp);
a35ddf12 3175 return;
3176 }
560db612 3177
3178 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
e4ed181d 3179 /* Object is in memory, or in the process of being swapped out.
3180 *
3181 * If the object is being swapped out, abort the operation on
3182 * decrRefCount even if the refcount does not drop to 0: the object
3183 * is referenced at least two times, as value of the key AND as
3184 * job->val in the iojob. So if we don't invalidate the iojob, when it is
3185 * done but the relevant key was removed in the meantime, the
3186 * complete jobs handler will not find the key about the job and the
3187 * assert will fail. */
3188 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
3189 vmCancelThreadedIOJob(o);
ed9b544e 3190 if (--(o->refcount) == 0) {
3191 switch(o->type) {
3192 case REDIS_STRING: freeStringObject(o); break;
3193 case REDIS_LIST: freeListObject(o); break;
3194 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 3195 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 3196 case REDIS_HASH: freeHashObject(o); break;
f83c6cb5 3197 default: redisPanic("Unknown object type"); break;
ed9b544e 3198 }
a5819310 3199 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 3200 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3201 !listAddNodeHead(server.objfreelist,o))
3202 zfree(o);
a5819310 3203 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 3204 }
3205}
3206
92b27fe9 3207static int checkType(redisClient *c, robj *o, int type) {
3208 if (o->type != type) {
3209 addReply(c,shared.wrongtypeerr);
3210 return 1;
3211 }
3212 return 0;
3213}
3214
724a51b1 3215/* Check if the nul-terminated string 's' can be represented by a long
3216 * (that is, is a number that fits into long without any other space or
3217 * character before or after the digits).
3218 *
3219 * If so, the function returns REDIS_OK and *longval is set to the value
3220 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 3221static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 3222 char buf[32], *endptr;
3223 long value;
3224 int slen;
e0a62c7f 3225
724a51b1 3226 value = strtol(s, &endptr, 10);
3227 if (endptr[0] != '\0') return REDIS_ERR;
ee14da56 3228 slen = ll2string(buf,32,value);
724a51b1 3229
3230 /* If the number converted back into a string is not identical
3231 * then it's not possible to encode the string as integer */
f69f2cba 3232 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 3233 if (longval) *longval = value;
3234 return REDIS_OK;
3235}
3236
942a3961 3237/* Try to encode a string object in order to save space */
05df7621 3238static robj *tryObjectEncoding(robj *o) {
942a3961 3239 long value;
942a3961 3240 sds s = o->ptr;
3305306f 3241
942a3961 3242 if (o->encoding != REDIS_ENCODING_RAW)
05df7621 3243 return o; /* Already encoded */
3305306f 3244
05df7621 3245 /* It's not safe to encode shared objects: shared objects can be shared
942a3961 3246 * everywhere in the "object space" of Redis. Encoded objects can only
3247 * appear as "values" (and not, for instance, as keys) */
05df7621 3248 if (o->refcount > 1) return o;
3305306f 3249
942a3961 3250 /* Currently we try to encode only strings */
dfc5e96c 3251 redisAssert(o->type == REDIS_STRING);
94754ccc 3252
724a51b1 3253 /* Check if we can represent this string as a long integer */
05df7621 3254 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
942a3961 3255
3256 /* Ok, this object can be encoded */
05df7621 3257 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3258 decrRefCount(o);
3259 incrRefCount(shared.integers[value]);
3260 return shared.integers[value];
3261 } else {
3262 o->encoding = REDIS_ENCODING_INT;
3263 sdsfree(o->ptr);
3264 o->ptr = (void*) value;
3265 return o;
3266 }
942a3961 3267}
3268
9d65a1bb 3269/* Get a decoded version of an encoded object (returned as a new object).
3270 * If the object is already raw-encoded just increment the ref count. */
3271static robj *getDecodedObject(robj *o) {
942a3961 3272 robj *dec;
e0a62c7f 3273
9d65a1bb 3274 if (o->encoding == REDIS_ENCODING_RAW) {
3275 incrRefCount(o);
3276 return o;
3277 }
942a3961 3278 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3279 char buf[32];
3280
ee14da56 3281 ll2string(buf,32,(long)o->ptr);
942a3961 3282 dec = createStringObject(buf,strlen(buf));
3283 return dec;
3284 } else {
08ee9b57 3285 redisPanic("Unknown encoding type");
942a3961 3286 }
3305306f 3287}
3288
d7f43c08 3289/* Compare two string objects via strcmp() or alike.
3290 * Note that the objects may be integer-encoded. In such a case we
ee14da56 3291 * use ll2string() to get a string representation of the numbers on the stack
1fd9bc8a 3292 * and compare the strings, it's much faster than calling getDecodedObject().
3293 *
3294 * Important note: if objects are not integer encoded, but binary-safe strings,
3295 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3296 * binary safe. */
724a51b1 3297static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 3298 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 3299 char bufa[128], bufb[128], *astr, *bstr;
3300 int bothsds = 1;
724a51b1 3301
e197b441 3302 if (a == b) return 0;
d7f43c08 3303 if (a->encoding != REDIS_ENCODING_RAW) {
ee14da56 3304 ll2string(bufa,sizeof(bufa),(long) a->ptr);
d7f43c08 3305 astr = bufa;
3306 bothsds = 0;
724a51b1 3307 } else {
d7f43c08 3308 astr = a->ptr;
724a51b1 3309 }
d7f43c08 3310 if (b->encoding != REDIS_ENCODING_RAW) {
ee14da56 3311 ll2string(bufb,sizeof(bufb),(long) b->ptr);
d7f43c08 3312 bstr = bufb;
3313 bothsds = 0;
3314 } else {
3315 bstr = b->ptr;
3316 }
3317 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3318}
3319
bf028098 3320/* Equal string objects return 1 if the two objects are the same from the
3321 * point of view of a string comparison, otherwise 0 is returned. Note that
3322 * this function is faster then checking for (compareStringObject(a,b) == 0)
3323 * because it can perform some more optimization. */
3324static int equalStringObjects(robj *a, robj *b) {
3325 if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
3326 return a->ptr == b->ptr;
3327 } else {
3328 return compareStringObjects(a,b) == 0;
3329 }
3330}
3331
0ea663ea 3332static size_t stringObjectLen(robj *o) {
dfc5e96c 3333 redisAssert(o->type == REDIS_STRING);
0ea663ea 3334 if (o->encoding == REDIS_ENCODING_RAW) {
3335 return sdslen(o->ptr);
3336 } else {
3337 char buf[32];
3338
ee14da56 3339 return ll2string(buf,32,(long)o->ptr);
0ea663ea 3340 }
3341}
3342
bd79a6bd
PN
3343static int getDoubleFromObject(robj *o, double *target) {
3344 double value;
682c73e8 3345 char *eptr;
bbe025e0 3346
bd79a6bd
PN
3347 if (o == NULL) {
3348 value = 0;
3349 } else {
3350 redisAssert(o->type == REDIS_STRING);
3351 if (o->encoding == REDIS_ENCODING_RAW) {
3352 value = strtod(o->ptr, &eptr);
682c73e8 3353 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3354 } else if (o->encoding == REDIS_ENCODING_INT) {
3355 value = (long)o->ptr;
3356 } else {
946342c1 3357 redisPanic("Unknown string encoding");
bd79a6bd
PN
3358 }
3359 }
3360
bd79a6bd
PN
3361 *target = value;
3362 return REDIS_OK;
3363}
bbe025e0 3364
bd79a6bd
PN
3365static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3366 double value;
3367 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3368 if (msg != NULL) {
3369 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3370 } else {
3371 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3372 }
bbe025e0
AM
3373 return REDIS_ERR;
3374 }
3375
bd79a6bd 3376 *target = value;
bbe025e0
AM
3377 return REDIS_OK;
3378}
3379
bd79a6bd
PN
3380static int getLongLongFromObject(robj *o, long long *target) {
3381 long long value;
682c73e8 3382 char *eptr;
bbe025e0 3383
bd79a6bd
PN
3384 if (o == NULL) {
3385 value = 0;
3386 } else {
3387 redisAssert(o->type == REDIS_STRING);
3388 if (o->encoding == REDIS_ENCODING_RAW) {
3389 value = strtoll(o->ptr, &eptr, 10);
682c73e8 3390 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3391 } else if (o->encoding == REDIS_ENCODING_INT) {
3392 value = (long)o->ptr;
3393 } else {
946342c1 3394 redisPanic("Unknown string encoding");
bd79a6bd
PN
3395 }
3396 }
3397
d0b58d53 3398 if (target) *target = value;
bd79a6bd
PN
3399 return REDIS_OK;
3400}
bbe025e0 3401
bd79a6bd
PN
3402static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3403 long long value;
3404 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3405 if (msg != NULL) {
3406 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3407 } else {
3408 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3409 }
bbe025e0
AM
3410 return REDIS_ERR;
3411 }
3412
bd79a6bd 3413 *target = value;
bbe025e0
AM
3414 return REDIS_OK;
3415}
3416
bd79a6bd
PN
3417static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3418 long long value;
bbe025e0 3419
bd79a6bd
PN
3420 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3421 if (value < LONG_MIN || value > LONG_MAX) {
3422 if (msg != NULL) {
3423 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3424 } else {
3425 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3426 }
bbe025e0
AM
3427 return REDIS_ERR;
3428 }
3429
bd79a6bd 3430 *target = value;
bbe025e0
AM
3431 return REDIS_OK;
3432}
3433
612e4de8 3434/* =========================== Keyspace access API ========================== */
3435
3436static robj *lookupKey(redisDb *db, robj *key) {
09241813 3437 dictEntry *de = dictFind(db->dict,key->ptr);
612e4de8 3438 if (de) {
612e4de8 3439 robj *val = dictGetEntryVal(de);
3440
3441 if (server.vm_enabled) {
3442 if (val->storage == REDIS_VM_MEMORY ||
3443 val->storage == REDIS_VM_SWAPPING)
3444 {
3445 /* If we were swapping the object out, cancel the operation */
3446 if (val->storage == REDIS_VM_SWAPPING)
3447 vmCancelThreadedIOJob(val);
09241813 3448 /* Update the access time for the aging algorithm. */
612e4de8 3449 val->lru = server.lruclock;
3450 } else {
3451 int notify = (val->storage == REDIS_VM_LOADING);
3452
3453 /* Our value was swapped on disk. Bring it at home. */
3454 redisAssert(val->type == REDIS_VMPOINTER);
3455 val = vmLoadObject(val);
3456 dictGetEntryVal(de) = val;
3457
3458 /* Clients blocked by the VM subsystem may be waiting for
3459 * this key... */
3460 if (notify) handleClientsBlockedOnSwappedKey(db,key);
3461 }
3462 }
3463 return val;
3464 } else {
3465 return NULL;
3466 }
3467}
3468
3469static robj *lookupKeyRead(redisDb *db, robj *key) {
3470 expireIfNeeded(db,key);
3471 return lookupKey(db,key);
3472}
3473
3474static robj *lookupKeyWrite(redisDb *db, robj *key) {
3475 deleteIfVolatile(db,key);
3476 touchWatchedKey(db,key);
3477 return lookupKey(db,key);
3478}
3479
3480static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3481 robj *o = lookupKeyRead(c->db, key);
3482 if (!o) addReply(c,reply);
3483 return o;
3484}
3485
3486static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3487 robj *o = lookupKeyWrite(c->db, key);
3488 if (!o) addReply(c,reply);
3489 return o;
3490}
3491
09241813 3492/* Add the key to the DB. If the key already exists REDIS_ERR is returned,
3493 * otherwise REDIS_OK is returned, and the caller should increment the
3494 * refcount of 'val'. */
3495static int dbAdd(redisDb *db, robj *key, robj *val) {
3496 /* Perform a lookup before adding the key, as we need to copy the
3497 * key value. */
3498 if (dictFind(db->dict, key->ptr) != NULL) {
3499 return REDIS_ERR;
3500 } else {
3501 sds copy = sdsdup(key->ptr);
3502 dictAdd(db->dict, copy, val);
3503 return REDIS_OK;
3504 }
3505}
3506
3507/* If the key does not exist, this is just like dbAdd(). Otherwise
3508 * the value associated to the key is replaced with the new one.
3509 *
3510 * On update (key already existed) 0 is returned. Otherwise 1. */
3511static int dbReplace(redisDb *db, robj *key, robj *val) {
3512 if (dictFind(db->dict,key->ptr) == NULL) {
3513 sds copy = sdsdup(key->ptr);
3514 dictAdd(db->dict, copy, val);
3515 return 1;
3516 } else {
3517 dictReplace(db->dict, key->ptr, val);
3518 return 0;
3519 }
3520}
3521
3522static int dbExists(redisDb *db, robj *key) {
3523 return dictFind(db->dict,key->ptr) != NULL;
3524}
3525
3526/* Return a random key, in form of a Redis object.
3527 * If there are no keys, NULL is returned.
3528 *
3529 * The function makes sure to return keys not already expired. */
3530static robj *dbRandomKey(redisDb *db) {
3531 struct dictEntry *de;
3532
3533 while(1) {
3534 sds key;
3535 robj *keyobj;
3536
3537 de = dictGetRandomKey(db->dict);
3538 if (de == NULL) return NULL;
3539
3540 key = dictGetEntryKey(de);
3541 keyobj = createStringObject(key,sdslen(key));
3542 if (dictFind(db->expires,key)) {
3543 if (expireIfNeeded(db,keyobj)) {
3544 decrRefCount(keyobj);
3545 continue; /* search for another key. This expired. */
3546 }
3547 }
3548 return keyobj;
3549 }
3550}
3551
3552/* Delete a key, value, and associated expiration entry if any, from the DB */
3553static int dbDelete(redisDb *db, robj *key) {
612e4de8 3554 int retval;
3555
09241813 3556 if (dictSize(db->expires)) dictDelete(db->expires,key->ptr);
3557 retval = dictDelete(db->dict,key->ptr);
612e4de8 3558
3559 return retval == DICT_OK;
3560}
3561
06233c45 3562/*============================ RDB saving/loading =========================== */
ed9b544e 3563
f78fd11b 3564static int rdbSaveType(FILE *fp, unsigned char type) {
3565 if (fwrite(&type,1,1,fp) == 0) return -1;
3566 return 0;
3567}
3568
bb32ede5 3569static int rdbSaveTime(FILE *fp, time_t t) {
3570 int32_t t32 = (int32_t) t;
3571 if (fwrite(&t32,4,1,fp) == 0) return -1;
3572 return 0;
3573}
3574
e3566d4b 3575/* check rdbLoadLen() comments for more info */
f78fd11b 3576static int rdbSaveLen(FILE *fp, uint32_t len) {
3577 unsigned char buf[2];
3578
3579 if (len < (1<<6)) {
3580 /* Save a 6 bit len */
10c43610 3581 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3582 if (fwrite(buf,1,1,fp) == 0) return -1;
3583 } else if (len < (1<<14)) {
3584 /* Save a 14 bit len */
10c43610 3585 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3586 buf[1] = len&0xFF;
17be1a4a 3587 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3588 } else {
3589 /* Save a 32 bit len */
10c43610 3590 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3591 if (fwrite(buf,1,1,fp) == 0) return -1;
3592 len = htonl(len);
3593 if (fwrite(&len,4,1,fp) == 0) return -1;
3594 }
3595 return 0;
3596}
3597
32a66513 3598/* Encode 'value' as an integer if possible (if integer will fit the
3599 * supported range). If the function sucessful encoded the integer
3600 * then the (up to 5 bytes) encoded representation is written in the
3601 * string pointed by 'enc' and the length is returned. Otherwise
3602 * 0 is returned. */
3603static int rdbEncodeInteger(long long value, unsigned char *enc) {
e3566d4b 3604 /* Finally check if it fits in our ranges */
3605 if (value >= -(1<<7) && value <= (1<<7)-1) {
3606 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3607 enc[1] = value&0xFF;
3608 return 2;
3609 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3610 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3611 enc[1] = value&0xFF;
3612 enc[2] = (value>>8)&0xFF;
3613 return 3;
3614 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3615 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3616 enc[1] = value&0xFF;
3617 enc[2] = (value>>8)&0xFF;
3618 enc[3] = (value>>16)&0xFF;
3619 enc[4] = (value>>24)&0xFF;
3620 return 5;
3621 } else {
3622 return 0;
3623 }
3624}
3625
32a66513 3626/* String objects in the form "2391" "-100" without any space and with a
3627 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3628 * encoded as integers to save space */
3629static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
3630 long long value;
3631 char *endptr, buf[32];
3632
3633 /* Check if it's possible to encode this value as a number */
3634 value = strtoll(s, &endptr, 10);
3635 if (endptr[0] != '\0') return 0;
3636 ll2string(buf,32,value);
3637
3638 /* If the number converted back into a string is not identical
3639 * then it's not possible to encode the string as integer */
3640 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
3641
3642 return rdbEncodeInteger(value,enc);
3643}
3644
b1befe6a 3645static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3646 size_t comprlen, outlen;
774e3047 3647 unsigned char byte;
3648 void *out;
3649
3650 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3651 if (len <= 4) return 0;
3652 outlen = len-4;
3a2694c4 3653 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3654 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3655 if (comprlen == 0) {
88e85998 3656 zfree(out);
774e3047 3657 return 0;
3658 }
3659 /* Data compressed! Let's save it on disk */
3660 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3661 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3662 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3663 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3664 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3665 zfree(out);
774e3047 3666 return comprlen;
3667
3668writeerr:
88e85998 3669 zfree(out);
774e3047 3670 return -1;
3671}
3672
e3566d4b 3673/* Save a string objet as [len][data] on disk. If the object is a string
3674 * representation of an integer value we try to safe it in a special form */
b1befe6a 3675static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3676 int enclen;
10c43610 3677
774e3047 3678 /* Try integer encoding */
e3566d4b 3679 if (len <= 11) {
3680 unsigned char buf[5];
b1befe6a 3681 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3682 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3683 return 0;
3684 }
3685 }
774e3047 3686
3687 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3688 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3689 if (server.rdbcompression && len > 20) {
774e3047 3690 int retval;
3691
b1befe6a 3692 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3693 if (retval == -1) return -1;
3694 if (retval > 0) return 0;
3695 /* retval == 0 means data can't be compressed, save the old way */
3696 }
3697
3698 /* Store verbatim */
10c43610 3699 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3700 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3701 return 0;
3702}
3703
2796f6da
PN
3704/* Save a long long value as either an encoded string or a string. */
3705static int rdbSaveLongLongAsStringObject(FILE *fp, long long value) {
3706 unsigned char buf[32];
3707 int enclen = rdbEncodeInteger(value,buf);
3708 if (enclen > 0) {
3709 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3710 } else {
3711 /* Encode as string */
3712 enclen = ll2string((char*)buf,32,value);
3713 redisAssert(enclen < 32);
3714 if (rdbSaveLen(fp,enclen) == -1) return -1;
3715 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3716 }
3717 return 0;
3718}
3719
942a3961 3720/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3721static int rdbSaveStringObject(FILE *fp, robj *obj) {
32a66513 3722 /* Avoid to decode the object, then encode it again, if the
3723 * object is alrady integer encoded. */
3724 if (obj->encoding == REDIS_ENCODING_INT) {
2796f6da 3725 return rdbSaveLongLongAsStringObject(fp,(long)obj->ptr);
996cb5f7 3726 } else {
2796f6da
PN
3727 redisAssert(obj->encoding == REDIS_ENCODING_RAW);
3728 return rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3729 }
942a3961 3730}
3731
a7866db6 3732/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3733 * 8 bit integer specifing the length of the representation.
3734 * This 8 bit integer has special values in order to specify the following
3735 * conditions:
3736 * 253: not a number
3737 * 254: + inf
3738 * 255: - inf
3739 */
3740static int rdbSaveDoubleValue(FILE *fp, double val) {
3741 unsigned char buf[128];
3742 int len;
3743
3744 if (isnan(val)) {
3745 buf[0] = 253;
3746 len = 1;
3747 } else if (!isfinite(val)) {
3748 len = 1;
3749 buf[0] = (val < 0) ? 255 : 254;
3750 } else {
88e8d89f 3751#if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
fe244589 3752 /* Check if the float is in a safe range to be casted into a
3753 * long long. We are assuming that long long is 64 bit here.
3754 * Also we are assuming that there are no implementations around where
3755 * double has precision < 52 bit.
3756 *
3757 * Under this assumptions we test if a double is inside an interval
3758 * where casting to long long is safe. Then using two castings we
3759 * make sure the decimal part is zero. If all this is true we use
3760 * integer printing function that is much faster. */
fb82e75c 3761 double min = -4503599627370495; /* (2^52)-1 */
3762 double max = 4503599627370496; /* -(2^52) */
fe244589 3763 if (val > min && val < max && val == ((double)((long long)val)))
8c096b16 3764 ll2string((char*)buf+1,sizeof(buf),(long long)val);
3765 else
88e8d89f 3766#endif
8c096b16 3767 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3768 buf[0] = strlen((char*)buf+1);
a7866db6 3769 len = buf[0]+1;
3770 }
3771 if (fwrite(buf,len,1,fp) == 0) return -1;
3772 return 0;
3773}
3774
06233c45 3775/* Save a Redis object. */
3776static int rdbSaveObject(FILE *fp, robj *o) {
3777 if (o->type == REDIS_STRING) {
3778 /* Save a string value */
3779 if (rdbSaveStringObject(fp,o) == -1) return -1;
3780 } else if (o->type == REDIS_LIST) {
3781 /* Save a list value */
23f96494
PN
3782 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
3783 unsigned char *p;
3784 unsigned char *vstr;
3785 unsigned int vlen;
3786 long long vlong;
3787
3788 if (rdbSaveLen(fp,ziplistLen(o->ptr)) == -1) return -1;
3789 p = ziplistIndex(o->ptr,0);
3790 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
3791 if (vstr) {
3792 if (rdbSaveRawString(fp,vstr,vlen) == -1)
3793 return -1;
3794 } else {
3795 if (rdbSaveLongLongAsStringObject(fp,vlong) == -1)
3796 return -1;
3797 }
3798 p = ziplistNext(o->ptr,p);
3799 }
3800 } else if (o->encoding == REDIS_ENCODING_LIST) {
3801 list *list = o->ptr;
3802 listIter li;
3803 listNode *ln;
3804
3805 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
3806 listRewind(list,&li);
3807 while((ln = listNext(&li))) {
3808 robj *eleobj = listNodeValue(ln);
3809 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3810 }
3811 } else {
3812 redisPanic("Unknown list encoding");
06233c45 3813 }
3814 } else if (o->type == REDIS_SET) {
3815 /* Save a set value */
d0b58d53
PN
3816 if (o->encoding == REDIS_ENCODING_HT) {
3817 dict *set = o->ptr;
3818 dictIterator *di = dictGetIterator(set);
3819 dictEntry *de;
06233c45 3820
d0b58d53
PN
3821 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3822 while((de = dictNext(di)) != NULL) {
3823 robj *eleobj = dictGetEntryKey(de);
3824 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3825 }
3826 dictReleaseIterator(di);
3827 } else if (o->encoding == REDIS_ENCODING_INTSET) {
3828 intset *is = o->ptr;
3829 long long llval;
3830 int i = 0;
3831
3832 if (rdbSaveLen(fp,intsetLen(is)) == -1) return -1;
3833 while(intsetGet(is,i++,&llval)) {
3834 if (rdbSaveLongLongAsStringObject(fp,llval) == -1) return -1;
3835 }
3836 } else {
3837 redisPanic("Unknown set encoding");
06233c45 3838 }
06233c45 3839 } else if (o->type == REDIS_ZSET) {
3840 /* Save a set value */
3841 zset *zs = o->ptr;
3842 dictIterator *di = dictGetIterator(zs->dict);
3843 dictEntry *de;
3844
3845 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3846 while((de = dictNext(di)) != NULL) {
3847 robj *eleobj = dictGetEntryKey(de);
3848 double *score = dictGetEntryVal(de);
3849
3850 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3851 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3852 }
3853 dictReleaseIterator(di);
b1befe6a 3854 } else if (o->type == REDIS_HASH) {
3855 /* Save a hash value */
3856 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3857 unsigned char *p = zipmapRewind(o->ptr);
3858 unsigned int count = zipmapLen(o->ptr);
3859 unsigned char *key, *val;
3860 unsigned int klen, vlen;
3861
3862 if (rdbSaveLen(fp,count) == -1) return -1;
3863 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3864 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3865 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3866 }
3867 } else {
3868 dictIterator *di = dictGetIterator(o->ptr);
3869 dictEntry *de;
3870
3871 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3872 while((de = dictNext(di)) != NULL) {
3873 robj *key = dictGetEntryKey(de);
3874 robj *val = dictGetEntryVal(de);
3875
3876 if (rdbSaveStringObject(fp,key) == -1) return -1;
3877 if (rdbSaveStringObject(fp,val) == -1) return -1;
3878 }
3879 dictReleaseIterator(di);
3880 }
06233c45 3881 } else {
f83c6cb5 3882 redisPanic("Unknown object type");
06233c45 3883 }
3884 return 0;
3885}
3886
3887/* Return the length the object will have on disk if saved with
3888 * the rdbSaveObject() function. Currently we use a trick to get
3889 * this length with very little changes to the code. In the future
3890 * we could switch to a faster solution. */
b9bc0eef 3891static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3892 if (fp == NULL) fp = server.devnull;
06233c45 3893 rewind(fp);
3894 assert(rdbSaveObject(fp,o) != 1);
3895 return ftello(fp);
3896}
3897
06224fec 3898/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3899static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3900 off_t bytes = rdbSavedObjectLen(o,fp);
e0a62c7f 3901
06224fec 3902 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3903}
3904
ed9b544e 3905/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3906static int rdbSave(char *filename) {
ed9b544e 3907 dictIterator *di = NULL;
3908 dictEntry *de;
ed9b544e 3909 FILE *fp;
3910 char tmpfile[256];
3911 int j;
bb32ede5 3912 time_t now = time(NULL);
ed9b544e 3913
2316bb3b 3914 /* Wait for I/O therads to terminate, just in case this is a
3915 * foreground-saving, to avoid seeking the swap file descriptor at the
3916 * same time. */
3917 if (server.vm_enabled)
3918 waitEmptyIOJobsQueue();
3919
a3b21203 3920 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3921 fp = fopen(tmpfile,"w");
3922 if (!fp) {
3923 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3924 return REDIS_ERR;
3925 }
f78fd11b 3926 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3927 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3928 redisDb *db = server.db+j;
3929 dict *d = db->dict;
3305306f 3930 if (dictSize(d) == 0) continue;
ed9b544e 3931 di = dictGetIterator(d);
3932 if (!di) {
3933 fclose(fp);
3934 return REDIS_ERR;
3935 }
3936
3937 /* Write the SELECT DB opcode */
f78fd11b 3938 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3939 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3940
3941 /* Iterate this DB writing every entry */
3942 while((de = dictNext(di)) != NULL) {
09241813 3943 sds keystr = dictGetEntryKey(de);
3944 robj key, *o = dictGetEntryVal(de);
3945 time_t expiretime;
3946
3947 initStaticStringObject(key,keystr);
3948 expiretime = getExpire(db,&key);
bb32ede5 3949
3950 /* Save the expire time */
3951 if (expiretime != -1) {
3952 /* If this key is already expired skip it */
3953 if (expiretime < now) continue;
3954 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3955 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3956 }
7e69548d 3957 /* Save the key and associated value. This requires special
3958 * handling if the value is swapped out. */
560db612 3959 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
3960 o->storage == REDIS_VM_SWAPPING) {
7e69548d 3961 /* Save type, key, value */
3962 if (rdbSaveType(fp,o->type) == -1) goto werr;
09241813 3963 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
7e69548d 3964 if (rdbSaveObject(fp,o) == -1) goto werr;
3965 } else {
996cb5f7 3966 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3967 robj *po;
7e69548d 3968 /* Get a preview of the object in memory */
560db612 3969 po = vmPreviewObject(o);
7e69548d 3970 /* Save type, key, value */
560db612 3971 if (rdbSaveType(fp,po->type) == -1) goto werr;
09241813 3972 if (rdbSaveStringObject(fp,&key) == -1) goto werr;
7e69548d 3973 if (rdbSaveObject(fp,po) == -1) goto werr;
3974 /* Remove the loaded object from memory */
3975 decrRefCount(po);
7e69548d 3976 }
ed9b544e 3977 }
3978 dictReleaseIterator(di);
3979 }
3980 /* EOF opcode */
f78fd11b 3981 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3982
3983 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3984 fflush(fp);
3985 fsync(fileno(fp));
3986 fclose(fp);
e0a62c7f 3987
ed9b544e 3988 /* Use RENAME to make sure the DB file is changed atomically only
3989 * if the generate DB file is ok. */
3990 if (rename(tmpfile,filename) == -1) {
325d1eb4 3991 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3992 unlink(tmpfile);
3993 return REDIS_ERR;
3994 }
3995 redisLog(REDIS_NOTICE,"DB saved on disk");
3996 server.dirty = 0;
3997 server.lastsave = time(NULL);
3998 return REDIS_OK;
3999
4000werr:
4001 fclose(fp);
4002 unlink(tmpfile);
4003 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
4004 if (di) dictReleaseIterator(di);
4005 return REDIS_ERR;
4006}
4007
f78fd11b 4008static int rdbSaveBackground(char *filename) {
ed9b544e 4009 pid_t childpid;
4010
9d65a1bb 4011 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 4012 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 4013 if ((childpid = fork()) == 0) {
4014 /* Child */
054e426d 4015 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 4016 close(server.fd);
f78fd11b 4017 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 4018 _exit(0);
ed9b544e 4019 } else {
478c2c6f 4020 _exit(1);
ed9b544e 4021 }
4022 } else {
4023 /* Parent */
5a7c647e 4024 if (childpid == -1) {
4025 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
4026 strerror(errno));
4027 return REDIS_ERR;
4028 }
ed9b544e 4029 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 4030 server.bgsavechildpid = childpid;
884d4b39 4031 updateDictResizePolicy();
ed9b544e 4032 return REDIS_OK;
4033 }
4034 return REDIS_OK; /* unreached */
4035}
4036
a3b21203 4037static void rdbRemoveTempFile(pid_t childpid) {
4038 char tmpfile[256];
4039
4040 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
4041 unlink(tmpfile);
4042}
4043
f78fd11b 4044static int rdbLoadType(FILE *fp) {
4045 unsigned char type;
7b45bfb2 4046 if (fread(&type,1,1,fp) == 0) return -1;
4047 return type;
4048}
4049
bb32ede5 4050static time_t rdbLoadTime(FILE *fp) {
4051 int32_t t32;
4052 if (fread(&t32,4,1,fp) == 0) return -1;
4053 return (time_t) t32;
4054}
4055
e3566d4b 4056/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
4057 * of this file for a description of how this are stored on disk.
4058 *
4059 * isencoded is set to 1 if the readed length is not actually a length but
4060 * an "encoding type", check the above comments for more info */
c78a8ccc 4061static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 4062 unsigned char buf[2];
4063 uint32_t len;
c78a8ccc 4064 int type;
f78fd11b 4065
e3566d4b 4066 if (isencoded) *isencoded = 0;
c78a8ccc 4067 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
4068 type = (buf[0]&0xC0)>>6;
4069 if (type == REDIS_RDB_6BITLEN) {
4070 /* Read a 6 bit len */
4071 return buf[0]&0x3F;
4072 } else if (type == REDIS_RDB_ENCVAL) {
4073 /* Read a 6 bit len encoding type */
4074 if (isencoded) *isencoded = 1;
4075 return buf[0]&0x3F;
4076 } else if (type == REDIS_RDB_14BITLEN) {
4077 /* Read a 14 bit len */
4078 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
4079 return ((buf[0]&0x3F)<<8)|buf[1];
4080 } else {
4081 /* Read a 32 bit len */
f78fd11b 4082 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
4083 return ntohl(len);
f78fd11b 4084 }
f78fd11b 4085}
4086
ad30aa60 4087/* Load an integer-encoded object from file 'fp', with the specified
4088 * encoding type 'enctype'. If encode is true the function may return
4089 * an integer-encoded object as reply, otherwise the returned object
4090 * will always be encoded as a raw string. */
4091static robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
e3566d4b 4092 unsigned char enc[4];
4093 long long val;
4094
4095 if (enctype == REDIS_RDB_ENC_INT8) {
4096 if (fread(enc,1,1,fp) == 0) return NULL;
4097 val = (signed char)enc[0];
4098 } else if (enctype == REDIS_RDB_ENC_INT16) {
4099 uint16_t v;
4100 if (fread(enc,2,1,fp) == 0) return NULL;
4101 v = enc[0]|(enc[1]<<8);
4102 val = (int16_t)v;
4103 } else if (enctype == REDIS_RDB_ENC_INT32) {
4104 uint32_t v;
4105 if (fread(enc,4,1,fp) == 0) return NULL;
4106 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
4107 val = (int32_t)v;
4108 } else {
4109 val = 0; /* anti-warning */
f83c6cb5 4110 redisPanic("Unknown RDB integer encoding type");
e3566d4b 4111 }
ad30aa60 4112 if (encode)
4113 return createStringObjectFromLongLong(val);
4114 else
4115 return createObject(REDIS_STRING,sdsfromlonglong(val));
e3566d4b 4116}
4117
c78a8ccc 4118static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 4119 unsigned int len, clen;
4120 unsigned char *c = NULL;
4121 sds val = NULL;
4122
c78a8ccc 4123 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4124 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 4125 if ((c = zmalloc(clen)) == NULL) goto err;
4126 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
4127 if (fread(c,clen,1,fp) == 0) goto err;
4128 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 4129 zfree(c);
88e85998 4130 return createObject(REDIS_STRING,val);
4131err:
4132 zfree(c);
4133 sdsfree(val);
4134 return NULL;
4135}
4136
ad30aa60 4137static robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
e3566d4b 4138 int isencoded;
4139 uint32_t len;
f78fd11b 4140 sds val;
4141
c78a8ccc 4142 len = rdbLoadLen(fp,&isencoded);
e3566d4b 4143 if (isencoded) {
4144 switch(len) {
4145 case REDIS_RDB_ENC_INT8:
4146 case REDIS_RDB_ENC_INT16:
4147 case REDIS_RDB_ENC_INT32:
ad30aa60 4148 return rdbLoadIntegerObject(fp,len,encode);
88e85998 4149 case REDIS_RDB_ENC_LZF:
bdcb92f2 4150 return rdbLoadLzfStringObject(fp);
e3566d4b 4151 default:
f83c6cb5 4152 redisPanic("Unknown RDB encoding type");
e3566d4b 4153 }
4154 }
4155
f78fd11b 4156 if (len == REDIS_RDB_LENERR) return NULL;
4157 val = sdsnewlen(NULL,len);
4158 if (len && fread(val,len,1,fp) == 0) {
4159 sdsfree(val);
4160 return NULL;
4161 }
bdcb92f2 4162 return createObject(REDIS_STRING,val);
f78fd11b 4163}
4164
ad30aa60 4165static robj *rdbLoadStringObject(FILE *fp) {
4166 return rdbGenericLoadStringObject(fp,0);
4167}
4168
4169static robj *rdbLoadEncodedStringObject(FILE *fp) {
4170 return rdbGenericLoadStringObject(fp,1);
4171}
4172
a7866db6 4173/* For information about double serialization check rdbSaveDoubleValue() */
4174static int rdbLoadDoubleValue(FILE *fp, double *val) {
4175 char buf[128];
4176 unsigned char len;
4177
4178 if (fread(&len,1,1,fp) == 0) return -1;
4179 switch(len) {
4180 case 255: *val = R_NegInf; return 0;
4181 case 254: *val = R_PosInf; return 0;
4182 case 253: *val = R_Nan; return 0;
4183 default:
4184 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 4185 buf[len] = '\0';
a7866db6 4186 sscanf(buf, "%lg", val);
4187 return 0;
4188 }
4189}
4190
c78a8ccc 4191/* Load a Redis object of the specified type from the specified file.
4192 * On success a newly allocated object is returned, otherwise NULL. */
4193static robj *rdbLoadObject(int type, FILE *fp) {
23f96494
PN
4194 robj *o, *ele, *dec;
4195 size_t len;
c78a8ccc 4196
bcd11906 4197 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 4198 if (type == REDIS_STRING) {
4199 /* Read string value */
ad30aa60 4200 if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4201 o = tryObjectEncoding(o);
23f96494
PN
4202 } else if (type == REDIS_LIST) {
4203 /* Read list value */
4204 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4205
d0686e07
PN
4206 /* Use a real list when there are too many entries */
4207 if (len > server.list_max_ziplist_entries) {
4208 o = createListObject();
4209 } else {
4210 o = createZiplistObject();
4211 }
c78a8ccc 4212
23f96494
PN
4213 /* Load every single element of the list */
4214 while(len--) {
4215 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4216
d0686e07
PN
4217 /* If we are using a ziplist and the value is too big, convert
4218 * the object to a real list. */
4219 if (o->encoding == REDIS_ENCODING_ZIPLIST &&
4220 ele->encoding == REDIS_ENCODING_RAW &&
4221 sdslen(ele->ptr) > server.list_max_ziplist_value)
003f0840 4222 listTypeConvert(o,REDIS_ENCODING_LIST);
d0686e07 4223
23f96494
PN
4224 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
4225 dec = getDecodedObject(ele);
4226 o->ptr = ziplistPush(o->ptr,dec->ptr,sdslen(dec->ptr),REDIS_TAIL);
4227 decrRefCount(dec);
4228 decrRefCount(ele);
4229 } else {
4230 ele = tryObjectEncoding(ele);
4231 listAddNodeTail(o->ptr,ele);
23f96494
PN
4232 }
4233 }
4234 } else if (type == REDIS_SET) {
4235 /* Read list/set value */
4236 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4237 o = createSetObject();
3c68de9b 4238 /* It's faster to expand the dict to the right size asap in order
4239 * to avoid rehashing */
23f96494
PN
4240 if (len > DICT_HT_INITIAL_SIZE)
4241 dictExpand(o->ptr,len);
c78a8ccc 4242 /* Load every single element of the list/set */
23f96494 4243 while(len--) {
ad30aa60 4244 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4245 ele = tryObjectEncoding(ele);
23f96494 4246 dictAdd((dict*)o->ptr,ele,NULL);
c78a8ccc 4247 }
4248 } else if (type == REDIS_ZSET) {
4249 /* Read list/set value */
ada386b2 4250 size_t zsetlen;
c78a8ccc 4251 zset *zs;
4252
4253 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4254 o = createZsetObject();
4255 zs = o->ptr;
4256 /* Load every single element of the list/set */
4257 while(zsetlen--) {
4258 robj *ele;
4259 double *score = zmalloc(sizeof(double));
4260
ad30aa60 4261 if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
05df7621 4262 ele = tryObjectEncoding(ele);
c78a8ccc 4263 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
4264 dictAdd(zs->dict,ele,score);
4265 zslInsert(zs->zsl,*score,ele);
4266 incrRefCount(ele); /* added to skiplist */
4267 }
ada386b2 4268 } else if (type == REDIS_HASH) {
4269 size_t hashlen;
4270
4271 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
4272 o = createHashObject();
4273 /* Too many entries? Use an hash table. */
4274 if (hashlen > server.hash_max_zipmap_entries)
4275 convertToRealHash(o);
4276 /* Load every key/value, then set it into the zipmap or hash
4277 * table, as needed. */
4278 while(hashlen--) {
4279 robj *key, *val;
4280
b785b2bf 4281 if ((key = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
4282 if ((val = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
ada386b2 4283 /* If we are using a zipmap and there are too big values
4284 * the object is converted to real hash table encoding. */
4285 if (o->encoding != REDIS_ENCODING_HT &&
4286 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
4287 sdslen(val->ptr) > server.hash_max_zipmap_value))
4288 {
4289 convertToRealHash(o);
4290 }
4291
4292 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
4293 unsigned char *zm = o->ptr;
4294
4295 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
4296 val->ptr,sdslen(val->ptr),NULL);
4297 o->ptr = zm;
4298 decrRefCount(key);
4299 decrRefCount(val);
4300 } else {
05df7621 4301 key = tryObjectEncoding(key);
4302 val = tryObjectEncoding(val);
ada386b2 4303 dictAdd((dict*)o->ptr,key,val);
ada386b2 4304 }
4305 }
c78a8ccc 4306 } else {
f83c6cb5 4307 redisPanic("Unknown object type");
c78a8ccc 4308 }
4309 return o;
4310}
4311
f78fd11b 4312static int rdbLoad(char *filename) {
ed9b544e 4313 FILE *fp;
f78fd11b 4314 uint32_t dbid;
bb32ede5 4315 int type, retval, rdbver;
585af7e2 4316 int swap_all_values = 0;
bb32ede5 4317 redisDb *db = server.db+0;
f78fd11b 4318 char buf[1024];
242a64f3 4319 time_t expiretime, now = time(NULL);
bb32ede5 4320
ed9b544e 4321 fp = fopen(filename,"r");
4322 if (!fp) return REDIS_ERR;
4323 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 4324 buf[9] = '\0';
4325 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 4326 fclose(fp);
4327 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
4328 return REDIS_ERR;
4329 }
f78fd11b 4330 rdbver = atoi(buf+5);
c78a8ccc 4331 if (rdbver != 1) {
f78fd11b 4332 fclose(fp);
4333 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
4334 return REDIS_ERR;
4335 }
ed9b544e 4336 while(1) {
585af7e2 4337 robj *key, *val;
7e02fe32 4338 int force_swapout;
ed9b544e 4339
585af7e2 4340 expiretime = -1;
ed9b544e 4341 /* Read type. */
f78fd11b 4342 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 4343 if (type == REDIS_EXPIRETIME) {
4344 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
4345 /* We read the time so we need to read the object type again */
4346 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
4347 }
ed9b544e 4348 if (type == REDIS_EOF) break;
4349 /* Handle SELECT DB opcode as a special case */
4350 if (type == REDIS_SELECTDB) {
c78a8ccc 4351 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 4352 goto eoferr;
ed9b544e 4353 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 4354 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 4355 exit(1);
4356 }
bb32ede5 4357 db = server.db+dbid;
ed9b544e 4358 continue;
4359 }
4360 /* Read key */
585af7e2 4361 if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
c78a8ccc 4362 /* Read value */
585af7e2 4363 if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
89e689c5 4364 /* Check if the key already expired */
4365 if (expiretime != -1 && expiretime < now) {
4366 decrRefCount(key);
4367 decrRefCount(val);
4368 continue;
4369 }
ed9b544e 4370 /* Add the new object in the hash table */
09241813 4371 retval = dbAdd(db,key,val);
4372 if (retval == REDIS_ERR) {
585af7e2 4373 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
ed9b544e 4374 exit(1);
4375 }
bb32ede5 4376 /* Set the expire time if needed */
89e689c5 4377 if (expiretime != -1) setExpire(db,key,expiretime);
242a64f3 4378
b492cf00 4379 /* Handle swapping while loading big datasets when VM is on */
242a64f3 4380
4381 /* If we detecter we are hopeless about fitting something in memory
4382 * we just swap every new key on disk. Directly...
4383 * Note that's important to check for this condition before resorting
4384 * to random sampling, otherwise we may try to swap already
4385 * swapped keys. */
585af7e2 4386 if (swap_all_values) {
09241813 4387 dictEntry *de = dictFind(db->dict,key->ptr);
242a64f3 4388
4389 /* de may be NULL since the key already expired */
4390 if (de) {
560db612 4391 vmpointer *vp;
585af7e2 4392 val = dictGetEntryVal(de);
242a64f3 4393
560db612 4394 if (val->refcount == 1 &&
4395 (vp = vmSwapObjectBlocking(val)) != NULL)
4396 dictGetEntryVal(de) = vp;
242a64f3 4397 }
09241813 4398 decrRefCount(key);
242a64f3 4399 continue;
4400 }
09241813 4401 decrRefCount(key);
242a64f3 4402
a89b7013 4403 /* Flush data on disk once 32 MB of additional RAM are used... */
7e02fe32 4404 force_swapout = 0;
4405 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
4406 force_swapout = 1;
242a64f3 4407
4408 /* If we have still some hope of having some value fitting memory
4409 * then we try random sampling. */
7e02fe32 4410 if (!swap_all_values && server.vm_enabled && force_swapout) {
b492cf00 4411 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 4412 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 4413 }
242a64f3 4414 if (zmalloc_used_memory() > server.vm_max_memory)
585af7e2 4415 swap_all_values = 1; /* We are already using too much mem */
b492cf00 4416 }
ed9b544e 4417 }
4418 fclose(fp);
4419 return REDIS_OK;
4420
4421eoferr: /* unexpected end of file is handled here with a fatal exit */
f80dff62 4422 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 4423 exit(1);
4424 return REDIS_ERR; /* Just to avoid warning */
4425}
4426
b58ba105 4427/*================================== Shutdown =============================== */
fab43727 4428static int prepareForShutdown() {
b58ba105
AM
4429 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
4430 /* Kill the saving child if there is a background saving in progress.
4431 We want to avoid race conditions, for instance our saving child may
4432 overwrite the synchronous saving did by SHUTDOWN. */
4433 if (server.bgsavechildpid != -1) {
4434 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4435 kill(server.bgsavechildpid,SIGKILL);
4436 rdbRemoveTempFile(server.bgsavechildpid);
4437 }
4438 if (server.appendonly) {
4439 /* Append only file: fsync() the AOF and exit */
b0bd87f6 4440 aof_fsync(server.appendfd);
b58ba105 4441 if (server.vm_enabled) unlink(server.vm_swap_file);
b58ba105
AM
4442 } else {
4443 /* Snapshotting. Perform a SYNC SAVE and exit */
4444 if (rdbSave(server.dbfilename) == REDIS_OK) {
4445 if (server.daemonize)
4446 unlink(server.pidfile);
4447 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
b58ba105
AM
4448 } else {
4449 /* Ooops.. error saving! The best we can do is to continue
4450 * operating. Note that if there was a background saving process,
4451 * in the next cron() Redis will be notified that the background
4452 * saving aborted, handling special stuff like slaves pending for
4453 * synchronization... */
4454 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
fab43727 4455 return REDIS_ERR;
b58ba105
AM
4456 }
4457 }
8513a757 4458 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
fab43727 4459 return REDIS_OK;
b58ba105
AM
4460}
4461
ed9b544e 4462/*================================== Commands =============================== */
4463
abcb223e 4464static void authCommand(redisClient *c) {
2e77c2ee 4465 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
4466 c->authenticated = 1;
4467 addReply(c,shared.ok);
4468 } else {
4469 c->authenticated = 0;
fa4c0aba 4470 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
4471 }
4472}
4473
ed9b544e 4474static void pingCommand(redisClient *c) {
4475 addReply(c,shared.pong);
4476}
4477
4478static void echoCommand(redisClient *c) {
dd88747b 4479 addReplyBulk(c,c->argv[1]);
ed9b544e 4480}
4481
4482/*=================================== Strings =============================== */
4483
526d00a5 4484static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
ed9b544e 4485 int retval;
10ce1276 4486 long seconds = 0; /* initialized to avoid an harmness warning */
ed9b544e 4487
526d00a5 4488 if (expire) {
4489 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4490 return;
4491 if (seconds <= 0) {
4492 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4493 return;
4494 }
4495 }
4496
37ab76c9 4497 touchWatchedKey(c->db,key);
526d00a5 4498 if (nx) deleteIfVolatile(c->db,key);
09241813 4499 retval = dbAdd(c->db,key,val);
4500 if (retval == REDIS_ERR) {
ed9b544e 4501 if (!nx) {
09241813 4502 dbReplace(c->db,key,val);
526d00a5 4503 incrRefCount(val);
ed9b544e 4504 } else {
c937aa89 4505 addReply(c,shared.czero);
ed9b544e 4506 return;
4507 }
4508 } else {
526d00a5 4509 incrRefCount(val);
ed9b544e 4510 }
4511 server.dirty++;
526d00a5 4512 removeExpire(c->db,key);
4513 if (expire) setExpire(c->db,key,time(NULL)+seconds);
c937aa89 4514 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 4515}
4516
4517static void setCommand(redisClient *c) {
526d00a5 4518 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
ed9b544e 4519}
4520
4521static void setnxCommand(redisClient *c) {
526d00a5 4522 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4523}
4524
4525static void setexCommand(redisClient *c) {
4526 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
ed9b544e 4527}
4528
322fc7d8 4529static int getGenericCommand(redisClient *c) {
dd88747b 4530 robj *o;
e0a62c7f 4531
dd88747b 4532 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 4533 return REDIS_OK;
dd88747b 4534
4535 if (o->type != REDIS_STRING) {
4536 addReply(c,shared.wrongtypeerr);
4537 return REDIS_ERR;
ed9b544e 4538 } else {
dd88747b 4539 addReplyBulk(c,o);
4540 return REDIS_OK;
ed9b544e 4541 }
4542}
4543
322fc7d8 4544static void getCommand(redisClient *c) {
4545 getGenericCommand(c);
4546}
4547
f6b141c5 4548static void getsetCommand(redisClient *c) {
322fc7d8 4549 if (getGenericCommand(c) == REDIS_ERR) return;
09241813 4550 dbReplace(c->db,c->argv[1],c->argv[2]);
a431eb74 4551 incrRefCount(c->argv[2]);
4552 server.dirty++;
4553 removeExpire(c->db,c->argv[1]);
4554}
4555
70003d28 4556static void mgetCommand(redisClient *c) {
70003d28 4557 int j;
e0a62c7f 4558
c937aa89 4559 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 4560 for (j = 1; j < c->argc; j++) {
3305306f 4561 robj *o = lookupKeyRead(c->db,c->argv[j]);
4562 if (o == NULL) {
c937aa89 4563 addReply(c,shared.nullbulk);
70003d28 4564 } else {
70003d28 4565 if (o->type != REDIS_STRING) {
c937aa89 4566 addReply(c,shared.nullbulk);
70003d28 4567 } else {
dd88747b 4568 addReplyBulk(c,o);
70003d28 4569 }
4570 }
4571 }
4572}
4573
6c446631 4574static void msetGenericCommand(redisClient *c, int nx) {
906573e7 4575 int j, busykeys = 0;
6c446631 4576
4577 if ((c->argc % 2) == 0) {
454d4e43 4578 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 4579 return;
4580 }
4581 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4582 * set nothing at all if at least one already key exists. */
4583 if (nx) {
4584 for (j = 1; j < c->argc; j += 2) {
906573e7 4585 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4586 busykeys++;
6c446631 4587 }
4588 }
4589 }
906573e7 4590 if (busykeys) {
4591 addReply(c, shared.czero);
4592 return;
4593 }
6c446631 4594
4595 for (j = 1; j < c->argc; j += 2) {
05df7621 4596 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
09241813 4597 dbReplace(c->db,c->argv[j],c->argv[j+1]);
4598 incrRefCount(c->argv[j+1]);
6c446631 4599 removeExpire(c->db,c->argv[j]);
4600 }
4601 server.dirty += (c->argc-1)/2;
4602 addReply(c, nx ? shared.cone : shared.ok);
4603}
4604
4605static void msetCommand(redisClient *c) {
4606 msetGenericCommand(c,0);
4607}
4608
4609static void msetnxCommand(redisClient *c) {
4610 msetGenericCommand(c,1);
4611}
4612
d68ed120 4613static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 4614 long long value;
ed9b544e 4615 robj *o;
e0a62c7f 4616
3305306f 4617 o = lookupKeyWrite(c->db,c->argv[1]);
6485f293
PN
4618 if (o != NULL && checkType(c,o,REDIS_STRING)) return;
4619 if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
ed9b544e 4620
4621 value += incr;
d6f4c262 4622 o = createStringObjectFromLongLong(value);
09241813 4623 dbReplace(c->db,c->argv[1],o);
ed9b544e 4624 server.dirty++;
c937aa89 4625 addReply(c,shared.colon);
ed9b544e 4626 addReply(c,o);
4627 addReply(c,shared.crlf);
4628}
4629
4630static void incrCommand(redisClient *c) {
a4d1ba9a 4631 incrDecrCommand(c,1);
ed9b544e 4632}
4633
4634static void decrCommand(redisClient *c) {
a4d1ba9a 4635 incrDecrCommand(c,-1);
ed9b544e 4636}
4637
4638static void incrbyCommand(redisClient *c) {
bbe025e0
AM
4639 long long incr;
4640
bd79a6bd 4641 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4642 incrDecrCommand(c,incr);
ed9b544e 4643}
4644
4645static void decrbyCommand(redisClient *c) {
bbe025e0
AM
4646 long long incr;
4647
bd79a6bd 4648 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4649 incrDecrCommand(c,-incr);
ed9b544e 4650}
4651
4b00bebd 4652static void appendCommand(redisClient *c) {
4653 int retval;
4654 size_t totlen;
4655 robj *o;
4656
4657 o = lookupKeyWrite(c->db,c->argv[1]);
4658 if (o == NULL) {
4659 /* Create the key */
09241813 4660 retval = dbAdd(c->db,c->argv[1],c->argv[2]);
4b00bebd 4661 incrRefCount(c->argv[2]);
4662 totlen = stringObjectLen(c->argv[2]);
4663 } else {
4b00bebd 4664 if (o->type != REDIS_STRING) {
4665 addReply(c,shared.wrongtypeerr);
4666 return;
4667 }
4668 /* If the object is specially encoded or shared we have to make
4669 * a copy */
4670 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4671 robj *decoded = getDecodedObject(o);
4672
4673 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4674 decrRefCount(decoded);
09241813 4675 dbReplace(c->db,c->argv[1],o);
4b00bebd 4676 }
4677 /* APPEND! */
4678 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4679 o->ptr = sdscatlen(o->ptr,
4680 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4681 } else {
4682 o->ptr = sdscatprintf(o->ptr, "%ld",
4683 (unsigned long) c->argv[2]->ptr);
4684 }
4685 totlen = sdslen(o->ptr);
4686 }
4687 server.dirty++;
4688 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4689}
4690
39191553 4691static void substrCommand(redisClient *c) {
4692 robj *o;
4693 long start = atoi(c->argv[2]->ptr);
4694 long end = atoi(c->argv[3]->ptr);
dd88747b 4695 size_t rangelen, strlen;
4696 sds range;
39191553 4697
dd88747b 4698 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4699 checkType(c,o,REDIS_STRING)) return;
39191553 4700
dd88747b 4701 o = getDecodedObject(o);
4702 strlen = sdslen(o->ptr);
8fe7fad7 4703
dd88747b 4704 /* convert negative indexes */
4705 if (start < 0) start = strlen+start;
4706 if (end < 0) end = strlen+end;
4707 if (start < 0) start = 0;
4708 if (end < 0) end = 0;
39191553 4709
dd88747b 4710 /* indexes sanity checks */
4711 if (start > end || (size_t)start >= strlen) {
4712 /* Out of range start or start > end result in null reply */
4713 addReply(c,shared.nullbulk);
4714 decrRefCount(o);
4715 return;
39191553 4716 }
dd88747b 4717 if ((size_t)end >= strlen) end = strlen-1;
4718 rangelen = (end-start)+1;
4719
4720 /* Return the result */
4721 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4722 range = sdsnewlen((char*)o->ptr+start,rangelen);
4723 addReplySds(c,range);
4724 addReply(c,shared.crlf);
4725 decrRefCount(o);
39191553 4726}
4727
ed9b544e 4728/* ========================= Type agnostic commands ========================= */
4729
4730static void delCommand(redisClient *c) {
5109cdff 4731 int deleted = 0, j;
4732
4733 for (j = 1; j < c->argc; j++) {
09241813 4734 if (dbDelete(c->db,c->argv[j])) {
37ab76c9 4735 touchWatchedKey(c->db,c->argv[j]);
5109cdff 4736 server.dirty++;
4737 deleted++;
4738 }
4739 }
482b672d 4740 addReplyLongLong(c,deleted);
ed9b544e 4741}
4742
4743static void existsCommand(redisClient *c) {
f4f06efc 4744 expireIfNeeded(c->db,c->argv[1]);
09241813 4745 if (dbExists(c->db,c->argv[1])) {
f4f06efc
PN
4746 addReply(c, shared.cone);
4747 } else {
4748 addReply(c, shared.czero);
4749 }
ed9b544e 4750}
4751
4752static void selectCommand(redisClient *c) {
4753 int id = atoi(c->argv[1]->ptr);
e0a62c7f 4754
ed9b544e 4755 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4756 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4757 } else {
4758 addReply(c,shared.ok);
4759 }
4760}
4761
4762static void randomkeyCommand(redisClient *c) {
dc4be23e 4763 robj *key;
e0a62c7f 4764
09241813 4765 if ((key = dbRandomKey(c->db)) == NULL) {
dc4be23e 4766 addReply(c,shared.nullbulk);
4767 return;
4768 }
4769
09241813 4770 addReplyBulk(c,key);
4771 decrRefCount(key);
ed9b544e 4772}
4773
4774static void keysCommand(redisClient *c) {
4775 dictIterator *di;
4776 dictEntry *de;
4777 sds pattern = c->argv[1]->ptr;
4778 int plen = sdslen(pattern);
a3f9eec2 4779 unsigned long numkeys = 0;
ed9b544e 4780 robj *lenobj = createObject(REDIS_STRING,NULL);
4781
3305306f 4782 di = dictGetIterator(c->db->dict);
ed9b544e 4783 addReply(c,lenobj);
4784 decrRefCount(lenobj);
4785 while((de = dictNext(di)) != NULL) {
09241813 4786 sds key = dictGetEntryKey(de);
4787 robj *keyobj;
3305306f 4788
ed9b544e 4789 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4790 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
09241813 4791 keyobj = createStringObject(key,sdslen(key));
3305306f 4792 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4793 addReplyBulk(c,keyobj);
3305306f 4794 numkeys++;
3305306f 4795 }
09241813 4796 decrRefCount(keyobj);
ed9b544e 4797 }
4798 }
4799 dictReleaseIterator(di);
a3f9eec2 4800 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4801}
4802
4803static void dbsizeCommand(redisClient *c) {
4804 addReplySds(c,
3305306f 4805 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4806}
4807
4808static void lastsaveCommand(redisClient *c) {
4809 addReplySds(c,
c937aa89 4810 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4811}
4812
4813static void typeCommand(redisClient *c) {
3305306f 4814 robj *o;
ed9b544e 4815 char *type;
3305306f 4816
4817 o = lookupKeyRead(c->db,c->argv[1]);
4818 if (o == NULL) {
c937aa89 4819 type = "+none";
ed9b544e 4820 } else {
ed9b544e 4821 switch(o->type) {
c937aa89 4822 case REDIS_STRING: type = "+string"; break;
4823 case REDIS_LIST: type = "+list"; break;
4824 case REDIS_SET: type = "+set"; break;
412a8bce 4825 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4826 case REDIS_HASH: type = "+hash"; break;
4827 default: type = "+unknown"; break;
ed9b544e 4828 }
4829 }
4830 addReplySds(c,sdsnew(type));
4831 addReply(c,shared.crlf);
4832}
4833
4834static void saveCommand(redisClient *c) {
9d65a1bb 4835 if (server.bgsavechildpid != -1) {
05557f6d 4836 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4837 return;
4838 }
f78fd11b 4839 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4840 addReply(c,shared.ok);
4841 } else {
4842 addReply(c,shared.err);
4843 }
4844}
4845
4846static void bgsaveCommand(redisClient *c) {
9d65a1bb 4847 if (server.bgsavechildpid != -1) {
ed9b544e 4848 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4849 return;
4850 }
f78fd11b 4851 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4852 char *status = "+Background saving started\r\n";
4853 addReplySds(c,sdsnew(status));
ed9b544e 4854 } else {
4855 addReply(c,shared.err);
4856 }
4857}
4858
4859static void shutdownCommand(redisClient *c) {
fab43727 4860 if (prepareForShutdown() == REDIS_OK)
4861 exit(0);
4862 addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
ed9b544e 4863}
4864
4865static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4866 robj *o;
4867
4868 /* To use the same key as src and dst is probably an error */
4869 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4870 addReply(c,shared.sameobjecterr);
ed9b544e 4871 return;
4872 }
4873
dd88747b 4874 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4875 return;
dd88747b 4876
ed9b544e 4877 incrRefCount(o);
3305306f 4878 deleteIfVolatile(c->db,c->argv[2]);
09241813 4879 if (dbAdd(c->db,c->argv[2],o) == REDIS_ERR) {
ed9b544e 4880 if (nx) {
4881 decrRefCount(o);
c937aa89 4882 addReply(c,shared.czero);
ed9b544e 4883 return;
4884 }
09241813 4885 dbReplace(c->db,c->argv[2],o);
ed9b544e 4886 }
09241813 4887 dbDelete(c->db,c->argv[1]);
b167f877 4888 touchWatchedKey(c->db,c->argv[2]);
ed9b544e 4889 server.dirty++;
c937aa89 4890 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4891}
4892
4893static void renameCommand(redisClient *c) {
4894 renameGenericCommand(c,0);
4895}
4896
4897static void renamenxCommand(redisClient *c) {
4898 renameGenericCommand(c,1);
4899}
4900
4901static void moveCommand(redisClient *c) {
3305306f 4902 robj *o;
4903 redisDb *src, *dst;
ed9b544e 4904 int srcid;
4905
4906 /* Obtain source and target DB pointers */
3305306f 4907 src = c->db;
4908 srcid = c->db->id;
ed9b544e 4909 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4910 addReply(c,shared.outofrangeerr);
ed9b544e 4911 return;
4912 }
3305306f 4913 dst = c->db;
4914 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4915
4916 /* If the user is moving using as target the same
4917 * DB as the source DB it is probably an error. */
4918 if (src == dst) {
c937aa89 4919 addReply(c,shared.sameobjecterr);
ed9b544e 4920 return;
4921 }
4922
4923 /* Check if the element exists and get a reference */
3305306f 4924 o = lookupKeyWrite(c->db,c->argv[1]);
4925 if (!o) {
c937aa89 4926 addReply(c,shared.czero);
ed9b544e 4927 return;
4928 }
4929
4930 /* Try to add the element to the target DB */
3305306f 4931 deleteIfVolatile(dst,c->argv[1]);
09241813 4932 if (dbAdd(dst,c->argv[1],o) == REDIS_ERR) {
c937aa89 4933 addReply(c,shared.czero);
ed9b544e 4934 return;
4935 }
ed9b544e 4936 incrRefCount(o);
4937
4938 /* OK! key moved, free the entry in the source DB */
09241813 4939 dbDelete(src,c->argv[1]);
ed9b544e 4940 server.dirty++;
c937aa89 4941 addReply(c,shared.cone);
ed9b544e 4942}
4943
4944/* =================================== Lists ================================ */
d0686e07
PN
4945
4946
4947/* Check the argument length to see if it requires us to convert the ziplist
4948 * to a real list. Only check raw-encoded objects because integer encoded
4949 * objects are never too long. */
003f0840 4950static void listTypeTryConversion(robj *subject, robj *value) {
d0686e07
PN
4951 if (subject->encoding != REDIS_ENCODING_ZIPLIST) return;
4952 if (value->encoding == REDIS_ENCODING_RAW &&
4953 sdslen(value->ptr) > server.list_max_ziplist_value)
003f0840 4954 listTypeConvert(subject,REDIS_ENCODING_LIST);
d0686e07
PN
4955}
4956
003f0840 4957static void listTypePush(robj *subject, robj *value, int where) {
d0686e07 4958 /* Check if we need to convert the ziplist */
003f0840 4959 listTypeTryConversion(subject,value);
d0686e07
PN
4960 if (subject->encoding == REDIS_ENCODING_ZIPLIST &&
4961 ziplistLen(subject->ptr) > server.list_max_ziplist_entries)
003f0840 4962 listTypeConvert(subject,REDIS_ENCODING_LIST);
d0686e07 4963
c7d9d662
PN
4964 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4965 int pos = (where == REDIS_HEAD) ? ZIPLIST_HEAD : ZIPLIST_TAIL;
4966 value = getDecodedObject(value);
4967 subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),pos);
4968 decrRefCount(value);
4969 } else if (subject->encoding == REDIS_ENCODING_LIST) {
4970 if (where == REDIS_HEAD) {
4971 listAddNodeHead(subject->ptr,value);
4972 } else {
4973 listAddNodeTail(subject->ptr,value);
4974 }
4975 incrRefCount(value);
4976 } else {
4977 redisPanic("Unknown list encoding");
4978 }
4979}
4980
003f0840 4981static robj *listTypePop(robj *subject, int where) {
d72562f7
PN
4982 robj *value = NULL;
4983 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
4984 unsigned char *p;
b6eb9703 4985 unsigned char *vstr;
d72562f7 4986 unsigned int vlen;
b6eb9703 4987 long long vlong;
d72562f7
PN
4988 int pos = (where == REDIS_HEAD) ? 0 : -1;
4989 p = ziplistIndex(subject->ptr,pos);
b6eb9703
PN
4990 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
4991 if (vstr) {
4992 value = createStringObject((char*)vstr,vlen);
d72562f7 4993 } else {
b6eb9703 4994 value = createStringObjectFromLongLong(vlong);
d72562f7 4995 }
0f62e177
PN
4996 /* We only need to delete an element when it exists */
4997 subject->ptr = ziplistDelete(subject->ptr,&p);
d72562f7 4998 }
d72562f7
PN
4999 } else if (subject->encoding == REDIS_ENCODING_LIST) {
5000 list *list = subject->ptr;
5001 listNode *ln;
5002 if (where == REDIS_HEAD) {
5003 ln = listFirst(list);
5004 } else {
5005 ln = listLast(list);
5006 }
5007 if (ln != NULL) {
5008 value = listNodeValue(ln);
5009 incrRefCount(value);
5010 listDelNode(list,ln);
5011 }
5012 } else {
5013 redisPanic("Unknown list encoding");
5014 }
5015 return value;
5016}
5017
003f0840 5018static unsigned long listTypeLength(robj *subject) {
d72562f7
PN
5019 if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
5020 return ziplistLen(subject->ptr);
5021 } else if (subject->encoding == REDIS_ENCODING_LIST) {
5022 return listLength((list*)subject->ptr);
5023 } else {
5024 redisPanic("Unknown list encoding");
5025 }
5026}
5027
a6dd455b
PN
5028/* Structure to hold set iteration abstraction. */
5029typedef struct {
5030 robj *subject;
5031 unsigned char encoding;
be02a7c0 5032 unsigned char direction; /* Iteration direction */
a6dd455b
PN
5033 unsigned char *zi;
5034 listNode *ln;
003f0840 5035} listTypeIterator;
a6dd455b 5036
be02a7c0
PN
5037/* Structure for an entry while iterating over a list. */
5038typedef struct {
003f0840 5039 listTypeIterator *li;
be02a7c0
PN
5040 unsigned char *zi; /* Entry in ziplist */
5041 listNode *ln; /* Entry in linked list */
003f0840 5042} listTypeEntry;
be02a7c0 5043
a6dd455b 5044/* Initialize an iterator at the specified index. */
003f0840
PN
5045static listTypeIterator *listTypeInitIterator(robj *subject, int index, unsigned char direction) {
5046 listTypeIterator *li = zmalloc(sizeof(listTypeIterator));
a6dd455b
PN
5047 li->subject = subject;
5048 li->encoding = subject->encoding;
be02a7c0 5049 li->direction = direction;
a6dd455b
PN
5050 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5051 li->zi = ziplistIndex(subject->ptr,index);
5052 } else if (li->encoding == REDIS_ENCODING_LIST) {
5053 li->ln = listIndex(subject->ptr,index);
5054 } else {
5055 redisPanic("Unknown list encoding");
5056 }
5057 return li;
5058}
5059
5060/* Clean up the iterator. */
003f0840 5061static void listTypeReleaseIterator(listTypeIterator *li) {
a6dd455b
PN
5062 zfree(li);
5063}
5064
be02a7c0
PN
5065/* Stores pointer to current the entry in the provided entry structure
5066 * and advances the position of the iterator. Returns 1 when the current
5067 * entry is in fact an entry, 0 otherwise. */
003f0840 5068static int listTypeNext(listTypeIterator *li, listTypeEntry *entry) {
dda20542
PN
5069 /* Protect from converting when iterating */
5070 redisAssert(li->subject->encoding == li->encoding);
5071
be02a7c0 5072 entry->li = li;
d2ee16ab 5073 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
be02a7c0
PN
5074 entry->zi = li->zi;
5075 if (entry->zi != NULL) {
5076 if (li->direction == REDIS_TAIL)
5077 li->zi = ziplistNext(li->subject->ptr,li->zi);
5078 else
5079 li->zi = ziplistPrev(li->subject->ptr,li->zi);
5080 return 1;
5081 }
d2ee16ab 5082 } else if (li->encoding == REDIS_ENCODING_LIST) {
be02a7c0
PN
5083 entry->ln = li->ln;
5084 if (entry->ln != NULL) {
5085 if (li->direction == REDIS_TAIL)
5086 li->ln = li->ln->next;
5087 else
5088 li->ln = li->ln->prev;
5089 return 1;
5090 }
d2ee16ab
PN
5091 } else {
5092 redisPanic("Unknown list encoding");
5093 }
be02a7c0 5094 return 0;
d2ee16ab
PN
5095}
5096
a6dd455b 5097/* Return entry or NULL at the current position of the iterator. */
003f0840
PN
5098static robj *listTypeGet(listTypeEntry *entry) {
5099 listTypeIterator *li = entry->li;
a6dd455b
PN
5100 robj *value = NULL;
5101 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
b6eb9703 5102 unsigned char *vstr;
a6dd455b 5103 unsigned int vlen;
b6eb9703 5104 long long vlong;
be02a7c0 5105 redisAssert(entry->zi != NULL);
b6eb9703
PN
5106 if (ziplistGet(entry->zi,&vstr,&vlen,&vlong)) {
5107 if (vstr) {
5108 value = createStringObject((char*)vstr,vlen);
a6dd455b 5109 } else {
b6eb9703 5110 value = createStringObjectFromLongLong(vlong);
a6dd455b
PN
5111 }
5112 }
5113 } else if (li->encoding == REDIS_ENCODING_LIST) {
be02a7c0
PN
5114 redisAssert(entry->ln != NULL);
5115 value = listNodeValue(entry->ln);
a6dd455b
PN
5116 incrRefCount(value);
5117 } else {
5118 redisPanic("Unknown list encoding");
5119 }
5120 return value;
5121}
5122
d2ee16ab 5123/* Compare the given object with the entry at the current position. */
003f0840
PN
5124static int listTypeEqual(listTypeEntry *entry, robj *o) {
5125 listTypeIterator *li = entry->li;
d2ee16ab
PN
5126 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
5127 redisAssert(o->encoding == REDIS_ENCODING_RAW);
be02a7c0 5128 return ziplistCompare(entry->zi,o->ptr,sdslen(o->ptr));
d2ee16ab 5129 } else if (li->encoding == REDIS_ENCODING_LIST) {
be02a7c0 5130 return equalStringObjects(o,listNodeValue(entry->ln));
d2ee16ab
PN
5131 } else {
5132 redisPanic("Unknown list encoding");
5133 }
5134}
5135
be02a7c0 5136/* Delete the element pointed to. */
003f0840
PN
5137static void listTypeDelete(listTypeEntry *entry) {
5138 listTypeIterator *li = entry->li;
a6dd455b 5139 if (li->encoding == REDIS_ENCODING_ZIPLIST) {
be02a7c0
PN
5140 unsigned char *p = entry->zi;
5141 li->subject->ptr = ziplistDelete(li->subject->ptr,&p);
5142
5143 /* Update position of the iterator depending on the direction */
5144 if (li->direction == REDIS_TAIL)
5145 li->zi = p;
a6dd455b 5146 else
be02a7c0
PN
5147 li->zi = ziplistPrev(li->subject->ptr,p);
5148 } else if (entry->li->encoding == REDIS_ENCODING_LIST) {
5149 listNode *next;
5150 if (li->direction == REDIS_TAIL)
5151 next = entry->ln->next;
a6dd455b 5152 else
be02a7c0
PN
5153 next = entry->ln->prev;
5154 listDelNode(li->subject->ptr,entry->ln);
5155 li->ln = next;
a6dd455b
PN
5156 } else {
5157 redisPanic("Unknown list encoding");
5158 }
5159}
3305306f 5160
003f0840
PN
5161static void listTypeConvert(robj *subject, int enc) {
5162 listTypeIterator *li;
5163 listTypeEntry entry;
d0686e07
PN
5164 redisAssert(subject->type == REDIS_LIST);
5165
5166 if (enc == REDIS_ENCODING_LIST) {
5167 list *l = listCreate();
cd627d4e 5168 listSetFreeMethod(l,decrRefCount);
d0686e07 5169
003f0840
PN
5170 /* listTypeGet returns a robj with incremented refcount */
5171 li = listTypeInitIterator(subject,0,REDIS_TAIL);
5172 while (listTypeNext(li,&entry)) listAddNodeTail(l,listTypeGet(&entry));
5173 listTypeReleaseIterator(li);
d0686e07
PN
5174
5175 subject->encoding = REDIS_ENCODING_LIST;
5176 zfree(subject->ptr);
5177 subject->ptr = l;
5178 } else {
5179 redisPanic("Unsupported list conversion");
5180 }
5181}
5182
c7d9d662
PN
5183static void pushGenericCommand(redisClient *c, int where) {
5184 robj *lobj = lookupKeyWrite(c->db,c->argv[1]);
3305306f 5185 if (lobj == NULL) {
95242ab5 5186 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 5187 addReply(c,shared.cone);
95242ab5 5188 return;
5189 }
1cd92e7f 5190 lobj = createZiplistObject();
09241813 5191 dbAdd(c->db,c->argv[1],lobj);
ed9b544e 5192 } else {
ed9b544e 5193 if (lobj->type != REDIS_LIST) {
5194 addReply(c,shared.wrongtypeerr);
5195 return;
5196 }
95242ab5 5197 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 5198 addReply(c,shared.cone);
95242ab5 5199 return;
5200 }
ed9b544e 5201 }
003f0840
PN
5202 listTypePush(lobj,c->argv[2],where);
5203 addReplyLongLong(c,listTypeLength(lobj));
ed9b544e 5204 server.dirty++;
ed9b544e 5205}
5206
5207static void lpushCommand(redisClient *c) {
5208 pushGenericCommand(c,REDIS_HEAD);
5209}
5210
5211static void rpushCommand(redisClient *c) {
5212 pushGenericCommand(c,REDIS_TAIL);
5213}
5214
5215static void llenCommand(redisClient *c) {
d72562f7
PN
5216 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.czero);
5217 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
003f0840 5218 addReplyUlong(c,listTypeLength(o));
ed9b544e 5219}
5220
5221static void lindexCommand(redisClient *c) {
697bd567
PN
5222 robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk);
5223 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
ed9b544e 5224 int index = atoi(c->argv[2]->ptr);
bd8db0ad 5225 robj *value = NULL;
dd88747b 5226
697bd567
PN
5227 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5228 unsigned char *p;
b6eb9703 5229 unsigned char *vstr;
697bd567 5230 unsigned int vlen;
b6eb9703 5231 long long vlong;
697bd567 5232 p = ziplistIndex(o->ptr,index);
b6eb9703
PN
5233 if (ziplistGet(p,&vstr,&vlen,&vlong)) {
5234 if (vstr) {
5235 value = createStringObject((char*)vstr,vlen);
697bd567 5236 } else {
b6eb9703 5237 value = createStringObjectFromLongLong(vlong);
697bd567 5238 }
bd8db0ad
PN
5239 addReplyBulk(c,value);
5240 decrRefCount(value);
697bd567
PN
5241 } else {
5242 addReply(c,shared.nullbulk);
5243 }
5244 } else if (o->encoding == REDIS_ENCODING_LIST) {
5245 listNode *ln = listIndex(o->ptr,index);
5246 if (ln != NULL) {
bd8db0ad
PN
5247 value = listNodeValue(ln);
5248 addReplyBulk(c,value);
697bd567
PN
5249 } else {
5250 addReply(c,shared.nullbulk);
5251 }
ed9b544e 5252 } else {
697bd567 5253 redisPanic("Unknown list encoding");
ed9b544e 5254 }
5255}
5256
5257static void lsetCommand(redisClient *c) {
697bd567
PN
5258 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr);
5259 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
ed9b544e 5260 int index = atoi(c->argv[2]->ptr);
697bd567 5261 robj *value = c->argv[3];
dd88747b 5262
003f0840 5263 listTypeTryConversion(o,value);
697bd567
PN
5264 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5265 unsigned char *p, *zl = o->ptr;
5266 p = ziplistIndex(zl,index);
5267 if (p == NULL) {
5268 addReply(c,shared.outofrangeerr);
5269 } else {
be02a7c0 5270 o->ptr = ziplistDelete(o->ptr,&p);
697bd567
PN
5271 value = getDecodedObject(value);
5272 o->ptr = ziplistInsert(o->ptr,p,value->ptr,sdslen(value->ptr));
5273 decrRefCount(value);
5274 addReply(c,shared.ok);
5275 server.dirty++;
5276 }
5277 } else if (o->encoding == REDIS_ENCODING_LIST) {
5278 listNode *ln = listIndex(o->ptr,index);
5279 if (ln == NULL) {
5280 addReply(c,shared.outofrangeerr);
5281 } else {
5282 decrRefCount((robj*)listNodeValue(ln));
5283 listNodeValue(ln) = value;
5284 incrRefCount(value);
5285 addReply(c,shared.ok);
5286 server.dirty++;
5287 }
ed9b544e 5288 } else {
697bd567 5289 redisPanic("Unknown list encoding");
ed9b544e 5290 }
5291}
5292
5293static void popGenericCommand(redisClient *c, int where) {
d72562f7
PN
5294 robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk);
5295 if (o == NULL || checkType(c,o,REDIS_LIST)) return;
3305306f 5296
003f0840 5297 robj *value = listTypePop(o,where);
d72562f7 5298 if (value == NULL) {
dd88747b 5299 addReply(c,shared.nullbulk);
5300 } else {
d72562f7
PN
5301 addReplyBulk(c,value);
5302 decrRefCount(value);
003f0840 5303 if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5304 server.dirty++;
ed9b544e 5305 }
5306}
5307
5308static void lpopCommand(redisClient *c) {
5309 popGenericCommand(c,REDIS_HEAD);
5310}
5311
5312static void rpopCommand(redisClient *c) {
5313 popGenericCommand(c,REDIS_TAIL);
5314}
5315
5316static void lrangeCommand(redisClient *c) {
a6dd455b 5317 robj *o, *value;
ed9b544e 5318 int start = atoi(c->argv[2]->ptr);
5319 int end = atoi(c->argv[3]->ptr);
dd88747b 5320 int llen;
5321 int rangelen, j;
003f0840 5322 listTypeEntry entry;
dd88747b 5323
4e27f268 5324 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5325 || checkType(c,o,REDIS_LIST)) return;
003f0840 5326 llen = listTypeLength(o);
dd88747b 5327
5328 /* convert negative indexes */
5329 if (start < 0) start = llen+start;
5330 if (end < 0) end = llen+end;
5331 if (start < 0) start = 0;
5332 if (end < 0) end = 0;
5333
5334 /* indexes sanity checks */
5335 if (start > end || start >= llen) {
5336 /* Out of range start or start > end result in empty list */
5337 addReply(c,shared.emptymultibulk);
5338 return;
5339 }
5340 if (end >= llen) end = llen-1;
5341 rangelen = (end-start)+1;
3305306f 5342
dd88747b 5343 /* Return the result in form of a multi-bulk reply */
dd88747b 5344 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
003f0840 5345 listTypeIterator *li = listTypeInitIterator(o,start,REDIS_TAIL);
dd88747b 5346 for (j = 0; j < rangelen; j++) {
003f0840
PN
5347 redisAssert(listTypeNext(li,&entry));
5348 value = listTypeGet(&entry);
a6dd455b 5349 addReplyBulk(c,value);
be02a7c0 5350 decrRefCount(value);
ed9b544e 5351 }
003f0840 5352 listTypeReleaseIterator(li);
ed9b544e 5353}
5354
5355static void ltrimCommand(redisClient *c) {
3305306f 5356 robj *o;
ed9b544e 5357 int start = atoi(c->argv[2]->ptr);
5358 int end = atoi(c->argv[3]->ptr);
dd88747b 5359 int llen;
5360 int j, ltrim, rtrim;
5361 list *list;
5362 listNode *ln;
5363
5364 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
5365 checkType(c,o,REDIS_LIST)) return;
003f0840 5366 llen = listTypeLength(o);
dd88747b 5367
5368 /* convert negative indexes */
5369 if (start < 0) start = llen+start;
5370 if (end < 0) end = llen+end;
5371 if (start < 0) start = 0;
5372 if (end < 0) end = 0;
5373
5374 /* indexes sanity checks */
5375 if (start > end || start >= llen) {
5376 /* Out of range start or start > end result in empty list */
5377 ltrim = llen;
5378 rtrim = 0;
ed9b544e 5379 } else {
dd88747b 5380 if (end >= llen) end = llen-1;
5381 ltrim = start;
5382 rtrim = llen-end-1;
5383 }
ed9b544e 5384
dd88747b 5385 /* Remove list elements to perform the trim */
9ae6b0be
PN
5386 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
5387 o->ptr = ziplistDeleteRange(o->ptr,0,ltrim);
5388 o->ptr = ziplistDeleteRange(o->ptr,-rtrim,rtrim);
5389 } else if (o->encoding == REDIS_ENCODING_LIST) {
5390 list = o->ptr;
5391 for (j = 0; j < ltrim; j++) {
5392 ln = listFirst(list);
5393 listDelNode(list,ln);
5394 }
5395 for (j = 0; j < rtrim; j++) {
5396 ln = listLast(list);
5397 listDelNode(list,ln);
5398 }
5399 } else {
5400 redisPanic("Unknown list encoding");
ed9b544e 5401 }
003f0840 5402 if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5403 server.dirty++;
5404 addReply(c,shared.ok);
ed9b544e 5405}
5406
5407static void lremCommand(redisClient *c) {
d2ee16ab 5408 robj *subject, *obj = c->argv[3];
dd88747b 5409 int toremove = atoi(c->argv[2]->ptr);
5410 int removed = 0;
003f0840 5411 listTypeEntry entry;
a4d1ba9a 5412
d2ee16ab
PN
5413 subject = lookupKeyWriteOrReply(c,c->argv[1],shared.czero);
5414 if (subject == NULL || checkType(c,subject,REDIS_LIST)) return;
dd88747b 5415
d2ee16ab
PN
5416 /* Make sure obj is raw when we're dealing with a ziplist */
5417 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5418 obj = getDecodedObject(obj);
5419
003f0840 5420 listTypeIterator *li;
dd88747b 5421 if (toremove < 0) {
5422 toremove = -toremove;
003f0840 5423 li = listTypeInitIterator(subject,-1,REDIS_HEAD);
d2ee16ab 5424 } else {
003f0840 5425 li = listTypeInitIterator(subject,0,REDIS_TAIL);
dd88747b 5426 }
dd88747b 5427
003f0840
PN
5428 while (listTypeNext(li,&entry)) {
5429 if (listTypeEqual(&entry,obj)) {
5430 listTypeDelete(&entry);
dd88747b 5431 server.dirty++;
5432 removed++;
3fbf9001 5433 if (toremove && removed == toremove) break;
ed9b544e 5434 }
5435 }
003f0840 5436 listTypeReleaseIterator(li);
d2ee16ab
PN
5437
5438 /* Clean up raw encoded object */
5439 if (subject->encoding == REDIS_ENCODING_ZIPLIST)
5440 decrRefCount(obj);
5441
003f0840 5442 if (listTypeLength(subject) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5443 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 5444}
5445
12f9d551 5446/* This is the semantic of this command:
0f5f7e9a 5447 * RPOPLPUSH srclist dstlist:
12f9d551 5448 * IF LLEN(srclist) > 0
5449 * element = RPOP srclist
5450 * LPUSH dstlist element
5451 * RETURN element
5452 * ELSE
5453 * RETURN nil
5454 * END
5455 * END
5456 *
5457 * The idea is to be able to get an element from a list in a reliable way
5458 * since the element is not just returned but pushed against another list
5459 * as well. This command was originally proposed by Ezra Zygmuntowicz.
5460 */
0f5f7e9a 5461static void rpoplpushcommand(redisClient *c) {
0f62e177 5462 robj *sobj, *value;
dd88747b 5463 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5464 checkType(c,sobj,REDIS_LIST)) return;
12f9d551 5465
003f0840 5466 if (listTypeLength(sobj) == 0) {
12f9d551 5467 addReply(c,shared.nullbulk);
5468 } else {
dd88747b 5469 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
0f62e177 5470 if (dobj && checkType(c,dobj,REDIS_LIST)) return;
003f0840 5471 value = listTypePop(sobj,REDIS_TAIL);
12f9d551 5472
dd88747b 5473 /* Add the element to the target list (unless it's directly
5474 * passed to some BLPOP-ing client */
0f62e177
PN
5475 if (!handleClientsWaitingListPush(c,c->argv[2],value)) {
5476 /* Create the list if the key does not exist */
5477 if (!dobj) {
1cd92e7f 5478 dobj = createZiplistObject();
09241813 5479 dbAdd(c->db,c->argv[2],dobj);
12f9d551 5480 }
003f0840 5481 listTypePush(dobj,value,REDIS_HEAD);
12f9d551 5482 }
dd88747b 5483
5484 /* Send the element to the client as reply as well */
0f62e177
PN
5485 addReplyBulk(c,value);
5486
003f0840 5487 /* listTypePop returns an object with its refcount incremented */
0f62e177 5488 decrRefCount(value);
dd88747b 5489
0f62e177 5490 /* Delete the source list when it is empty */
003f0840 5491 if (listTypeLength(sobj) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5492 server.dirty++;
12f9d551 5493 }
5494}
5495
ed9b544e 5496/* ==================================== Sets ================================ */
5497
d0b58d53
PN
5498/* Factory method to return a set that *can* hold "value". When the object has
5499 * an integer-encodable value, an intset will be returned. Otherwise a regular
5500 * hash table. */
5501static robj *setTypeCreate(robj *value) {
5502 if (getLongLongFromObject(value,NULL) == REDIS_OK)
5503 return createIntsetObject();
5504 return createSetObject();
5505}
5506
35cabcb5 5507static int setTypeAdd(robj *subject, robj *value) {
d0b58d53 5508 long long llval;
35cabcb5
PN
5509 if (subject->encoding == REDIS_ENCODING_HT) {
5510 if (dictAdd(subject->ptr,value,NULL) == DICT_OK) {
5511 incrRefCount(value);
5512 return 1;
5513 }
d0b58d53
PN
5514 } else if (subject->encoding == REDIS_ENCODING_INTSET) {
5515 if (getLongLongFromObject(value,&llval) == REDIS_OK) {
70ff3511 5516 uint8_t success = 0;
d0b58d53 5517 subject->ptr = intsetAdd(subject->ptr,llval,&success);
70ff3511
PN
5518 if (success) {
5519 /* Convert to regular set when the intset contains
5520 * too many entries. */
5521 if (intsetLen(subject->ptr) > server.set_max_intset_entries)
5522 setTypeConvert(subject,REDIS_ENCODING_HT);
5523 return 1;
5524 }
d0b58d53
PN
5525 } else {
5526 /* Failed to get integer from object, convert to regular set. */
5527 setTypeConvert(subject,REDIS_ENCODING_HT);
5528
5529 /* The set *was* an intset and this value is not integer
5530 * encodable, so dictAdd should always work. */
5531 redisAssert(dictAdd(subject->ptr,value,NULL) == DICT_OK);
5532 incrRefCount(value);
5533 return 1;
5534 }
35cabcb5
PN
5535 } else {
5536 redisPanic("Unknown set encoding");
5537 }
5538 return 0;
5539}
5540
5541static int setTypeRemove(robj *subject, robj *value) {
d0b58d53 5542 long long llval;
35cabcb5
PN
5543 if (subject->encoding == REDIS_ENCODING_HT) {
5544 if (dictDelete(subject->ptr,value) == DICT_OK) {
5545 if (htNeedsResize(subject->ptr)) dictResize(subject->ptr);
5546 return 1;
5547 }
d0b58d53
PN
5548 } else if (subject->encoding == REDIS_ENCODING_INTSET) {
5549 if (getLongLongFromObject(value,&llval) == REDIS_OK) {
5550 uint8_t success;
5551 subject->ptr = intsetRemove(subject->ptr,llval,&success);
5552 if (success) return 1;
5553 }
35cabcb5
PN
5554 } else {
5555 redisPanic("Unknown set encoding");
5556 }
5557 return 0;
5558}
5559
5560static int setTypeIsMember(robj *subject, robj *value) {
d0b58d53 5561 long long llval;
35cabcb5
PN
5562 if (subject->encoding == REDIS_ENCODING_HT) {
5563 return dictFind((dict*)subject->ptr,value) != NULL;
d0b58d53
PN
5564 } else if (subject->encoding == REDIS_ENCODING_INTSET) {
5565 if (getLongLongFromObject(value,&llval) == REDIS_OK) {
5566 return intsetFind((intset*)subject->ptr,llval);
5567 }
35cabcb5
PN
5568 } else {
5569 redisPanic("Unknown set encoding");
5570 }
d0b58d53 5571 return 0;
35cabcb5
PN
5572}
5573
5574/* Structure to hold set iteration abstraction. */
5575typedef struct {
d0b58d53 5576 robj *subject;
35cabcb5 5577 int encoding;
d0b58d53 5578 int ii; /* intset iterator */
35cabcb5
PN
5579 dictIterator *di;
5580} setIterator;
5581
5582static setIterator *setTypeInitIterator(robj *subject) {
5583 setIterator *si = zmalloc(sizeof(setIterator));
d0b58d53 5584 si->subject = subject;
35cabcb5
PN
5585 si->encoding = subject->encoding;
5586 if (si->encoding == REDIS_ENCODING_HT) {
5587 si->di = dictGetIterator(subject->ptr);
d0b58d53
PN
5588 } else if (si->encoding == REDIS_ENCODING_INTSET) {
5589 si->ii = 0;
35cabcb5
PN
5590 } else {
5591 redisPanic("Unknown set encoding");
5592 }
5593 return si;
5594}
5595
5596static void setTypeReleaseIterator(setIterator *si) {
5597 if (si->encoding == REDIS_ENCODING_HT)
5598 dictReleaseIterator(si->di);
5599 zfree(si);
5600}
5601
5602/* Move to the next entry in the set. Returns the object at the current
5603 * position, or NULL when the end is reached. This object will have its
5604 * refcount incremented, so the caller needs to take care of this. */
5605static robj *setTypeNext(setIterator *si) {
5606 robj *ret = NULL;
5607 if (si->encoding == REDIS_ENCODING_HT) {
5608 dictEntry *de = dictNext(si->di);
5609 if (de != NULL) {
5610 ret = dictGetEntryKey(de);
5611 incrRefCount(ret);
5612 }
d0b58d53
PN
5613 } else if (si->encoding == REDIS_ENCODING_INTSET) {
5614 long long llval;
5615 if (intsetGet(si->subject->ptr,si->ii++,&llval))
5616 ret = createStringObjectFromLongLong(llval);
35cabcb5
PN
5617 }
5618 return ret;
5619}
5620
5621
5622/* Return random element from set. The returned object will always have
5623 * an incremented refcount. */
5624robj *setTypeRandomElement(robj *subject) {
5625 robj *ret = NULL;
5626 if (subject->encoding == REDIS_ENCODING_HT) {
5627 dictEntry *de = dictGetRandomKey(subject->ptr);
5628 ret = dictGetEntryKey(de);
5629 incrRefCount(ret);
d0b58d53
PN
5630 } else if (subject->encoding == REDIS_ENCODING_INTSET) {
5631 long long llval = intsetRandom(subject->ptr);
5632 ret = createStringObjectFromLongLong(llval);
35cabcb5
PN
5633 } else {
5634 redisPanic("Unknown set encoding");
5635 }
5636 return ret;
5637}
5638
5639static unsigned long setTypeSize(robj *subject) {
5640 if (subject->encoding == REDIS_ENCODING_HT) {
5641 return dictSize((dict*)subject->ptr);
d0b58d53
PN
5642 } else if (subject->encoding == REDIS_ENCODING_INTSET) {
5643 return intsetLen((intset*)subject->ptr);
35cabcb5
PN
5644 } else {
5645 redisPanic("Unknown set encoding");
5646 }
5647}
5648
d0b58d53
PN
5649static void setTypeConvert(robj *subject, int enc) {
5650 setIterator *si;
5651 robj *element;
5652 redisAssert(subject->type == REDIS_SET);
5653
5654 if (enc == REDIS_ENCODING_HT) {
5655 dict *d = dictCreate(&setDictType,NULL);
5656
5657 /* setTypeGet returns a robj with incremented refcount */
5658 si = setTypeInitIterator(subject);
5659 while ((element = setTypeNext(si)) != NULL)
5660 redisAssert(dictAdd(d,element,NULL) == DICT_OK);
5661 setTypeReleaseIterator(si);
5662
5663 subject->encoding = REDIS_ENCODING_HT;
5664 zfree(subject->ptr);
5665 subject->ptr = d;
5666 } else {
5667 redisPanic("Unsupported set conversion");
5668 }
5669}
5670
ed9b544e 5671static void saddCommand(redisClient *c) {
ed9b544e 5672 robj *set;
5673
3305306f 5674 set = lookupKeyWrite(c->db,c->argv[1]);
5675 if (set == NULL) {
d0b58d53 5676 set = setTypeCreate(c->argv[2]);
09241813 5677 dbAdd(c->db,c->argv[1],set);
ed9b544e 5678 } else {
ed9b544e 5679 if (set->type != REDIS_SET) {
c937aa89 5680 addReply(c,shared.wrongtypeerr);
ed9b544e 5681 return;
5682 }
5683 }
35cabcb5 5684 if (setTypeAdd(set,c->argv[2])) {
ed9b544e 5685 server.dirty++;
c937aa89 5686 addReply(c,shared.cone);
ed9b544e 5687 } else {
c937aa89 5688 addReply(c,shared.czero);
ed9b544e 5689 }
5690}
5691
5692static void sremCommand(redisClient *c) {
3305306f 5693 robj *set;
ed9b544e 5694
dd88747b 5695 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5696 checkType(c,set,REDIS_SET)) return;
5697
35cabcb5
PN
5698 if (setTypeRemove(set,c->argv[2])) {
5699 if (setTypeSize(set) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5700 server.dirty++;
dd88747b 5701 addReply(c,shared.cone);
ed9b544e 5702 } else {
dd88747b 5703 addReply(c,shared.czero);
ed9b544e 5704 }
5705}
5706
a4460ef4 5707static void smoveCommand(redisClient *c) {
b978abbf 5708 robj *srcset, *dstset, *ele;
a4460ef4 5709 srcset = lookupKeyWrite(c->db,c->argv[1]);
5710 dstset = lookupKeyWrite(c->db,c->argv[2]);
b978abbf 5711 ele = c->argv[3];
a4460ef4 5712
b978abbf
PN
5713 /* If the source key does not exist return 0 */
5714 if (srcset == NULL) {
5715 addReply(c,shared.czero);
a4460ef4 5716 return;
5717 }
b978abbf
PN
5718
5719 /* If the source key has the wrong type, or the destination key
5720 * is set and has the wrong type, return with an error. */
5721 if (checkType(c,srcset,REDIS_SET) ||
5722 (dstset && checkType(c,dstset,REDIS_SET))) return;
5723
5724 /* If srcset and dstset are equal, SMOVE is a no-op */
5725 if (srcset == dstset) {
5726 addReply(c,shared.cone);
a4460ef4 5727 return;
5728 }
b978abbf
PN
5729
5730 /* If the element cannot be removed from the src set, return 0. */
5731 if (!setTypeRemove(srcset,ele)) {
a4460ef4 5732 addReply(c,shared.czero);
5733 return;
5734 }
b978abbf
PN
5735
5736 /* Remove the src set from the database when empty */
5737 if (setTypeSize(srcset) == 0) dbDelete(c->db,c->argv[1]);
a4460ef4 5738 server.dirty++;
b978abbf
PN
5739
5740 /* Create the destination set when it doesn't exist */
a4460ef4 5741 if (!dstset) {
b978abbf 5742 dstset = setTypeCreate(ele);
09241813 5743 dbAdd(c->db,c->argv[2],dstset);
a4460ef4 5744 }
b978abbf
PN
5745
5746 /* An extra key has changed when ele was successfully added to dstset */
5747 if (setTypeAdd(dstset,ele)) server.dirty++;
a4460ef4 5748 addReply(c,shared.cone);
5749}
5750
ed9b544e 5751static void sismemberCommand(redisClient *c) {
3305306f 5752 robj *set;
ed9b544e 5753
dd88747b 5754 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5755 checkType(c,set,REDIS_SET)) return;
5756
35cabcb5 5757 if (setTypeIsMember(set,c->argv[2]))
dd88747b 5758 addReply(c,shared.cone);
5759 else
c937aa89 5760 addReply(c,shared.czero);
ed9b544e 5761}
5762
5763static void scardCommand(redisClient *c) {
3305306f 5764 robj *o;
dd88747b 5765
5766 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5767 checkType(c,o,REDIS_SET)) return;
e0a62c7f 5768
35cabcb5 5769 addReplyUlong(c,setTypeSize(o));
ed9b544e 5770}
5771
12fea928 5772static void spopCommand(redisClient *c) {
35cabcb5 5773 robj *set, *ele;
12fea928 5774
dd88747b 5775 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5776 checkType(c,set,REDIS_SET)) return;
5777
35cabcb5
PN
5778 ele = setTypeRandomElement(set);
5779 if (ele == NULL) {
12fea928 5780 addReply(c,shared.nullbulk);
5781 } else {
35cabcb5 5782 setTypeRemove(set,ele);
dd88747b 5783 addReplyBulk(c,ele);
35cabcb5
PN
5784 decrRefCount(ele);
5785 if (setTypeSize(set) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 5786 server.dirty++;
12fea928 5787 }
5788}
5789
2abb95a9 5790static void srandmemberCommand(redisClient *c) {
35cabcb5 5791 robj *set, *ele;
2abb95a9 5792
dd88747b 5793 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5794 checkType(c,set,REDIS_SET)) return;
5795
35cabcb5
PN
5796 ele = setTypeRandomElement(set);
5797 if (ele == NULL) {
2abb95a9 5798 addReply(c,shared.nullbulk);
5799 } else {
dd88747b 5800 addReplyBulk(c,ele);
35cabcb5 5801 decrRefCount(ele);
2abb95a9 5802 }
5803}
5804
ed9b544e 5805static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
35cabcb5 5806 return setTypeSize(*(robj**)s1)-setTypeSize(*(robj**)s2);
ed9b544e 5807}
5808
35cabcb5
PN
5809static void sinterGenericCommand(redisClient *c, robj **setkeys, unsigned long setnum, robj *dstkey) {
5810 robj **sets = zmalloc(sizeof(robj*)*setnum);
5811 setIterator *si;
5812 robj *ele, *lenobj = NULL, *dstset = NULL;
682ac724 5813 unsigned long j, cardinality = 0;
ed9b544e 5814
35cabcb5
PN
5815 for (j = 0; j < setnum; j++) {
5816 robj *setobj = dstkey ?
5817 lookupKeyWrite(c->db,setkeys[j]) :
5818 lookupKeyRead(c->db,setkeys[j]);
3305306f 5819 if (!setobj) {
35cabcb5 5820 zfree(sets);
5faa6025 5821 if (dstkey) {
09241813 5822 if (dbDelete(c->db,dstkey))
fdcaae84 5823 server.dirty++;
0d36ded0 5824 addReply(c,shared.czero);
5faa6025 5825 } else {
4e27f268 5826 addReply(c,shared.emptymultibulk);
5faa6025 5827 }
ed9b544e 5828 return;
5829 }
35cabcb5
PN
5830 if (checkType(c,setobj,REDIS_SET)) {
5831 zfree(sets);
ed9b544e 5832 return;
5833 }
35cabcb5 5834 sets[j] = setobj;
ed9b544e 5835 }
5836 /* Sort sets from the smallest to largest, this will improve our
5837 * algorithm's performace */
35cabcb5 5838 qsort(sets,setnum,sizeof(robj*),qsortCompareSetsByCardinality);
ed9b544e 5839
5840 /* The first thing we should output is the total number of elements...
5841 * since this is a multi-bulk write, but at this stage we don't know
5842 * the intersection set size, so we use a trick, append an empty object
5843 * to the output list and save the pointer to later modify it with the
5844 * right length */
5845 if (!dstkey) {
5846 lenobj = createObject(REDIS_STRING,NULL);
5847 addReply(c,lenobj);
5848 decrRefCount(lenobj);
5849 } else {
5850 /* If we have a target key where to store the resulting set
5851 * create this key with an empty set inside */
d0b58d53 5852 dstset = createIntsetObject();
ed9b544e 5853 }
5854
5855 /* Iterate all the elements of the first (smallest) set, and test
5856 * the element against all the other sets, if at least one set does
5857 * not include the element it is discarded */
35cabcb5
PN
5858 si = setTypeInitIterator(sets[0]);
5859 while((ele = setTypeNext(si)) != NULL) {
5860 for (j = 1; j < setnum; j++)
5861 if (!setTypeIsMember(sets[j],ele)) break;
5862
5863 /* Only take action when all sets contain the member */
5864 if (j == setnum) {
5865 if (!dstkey) {
5866 addReplyBulk(c,ele);
5867 cardinality++;
5868 } else {
5869 setTypeAdd(dstset,ele);
5870 }
ed9b544e 5871 }
35cabcb5 5872 decrRefCount(ele);
ed9b544e 5873 }
35cabcb5 5874 setTypeReleaseIterator(si);
ed9b544e 5875
83cdfe18 5876 if (dstkey) {
3ea27d37 5877 /* Store the resulting set into the target, if the intersection
5878 * is not an empty set. */
09241813 5879 dbDelete(c->db,dstkey);
35cabcb5 5880 if (setTypeSize(dstset) > 0) {
09241813 5881 dbAdd(c->db,dstkey,dstset);
35cabcb5 5882 addReplyLongLong(c,setTypeSize(dstset));
3ea27d37 5883 } else {
5884 decrRefCount(dstset);
d36c4e97 5885 addReply(c,shared.czero);
3ea27d37 5886 }
40d224a9 5887 server.dirty++;
d36c4e97 5888 } else {
5889 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 5890 }
35cabcb5 5891 zfree(sets);
ed9b544e 5892}
5893
5894static void sinterCommand(redisClient *c) {
5895 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5896}
5897
5898static void sinterstoreCommand(redisClient *c) {
5899 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5900}
5901
f4f56e1d 5902#define REDIS_OP_UNION 0
5903#define REDIS_OP_DIFF 1
2830ca53 5904#define REDIS_OP_INTER 2
f4f56e1d 5905
35cabcb5
PN
5906static void sunionDiffGenericCommand(redisClient *c, robj **setkeys, int setnum, robj *dstkey, int op) {
5907 robj **sets = zmalloc(sizeof(robj*)*setnum);
5908 setIterator *si;
5909 robj *ele, *dstset = NULL;
40d224a9 5910 int j, cardinality = 0;
5911
35cabcb5
PN
5912 for (j = 0; j < setnum; j++) {
5913 robj *setobj = dstkey ?
5914 lookupKeyWrite(c->db,setkeys[j]) :
5915 lookupKeyRead(c->db,setkeys[j]);
40d224a9 5916 if (!setobj) {
35cabcb5 5917 sets[j] = NULL;
40d224a9 5918 continue;
5919 }
35cabcb5
PN
5920 if (checkType(c,setobj,REDIS_SET)) {
5921 zfree(sets);
40d224a9 5922 return;
5923 }
35cabcb5 5924 sets[j] = setobj;
40d224a9 5925 }
5926
5927 /* We need a temp set object to store our union. If the dstkey
5928 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5929 * this set object will be the resulting object to set into the target key*/
d0b58d53 5930 dstset = createIntsetObject();
40d224a9 5931
40d224a9 5932 /* Iterate all the elements of all the sets, add every element a single
5933 * time to the result set */
35cabcb5
PN
5934 for (j = 0; j < setnum; j++) {
5935 if (op == REDIS_OP_DIFF && j == 0 && !sets[j]) break; /* result set is empty */
5936 if (!sets[j]) continue; /* non existing keys are like empty sets */
40d224a9 5937
35cabcb5
PN
5938 si = setTypeInitIterator(sets[j]);
5939 while((ele = setTypeNext(si)) != NULL) {
f4f56e1d 5940 if (op == REDIS_OP_UNION || j == 0) {
35cabcb5 5941 if (setTypeAdd(dstset,ele)) {
40d224a9 5942 cardinality++;
5943 }
f4f56e1d 5944 } else if (op == REDIS_OP_DIFF) {
35cabcb5 5945 if (setTypeRemove(dstset,ele)) {
f4f56e1d 5946 cardinality--;
5947 }
40d224a9 5948 }
35cabcb5 5949 decrRefCount(ele);
40d224a9 5950 }
35cabcb5 5951 setTypeReleaseIterator(si);
51829ed3 5952
35cabcb5 5953 /* Exit when result set is empty. */
d36c4e97 5954 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 5955 }
5956
f4f56e1d 5957 /* Output the content of the resulting set, if not in STORE mode */
5958 if (!dstkey) {
5959 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
35cabcb5
PN
5960 si = setTypeInitIterator(dstset);
5961 while((ele = setTypeNext(si)) != NULL) {
dd88747b 5962 addReplyBulk(c,ele);
35cabcb5 5963 decrRefCount(ele);
f4f56e1d 5964 }
35cabcb5 5965 setTypeReleaseIterator(si);
d36c4e97 5966 decrRefCount(dstset);
83cdfe18
AG
5967 } else {
5968 /* If we have a target key where to store the resulting set
5969 * create this key with the result set inside */
09241813 5970 dbDelete(c->db,dstkey);
35cabcb5 5971 if (setTypeSize(dstset) > 0) {
09241813 5972 dbAdd(c->db,dstkey,dstset);
35cabcb5 5973 addReplyLongLong(c,setTypeSize(dstset));
3ea27d37 5974 } else {
5975 decrRefCount(dstset);
d36c4e97 5976 addReply(c,shared.czero);
3ea27d37 5977 }
40d224a9 5978 server.dirty++;
5979 }
35cabcb5 5980 zfree(sets);
40d224a9 5981}
5982
5983static void sunionCommand(redisClient *c) {
f4f56e1d 5984 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 5985}
5986
5987static void sunionstoreCommand(redisClient *c) {
f4f56e1d 5988 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5989}
5990
5991static void sdiffCommand(redisClient *c) {
5992 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5993}
5994
5995static void sdiffstoreCommand(redisClient *c) {
5996 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 5997}
5998
6b47e12e 5999/* ==================================== ZSets =============================== */
6000
6001/* ZSETs are ordered sets using two data structures to hold the same elements
6002 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
6003 * data structure.
6004 *
6005 * The elements are added to an hash table mapping Redis objects to scores.
6006 * At the same time the elements are added to a skip list mapping scores
6007 * to Redis objects (so objects are sorted by scores in this "view"). */
6008
6009/* This skiplist implementation is almost a C translation of the original
6010 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
6011 * Alternative to Balanced Trees", modified in three ways:
6012 * a) this implementation allows for repeated values.
6013 * b) the comparison is not just by key (our 'score') but by satellite data.
6014 * c) there is a back pointer, so it's a doubly linked list with the back
6015 * pointers being only at "level 1". This allows to traverse the list
6016 * from tail to head, useful for ZREVRANGE. */
6017
6018static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
6019 zskiplistNode *zn = zmalloc(sizeof(*zn));
6020
6021 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2f4dd7e0 6022 if (level > 1)
2b37892e 6023 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
2f4dd7e0 6024 else
6025 zn->span = NULL;
6b47e12e 6026 zn->score = score;
6027 zn->obj = obj;
6028 return zn;
6029}
6030
6031static zskiplist *zslCreate(void) {
6032 int j;
6033 zskiplist *zsl;
e0a62c7f 6034
6b47e12e 6035 zsl = zmalloc(sizeof(*zsl));
6036 zsl->level = 1;
cc812361 6037 zsl->length = 0;
6b47e12e 6038 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 6039 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 6040 zsl->header->forward[j] = NULL;
94e543b5 6041
6042 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
6043 if (j < ZSKIPLIST_MAXLEVEL-1)
6044 zsl->header->span[j] = 0;
69d95c3e 6045 }
e3870fab 6046 zsl->header->backward = NULL;
6047 zsl->tail = NULL;
6b47e12e 6048 return zsl;
6049}
6050
fd8ccf44 6051static void zslFreeNode(zskiplistNode *node) {
6052 decrRefCount(node->obj);
ad807e6f 6053 zfree(node->forward);
69d95c3e 6054 zfree(node->span);
fd8ccf44 6055 zfree(node);
6056}
6057
6058static void zslFree(zskiplist *zsl) {
ad807e6f 6059 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 6060
ad807e6f 6061 zfree(zsl->header->forward);
69d95c3e 6062 zfree(zsl->header->span);
ad807e6f 6063 zfree(zsl->header);
fd8ccf44 6064 while(node) {
599379dd 6065 next = node->forward[0];
fd8ccf44 6066 zslFreeNode(node);
6067 node = next;
6068 }
ad807e6f 6069 zfree(zsl);
fd8ccf44 6070}
6071
6b47e12e 6072static int zslRandomLevel(void) {
6073 int level = 1;
6074 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
6075 level += 1;
10c2baa5 6076 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6b47e12e 6077}
6078
6079static void zslInsert(zskiplist *zsl, double score, robj *obj) {
6080 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 6081 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 6082 int i, level;
6083
6084 x = zsl->header;
6085 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
6086 /* store rank that is crossed to reach the insert position */
6087 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 6088
9d60e6e4 6089 while (x->forward[i] &&
6090 (x->forward[i]->score < score ||
6091 (x->forward[i]->score == score &&
69d95c3e 6092 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 6093 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 6094 x = x->forward[i];
69d95c3e 6095 }
6b47e12e 6096 update[i] = x;
6097 }
6b47e12e 6098 /* we assume the key is not already inside, since we allow duplicated
6099 * scores, and the re-insertion of score and redis object should never
6100 * happpen since the caller of zslInsert() should test in the hash table
6101 * if the element is already inside or not. */
6102 level = zslRandomLevel();
6103 if (level > zsl->level) {
69d95c3e 6104 for (i = zsl->level; i < level; i++) {
2b37892e 6105 rank[i] = 0;
6b47e12e 6106 update[i] = zsl->header;
2b37892e 6107 update[i]->span[i-1] = zsl->length;
69d95c3e 6108 }
6b47e12e 6109 zsl->level = level;
6110 }
6111 x = zslCreateNode(level,score,obj);
6112 for (i = 0; i < level; i++) {
6113 x->forward[i] = update[i]->forward[i];
6114 update[i]->forward[i] = x;
69d95c3e
PN
6115
6116 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
6117 if (i > 0) {
6118 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
6119 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
6120 }
6b47e12e 6121 }
69d95c3e
PN
6122
6123 /* increment span for untouched levels */
6124 for (i = level; i < zsl->level; i++) {
2b37892e 6125 update[i]->span[i-1]++;
69d95c3e
PN
6126 }
6127
bb975144 6128 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 6129 if (x->forward[0])
6130 x->forward[0]->backward = x;
6131 else
6132 zsl->tail = x;
cc812361 6133 zsl->length++;
6b47e12e 6134}
6135
84105336
PN
6136/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
6137void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
6138 int i;
6139 for (i = 0; i < zsl->level; i++) {
6140 if (update[i]->forward[i] == x) {
6141 if (i > 0) {
6142 update[i]->span[i-1] += x->span[i-1] - 1;
6143 }
6144 update[i]->forward[i] = x->forward[i];
6145 } else {
6146 /* invariant: i > 0, because update[0]->forward[0]
6147 * is always equal to x */
6148 update[i]->span[i-1] -= 1;
6149 }
6150 }
6151 if (x->forward[0]) {
6152 x->forward[0]->backward = x->backward;
6153 } else {
6154 zsl->tail = x->backward;
6155 }
6156 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
6157 zsl->level--;
6158 zsl->length--;
6159}
6160
50c55df5 6161/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 6162static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 6163 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6164 int i;
6165
6166 x = zsl->header;
6167 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 6168 while (x->forward[i] &&
6169 (x->forward[i]->score < score ||
6170 (x->forward[i]->score == score &&
6171 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 6172 x = x->forward[i];
6173 update[i] = x;
6174 }
6175 /* We may have multiple elements with the same score, what we need
6176 * is to find the element with both the right score and object. */
6177 x = x->forward[0];
bf028098 6178 if (x && score == x->score && equalStringObjects(x->obj,obj)) {
84105336 6179 zslDeleteNode(zsl, x, update);
9d60e6e4 6180 zslFreeNode(x);
9d60e6e4 6181 return 1;
6182 } else {
6183 return 0; /* not found */
e197b441 6184 }
6185 return 0; /* not found */
fd8ccf44 6186}
6187
1807985b 6188/* Delete all the elements with score between min and max from the skiplist.
6189 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
6190 * Note that this function takes the reference to the hash table view of the
6191 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 6192static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 6193 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6194 unsigned long removed = 0;
6195 int i;
6196
6197 x = zsl->header;
6198 for (i = zsl->level-1; i >= 0; i--) {
6199 while (x->forward[i] && x->forward[i]->score < min)
6200 x = x->forward[i];
6201 update[i] = x;
6202 }
6203 /* We may have multiple elements with the same score, what we need
6204 * is to find the element with both the right score and object. */
6205 x = x->forward[0];
6206 while (x && x->score <= max) {
84105336
PN
6207 zskiplistNode *next = x->forward[0];
6208 zslDeleteNode(zsl, x, update);
1807985b 6209 dictDelete(dict,x->obj);
6210 zslFreeNode(x);
1807985b 6211 removed++;
6212 x = next;
6213 }
6214 return removed; /* not found */
6215}
1807985b 6216
9212eafd 6217/* Delete all the elements with rank between start and end from the skiplist.
2424490f 6218 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
6219static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
6220 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
6221 unsigned long traversed = 0, removed = 0;
6222 int i;
6223
9212eafd
PN
6224 x = zsl->header;
6225 for (i = zsl->level-1; i >= 0; i--) {
6226 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
6227 traversed += i > 0 ? x->span[i-1] : 1;
6228 x = x->forward[i];
1807985b 6229 }
9212eafd
PN
6230 update[i] = x;
6231 }
6232
6233 traversed++;
6234 x = x->forward[0];
6235 while (x && traversed <= end) {
84105336
PN
6236 zskiplistNode *next = x->forward[0];
6237 zslDeleteNode(zsl, x, update);
1807985b 6238 dictDelete(dict,x->obj);
6239 zslFreeNode(x);
1807985b 6240 removed++;
9212eafd 6241 traversed++;
1807985b 6242 x = next;
6243 }
9212eafd 6244 return removed;
1807985b 6245}
6246
50c55df5 6247/* Find the first node having a score equal or greater than the specified one.
6248 * Returns NULL if there is no match. */
6249static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
6250 zskiplistNode *x;
6251 int i;
6252
6253 x = zsl->header;
6254 for (i = zsl->level-1; i >= 0; i--) {
6255 while (x->forward[i] && x->forward[i]->score < score)
6256 x = x->forward[i];
6257 }
6258 /* We may have multiple elements with the same score, what we need
6259 * is to find the element with both the right score and object. */
6260 return x->forward[0];
6261}
6262
27b0ccca
PN
6263/* Find the rank for an element by both score and key.
6264 * Returns 0 when the element cannot be found, rank otherwise.
6265 * Note that the rank is 1-based due to the span of zsl->header to the
6266 * first element. */
003f0840 6267static unsigned long zslistTypeGetRank(zskiplist *zsl, double score, robj *o) {
27b0ccca
PN
6268 zskiplistNode *x;
6269 unsigned long rank = 0;
6270 int i;
6271
6272 x = zsl->header;
6273 for (i = zsl->level-1; i >= 0; i--) {
6274 while (x->forward[i] &&
6275 (x->forward[i]->score < score ||
6276 (x->forward[i]->score == score &&
6277 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 6278 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
6279 x = x->forward[i];
6280 }
6281
6282 /* x might be equal to zsl->header, so test if obj is non-NULL */
bf028098 6283 if (x->obj && equalStringObjects(x->obj,o)) {
27b0ccca
PN
6284 return rank;
6285 }
6286 }
6287 return 0;
6288}
6289
e74825c2 6290/* Finds an element by its rank. The rank argument needs to be 1-based. */
003f0840 6291zskiplistNode* zslistTypeGetElementByRank(zskiplist *zsl, unsigned long rank) {
e74825c2
PN
6292 zskiplistNode *x;
6293 unsigned long traversed = 0;
6294 int i;
6295
6296 x = zsl->header;
6297 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 6298 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
6299 {
a50ea45c 6300 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
6301 x = x->forward[i];
6302 }
e74825c2
PN
6303 if (traversed == rank) {
6304 return x;
6305 }
6306 }
6307 return NULL;
6308}
6309
fd8ccf44 6310/* The actual Z-commands implementations */
6311
7db723ad 6312/* This generic command implements both ZADD and ZINCRBY.
e2665397 6313 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 6314 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 6315static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 6316 robj *zsetobj;
6317 zset *zs;
6318 double *score;
6319
5fc9229c 6320 if (isnan(scoreval)) {
6321 addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
6322 return;
6323 }
6324
e2665397 6325 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 6326 if (zsetobj == NULL) {
6327 zsetobj = createZsetObject();
09241813 6328 dbAdd(c->db,key,zsetobj);
fd8ccf44 6329 } else {
6330 if (zsetobj->type != REDIS_ZSET) {
6331 addReply(c,shared.wrongtypeerr);
6332 return;
6333 }
6334 }
fd8ccf44 6335 zs = zsetobj->ptr;
e2665397 6336
7db723ad 6337 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 6338 * needs to handle the two different conditions. It's all about setting
6339 * '*score', that is, the new score to set, to the right value. */
6340 score = zmalloc(sizeof(double));
6341 if (doincrement) {
6342 dictEntry *de;
6343
6344 /* Read the old score. If the element was not present starts from 0 */
6345 de = dictFind(zs->dict,ele);
6346 if (de) {
6347 double *oldscore = dictGetEntryVal(de);
6348 *score = *oldscore + scoreval;
6349 } else {
6350 *score = scoreval;
6351 }
5fc9229c 6352 if (isnan(*score)) {
6353 addReplySds(c,
6354 sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
6355 zfree(score);
6356 /* Note that we don't need to check if the zset may be empty and
6357 * should be removed here, as we can only obtain Nan as score if
6358 * there was already an element in the sorted set. */
6359 return;
6360 }
e2665397 6361 } else {
6362 *score = scoreval;
6363 }
6364
6365 /* What follows is a simple remove and re-insert operation that is common
7db723ad 6366 * to both ZADD and ZINCRBY... */
e2665397 6367 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 6368 /* case 1: New element */
e2665397 6369 incrRefCount(ele); /* added to hash */
6370 zslInsert(zs->zsl,*score,ele);
6371 incrRefCount(ele); /* added to skiplist */
fd8ccf44 6372 server.dirty++;
e2665397 6373 if (doincrement)
e2665397 6374 addReplyDouble(c,*score);
91d71bfc 6375 else
6376 addReply(c,shared.cone);
fd8ccf44 6377 } else {
6378 dictEntry *de;
6379 double *oldscore;
e0a62c7f 6380
fd8ccf44 6381 /* case 2: Score update operation */
e2665397 6382 de = dictFind(zs->dict,ele);
dfc5e96c 6383 redisAssert(de != NULL);
fd8ccf44 6384 oldscore = dictGetEntryVal(de);
6385 if (*score != *oldscore) {
6386 int deleted;
6387
e2665397 6388 /* Remove and insert the element in the skip list with new score */
6389 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 6390 redisAssert(deleted != 0);
e2665397 6391 zslInsert(zs->zsl,*score,ele);
6392 incrRefCount(ele);
6393 /* Update the score in the hash table */
6394 dictReplace(zs->dict,ele,score);
fd8ccf44 6395 server.dirty++;
2161a965 6396 } else {
6397 zfree(score);
fd8ccf44 6398 }
e2665397 6399 if (doincrement)
6400 addReplyDouble(c,*score);
6401 else
6402 addReply(c,shared.czero);
fd8ccf44 6403 }
6404}
6405
e2665397 6406static void zaddCommand(redisClient *c) {
6407 double scoreval;
6408
bd79a6bd 6409 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 6410 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
6411}
6412
7db723ad 6413static void zincrbyCommand(redisClient *c) {
e2665397 6414 double scoreval;
6415
bd79a6bd 6416 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 6417 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
6418}
6419
1b7106e7 6420static void zremCommand(redisClient *c) {
6421 robj *zsetobj;
6422 zset *zs;
dd88747b 6423 dictEntry *de;
6424 double *oldscore;
6425 int deleted;
1b7106e7 6426
dd88747b 6427 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6428 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 6429
dd88747b 6430 zs = zsetobj->ptr;
6431 de = dictFind(zs->dict,c->argv[2]);
6432 if (de == NULL) {
6433 addReply(c,shared.czero);
6434 return;
1b7106e7 6435 }
dd88747b 6436 /* Delete from the skiplist */
6437 oldscore = dictGetEntryVal(de);
6438 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
6439 redisAssert(deleted != 0);
6440
6441 /* Delete from the hash table */
6442 dictDelete(zs->dict,c->argv[2]);
6443 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
09241813 6444 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 6445 server.dirty++;
6446 addReply(c,shared.cone);
1b7106e7 6447}
6448
1807985b 6449static void zremrangebyscoreCommand(redisClient *c) {
bbe025e0
AM
6450 double min;
6451 double max;
dd88747b 6452 long deleted;
1807985b 6453 robj *zsetobj;
6454 zset *zs;
6455
bd79a6bd
PN
6456 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
6457 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
bbe025e0 6458
dd88747b 6459 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6460 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 6461
dd88747b 6462 zs = zsetobj->ptr;
6463 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
6464 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
09241813 6465 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 6466 server.dirty += deleted;
482b672d 6467 addReplyLongLong(c,deleted);
1807985b 6468}
6469
9212eafd 6470static void zremrangebyrankCommand(redisClient *c) {
bbe025e0
AM
6471 long start;
6472 long end;
dd88747b 6473 int llen;
6474 long deleted;
9212eafd
PN
6475 robj *zsetobj;
6476 zset *zs;
6477
bd79a6bd
PN
6478 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6479 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 6480
dd88747b 6481 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6482 checkType(c,zsetobj,REDIS_ZSET)) return;
6483 zs = zsetobj->ptr;
6484 llen = zs->zsl->length;
9212eafd 6485
dd88747b 6486 /* convert negative indexes */
6487 if (start < 0) start = llen+start;
6488 if (end < 0) end = llen+end;
6489 if (start < 0) start = 0;
6490 if (end < 0) end = 0;
9212eafd 6491
dd88747b 6492 /* indexes sanity checks */
6493 if (start > end || start >= llen) {
6494 addReply(c,shared.czero);
6495 return;
9212eafd 6496 }
dd88747b 6497 if (end >= llen) end = llen-1;
6498
6499 /* increment start and end because zsl*Rank functions
6500 * use 1-based rank */
6501 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
6502 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
09241813 6503 if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
dd88747b 6504 server.dirty += deleted;
482b672d 6505 addReplyLongLong(c, deleted);
9212eafd
PN
6506}
6507
8f92e768
PN
6508typedef struct {
6509 dict *dict;
6510 double weight;
6511} zsetopsrc;
6512
6513static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
6514 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
6515 unsigned long size1, size2;
6516 size1 = d1->dict ? dictSize(d1->dict) : 0;
6517 size2 = d2->dict ? dictSize(d2->dict) : 0;
6518 return size1 - size2;
6519}
6520
d2764cd6
PN
6521#define REDIS_AGGR_SUM 1
6522#define REDIS_AGGR_MIN 2
6523#define REDIS_AGGR_MAX 3
bc000c1d 6524#define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
d2764cd6
PN
6525
6526inline static void zunionInterAggregate(double *target, double val, int aggregate) {
6527 if (aggregate == REDIS_AGGR_SUM) {
6528 *target = *target + val;
6529 } else if (aggregate == REDIS_AGGR_MIN) {
6530 *target = val < *target ? val : *target;
6531 } else if (aggregate == REDIS_AGGR_MAX) {
6532 *target = val > *target ? val : *target;
6533 } else {
6534 /* safety net */
f83c6cb5 6535 redisPanic("Unknown ZUNION/INTER aggregate type");
d2764cd6
PN
6536 }
6537}
6538
2830ca53 6539static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
bc000c1d 6540 int i, j, setnum;
d2764cd6 6541 int aggregate = REDIS_AGGR_SUM;
8f92e768 6542 zsetopsrc *src;
2830ca53
PN
6543 robj *dstobj;
6544 zset *dstzset;
b287c9bb
PN
6545 dictIterator *di;
6546 dictEntry *de;
6547
bc000c1d
JC
6548 /* expect setnum input keys to be given */
6549 setnum = atoi(c->argv[2]->ptr);
6550 if (setnum < 1) {
5d373da9 6551 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
2830ca53 6552 return;
b287c9bb 6553 }
2830ca53
PN
6554
6555 /* test if the expected number of keys would overflow */
bc000c1d 6556 if (3+setnum > c->argc) {
b287c9bb
PN
6557 addReply(c,shared.syntaxerr);
6558 return;
6559 }
6560
2830ca53 6561 /* read keys to be used for input */
bc000c1d
JC
6562 src = zmalloc(sizeof(zsetopsrc) * setnum);
6563 for (i = 0, j = 3; i < setnum; i++, j++) {
6564 robj *obj = lookupKeyWrite(c->db,c->argv[j]);
6565 if (!obj) {
8f92e768 6566 src[i].dict = NULL;
b287c9bb 6567 } else {
bc000c1d
JC
6568 if (obj->type == REDIS_ZSET) {
6569 src[i].dict = ((zset*)obj->ptr)->dict;
6570 } else if (obj->type == REDIS_SET) {
6571 src[i].dict = (obj->ptr);
6572 } else {
8f92e768 6573 zfree(src);
b287c9bb
PN
6574 addReply(c,shared.wrongtypeerr);
6575 return;
6576 }
b287c9bb 6577 }
2830ca53
PN
6578
6579 /* default all weights to 1 */
8f92e768 6580 src[i].weight = 1.0;
b287c9bb
PN
6581 }
6582
2830ca53
PN
6583 /* parse optional extra arguments */
6584 if (j < c->argc) {
d2764cd6 6585 int remaining = c->argc - j;
b287c9bb 6586
2830ca53 6587 while (remaining) {
bc000c1d 6588 if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 6589 j++; remaining--;
bc000c1d 6590 for (i = 0; i < setnum; i++, j++, remaining--) {
bd79a6bd 6591 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
bbe025e0 6592 return;
2830ca53 6593 }
d2764cd6
PN
6594 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
6595 j++; remaining--;
6596 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
6597 aggregate = REDIS_AGGR_SUM;
6598 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
6599 aggregate = REDIS_AGGR_MIN;
6600 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
6601 aggregate = REDIS_AGGR_MAX;
6602 } else {
6603 zfree(src);
6604 addReply(c,shared.syntaxerr);
6605 return;
6606 }
6607 j++; remaining--;
2830ca53 6608 } else {
8f92e768 6609 zfree(src);
2830ca53
PN
6610 addReply(c,shared.syntaxerr);
6611 return;
6612 }
6613 }
6614 }
b287c9bb 6615
d2764cd6
PN
6616 /* sort sets from the smallest to largest, this will improve our
6617 * algorithm's performance */
bc000c1d 6618 qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
d2764cd6 6619
2830ca53
PN
6620 dstobj = createZsetObject();
6621 dstzset = dstobj->ptr;
6622
6623 if (op == REDIS_OP_INTER) {
8f92e768
PN
6624 /* skip going over all entries if the smallest zset is NULL or empty */
6625 if (src[0].dict && dictSize(src[0].dict) > 0) {
6626 /* precondition: as src[0].dict is non-empty and the zsets are ordered
6627 * from small to large, all src[i > 0].dict are non-empty too */
6628 di = dictGetIterator(src[0].dict);
2830ca53 6629 while((de = dictNext(di)) != NULL) {
d2764cd6 6630 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6631 *score = src[0].weight * zunionInterDictValue(de);
2830ca53 6632
bc000c1d 6633 for (j = 1; j < setnum; j++) {
d2764cd6 6634 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6635 if (other) {
bc000c1d 6636 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6637 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6638 } else {
6639 break;
6640 }
6641 }
b287c9bb 6642
2830ca53 6643 /* skip entry when not present in every source dict */
bc000c1d 6644 if (j != setnum) {
2830ca53
PN
6645 zfree(score);
6646 } else {
6647 robj *o = dictGetEntryKey(de);
6648 dictAdd(dstzset->dict,o,score);
6649 incrRefCount(o); /* added to dictionary */
6650 zslInsert(dstzset->zsl,*score,o);
6651 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
6652 }
6653 }
2830ca53
PN
6654 dictReleaseIterator(di);
6655 }
6656 } else if (op == REDIS_OP_UNION) {
bc000c1d 6657 for (i = 0; i < setnum; i++) {
8f92e768 6658 if (!src[i].dict) continue;
2830ca53 6659
8f92e768 6660 di = dictGetIterator(src[i].dict);
2830ca53
PN
6661 while((de = dictNext(di)) != NULL) {
6662 /* skip key when already processed */
6663 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
6664
d2764cd6 6665 double *score = zmalloc(sizeof(double)), value;
bc000c1d 6666 *score = src[i].weight * zunionInterDictValue(de);
2830ca53 6667
d2764cd6
PN
6668 /* because the zsets are sorted by size, its only possible
6669 * for sets at larger indices to hold this entry */
bc000c1d 6670 for (j = (i+1); j < setnum; j++) {
d2764cd6 6671 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 6672 if (other) {
bc000c1d 6673 value = src[j].weight * zunionInterDictValue(other);
d2764cd6 6674 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
6675 }
6676 }
b287c9bb 6677
2830ca53
PN
6678 robj *o = dictGetEntryKey(de);
6679 dictAdd(dstzset->dict,o,score);
6680 incrRefCount(o); /* added to dictionary */
6681 zslInsert(dstzset->zsl,*score,o);
6682 incrRefCount(o); /* added to skiplist */
6683 }
6684 dictReleaseIterator(di);
b287c9bb 6685 }
2830ca53
PN
6686 } else {
6687 /* unknown operator */
6688 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
6689 }
6690
09241813 6691 dbDelete(c->db,dstkey);
3ea27d37 6692 if (dstzset->zsl->length) {
09241813 6693 dbAdd(c->db,dstkey,dstobj);
482b672d 6694 addReplyLongLong(c, dstzset->zsl->length);
3ea27d37 6695 server.dirty++;
6696 } else {
8bca8773 6697 decrRefCount(dstobj);
3ea27d37 6698 addReply(c, shared.czero);
6699 }
8f92e768 6700 zfree(src);
b287c9bb
PN
6701}
6702
5d373da9 6703static void zunionstoreCommand(redisClient *c) {
2830ca53 6704 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
6705}
6706
5d373da9 6707static void zinterstoreCommand(redisClient *c) {
2830ca53 6708 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
6709}
6710
e3870fab 6711static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 6712 robj *o;
bbe025e0
AM
6713 long start;
6714 long end;
752da584 6715 int withscores = 0;
dd88747b 6716 int llen;
6717 int rangelen, j;
6718 zset *zsetobj;
6719 zskiplist *zsl;
6720 zskiplistNode *ln;
6721 robj *ele;
752da584 6722
bd79a6bd
PN
6723 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
6724 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 6725
752da584 6726 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
6727 withscores = 1;
6728 } else if (c->argc >= 5) {
6729 addReply(c,shared.syntaxerr);
6730 return;
6731 }
cc812361 6732
4e27f268 6733 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
6734 || checkType(c,o,REDIS_ZSET)) return;
dd88747b 6735 zsetobj = o->ptr;
6736 zsl = zsetobj->zsl;
6737 llen = zsl->length;
cc812361 6738
dd88747b 6739 /* convert negative indexes */
6740 if (start < 0) start = llen+start;
6741 if (end < 0) end = llen+end;
6742 if (start < 0) start = 0;
6743 if (end < 0) end = 0;
cc812361 6744
dd88747b 6745 /* indexes sanity checks */
6746 if (start > end || start >= llen) {
6747 /* Out of range start or start > end result in empty list */
6748 addReply(c,shared.emptymultibulk);
6749 return;
6750 }
6751 if (end >= llen) end = llen-1;
6752 rangelen = (end-start)+1;
cc812361 6753
dd88747b 6754 /* check if starting point is trivial, before searching
6755 * the element in log(N) time */
6756 if (reverse) {
003f0840 6757 ln = start == 0 ? zsl->tail : zslistTypeGetElementByRank(zsl, llen-start);
dd88747b 6758 } else {
6759 ln = start == 0 ?
003f0840 6760 zsl->header->forward[0] : zslistTypeGetElementByRank(zsl, start+1);
dd88747b 6761 }
cc812361 6762
dd88747b 6763 /* Return the result in form of a multi-bulk reply */
6764 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
6765 withscores ? (rangelen*2) : rangelen));
6766 for (j = 0; j < rangelen; j++) {
6767 ele = ln->obj;
6768 addReplyBulk(c,ele);
6769 if (withscores)
6770 addReplyDouble(c,ln->score);
6771 ln = reverse ? ln->backward : ln->forward[0];
cc812361 6772 }
6773}
6774
e3870fab 6775static void zrangeCommand(redisClient *c) {
6776 zrangeGenericCommand(c,0);
6777}
6778
6779static void zrevrangeCommand(redisClient *c) {
6780 zrangeGenericCommand(c,1);
6781}
6782
f44dd428 6783/* This command implements both ZRANGEBYSCORE and ZCOUNT.
6784 * If justcount is non-zero, just the count is returned. */
6785static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 6786 robj *o;
f44dd428 6787 double min, max;
6788 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 6789 int offset = 0, limit = -1;
0500ef27
SH
6790 int withscores = 0;
6791 int badsyntax = 0;
6792
f44dd428 6793 /* Parse the min-max interval. If one of the values is prefixed
6794 * by the "(" character, it's considered "open". For instance
6795 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6796 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6797 if (((char*)c->argv[2]->ptr)[0] == '(') {
6798 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6799 minex = 1;
6800 } else {
6801 min = strtod(c->argv[2]->ptr,NULL);
6802 }
6803 if (((char*)c->argv[3]->ptr)[0] == '(') {
6804 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6805 maxex = 1;
6806 } else {
6807 max = strtod(c->argv[3]->ptr,NULL);
6808 }
6809
6810 /* Parse "WITHSCORES": note that if the command was called with
6811 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6812 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 6813 if (c->argc == 5 || c->argc == 8) {
3a3978b1 6814 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6815 withscores = 1;
6816 else
6817 badsyntax = 1;
0500ef27 6818 }
3a3978b1 6819 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 6820 badsyntax = 1;
0500ef27 6821 if (badsyntax) {
454d4e43 6822 addReplySds(c,
6823 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 6824 return;
0500ef27
SH
6825 }
6826
f44dd428 6827 /* Parse "LIMIT" */
0500ef27 6828 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 6829 addReply(c,shared.syntaxerr);
6830 return;
0500ef27 6831 } else if (c->argc == (7 + withscores)) {
80181f78 6832 offset = atoi(c->argv[5]->ptr);
6833 limit = atoi(c->argv[6]->ptr);
0b13687c 6834 if (offset < 0) offset = 0;
80181f78 6835 }
50c55df5 6836
f44dd428 6837 /* Ok, lookup the key and get the range */
50c55df5 6838 o = lookupKeyRead(c->db,c->argv[1]);
6839 if (o == NULL) {
4e27f268 6840 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6841 } else {
6842 if (o->type != REDIS_ZSET) {
6843 addReply(c,shared.wrongtypeerr);
6844 } else {
6845 zset *zsetobj = o->ptr;
6846 zskiplist *zsl = zsetobj->zsl;
6847 zskiplistNode *ln;
f44dd428 6848 robj *ele, *lenobj = NULL;
6849 unsigned long rangelen = 0;
50c55df5 6850
f44dd428 6851 /* Get the first node with the score >= min, or with
6852 * score > min if 'minex' is true. */
50c55df5 6853 ln = zslFirstWithScore(zsl,min);
f44dd428 6854 while (minex && ln && ln->score == min) ln = ln->forward[0];
6855
50c55df5 6856 if (ln == NULL) {
6857 /* No element matching the speciifed interval */
f44dd428 6858 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6859 return;
6860 }
6861
6862 /* We don't know in advance how many matching elements there
6863 * are in the list, so we push this object that will represent
6864 * the multi-bulk length in the output buffer, and will "fix"
6865 * it later */
f44dd428 6866 if (!justcount) {
6867 lenobj = createObject(REDIS_STRING,NULL);
6868 addReply(c,lenobj);
6869 decrRefCount(lenobj);
6870 }
50c55df5 6871
f44dd428 6872 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 6873 if (offset) {
6874 offset--;
6875 ln = ln->forward[0];
6876 continue;
6877 }
6878 if (limit == 0) break;
f44dd428 6879 if (!justcount) {
6880 ele = ln->obj;
dd88747b 6881 addReplyBulk(c,ele);
f44dd428 6882 if (withscores)
6883 addReplyDouble(c,ln->score);
6884 }
50c55df5 6885 ln = ln->forward[0];
6886 rangelen++;
80181f78 6887 if (limit > 0) limit--;
50c55df5 6888 }
f44dd428 6889 if (justcount) {
482b672d 6890 addReplyLongLong(c,(long)rangelen);
f44dd428 6891 } else {
6892 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6893 withscores ? (rangelen*2) : rangelen);
6894 }
50c55df5 6895 }
6896 }
6897}
6898
f44dd428 6899static void zrangebyscoreCommand(redisClient *c) {
6900 genericZrangebyscoreCommand(c,0);
6901}
6902
6903static void zcountCommand(redisClient *c) {
6904 genericZrangebyscoreCommand(c,1);
6905}
6906
3c41331e 6907static void zcardCommand(redisClient *c) {
e197b441 6908 robj *o;
6909 zset *zs;
dd88747b 6910
6911 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6912 checkType(c,o,REDIS_ZSET)) return;
6913
6914 zs = o->ptr;
6915 addReplyUlong(c,zs->zsl->length);
e197b441 6916}
6917
6e333bbe 6918static void zscoreCommand(redisClient *c) {
6919 robj *o;
6920 zset *zs;
dd88747b 6921 dictEntry *de;
6922
6923 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6924 checkType(c,o,REDIS_ZSET)) return;
6925
6926 zs = o->ptr;
6927 de = dictFind(zs->dict,c->argv[2]);
6928 if (!de) {
96d8b4ee 6929 addReply(c,shared.nullbulk);
6e333bbe 6930 } else {
dd88747b 6931 double *score = dictGetEntryVal(de);
6e333bbe 6932
dd88747b 6933 addReplyDouble(c,*score);
6e333bbe 6934 }
6935}
6936
798d9e55 6937static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 6938 robj *o;
dd88747b 6939 zset *zs;
6940 zskiplist *zsl;
6941 dictEntry *de;
6942 unsigned long rank;
6943 double *score;
6944
6945 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6946 checkType(c,o,REDIS_ZSET)) return;
6947
6948 zs = o->ptr;
6949 zsl = zs->zsl;
6950 de = dictFind(zs->dict,c->argv[2]);
6951 if (!de) {
69d95c3e
PN
6952 addReply(c,shared.nullbulk);
6953 return;
6954 }
69d95c3e 6955
dd88747b 6956 score = dictGetEntryVal(de);
003f0840 6957 rank = zslistTypeGetRank(zsl, *score, c->argv[2]);
dd88747b 6958 if (rank) {
6959 if (reverse) {
482b672d 6960 addReplyLongLong(c, zsl->length - rank);
27b0ccca 6961 } else {
482b672d 6962 addReplyLongLong(c, rank-1);
69d95c3e 6963 }
dd88747b 6964 } else {
6965 addReply(c,shared.nullbulk);
978c2c94 6966 }
6967}
6968
798d9e55
PN
6969static void zrankCommand(redisClient *c) {
6970 zrankGenericCommand(c, 0);
6971}
6972
6973static void zrevrankCommand(redisClient *c) {
6974 zrankGenericCommand(c, 1);
6975}
6976
7fb16bac
PN
6977/* ========================= Hashes utility functions ======================= */
6978#define REDIS_HASH_KEY 1
6979#define REDIS_HASH_VALUE 2
978c2c94 6980
7fb16bac
PN
6981/* Check the length of a number of objects to see if we need to convert a
6982 * zipmap to a real hash. Note that we only check string encoded objects
6983 * as their string length can be queried in constant time. */
d1578a33 6984static void hashTypeTryConversion(robj *subject, robj **argv, int start, int end) {
7fb16bac
PN
6985 int i;
6986 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
978c2c94 6987
7fb16bac
PN
6988 for (i = start; i <= end; i++) {
6989 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6990 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6991 {
6992 convertToRealHash(subject);
978c2c94 6993 return;
6994 }
6995 }
7fb16bac 6996}
bae2c7ec 6997
97224de7 6998/* Encode given objects in-place when the hash uses a dict. */
d1578a33 6999static void hashTypeTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
97224de7 7000 if (subject->encoding == REDIS_ENCODING_HT) {
3f973463
PN
7001 if (o1) *o1 = tryObjectEncoding(*o1);
7002 if (o2) *o2 = tryObjectEncoding(*o2);
97224de7
PN
7003 }
7004}
7005
7fb16bac 7006/* Get the value from a hash identified by key. Returns either a string
a3f3af86
PN
7007 * object or NULL if the value cannot be found. The refcount of the object
7008 * is always increased by 1 when the value was found. */
d1578a33 7009static robj *hashTypeGet(robj *o, robj *key) {
7fb16bac 7010 robj *value = NULL;
978c2c94 7011 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7fb16bac
PN
7012 unsigned char *v;
7013 unsigned int vlen;
7014 key = getDecodedObject(key);
7015 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
7016 value = createStringObject((char*)v,vlen);
7017 }
7018 decrRefCount(key);
7019 } else {
7020 dictEntry *de = dictFind(o->ptr,key);
7021 if (de != NULL) {
7022 value = dictGetEntryVal(de);
a3f3af86 7023 incrRefCount(value);
7fb16bac
PN
7024 }
7025 }
7026 return value;
7027}
978c2c94 7028
7fb16bac
PN
7029/* Test if the key exists in the given hash. Returns 1 if the key
7030 * exists and 0 when it doesn't. */
d1578a33 7031static int hashTypeExists(robj *o, robj *key) {
7fb16bac
PN
7032 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7033 key = getDecodedObject(key);
7034 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
7035 decrRefCount(key);
7036 return 1;
7037 }
7038 decrRefCount(key);
7039 } else {
7040 if (dictFind(o->ptr,key) != NULL) {
7041 return 1;
7042 }
7043 }
7044 return 0;
7045}
bae2c7ec 7046
7fb16bac
PN
7047/* Add an element, discard the old if the key already exists.
7048 * Return 0 on insert and 1 on update. */
d1578a33 7049static int hashTypeSet(robj *o, robj *key, robj *value) {
7fb16bac
PN
7050 int update = 0;
7051 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7052 key = getDecodedObject(key);
7053 value = getDecodedObject(value);
7054 o->ptr = zipmapSet(o->ptr,
7055 key->ptr,sdslen(key->ptr),
7056 value->ptr,sdslen(value->ptr), &update);
7057 decrRefCount(key);
7058 decrRefCount(value);
7059
7060 /* Check if the zipmap needs to be upgraded to a real hash table */
7061 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
bae2c7ec 7062 convertToRealHash(o);
978c2c94 7063 } else {
7fb16bac
PN
7064 if (dictReplace(o->ptr,key,value)) {
7065 /* Insert */
7066 incrRefCount(key);
978c2c94 7067 } else {
7fb16bac 7068 /* Update */
978c2c94 7069 update = 1;
7070 }
7fb16bac 7071 incrRefCount(value);
978c2c94 7072 }
7fb16bac 7073 return update;
978c2c94 7074}
7075
7fb16bac
PN
7076/* Delete an element from a hash.
7077 * Return 1 on deleted and 0 on not found. */
d1578a33 7078static int hashTypeDelete(robj *o, robj *key) {
7fb16bac
PN
7079 int deleted = 0;
7080 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7081 key = getDecodedObject(key);
7082 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
7083 decrRefCount(key);
7084 } else {
7085 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
7086 /* Always check if the dictionary needs a resize after a delete. */
7087 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
d33278d1 7088 }
7fb16bac
PN
7089 return deleted;
7090}
d33278d1 7091
7fb16bac 7092/* Return the number of elements in a hash. */
d1578a33 7093static unsigned long hashTypeLength(robj *o) {
7fb16bac
PN
7094 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
7095 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
7096}
7097
7098/* Structure to hold hash iteration abstration. Note that iteration over
7099 * hashes involves both fields and values. Because it is possible that
7100 * not both are required, store pointers in the iterator to avoid
7101 * unnecessary memory allocation for fields/values. */
7102typedef struct {
7103 int encoding;
7104 unsigned char *zi;
7105 unsigned char *zk, *zv;
7106 unsigned int zklen, zvlen;
7107
7108 dictIterator *di;
7109 dictEntry *de;
d1578a33 7110} hashTypeIterator;
7fb16bac 7111
d1578a33
PN
7112static hashTypeIterator *hashTypeInitIterator(robj *subject) {
7113 hashTypeIterator *hi = zmalloc(sizeof(hashTypeIterator));
7fb16bac
PN
7114 hi->encoding = subject->encoding;
7115 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7116 hi->zi = zipmapRewind(subject->ptr);
7117 } else if (hi->encoding == REDIS_ENCODING_HT) {
7118 hi->di = dictGetIterator(subject->ptr);
d33278d1 7119 } else {
7fb16bac 7120 redisAssert(NULL);
d33278d1 7121 }
c44d3b56 7122 return hi;
7fb16bac 7123}
d33278d1 7124
d1578a33 7125static void hashTypeReleaseIterator(hashTypeIterator *hi) {
7fb16bac
PN
7126 if (hi->encoding == REDIS_ENCODING_HT) {
7127 dictReleaseIterator(hi->di);
d33278d1 7128 }
c44d3b56 7129 zfree(hi);
7fb16bac 7130}
d33278d1 7131
7fb16bac
PN
7132/* Move to the next entry in the hash. Return REDIS_OK when the next entry
7133 * could be found and REDIS_ERR when the iterator reaches the end. */
d1578a33 7134static int hashTypeNext(hashTypeIterator *hi) {
7fb16bac
PN
7135 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7136 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
7137 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
7138 } else {
7139 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
7140 }
7141 return REDIS_OK;
7142}
d33278d1 7143
0c390abc 7144/* Get key or value object at current iteration position.
a3f3af86 7145 * This increases the refcount of the field object by 1. */
d1578a33 7146static robj *hashTypeCurrent(hashTypeIterator *hi, int what) {
7fb16bac
PN
7147 robj *o;
7148 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
7149 if (what & REDIS_HASH_KEY) {
7150 o = createStringObject((char*)hi->zk,hi->zklen);
7151 } else {
7152 o = createStringObject((char*)hi->zv,hi->zvlen);
d33278d1 7153 }
d33278d1 7154 } else {
7fb16bac
PN
7155 if (what & REDIS_HASH_KEY) {
7156 o = dictGetEntryKey(hi->de);
7157 } else {
7158 o = dictGetEntryVal(hi->de);
d33278d1 7159 }
a3f3af86 7160 incrRefCount(o);
d33278d1 7161 }
7fb16bac 7162 return o;
d33278d1
PN
7163}
7164
d1578a33 7165static robj *hashTypeLookupWriteOrCreate(redisClient *c, robj *key) {
7fb16bac 7166 robj *o = lookupKeyWrite(c->db,key);
01426b05
PN
7167 if (o == NULL) {
7168 o = createHashObject();
09241813 7169 dbAdd(c->db,key,o);
01426b05
PN
7170 } else {
7171 if (o->type != REDIS_HASH) {
7172 addReply(c,shared.wrongtypeerr);
7fb16bac 7173 return NULL;
01426b05
PN
7174 }
7175 }
7fb16bac
PN
7176 return o;
7177}
01426b05 7178
7fb16bac
PN
7179/* ============================= Hash commands ============================== */
7180static void hsetCommand(redisClient *c) {
6e9e463f 7181 int update;
7fb16bac 7182 robj *o;
bbe025e0 7183
d1578a33
PN
7184 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7185 hashTypeTryConversion(o,c->argv,2,3);
7186 hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
7187 update = hashTypeSet(o,c->argv[2],c->argv[3]);
6e9e463f 7188 addReply(c, update ? shared.czero : shared.cone);
7fb16bac
PN
7189 server.dirty++;
7190}
01426b05 7191
1f1c7695
PN
7192static void hsetnxCommand(redisClient *c) {
7193 robj *o;
d1578a33
PN
7194 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7195 hashTypeTryConversion(o,c->argv,2,3);
1f1c7695 7196
d1578a33 7197 if (hashTypeExists(o, c->argv[2])) {
1f1c7695 7198 addReply(c, shared.czero);
01426b05 7199 } else {
d1578a33
PN
7200 hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
7201 hashTypeSet(o,c->argv[2],c->argv[3]);
1f1c7695
PN
7202 addReply(c, shared.cone);
7203 server.dirty++;
7204 }
7205}
01426b05 7206
7fb16bac
PN
7207static void hmsetCommand(redisClient *c) {
7208 int i;
7209 robj *o;
01426b05 7210
7fb16bac
PN
7211 if ((c->argc % 2) == 1) {
7212 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
7213 return;
7214 }
01426b05 7215
d1578a33
PN
7216 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7217 hashTypeTryConversion(o,c->argv,2,c->argc-1);
7fb16bac 7218 for (i = 2; i < c->argc; i += 2) {
d1578a33
PN
7219 hashTypeTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
7220 hashTypeSet(o,c->argv[i],c->argv[i+1]);
7fb16bac
PN
7221 }
7222 addReply(c, shared.ok);
edc2f63a 7223 server.dirty++;
7fb16bac
PN
7224}
7225
7226static void hincrbyCommand(redisClient *c) {
7227 long long value, incr;
7228 robj *o, *current, *new;
7229
bd79a6bd 7230 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
d1578a33
PN
7231 if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
7232 if ((current = hashTypeGet(o,c->argv[2])) != NULL) {
946342c1
PN
7233 if (getLongLongFromObjectOrReply(c,current,&value,
7234 "hash value is not an integer") != REDIS_OK) {
7235 decrRefCount(current);
7236 return;
7237 }
a3f3af86 7238 decrRefCount(current);
7fb16bac
PN
7239 } else {
7240 value = 0;
01426b05
PN
7241 }
7242
7fb16bac 7243 value += incr;
3f973463 7244 new = createStringObjectFromLongLong(value);
d1578a33
PN
7245 hashTypeTryObjectEncoding(o,&c->argv[2],NULL);
7246 hashTypeSet(o,c->argv[2],new);
7fb16bac
PN
7247 decrRefCount(new);
7248 addReplyLongLong(c,value);
01426b05 7249 server.dirty++;
01426b05
PN
7250}
7251
978c2c94 7252static void hgetCommand(redisClient *c) {
7fb16bac 7253 robj *o, *value;
dd88747b 7254 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
7255 checkType(c,o,REDIS_HASH)) return;
7256
d1578a33 7257 if ((value = hashTypeGet(o,c->argv[2])) != NULL) {
7fb16bac 7258 addReplyBulk(c,value);
a3f3af86 7259 decrRefCount(value);
dd88747b 7260 } else {
7fb16bac 7261 addReply(c,shared.nullbulk);
69d95c3e 7262 }
69d95c3e
PN
7263}
7264
09aeb579
PN
7265static void hmgetCommand(redisClient *c) {
7266 int i;
7fb16bac
PN
7267 robj *o, *value;
7268 o = lookupKeyRead(c->db,c->argv[1]);
7269 if (o != NULL && o->type != REDIS_HASH) {
7270 addReply(c,shared.wrongtypeerr);
09aeb579
PN
7271 }
7272
7fb16bac
PN
7273 /* Note the check for o != NULL happens inside the loop. This is
7274 * done because objects that cannot be found are considered to be
7275 * an empty hash. The reply should then be a series of NULLs. */
09aeb579 7276 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7fb16bac 7277 for (i = 2; i < c->argc; i++) {
d1578a33 7278 if (o != NULL && (value = hashTypeGet(o,c->argv[i])) != NULL) {
7fb16bac 7279 addReplyBulk(c,value);
a3f3af86 7280 decrRefCount(value);
7fb16bac
PN
7281 } else {
7282 addReply(c,shared.nullbulk);
09aeb579
PN
7283 }
7284 }
7285}
7286
07efaf74 7287static void hdelCommand(redisClient *c) {
dd88747b 7288 robj *o;
dd88747b 7289 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
7290 checkType(c,o,REDIS_HASH)) return;
07efaf74 7291
d1578a33
PN
7292 if (hashTypeDelete(o,c->argv[2])) {
7293 if (hashTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
7fb16bac
PN
7294 addReply(c,shared.cone);
7295 server.dirty++;
dd88747b 7296 } else {
7fb16bac 7297 addReply(c,shared.czero);
07efaf74 7298 }
7299}
7300
92b27fe9 7301static void hlenCommand(redisClient *c) {
7302 robj *o;
dd88747b 7303 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 7304 checkType(c,o,REDIS_HASH)) return;
7305
d1578a33 7306 addReplyUlong(c,hashTypeLength(o));
92b27fe9 7307}
7308
78409a0f 7309static void genericHgetallCommand(redisClient *c, int flags) {
7fb16bac 7310 robj *o, *lenobj, *obj;
78409a0f 7311 unsigned long count = 0;
d1578a33 7312 hashTypeIterator *hi;
78409a0f 7313
4e27f268 7314 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
78409a0f 7315 || checkType(c,o,REDIS_HASH)) return;
7316
7317 lenobj = createObject(REDIS_STRING,NULL);
7318 addReply(c,lenobj);
7319 decrRefCount(lenobj);
7320
d1578a33
PN
7321 hi = hashTypeInitIterator(o);
7322 while (hashTypeNext(hi) != REDIS_ERR) {
7fb16bac 7323 if (flags & REDIS_HASH_KEY) {
d1578a33 7324 obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
7fb16bac 7325 addReplyBulk(c,obj);
a3f3af86 7326 decrRefCount(obj);
7fb16bac 7327 count++;
78409a0f 7328 }
7fb16bac 7329 if (flags & REDIS_HASH_VALUE) {
d1578a33 7330 obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
7fb16bac 7331 addReplyBulk(c,obj);
a3f3af86 7332 decrRefCount(obj);
7fb16bac 7333 count++;
78409a0f 7334 }
78409a0f 7335 }
d1578a33 7336 hashTypeReleaseIterator(hi);
7fb16bac 7337
78409a0f 7338 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
7339}
7340
7341static void hkeysCommand(redisClient *c) {
7fb16bac 7342 genericHgetallCommand(c,REDIS_HASH_KEY);
78409a0f 7343}
7344
7345static void hvalsCommand(redisClient *c) {
7fb16bac 7346 genericHgetallCommand(c,REDIS_HASH_VALUE);
78409a0f 7347}
7348
7349static void hgetallCommand(redisClient *c) {
7fb16bac 7350 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
78409a0f 7351}
7352
a86f14b1 7353static void hexistsCommand(redisClient *c) {
7354 robj *o;
a86f14b1 7355 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
7356 checkType(c,o,REDIS_HASH)) return;
7357
d1578a33 7358 addReply(c, hashTypeExists(o,c->argv[2]) ? shared.cone : shared.czero);
a86f14b1 7359}
7360
ada386b2 7361static void convertToRealHash(robj *o) {
7362 unsigned char *key, *val, *p, *zm = o->ptr;
7363 unsigned int klen, vlen;
7364 dict *dict = dictCreate(&hashDictType,NULL);
7365
7366 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
7367 p = zipmapRewind(zm);
7368 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
7369 robj *keyobj, *valobj;
7370
7371 keyobj = createStringObject((char*)key,klen);
7372 valobj = createStringObject((char*)val,vlen);
05df7621 7373 keyobj = tryObjectEncoding(keyobj);
7374 valobj = tryObjectEncoding(valobj);
ada386b2 7375 dictAdd(dict,keyobj,valobj);
7376 }
7377 o->encoding = REDIS_ENCODING_HT;
7378 o->ptr = dict;
7379 zfree(zm);
7380}
7381
6b47e12e 7382/* ========================= Non type-specific commands ==================== */
7383
ed9b544e 7384static void flushdbCommand(redisClient *c) {
ca37e9cd 7385 server.dirty += dictSize(c->db->dict);
9b30e1a2 7386 touchWatchedKeysOnFlush(c->db->id);
3305306f 7387 dictEmpty(c->db->dict);
7388 dictEmpty(c->db->expires);
ed9b544e 7389 addReply(c,shared.ok);
ed9b544e 7390}
7391
7392static void flushallCommand(redisClient *c) {
9b30e1a2 7393 touchWatchedKeysOnFlush(-1);
ca37e9cd 7394 server.dirty += emptyDb();
ed9b544e 7395 addReply(c,shared.ok);
500ece7c 7396 if (server.bgsavechildpid != -1) {
7397 kill(server.bgsavechildpid,SIGKILL);
7398 rdbRemoveTempFile(server.bgsavechildpid);
7399 }
f78fd11b 7400 rdbSave(server.dbfilename);
ca37e9cd 7401 server.dirty++;
ed9b544e 7402}
7403
56906eef 7404static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 7405 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 7406 so->type = type;
7407 so->pattern = pattern;
7408 return so;
7409}
7410
7411/* Return the value associated to the key with a name obtained
55017f9d
PN
7412 * substituting the first occurence of '*' in 'pattern' with 'subst'.
7413 * The returned object will always have its refcount increased by 1
7414 * when it is non-NULL. */
56906eef 7415static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6d7d1370 7416 char *p, *f;
ed9b544e 7417 sds spat, ssub;
6d7d1370
PN
7418 robj keyobj, fieldobj, *o;
7419 int prefixlen, sublen, postfixlen, fieldlen;
ed9b544e 7420 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
7421 struct {
f1017b3f 7422 long len;
7423 long free;
ed9b544e 7424 char buf[REDIS_SORTKEY_MAX+1];
6d7d1370 7425 } keyname, fieldname;
ed9b544e 7426
28173a49 7427 /* If the pattern is "#" return the substitution object itself in order
7428 * to implement the "SORT ... GET #" feature. */
7429 spat = pattern->ptr;
7430 if (spat[0] == '#' && spat[1] == '\0') {
55017f9d 7431 incrRefCount(subst);
28173a49 7432 return subst;
7433 }
7434
7435 /* The substitution object may be specially encoded. If so we create
9d65a1bb 7436 * a decoded object on the fly. Otherwise getDecodedObject will just
7437 * increment the ref count, that we'll decrement later. */
7438 subst = getDecodedObject(subst);
942a3961 7439
ed9b544e 7440 ssub = subst->ptr;
7441 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
7442 p = strchr(spat,'*');
ed5a857a 7443 if (!p) {
7444 decrRefCount(subst);
7445 return NULL;
7446 }
ed9b544e 7447
6d7d1370
PN
7448 /* Find out if we're dealing with a hash dereference. */
7449 if ((f = strstr(p+1, "->")) != NULL) {
7450 fieldlen = sdslen(spat)-(f-spat);
7451 /* this also copies \0 character */
7452 memcpy(fieldname.buf,f+2,fieldlen-1);
7453 fieldname.len = fieldlen-2;
7454 } else {
7455 fieldlen = 0;
7456 }
7457
ed9b544e 7458 prefixlen = p-spat;
7459 sublen = sdslen(ssub);
6d7d1370 7460 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
ed9b544e 7461 memcpy(keyname.buf,spat,prefixlen);
7462 memcpy(keyname.buf+prefixlen,ssub,sublen);
7463 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
7464 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
7465 keyname.len = prefixlen+sublen+postfixlen;
942a3961 7466 decrRefCount(subst);
7467
6d7d1370
PN
7468 /* Lookup substituted key */
7469 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
7470 o = lookupKeyRead(db,&keyobj);
55017f9d
PN
7471 if (o == NULL) return NULL;
7472
7473 if (fieldlen > 0) {
7474 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6d7d1370 7475
705dad38
PN
7476 /* Retrieve value from hash by the field name. This operation
7477 * already increases the refcount of the returned object. */
6d7d1370 7478 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
d1578a33 7479 o = hashTypeGet(o, &fieldobj);
705dad38 7480 } else {
55017f9d 7481 if (o->type != REDIS_STRING) return NULL;
b6f07345 7482
705dad38
PN
7483 /* Every object that this function returns needs to have its refcount
7484 * increased. sortCommand decreases it again. */
7485 incrRefCount(o);
6d7d1370
PN
7486 }
7487
7488 return o;
ed9b544e 7489}
7490
7491/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
7492 * the additional parameter is not standard but a BSD-specific we have to
7493 * pass sorting parameters via the global 'server' structure */
7494static int sortCompare(const void *s1, const void *s2) {
7495 const redisSortObject *so1 = s1, *so2 = s2;
7496 int cmp;
7497
7498 if (!server.sort_alpha) {
7499 /* Numeric sorting. Here it's trivial as we precomputed scores */
7500 if (so1->u.score > so2->u.score) {
7501 cmp = 1;
7502 } else if (so1->u.score < so2->u.score) {
7503 cmp = -1;
7504 } else {
7505 cmp = 0;
7506 }
7507 } else {
7508 /* Alphanumeric sorting */
7509 if (server.sort_bypattern) {
7510 if (!so1->u.cmpobj || !so2->u.cmpobj) {
7511 /* At least one compare object is NULL */
7512 if (so1->u.cmpobj == so2->u.cmpobj)
7513 cmp = 0;
7514 else if (so1->u.cmpobj == NULL)
7515 cmp = -1;
7516 else
7517 cmp = 1;
7518 } else {
7519 /* We have both the objects, use strcoll */
7520 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
7521 }
7522 } else {
08ee9b57 7523 /* Compare elements directly. */
7524 cmp = compareStringObjects(so1->obj,so2->obj);
ed9b544e 7525 }
7526 }
7527 return server.sort_desc ? -cmp : cmp;
7528}
7529
7530/* The SORT command is the most complex command in Redis. Warning: this code
7531 * is optimized for speed and a bit less for readability */
7532static void sortCommand(redisClient *c) {
ed9b544e 7533 list *operations;
a03611e1 7534 unsigned int outputlen = 0;
ed9b544e 7535 int desc = 0, alpha = 0;
7536 int limit_start = 0, limit_count = -1, start, end;
7537 int j, dontsort = 0, vectorlen;
7538 int getop = 0; /* GET operation counter */
443c6409 7539 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 7540 redisSortObject *vector; /* Resulting vector to sort */
7541
7542 /* Lookup the key to sort. It must be of the right types */
3305306f 7543 sortval = lookupKeyRead(c->db,c->argv[1]);
7544 if (sortval == NULL) {
4e27f268 7545 addReply(c,shared.emptymultibulk);
ed9b544e 7546 return;
7547 }
a5eb649b 7548 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
7549 sortval->type != REDIS_ZSET)
7550 {
c937aa89 7551 addReply(c,shared.wrongtypeerr);
ed9b544e 7552 return;
7553 }
7554
7555 /* Create a list of operations to perform for every sorted element.
7556 * Operations can be GET/DEL/INCR/DECR */
7557 operations = listCreate();
092dac2a 7558 listSetFreeMethod(operations,zfree);
ed9b544e 7559 j = 2;
7560
7561 /* Now we need to protect sortval incrementing its count, in the future
7562 * SORT may have options able to overwrite/delete keys during the sorting
7563 * and the sorted key itself may get destroied */
7564 incrRefCount(sortval);
7565
7566 /* The SORT command has an SQL-alike syntax, parse it */
7567 while(j < c->argc) {
7568 int leftargs = c->argc-j-1;
7569 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
7570 desc = 0;
7571 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
7572 desc = 1;
7573 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
7574 alpha = 1;
7575 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
7576 limit_start = atoi(c->argv[j+1]->ptr);
7577 limit_count = atoi(c->argv[j+2]->ptr);
7578 j+=2;
443c6409 7579 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
7580 storekey = c->argv[j+1];
7581 j++;
ed9b544e 7582 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
7583 sortby = c->argv[j+1];
7584 /* If the BY pattern does not contain '*', i.e. it is constant,
7585 * we don't need to sort nor to lookup the weight keys. */
7586 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
7587 j++;
7588 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
7589 listAddNodeTail(operations,createSortOperation(
7590 REDIS_SORT_GET,c->argv[j+1]));
7591 getop++;
7592 j++;
ed9b544e 7593 } else {
7594 decrRefCount(sortval);
7595 listRelease(operations);
c937aa89 7596 addReply(c,shared.syntaxerr);
ed9b544e 7597 return;
7598 }
7599 j++;
7600 }
7601
7602 /* Load the sorting vector with all the objects to sort */
a5eb649b 7603 switch(sortval->type) {
003f0840 7604 case REDIS_LIST: vectorlen = listTypeLength(sortval); break;
a5eb649b 7605 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
7606 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
f83c6cb5 7607 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
a5eb649b 7608 }
ed9b544e 7609 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 7610 j = 0;
a5eb649b 7611
ed9b544e 7612 if (sortval->type == REDIS_LIST) {
003f0840
PN
7613 listTypeIterator *li = listTypeInitIterator(sortval,0,REDIS_TAIL);
7614 listTypeEntry entry;
7615 while(listTypeNext(li,&entry)) {
7616 vector[j].obj = listTypeGet(&entry);
ed9b544e 7617 vector[j].u.score = 0;
7618 vector[j].u.cmpobj = NULL;
ed9b544e 7619 j++;
7620 }
003f0840 7621 listTypeReleaseIterator(li);
ed9b544e 7622 } else {
a5eb649b 7623 dict *set;
ed9b544e 7624 dictIterator *di;
7625 dictEntry *setele;
7626
a5eb649b 7627 if (sortval->type == REDIS_SET) {
7628 set = sortval->ptr;
7629 } else {
7630 zset *zs = sortval->ptr;
7631 set = zs->dict;
7632 }
7633
ed9b544e 7634 di = dictGetIterator(set);
ed9b544e 7635 while((setele = dictNext(di)) != NULL) {
7636 vector[j].obj = dictGetEntryKey(setele);
7637 vector[j].u.score = 0;
7638 vector[j].u.cmpobj = NULL;
7639 j++;
7640 }
7641 dictReleaseIterator(di);
7642 }
dfc5e96c 7643 redisAssert(j == vectorlen);
ed9b544e 7644
7645 /* Now it's time to load the right scores in the sorting vector */
7646 if (dontsort == 0) {
7647 for (j = 0; j < vectorlen; j++) {
6d7d1370 7648 robj *byval;
ed9b544e 7649 if (sortby) {
6d7d1370 7650 /* lookup value to sort by */
3305306f 7651 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
705dad38 7652 if (!byval) continue;
ed9b544e 7653 } else {
6d7d1370
PN
7654 /* use object itself to sort by */
7655 byval = vector[j].obj;
7656 }
7657
7658 if (alpha) {
08ee9b57 7659 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6d7d1370
PN
7660 } else {
7661 if (byval->encoding == REDIS_ENCODING_RAW) {
7662 vector[j].u.score = strtod(byval->ptr,NULL);
16fa22f1 7663 } else if (byval->encoding == REDIS_ENCODING_INT) {
6d7d1370
PN
7664 /* Don't need to decode the object if it's
7665 * integer-encoded (the only encoding supported) so
7666 * far. We can just cast it */
16fa22f1
PN
7667 vector[j].u.score = (long)byval->ptr;
7668 } else {
7669 redisAssert(1 != 1);
942a3961 7670 }
ed9b544e 7671 }
6d7d1370 7672
705dad38
PN
7673 /* when the object was retrieved using lookupKeyByPattern,
7674 * its refcount needs to be decreased. */
7675 if (sortby) {
7676 decrRefCount(byval);
ed9b544e 7677 }
7678 }
7679 }
7680
7681 /* We are ready to sort the vector... perform a bit of sanity check
7682 * on the LIMIT option too. We'll use a partial version of quicksort. */
7683 start = (limit_start < 0) ? 0 : limit_start;
7684 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
7685 if (start >= vectorlen) {
7686 start = vectorlen-1;
7687 end = vectorlen-2;
7688 }
7689 if (end >= vectorlen) end = vectorlen-1;
7690
7691 if (dontsort == 0) {
7692 server.sort_desc = desc;
7693 server.sort_alpha = alpha;
7694 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 7695 if (sortby && (start != 0 || end != vectorlen-1))
7696 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
7697 else
7698 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 7699 }
7700
7701 /* Send command output to the output buffer, performing the specified
7702 * GET/DEL/INCR/DECR operations if any. */
7703 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 7704 if (storekey == NULL) {
7705 /* STORE option not specified, sent the sorting result to client */
7706 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
7707 for (j = start; j <= end; j++) {
7708 listNode *ln;
c7df85a4 7709 listIter li;
7710
dd88747b 7711 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 7712 listRewind(operations,&li);
7713 while((ln = listNext(&li))) {
443c6409 7714 redisSortOperation *sop = ln->value;
7715 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7716 vector[j].obj);
7717
7718 if (sop->type == REDIS_SORT_GET) {
55017f9d 7719 if (!val) {
443c6409 7720 addReply(c,shared.nullbulk);
7721 } else {
dd88747b 7722 addReplyBulk(c,val);
55017f9d 7723 decrRefCount(val);
443c6409 7724 }
7725 } else {
dfc5e96c 7726 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 7727 }
7728 }
ed9b544e 7729 }
443c6409 7730 } else {
74e0f445 7731 robj *sobj = createZiplistObject();
443c6409 7732
7733 /* STORE option specified, set the sorting result as a List object */
7734 for (j = start; j <= end; j++) {
7735 listNode *ln;
c7df85a4 7736 listIter li;
7737
443c6409 7738 if (!getop) {
003f0840 7739 listTypePush(sobj,vector[j].obj,REDIS_TAIL);
a03611e1
PN
7740 } else {
7741 listRewind(operations,&li);
7742 while((ln = listNext(&li))) {
7743 redisSortOperation *sop = ln->value;
7744 robj *val = lookupKeyByPattern(c->db,sop->pattern,
7745 vector[j].obj);
7746
7747 if (sop->type == REDIS_SORT_GET) {
7748 if (!val) val = createStringObject("",0);
7749
003f0840 7750 /* listTypePush does an incrRefCount, so we should take care
a03611e1
PN
7751 * care of the incremented refcount caused by either
7752 * lookupKeyByPattern or createStringObject("",0) */
003f0840 7753 listTypePush(sobj,val,REDIS_TAIL);
a03611e1 7754 decrRefCount(val);
443c6409 7755 } else {
a03611e1
PN
7756 /* always fails */
7757 redisAssert(sop->type == REDIS_SORT_GET);
443c6409 7758 }
ed9b544e 7759 }
ed9b544e 7760 }
ed9b544e 7761 }
846d8b3e 7762 dbReplace(c->db,storekey,sobj);
443c6409 7763 /* Note: we add 1 because the DB is dirty anyway since even if the
7764 * SORT result is empty a new key is set and maybe the old content
7765 * replaced. */
7766 server.dirty += 1+outputlen;
7767 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 7768 }
7769
7770 /* Cleanup */
a03611e1
PN
7771 if (sortval->type == REDIS_LIST)
7772 for (j = 0; j < vectorlen; j++)
7773 decrRefCount(vector[j].obj);
ed9b544e 7774 decrRefCount(sortval);
7775 listRelease(operations);
7776 for (j = 0; j < vectorlen; j++) {
16fa22f1 7777 if (alpha && vector[j].u.cmpobj)
ed9b544e 7778 decrRefCount(vector[j].u.cmpobj);
7779 }
7780 zfree(vector);
7781}
7782
ec6c7a1d 7783/* Convert an amount of bytes into a human readable string in the form
7784 * of 100B, 2G, 100M, 4K, and so forth. */
7785static void bytesToHuman(char *s, unsigned long long n) {
7786 double d;
7787
7788 if (n < 1024) {
7789 /* Bytes */
7790 sprintf(s,"%lluB",n);
7791 return;
7792 } else if (n < (1024*1024)) {
7793 d = (double)n/(1024);
7794 sprintf(s,"%.2fK",d);
7795 } else if (n < (1024LL*1024*1024)) {
7796 d = (double)n/(1024*1024);
7797 sprintf(s,"%.2fM",d);
7798 } else if (n < (1024LL*1024*1024*1024)) {
7799 d = (double)n/(1024LL*1024*1024);
b72f6a4b 7800 sprintf(s,"%.2fG",d);
ec6c7a1d 7801 }
7802}
7803
1c85b79f 7804/* Create the string returned by the INFO command. This is decoupled
7805 * by the INFO command itself as we need to report the same information
7806 * on memory corruption problems. */
7807static sds genRedisInfoString(void) {
ed9b544e 7808 sds info;
7809 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 7810 int j;
ec6c7a1d 7811 char hmem[64];
55a8298f 7812
b72f6a4b 7813 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 7814 info = sdscatprintf(sdsempty(),
7815 "redis_version:%s\r\n"
5436146c
PN
7816 "redis_git_sha1:%s\r\n"
7817 "redis_git_dirty:%d\r\n"
f1017b3f 7818 "arch_bits:%s\r\n"
7a932b74 7819 "multiplexing_api:%s\r\n"
0d7170a4 7820 "process_id:%ld\r\n"
682ac724 7821 "uptime_in_seconds:%ld\r\n"
7822 "uptime_in_days:%ld\r\n"
ed9b544e 7823 "connected_clients:%d\r\n"
7824 "connected_slaves:%d\r\n"
f86a74e9 7825 "blocked_clients:%d\r\n"
5fba9f71 7826 "used_memory:%zu\r\n"
ec6c7a1d 7827 "used_memory_human:%s\r\n"
ed9b544e 7828 "changes_since_last_save:%lld\r\n"
be2bb6b0 7829 "bgsave_in_progress:%d\r\n"
682ac724 7830 "last_save_time:%ld\r\n"
b3fad521 7831 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 7832 "total_connections_received:%lld\r\n"
7833 "total_commands_processed:%lld\r\n"
2a6a2ed1 7834 "expired_keys:%lld\r\n"
3be2c9d7 7835 "hash_max_zipmap_entries:%zu\r\n"
7836 "hash_max_zipmap_value:%zu\r\n"
ffc6b7f8 7837 "pubsub_channels:%ld\r\n"
7838 "pubsub_patterns:%u\r\n"
7d98e08c 7839 "vm_enabled:%d\r\n"
a0f643ea 7840 "role:%s\r\n"
ed9b544e 7841 ,REDIS_VERSION,
5436146c 7842 REDIS_GIT_SHA1,
274e45e3 7843 strtol(REDIS_GIT_DIRTY,NULL,10) > 0,
f1017b3f 7844 (sizeof(long) == 8) ? "64" : "32",
7a932b74 7845 aeGetApiName(),
0d7170a4 7846 (long) getpid(),
a0f643ea 7847 uptime,
7848 uptime/(3600*24),
ed9b544e 7849 listLength(server.clients)-listLength(server.slaves),
7850 listLength(server.slaves),
d5d55fc3 7851 server.blpop_blocked_clients,
b72f6a4b 7852 zmalloc_used_memory(),
ec6c7a1d 7853 hmem,
ed9b544e 7854 server.dirty,
9d65a1bb 7855 server.bgsavechildpid != -1,
ed9b544e 7856 server.lastsave,
b3fad521 7857 server.bgrewritechildpid != -1,
ed9b544e 7858 server.stat_numconnections,
7859 server.stat_numcommands,
2a6a2ed1 7860 server.stat_expiredkeys,
55a8298f 7861 server.hash_max_zipmap_entries,
7862 server.hash_max_zipmap_value,
ffc6b7f8 7863 dictSize(server.pubsub_channels),
7864 listLength(server.pubsub_patterns),
7d98e08c 7865 server.vm_enabled != 0,
a0f643ea 7866 server.masterhost == NULL ? "master" : "slave"
ed9b544e 7867 );
a0f643ea 7868 if (server.masterhost) {
7869 info = sdscatprintf(info,
7870 "master_host:%s\r\n"
7871 "master_port:%d\r\n"
7872 "master_link_status:%s\r\n"
7873 "master_last_io_seconds_ago:%d\r\n"
7874 ,server.masterhost,
7875 server.masterport,
7876 (server.replstate == REDIS_REPL_CONNECTED) ?
7877 "up" : "down",
f72b934d 7878 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 7879 );
7880 }
7d98e08c 7881 if (server.vm_enabled) {
1064ef87 7882 lockThreadedIO();
7d98e08c 7883 info = sdscatprintf(info,
7884 "vm_conf_max_memory:%llu\r\n"
7885 "vm_conf_page_size:%llu\r\n"
7886 "vm_conf_pages:%llu\r\n"
7887 "vm_stats_used_pages:%llu\r\n"
7888 "vm_stats_swapped_objects:%llu\r\n"
7889 "vm_stats_swappin_count:%llu\r\n"
7890 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 7891 "vm_stats_io_newjobs_len:%lu\r\n"
7892 "vm_stats_io_processing_len:%lu\r\n"
7893 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 7894 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 7895 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 7896 ,(unsigned long long) server.vm_max_memory,
7897 (unsigned long long) server.vm_page_size,
7898 (unsigned long long) server.vm_pages,
7899 (unsigned long long) server.vm_stats_used_pages,
7900 (unsigned long long) server.vm_stats_swapped_objects,
7901 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 7902 (unsigned long long) server.vm_stats_swapouts,
7903 (unsigned long) listLength(server.io_newjobs),
7904 (unsigned long) listLength(server.io_processing),
7905 (unsigned long) listLength(server.io_processed),
d5d55fc3 7906 (unsigned long) server.io_active_threads,
7907 (unsigned long) server.vm_blocked_clients
7d98e08c 7908 );
1064ef87 7909 unlockThreadedIO();
7d98e08c 7910 }
c3cb078d 7911 for (j = 0; j < server.dbnum; j++) {
7912 long long keys, vkeys;
7913
7914 keys = dictSize(server.db[j].dict);
7915 vkeys = dictSize(server.db[j].expires);
7916 if (keys || vkeys) {
9d65a1bb 7917 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 7918 j, keys, vkeys);
7919 }
7920 }
1c85b79f 7921 return info;
7922}
7923
7924static void infoCommand(redisClient *c) {
7925 sds info = genRedisInfoString();
83c6a618 7926 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7927 (unsigned long)sdslen(info)));
ed9b544e 7928 addReplySds(c,info);
70003d28 7929 addReply(c,shared.crlf);
ed9b544e 7930}
7931
3305306f 7932static void monitorCommand(redisClient *c) {
7933 /* ignore MONITOR if aleady slave or in monitor mode */
7934 if (c->flags & REDIS_SLAVE) return;
7935
7936 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7937 c->slaveseldb = 0;
6b47e12e 7938 listAddNodeTail(server.monitors,c);
3305306f 7939 addReply(c,shared.ok);
7940}
7941
7942/* ================================= Expire ================================= */
7943static int removeExpire(redisDb *db, robj *key) {
09241813 7944 if (dictDelete(db->expires,key->ptr) == DICT_OK) {
3305306f 7945 return 1;
7946 } else {
7947 return 0;
7948 }
7949}
7950
7951static int setExpire(redisDb *db, robj *key, time_t when) {
09241813 7952 sds copy = sdsdup(key->ptr);
7953 if (dictAdd(db->expires,copy,(void*)when) == DICT_ERR) {
7954 sdsfree(copy);
3305306f 7955 return 0;
7956 } else {
3305306f 7957 return 1;
7958 }
7959}
7960
bb32ede5 7961/* Return the expire time of the specified key, or -1 if no expire
7962 * is associated with this key (i.e. the key is non volatile) */
7963static time_t getExpire(redisDb *db, robj *key) {
7964 dictEntry *de;
7965
7966 /* No expire? return ASAP */
7967 if (dictSize(db->expires) == 0 ||
09241813 7968 (de = dictFind(db->expires,key->ptr)) == NULL) return -1;
bb32ede5 7969
7970 return (time_t) dictGetEntryVal(de);
7971}
7972
3305306f 7973static int expireIfNeeded(redisDb *db, robj *key) {
7974 time_t when;
7975 dictEntry *de;
7976
7977 /* No expire? return ASAP */
7978 if (dictSize(db->expires) == 0 ||
09241813 7979 (de = dictFind(db->expires,key->ptr)) == NULL) return 0;
3305306f 7980
7981 /* Lookup the expire */
7982 when = (time_t) dictGetEntryVal(de);
7983 if (time(NULL) <= when) return 0;
7984
7985 /* Delete the key */
09241813 7986 dbDelete(db,key);
2a6a2ed1 7987 server.stat_expiredkeys++;
09241813 7988 return 1;
3305306f 7989}
7990
7991static int deleteIfVolatile(redisDb *db, robj *key) {
7992 dictEntry *de;
7993
7994 /* No expire? return ASAP */
7995 if (dictSize(db->expires) == 0 ||
09241813 7996 (de = dictFind(db->expires,key->ptr)) == NULL) return 0;
3305306f 7997
7998 /* Delete the key */
0c66a471 7999 server.dirty++;
2a6a2ed1 8000 server.stat_expiredkeys++;
09241813 8001 dictDelete(db->expires,key->ptr);
8002 return dictDelete(db->dict,key->ptr) == DICT_OK;
3305306f 8003}
8004
bbe025e0 8005static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
3305306f 8006 dictEntry *de;
bbe025e0
AM
8007 time_t seconds;
8008
bd79a6bd 8009 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
bbe025e0
AM
8010
8011 seconds -= offset;
3305306f 8012
09241813 8013 de = dictFind(c->db->dict,key->ptr);
3305306f 8014 if (de == NULL) {
8015 addReply(c,shared.czero);
8016 return;
8017 }
d4dd6556 8018 if (seconds <= 0) {
09241813 8019 if (dbDelete(c->db,key)) server.dirty++;
43e5ccdf 8020 addReply(c, shared.cone);
3305306f 8021 return;
8022 } else {
8023 time_t when = time(NULL)+seconds;
802e8373 8024 if (setExpire(c->db,key,when)) {
3305306f 8025 addReply(c,shared.cone);
77423026 8026 server.dirty++;
8027 } else {
3305306f 8028 addReply(c,shared.czero);
77423026 8029 }
3305306f 8030 return;
8031 }
8032}
8033
802e8373 8034static void expireCommand(redisClient *c) {
bbe025e0 8035 expireGenericCommand(c,c->argv[1],c->argv[2],0);
802e8373 8036}
8037
8038static void expireatCommand(redisClient *c) {
bbe025e0 8039 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
802e8373 8040}
8041
fd88489a 8042static void ttlCommand(redisClient *c) {
8043 time_t expire;
8044 int ttl = -1;
8045
8046 expire = getExpire(c->db,c->argv[1]);
8047 if (expire != -1) {
8048 ttl = (int) (expire-time(NULL));
8049 if (ttl < 0) ttl = -1;
8050 }
8051 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
8052}
8053
6e469882 8054/* ================================ MULTI/EXEC ============================== */
8055
8056/* Client state initialization for MULTI/EXEC */
8057static void initClientMultiState(redisClient *c) {
8058 c->mstate.commands = NULL;
8059 c->mstate.count = 0;
8060}
8061
8062/* Release all the resources associated with MULTI/EXEC state */
8063static void freeClientMultiState(redisClient *c) {
8064 int j;
8065
8066 for (j = 0; j < c->mstate.count; j++) {
8067 int i;
8068 multiCmd *mc = c->mstate.commands+j;
8069
8070 for (i = 0; i < mc->argc; i++)
8071 decrRefCount(mc->argv[i]);
8072 zfree(mc->argv);
8073 }
8074 zfree(c->mstate.commands);
8075}
8076
8077/* Add a new command into the MULTI commands queue */
8078static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
8079 multiCmd *mc;
8080 int j;
8081
8082 c->mstate.commands = zrealloc(c->mstate.commands,
8083 sizeof(multiCmd)*(c->mstate.count+1));
8084 mc = c->mstate.commands+c->mstate.count;
8085 mc->cmd = cmd;
8086 mc->argc = c->argc;
8087 mc->argv = zmalloc(sizeof(robj*)*c->argc);
8088 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
8089 for (j = 0; j < c->argc; j++)
8090 incrRefCount(mc->argv[j]);
8091 c->mstate.count++;
8092}
8093
8094static void multiCommand(redisClient *c) {
6531c94d 8095 if (c->flags & REDIS_MULTI) {
8096 addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
8097 return;
8098 }
6e469882 8099 c->flags |= REDIS_MULTI;
36c548f0 8100 addReply(c,shared.ok);
6e469882 8101}
8102
18b6cb76
DJ
8103static void discardCommand(redisClient *c) {
8104 if (!(c->flags & REDIS_MULTI)) {
8105 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
8106 return;
8107 }
8108
8109 freeClientMultiState(c);
8110 initClientMultiState(c);
8111 c->flags &= (~REDIS_MULTI);
a2645226 8112 unwatchAllKeys(c);
18b6cb76
DJ
8113 addReply(c,shared.ok);
8114}
8115
66c8853f 8116/* Send a MULTI command to all the slaves and AOF file. Check the execCommand
8117 * implememntation for more information. */
8118static void execCommandReplicateMulti(redisClient *c) {
8119 struct redisCommand *cmd;
8120 robj *multistring = createStringObject("MULTI",5);
8121
8122 cmd = lookupCommand("multi");
8123 if (server.appendonly)
8124 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
8125 if (listLength(server.slaves))
8126 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
8127 decrRefCount(multistring);
8128}
8129
6e469882 8130static void execCommand(redisClient *c) {
8131 int j;
8132 robj **orig_argv;
8133 int orig_argc;
8134
8135 if (!(c->flags & REDIS_MULTI)) {
8136 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
8137 return;
8138 }
8139
37ab76c9 8140 /* Check if we need to abort the EXEC if some WATCHed key was touched.
8141 * A failed EXEC will return a multi bulk nil object. */
8142 if (c->flags & REDIS_DIRTY_CAS) {
8143 freeClientMultiState(c);
8144 initClientMultiState(c);
8145 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
8146 unwatchAllKeys(c);
8147 addReply(c,shared.nullmultibulk);
8148 return;
8149 }
8150
66c8853f 8151 /* Replicate a MULTI request now that we are sure the block is executed.
8152 * This way we'll deliver the MULTI/..../EXEC block as a whole and
8153 * both the AOF and the replication link will have the same consistency
8154 * and atomicity guarantees. */
8155 execCommandReplicateMulti(c);
8156
8157 /* Exec all the queued commands */
1ad4d316 8158 unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
6e469882 8159 orig_argv = c->argv;
8160 orig_argc = c->argc;
8161 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
8162 for (j = 0; j < c->mstate.count; j++) {
8163 c->argc = c->mstate.commands[j].argc;
8164 c->argv = c->mstate.commands[j].argv;
8165 call(c,c->mstate.commands[j].cmd);
8166 }
8167 c->argv = orig_argv;
8168 c->argc = orig_argc;
8169 freeClientMultiState(c);
8170 initClientMultiState(c);
1ad4d316 8171 c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
66c8853f 8172 /* Make sure the EXEC command is always replicated / AOF, since we
8173 * always send the MULTI command (we can't know beforehand if the
8174 * next operations will contain at least a modification to the DB). */
8175 server.dirty++;
6e469882 8176}
8177
4409877e 8178/* =========================== Blocking Operations ========================= */
8179
8180/* Currently Redis blocking operations support is limited to list POP ops,
8181 * so the current implementation is not fully generic, but it is also not
8182 * completely specific so it will not require a rewrite to support new
8183 * kind of blocking operations in the future.
8184 *
8185 * Still it's important to note that list blocking operations can be already
8186 * used as a notification mechanism in order to implement other blocking
8187 * operations at application level, so there must be a very strong evidence
8188 * of usefulness and generality before new blocking operations are implemented.
8189 *
8190 * This is how the current blocking POP works, we use BLPOP as example:
8191 * - If the user calls BLPOP and the key exists and contains a non empty list
8192 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
8193 * if there is not to block.
8194 * - If instead BLPOP is called and the key does not exists or the list is
8195 * empty we need to block. In order to do so we remove the notification for
8196 * new data to read in the client socket (so that we'll not serve new
8197 * requests if the blocking request is not served). Also we put the client
37ab76c9 8198 * in a dictionary (db->blocking_keys) mapping keys to a list of clients
4409877e 8199 * blocking for this keys.
8200 * - If a PUSH operation against a key with blocked clients waiting is
8201 * performed, we serve the first in the list: basically instead to push
8202 * the new element inside the list we return it to the (first / oldest)
8203 * blocking client, unblock the client, and remove it form the list.
8204 *
8205 * The above comment and the source code should be enough in order to understand
8206 * the implementation and modify / fix it later.
8207 */
8208
8209/* Set a client in blocking mode for the specified key, with the specified
8210 * timeout */
b177fd30 8211static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 8212 dictEntry *de;
8213 list *l;
b177fd30 8214 int j;
4409877e 8215
37ab76c9 8216 c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
8217 c->blocking_keys_num = numkeys;
4409877e 8218 c->blockingto = timeout;
b177fd30 8219 for (j = 0; j < numkeys; j++) {
8220 /* Add the key in the client structure, to map clients -> keys */
37ab76c9 8221 c->blocking_keys[j] = keys[j];
b177fd30 8222 incrRefCount(keys[j]);
4409877e 8223
b177fd30 8224 /* And in the other "side", to map keys -> clients */
37ab76c9 8225 de = dictFind(c->db->blocking_keys,keys[j]);
b177fd30 8226 if (de == NULL) {
8227 int retval;
8228
8229 /* For every key we take a list of clients blocked for it */
8230 l = listCreate();
37ab76c9 8231 retval = dictAdd(c->db->blocking_keys,keys[j],l);
b177fd30 8232 incrRefCount(keys[j]);
8233 assert(retval == DICT_OK);
8234 } else {
8235 l = dictGetEntryVal(de);
8236 }
8237 listAddNodeTail(l,c);
4409877e 8238 }
b177fd30 8239 /* Mark the client as a blocked client */
4409877e 8240 c->flags |= REDIS_BLOCKED;
d5d55fc3 8241 server.blpop_blocked_clients++;
4409877e 8242}
8243
8244/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 8245static void unblockClientWaitingData(redisClient *c) {
4409877e 8246 dictEntry *de;
8247 list *l;
b177fd30 8248 int j;
4409877e 8249
37ab76c9 8250 assert(c->blocking_keys != NULL);
b177fd30 8251 /* The client may wait for multiple keys, so unblock it for every key. */
37ab76c9 8252 for (j = 0; j < c->blocking_keys_num; j++) {
b177fd30 8253 /* Remove this client from the list of clients waiting for this key. */
37ab76c9 8254 de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
b177fd30 8255 assert(de != NULL);
8256 l = dictGetEntryVal(de);
8257 listDelNode(l,listSearchKey(l,c));
8258 /* If the list is empty we need to remove it to avoid wasting memory */
8259 if (listLength(l) == 0)
37ab76c9 8260 dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
8261 decrRefCount(c->blocking_keys[j]);
b177fd30 8262 }
8263 /* Cleanup the client structure */
37ab76c9 8264 zfree(c->blocking_keys);
8265 c->blocking_keys = NULL;
4409877e 8266 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 8267 server.blpop_blocked_clients--;
5921aa36 8268 /* We want to process data if there is some command waiting
b0d8747d 8269 * in the input buffer. Note that this is safe even if
8270 * unblockClientWaitingData() gets called from freeClient() because
8271 * freeClient() will be smart enough to call this function
8272 * *after* c->querybuf was set to NULL. */
4409877e 8273 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
8274}
8275
8276/* This should be called from any function PUSHing into lists.
8277 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
8278 * 'ele' is the element pushed.
8279 *
8280 * If the function returns 0 there was no client waiting for a list push
8281 * against this key.
8282 *
8283 * If the function returns 1 there was a client waiting for a list push
8284 * against this key, the element was passed to this client thus it's not
8285 * needed to actually add it to the list and the caller should return asap. */
8286static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
8287 struct dictEntry *de;
8288 redisClient *receiver;
8289 list *l;
8290 listNode *ln;
8291
37ab76c9 8292 de = dictFind(c->db->blocking_keys,key);
4409877e 8293 if (de == NULL) return 0;
8294 l = dictGetEntryVal(de);
8295 ln = listFirst(l);
8296 assert(ln != NULL);
8297 receiver = ln->value;
4409877e 8298
b177fd30 8299 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 8300 addReplyBulk(receiver,key);
8301 addReplyBulk(receiver,ele);
b0d8747d 8302 unblockClientWaitingData(receiver);
4409877e 8303 return 1;
8304}
8305
8306/* Blocking RPOP/LPOP */
8307static void blockingPopGenericCommand(redisClient *c, int where) {
8308 robj *o;
8309 time_t timeout;
b177fd30 8310 int j;
4409877e 8311
b177fd30 8312 for (j = 1; j < c->argc-1; j++) {
8313 o = lookupKeyWrite(c->db,c->argv[j]);
8314 if (o != NULL) {
8315 if (o->type != REDIS_LIST) {
8316 addReply(c,shared.wrongtypeerr);
4409877e 8317 return;
b177fd30 8318 } else {
8319 list *list = o->ptr;
8320 if (listLength(list) != 0) {
8321 /* If the list contains elements fall back to the usual
8322 * non-blocking POP operation */
8323 robj *argv[2], **orig_argv;
8324 int orig_argc;
e0a62c7f 8325
b177fd30 8326 /* We need to alter the command arguments before to call
8327 * popGenericCommand() as the command takes a single key. */
8328 orig_argv = c->argv;
8329 orig_argc = c->argc;
8330 argv[1] = c->argv[j];
8331 c->argv = argv;
8332 c->argc = 2;
8333
8334 /* Also the return value is different, we need to output
8335 * the multi bulk reply header and the key name. The
8336 * "real" command will add the last element (the value)
8337 * for us. If this souds like an hack to you it's just
8338 * because it is... */
8339 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 8340 addReplyBulk(c,argv[1]);
b177fd30 8341 popGenericCommand(c,where);
8342
8343 /* Fix the client structure with the original stuff */
8344 c->argv = orig_argv;
8345 c->argc = orig_argc;
8346 return;
8347 }
4409877e 8348 }
8349 }
8350 }
8351 /* If the list is empty or the key does not exists we must block */
b177fd30 8352 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 8353 if (timeout > 0) timeout += time(NULL);
b177fd30 8354 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 8355}
8356
8357static void blpopCommand(redisClient *c) {
8358 blockingPopGenericCommand(c,REDIS_HEAD);
8359}
8360
8361static void brpopCommand(redisClient *c) {
8362 blockingPopGenericCommand(c,REDIS_TAIL);
8363}
8364
ed9b544e 8365/* =============================== Replication ============================= */
8366
a4d1ba9a 8367static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 8368 ssize_t nwritten, ret = size;
8369 time_t start = time(NULL);
8370
8371 timeout++;
8372 while(size) {
8373 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
8374 nwritten = write(fd,ptr,size);
8375 if (nwritten == -1) return -1;
8376 ptr += nwritten;
8377 size -= nwritten;
8378 }
8379 if ((time(NULL)-start) > timeout) {
8380 errno = ETIMEDOUT;
8381 return -1;
8382 }
8383 }
8384 return ret;
8385}
8386
a4d1ba9a 8387static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 8388 ssize_t nread, totread = 0;
8389 time_t start = time(NULL);
8390
8391 timeout++;
8392 while(size) {
8393 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
8394 nread = read(fd,ptr,size);
8395 if (nread == -1) return -1;
8396 ptr += nread;
8397 size -= nread;
8398 totread += nread;
8399 }
8400 if ((time(NULL)-start) > timeout) {
8401 errno = ETIMEDOUT;
8402 return -1;
8403 }
8404 }
8405 return totread;
8406}
8407
8408static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
8409 ssize_t nread = 0;
8410
8411 size--;
8412 while(size) {
8413 char c;
8414
8415 if (syncRead(fd,&c,1,timeout) == -1) return -1;
8416 if (c == '\n') {
8417 *ptr = '\0';
8418 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
8419 return nread;
8420 } else {
8421 *ptr++ = c;
8422 *ptr = '\0';
8423 nread++;
8424 }
8425 }
8426 return nread;
8427}
8428
8429static void syncCommand(redisClient *c) {
40d224a9 8430 /* ignore SYNC if aleady slave or in monitor mode */
8431 if (c->flags & REDIS_SLAVE) return;
8432
8433 /* SYNC can't be issued when the server has pending data to send to
8434 * the client about already issued commands. We need a fresh reply
8435 * buffer registering the differences between the BGSAVE and the current
8436 * dataset, so that we can copy to other slaves if needed. */
8437 if (listLength(c->reply) != 0) {
8438 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
8439 return;
8440 }
8441
8442 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
8443 /* Here we need to check if there is a background saving operation
8444 * in progress, or if it is required to start one */
9d65a1bb 8445 if (server.bgsavechildpid != -1) {
40d224a9 8446 /* Ok a background save is in progress. Let's check if it is a good
8447 * one for replication, i.e. if there is another slave that is
8448 * registering differences since the server forked to save */
8449 redisClient *slave;
8450 listNode *ln;
c7df85a4 8451 listIter li;
40d224a9 8452
c7df85a4 8453 listRewind(server.slaves,&li);
8454 while((ln = listNext(&li))) {
40d224a9 8455 slave = ln->value;
8456 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 8457 }
8458 if (ln) {
8459 /* Perfect, the server is already registering differences for
8460 * another slave. Set the right state, and copy the buffer. */
8461 listRelease(c->reply);
8462 c->reply = listDup(slave->reply);
40d224a9 8463 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8464 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
8465 } else {
8466 /* No way, we need to wait for the next BGSAVE in order to
8467 * register differences */
8468 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
8469 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
8470 }
8471 } else {
8472 /* Ok we don't have a BGSAVE in progress, let's start one */
8473 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
8474 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
8475 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
8476 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
8477 return;
8478 }
8479 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8480 }
6208b3a7 8481 c->repldbfd = -1;
40d224a9 8482 c->flags |= REDIS_SLAVE;
8483 c->slaveseldb = 0;
6b47e12e 8484 listAddNodeTail(server.slaves,c);
40d224a9 8485 return;
8486}
8487
6208b3a7 8488static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
8489 redisClient *slave = privdata;
8490 REDIS_NOTUSED(el);
8491 REDIS_NOTUSED(mask);
8492 char buf[REDIS_IOBUF_LEN];
8493 ssize_t nwritten, buflen;
8494
8495 if (slave->repldboff == 0) {
8496 /* Write the bulk write count before to transfer the DB. In theory here
8497 * we don't know how much room there is in the output buffer of the
8498 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
8499 * operations) will never be smaller than the few bytes we need. */
8500 sds bulkcount;
8501
8502 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
8503 slave->repldbsize);
8504 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
8505 {
8506 sdsfree(bulkcount);
8507 freeClient(slave);
8508 return;
8509 }
8510 sdsfree(bulkcount);
8511 }
8512 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
8513 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
8514 if (buflen <= 0) {
8515 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
8516 (buflen == 0) ? "premature EOF" : strerror(errno));
8517 freeClient(slave);
8518 return;
8519 }
8520 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 8521 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 8522 strerror(errno));
8523 freeClient(slave);
8524 return;
8525 }
8526 slave->repldboff += nwritten;
8527 if (slave->repldboff == slave->repldbsize) {
8528 close(slave->repldbfd);
8529 slave->repldbfd = -1;
8530 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
8531 slave->replstate = REDIS_REPL_ONLINE;
8532 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 8533 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 8534 freeClient(slave);
8535 return;
8536 }
8537 addReplySds(slave,sdsempty());
8538 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
8539 }
8540}
ed9b544e 8541
a3b21203 8542/* This function is called at the end of every backgrond saving.
8543 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
8544 * otherwise REDIS_ERR is passed to the function.
8545 *
8546 * The goal of this function is to handle slaves waiting for a successful
8547 * background saving in order to perform non-blocking synchronization. */
8548static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 8549 listNode *ln;
8550 int startbgsave = 0;
c7df85a4 8551 listIter li;
ed9b544e 8552
c7df85a4 8553 listRewind(server.slaves,&li);
8554 while((ln = listNext(&li))) {
6208b3a7 8555 redisClient *slave = ln->value;
ed9b544e 8556
6208b3a7 8557 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
8558 startbgsave = 1;
8559 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
8560 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 8561 struct redis_stat buf;
e0a62c7f 8562
6208b3a7 8563 if (bgsaveerr != REDIS_OK) {
8564 freeClient(slave);
8565 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
8566 continue;
8567 }
8568 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 8569 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 8570 freeClient(slave);
8571 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
8572 continue;
8573 }
8574 slave->repldboff = 0;
8575 slave->repldbsize = buf.st_size;
8576 slave->replstate = REDIS_REPL_SEND_BULK;
8577 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 8578 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 8579 freeClient(slave);
8580 continue;
8581 }
8582 }
ed9b544e 8583 }
6208b3a7 8584 if (startbgsave) {
8585 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 8586 listIter li;
8587
8588 listRewind(server.slaves,&li);
6208b3a7 8589 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 8590 while((ln = listNext(&li))) {
6208b3a7 8591 redisClient *slave = ln->value;
ed9b544e 8592
6208b3a7 8593 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
8594 freeClient(slave);
8595 }
8596 }
8597 }
ed9b544e 8598}
8599
8600static int syncWithMaster(void) {
d0ccebcf 8601 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 8602 long dumpsize;
ed9b544e 8603 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 8604 int dfd, maxtries = 5;
ed9b544e 8605
8606 if (fd == -1) {
8607 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
8608 strerror(errno));
8609 return REDIS_ERR;
8610 }
d0ccebcf 8611
8612 /* AUTH with the master if required. */
8613 if(server.masterauth) {
8614 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
8615 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
8616 close(fd);
8617 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
8618 strerror(errno));
8619 return REDIS_ERR;
8620 }
8621 /* Read the AUTH result. */
8622 if (syncReadLine(fd,buf,1024,3600) == -1) {
8623 close(fd);
8624 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
8625 strerror(errno));
8626 return REDIS_ERR;
8627 }
8628 if (buf[0] != '+') {
8629 close(fd);
8630 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
8631 return REDIS_ERR;
8632 }
8633 }
8634
ed9b544e 8635 /* Issue the SYNC command */
8636 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
8637 close(fd);
8638 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
8639 strerror(errno));
8640 return REDIS_ERR;
8641 }
8642 /* Read the bulk write count */
8c4d91fc 8643 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 8644 close(fd);
8645 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
8646 strerror(errno));
8647 return REDIS_ERR;
8648 }
4aa701c1 8649 if (buf[0] != '$') {
8650 close(fd);
8651 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
8652 return REDIS_ERR;
8653 }
18e61fa2 8654 dumpsize = strtol(buf+1,NULL,10);
8655 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 8656 /* Read the bulk write data on a temp file */
8c5abee8 8657 while(maxtries--) {
8658 snprintf(tmpfile,256,
8659 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
8660 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
8661 if (dfd != -1) break;
5de9ad7c 8662 sleep(1);
8c5abee8 8663 }
ed9b544e 8664 if (dfd == -1) {
8665 close(fd);
8666 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
8667 return REDIS_ERR;
8668 }
8669 while(dumpsize) {
8670 int nread, nwritten;
8671
8672 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
8673 if (nread == -1) {
8674 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
8675 strerror(errno));
8676 close(fd);
8677 close(dfd);
8678 return REDIS_ERR;
8679 }
8680 nwritten = write(dfd,buf,nread);
8681 if (nwritten == -1) {
8682 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
8683 close(fd);
8684 close(dfd);
8685 return REDIS_ERR;
8686 }
8687 dumpsize -= nread;
8688 }
8689 close(dfd);
8690 if (rename(tmpfile,server.dbfilename) == -1) {
8691 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
8692 unlink(tmpfile);
8693 close(fd);
8694 return REDIS_ERR;
8695 }
8696 emptyDb();
f78fd11b 8697 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 8698 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
8699 close(fd);
8700 return REDIS_ERR;
8701 }
8702 server.master = createClient(fd);
8703 server.master->flags |= REDIS_MASTER;
179b3952 8704 server.master->authenticated = 1;
ed9b544e 8705 server.replstate = REDIS_REPL_CONNECTED;
8706 return REDIS_OK;
8707}
8708
321b0e13 8709static void slaveofCommand(redisClient *c) {
8710 if (!strcasecmp(c->argv[1]->ptr,"no") &&
8711 !strcasecmp(c->argv[2]->ptr,"one")) {
8712 if (server.masterhost) {
8713 sdsfree(server.masterhost);
8714 server.masterhost = NULL;
8715 if (server.master) freeClient(server.master);
8716 server.replstate = REDIS_REPL_NONE;
8717 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
8718 }
8719 } else {
8720 sdsfree(server.masterhost);
8721 server.masterhost = sdsdup(c->argv[1]->ptr);
8722 server.masterport = atoi(c->argv[2]->ptr);
8723 if (server.master) freeClient(server.master);
8724 server.replstate = REDIS_REPL_CONNECT;
8725 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
8726 server.masterhost, server.masterport);
8727 }
8728 addReply(c,shared.ok);
8729}
8730
3fd78bcd 8731/* ============================ Maxmemory directive ======================== */
8732
a5819310 8733/* Try to free one object form the pre-allocated objects free list.
8734 * This is useful under low mem conditions as by default we take 1 million
8735 * free objects allocated. On success REDIS_OK is returned, otherwise
8736 * REDIS_ERR. */
8737static int tryFreeOneObjectFromFreelist(void) {
f870935d 8738 robj *o;
8739
a5819310 8740 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
8741 if (listLength(server.objfreelist)) {
8742 listNode *head = listFirst(server.objfreelist);
8743 o = listNodeValue(head);
8744 listDelNode(server.objfreelist,head);
8745 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8746 zfree(o);
8747 return REDIS_OK;
8748 } else {
8749 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
8750 return REDIS_ERR;
8751 }
f870935d 8752}
8753
3fd78bcd 8754/* This function gets called when 'maxmemory' is set on the config file to limit
8755 * the max memory used by the server, and we are out of memory.
8756 * This function will try to, in order:
8757 *
8758 * - Free objects from the free list
8759 * - Try to remove keys with an EXPIRE set
8760 *
8761 * It is not possible to free enough memory to reach used-memory < maxmemory
8762 * the server will start refusing commands that will enlarge even more the
8763 * memory usage.
8764 */
8765static void freeMemoryIfNeeded(void) {
8766 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 8767 int j, k, freed = 0;
8768
8769 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
8770 for (j = 0; j < server.dbnum; j++) {
8771 int minttl = -1;
8772 robj *minkey = NULL;
8773 struct dictEntry *de;
8774
8775 if (dictSize(server.db[j].expires)) {
8776 freed = 1;
8777 /* From a sample of three keys drop the one nearest to
8778 * the natural expire */
8779 for (k = 0; k < 3; k++) {
8780 time_t t;
8781
8782 de = dictGetRandomKey(server.db[j].expires);
8783 t = (time_t) dictGetEntryVal(de);
8784 if (minttl == -1 || t < minttl) {
8785 minkey = dictGetEntryKey(de);
8786 minttl = t;
3fd78bcd 8787 }
3fd78bcd 8788 }
09241813 8789 dbDelete(server.db+j,minkey);
3fd78bcd 8790 }
3fd78bcd 8791 }
a5819310 8792 if (!freed) return; /* nothing to free... */
3fd78bcd 8793 }
8794}
8795
f80dff62 8796/* ============================== Append Only file ========================== */
8797
560db612 8798/* Called when the user switches from "appendonly yes" to "appendonly no"
8799 * at runtime using the CONFIG command. */
8800static void stopAppendOnly(void) {
8801 flushAppendOnlyFile();
8802 aof_fsync(server.appendfd);
8803 close(server.appendfd);
8804
8805 server.appendfd = -1;
8806 server.appendseldb = -1;
8807 server.appendonly = 0;
8808 /* rewrite operation in progress? kill it, wait child exit */
8809 if (server.bgsavechildpid != -1) {
8810 int statloc;
8811
8812 if (kill(server.bgsavechildpid,SIGKILL) != -1)
8813 wait3(&statloc,0,NULL);
8814 /* reset the buffer accumulating changes while the child saves */
8815 sdsfree(server.bgrewritebuf);
8816 server.bgrewritebuf = sdsempty();
8817 server.bgsavechildpid = -1;
8818 }
8819}
8820
8821/* Called when the user switches from "appendonly no" to "appendonly yes"
8822 * at runtime using the CONFIG command. */
8823static int startAppendOnly(void) {
8824 server.appendonly = 1;
8825 server.lastfsync = time(NULL);
8826 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
8827 if (server.appendfd == -1) {
8828 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
8829 return REDIS_ERR;
8830 }
8831 if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
8832 server.appendonly = 0;
8833 close(server.appendfd);
8834 redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
8835 return REDIS_ERR;
8836 }
8837 return REDIS_OK;
8838}
8839
28ed1f33 8840/* Write the append only file buffer on disk.
8841 *
8842 * Since we are required to write the AOF before replying to the client,
8843 * and the only way the client socket can get a write is entering when the
8844 * the event loop, we accumulate all the AOF writes in a memory
8845 * buffer and write it on disk using this function just before entering
8846 * the event loop again. */
8847static void flushAppendOnlyFile(void) {
8848 time_t now;
8849 ssize_t nwritten;
8850
8851 if (sdslen(server.aofbuf) == 0) return;
8852
8853 /* We want to perform a single write. This should be guaranteed atomic
8854 * at least if the filesystem we are writing is a real physical one.
8855 * While this will save us against the server being killed I don't think
8856 * there is much to do about the whole server stopping for power problems
8857 * or alike */
8858 nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
8859 if (nwritten != (signed)sdslen(server.aofbuf)) {
8860 /* Ooops, we are in troubles. The best thing to do for now is
8861 * aborting instead of giving the illusion that everything is
8862 * working as expected. */
8863 if (nwritten == -1) {
8864 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8865 } else {
8866 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8867 }
8868 exit(1);
8869 }
8870 sdsfree(server.aofbuf);
8871 server.aofbuf = sdsempty();
8872
38db9171 8873 /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
8874 * childs performing heavy I/O on disk. */
8875 if (server.no_appendfsync_on_rewrite &&
8876 (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
8877 return;
28ed1f33 8878 /* Fsync if needed */
8879 now = time(NULL);
8880 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8881 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8882 now-server.lastfsync > 1))
8883 {
8884 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8885 * flushing metadata. */
8886 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
8887 server.lastfsync = now;
8888 }
8889}
8890
9376e434
PN
8891static sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
8892 int j;
8893 buf = sdscatprintf(buf,"*%d\r\n",argc);
8894 for (j = 0; j < argc; j++) {
8895 robj *o = getDecodedObject(argv[j]);
8896 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
8897 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8898 buf = sdscatlen(buf,"\r\n",2);
8899 decrRefCount(o);
8900 }
8901 return buf;
8902}
8903
8904static sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
8905 int argc = 3;
8906 long when;
8907 robj *argv[3];
8908
8909 /* Make sure we can use strtol */
8910 seconds = getDecodedObject(seconds);
8911 when = time(NULL)+strtol(seconds->ptr,NULL,10);
8912 decrRefCount(seconds);
8913
8914 argv[0] = createStringObject("EXPIREAT",8);
8915 argv[1] = key;
8916 argv[2] = createObject(REDIS_STRING,
8917 sdscatprintf(sdsempty(),"%ld",when));
8918 buf = catAppendOnlyGenericCommand(buf, argc, argv);
8919 decrRefCount(argv[0]);
8920 decrRefCount(argv[2]);
8921 return buf;
8922}
8923
f80dff62 8924static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8925 sds buf = sdsempty();
f80dff62 8926 robj *tmpargv[3];
8927
8928 /* The DB this command was targetting is not the same as the last command
8929 * we appendend. To issue a SELECT command is needed. */
8930 if (dictid != server.appendseldb) {
8931 char seldb[64];
8932
8933 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 8934 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 8935 (unsigned long)strlen(seldb),seldb);
f80dff62 8936 server.appendseldb = dictid;
8937 }
8938
f80dff62 8939 if (cmd->proc == expireCommand) {
9376e434
PN
8940 /* Translate EXPIRE into EXPIREAT */
8941 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8942 } else if (cmd->proc == setexCommand) {
8943 /* Translate SETEX to SET and EXPIREAT */
8944 tmpargv[0] = createStringObject("SET",3);
f80dff62 8945 tmpargv[1] = argv[1];
9376e434
PN
8946 tmpargv[2] = argv[3];
8947 buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
8948 decrRefCount(tmpargv[0]);
8949 buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
8950 } else {
8951 buf = catAppendOnlyGenericCommand(buf,argc,argv);
f80dff62 8952 }
8953
28ed1f33 8954 /* Append to the AOF buffer. This will be flushed on disk just before
8955 * of re-entering the event loop, so before the client will get a
8956 * positive reply about the operation performed. */
8957 server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
8958
85a83172 8959 /* If a background append only file rewriting is in progress we want to
8960 * accumulate the differences between the child DB and the current one
8961 * in a buffer, so that when the child process will do its work we
8962 * can append the differences to the new append only file. */
8963 if (server.bgrewritechildpid != -1)
8964 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8965
8966 sdsfree(buf);
f80dff62 8967}
8968
8969/* In Redis commands are always executed in the context of a client, so in
8970 * order to load the append only file we need to create a fake client. */
8971static struct redisClient *createFakeClient(void) {
8972 struct redisClient *c = zmalloc(sizeof(*c));
8973
8974 selectDb(c,0);
8975 c->fd = -1;
8976 c->querybuf = sdsempty();
8977 c->argc = 0;
8978 c->argv = NULL;
8979 c->flags = 0;
9387d17d 8980 /* We set the fake client as a slave waiting for the synchronization
8981 * so that Redis will not try to send replies to this client. */
8982 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 8983 c->reply = listCreate();
8984 listSetFreeMethod(c->reply,decrRefCount);
8985 listSetDupMethod(c->reply,dupClientReplyValue);
4132ad8d 8986 initClientMultiState(c);
f80dff62 8987 return c;
8988}
8989
8990static void freeFakeClient(struct redisClient *c) {
8991 sdsfree(c->querybuf);
8992 listRelease(c->reply);
4132ad8d 8993 freeClientMultiState(c);
f80dff62 8994 zfree(c);
8995}
8996
8997/* Replay the append log file. On error REDIS_OK is returned. On non fatal
8998 * error (the append only file is zero-length) REDIS_ERR is returned. On
8999 * fatal error an error message is logged and the program exists. */
9000int loadAppendOnlyFile(char *filename) {
9001 struct redisClient *fakeClient;
9002 FILE *fp = fopen(filename,"r");
9003 struct redis_stat sb;
4132ad8d 9004 int appendonly = server.appendonly;
f80dff62 9005
9006 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
9007 return REDIS_ERR;
9008
9009 if (fp == NULL) {
9010 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
9011 exit(1);
9012 }
9013
4132ad8d
PN
9014 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
9015 * to the same file we're about to read. */
9016 server.appendonly = 0;
9017
f80dff62 9018 fakeClient = createFakeClient();
9019 while(1) {
9020 int argc, j;
9021 unsigned long len;
9022 robj **argv;
9023 char buf[128];
9024 sds argsds;
9025 struct redisCommand *cmd;
a89b7013 9026 int force_swapout;
f80dff62 9027
9028 if (fgets(buf,sizeof(buf),fp) == NULL) {
9029 if (feof(fp))
9030 break;
9031 else
9032 goto readerr;
9033 }
9034 if (buf[0] != '*') goto fmterr;
9035 argc = atoi(buf+1);
9036 argv = zmalloc(sizeof(robj*)*argc);
9037 for (j = 0; j < argc; j++) {
9038 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
9039 if (buf[0] != '$') goto fmterr;
9040 len = strtol(buf+1,NULL,10);
9041 argsds = sdsnewlen(NULL,len);
0f151ef1 9042 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 9043 argv[j] = createObject(REDIS_STRING,argsds);
9044 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
9045 }
9046
9047 /* Command lookup */
9048 cmd = lookupCommand(argv[0]->ptr);
9049 if (!cmd) {
9050 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
9051 exit(1);
9052 }
bdcb92f2 9053 /* Try object encoding */
f80dff62 9054 if (cmd->flags & REDIS_CMD_BULK)
05df7621 9055 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
f80dff62 9056 /* Run the command in the context of a fake client */
9057 fakeClient->argc = argc;
9058 fakeClient->argv = argv;
9059 cmd->proc(fakeClient);
9060 /* Discard the reply objects list from the fake client */
9061 while(listLength(fakeClient->reply))
9062 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
9063 /* Clean up, ready for the next command */
9064 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
9065 zfree(argv);
b492cf00 9066 /* Handle swapping while loading big datasets when VM is on */
a89b7013 9067 force_swapout = 0;
9068 if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
9069 force_swapout = 1;
9070
9071 if (server.vm_enabled && force_swapout) {
b492cf00 9072 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 9073 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 9074 }
9075 }
f80dff62 9076 }
4132ad8d
PN
9077
9078 /* This point can only be reached when EOF is reached without errors.
9079 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
9080 if (fakeClient->flags & REDIS_MULTI) goto readerr;
9081
f80dff62 9082 fclose(fp);
9083 freeFakeClient(fakeClient);
4132ad8d 9084 server.appendonly = appendonly;
f80dff62 9085 return REDIS_OK;
9086
9087readerr:
9088 if (feof(fp)) {
9089 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
9090 } else {
9091 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
9092 }
9093 exit(1);
9094fmterr:
9095 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
9096 exit(1);
9097}
9098
9c8e3cee 9099/* Write binary-safe string into a file in the bulkformat
9100 * $<count>\r\n<payload>\r\n */
9101static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
9eaef89f
PN
9102 char cbuf[128];
9103 int clen;
9104 cbuf[0] = '$';
9105 clen = 1+ll2string(cbuf+1,sizeof(cbuf)-1,len);
9106 cbuf[clen++] = '\r';
9107 cbuf[clen++] = '\n';
9108 if (fwrite(cbuf,clen,1,fp) == 0) return 0;
9109 if (len > 0 && fwrite(s,len,1,fp) == 0) return 0;
9c8e3cee 9110 if (fwrite("\r\n",2,1,fp) == 0) return 0;
9111 return 1;
9112}
9113
9d65a1bb 9114/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
9115static int fwriteBulkDouble(FILE *fp, double d) {
9116 char buf[128], dbuf[128];
9117
9118 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
9119 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
9120 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
9121 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
9122 return 1;
9123}
9124
9125/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
9eaef89f
PN
9126static int fwriteBulkLongLong(FILE *fp, long long l) {
9127 char bbuf[128], lbuf[128];
9128 unsigned int blen, llen;
9129 llen = ll2string(lbuf,32,l);
9130 blen = snprintf(bbuf,sizeof(bbuf),"$%u\r\n%s\r\n",llen,lbuf);
9131 if (fwrite(bbuf,blen,1,fp) == 0) return 0;
9d65a1bb 9132 return 1;
9133}
9134
9eaef89f
PN
9135/* Delegate writing an object to writing a bulk string or bulk long long. */
9136static int fwriteBulkObject(FILE *fp, robj *obj) {
9137 /* Avoid using getDecodedObject to help copy-on-write (we are often
9138 * in a child process when this function is called). */
9139 if (obj->encoding == REDIS_ENCODING_INT) {
9140 return fwriteBulkLongLong(fp,(long)obj->ptr);
9141 } else if (obj->encoding == REDIS_ENCODING_RAW) {
9142 return fwriteBulkString(fp,obj->ptr,sdslen(obj->ptr));
9143 } else {
9144 redisPanic("Unknown string encoding");
9145 }
9146}
9147
9d65a1bb 9148/* Write a sequence of commands able to fully rebuild the dataset into
9149 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
9150static int rewriteAppendOnlyFile(char *filename) {
9151 dictIterator *di = NULL;
9152 dictEntry *de;
9153 FILE *fp;
9154 char tmpfile[256];
9155 int j;
9156 time_t now = time(NULL);
9157
9158 /* Note that we have to use a different temp name here compared to the
9159 * one used by rewriteAppendOnlyFileBackground() function. */
9160 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
9161 fp = fopen(tmpfile,"w");
9162 if (!fp) {
9163 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
9164 return REDIS_ERR;
9165 }
9166 for (j = 0; j < server.dbnum; j++) {
9167 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
9168 redisDb *db = server.db+j;
9169 dict *d = db->dict;
9170 if (dictSize(d) == 0) continue;
9171 di = dictGetIterator(d);
9172 if (!di) {
9173 fclose(fp);
9174 return REDIS_ERR;
9175 }
9176
9177 /* SELECT the new DB */
9178 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
9eaef89f 9179 if (fwriteBulkLongLong(fp,j) == 0) goto werr;
9d65a1bb 9180
9181 /* Iterate this DB writing every entry */
9182 while((de = dictNext(di)) != NULL) {
09241813 9183 sds keystr = dictGetEntryKey(de);
9184 robj key, *o;
e7546c63 9185 time_t expiretime;
9186 int swapped;
9187
09241813 9188 keystr = dictGetEntryKey(de);
560db612 9189 o = dictGetEntryVal(de);
09241813 9190 initStaticStringObject(key,keystr);
b9bc0eef 9191 /* If the value for this key is swapped, load a preview in memory.
9192 * We use a "swapped" flag to remember if we need to free the
9193 * value object instead to just increment the ref count anyway
9194 * in order to avoid copy-on-write of pages if we are forked() */
560db612 9195 if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
9196 o->storage == REDIS_VM_SWAPPING) {
e7546c63 9197 swapped = 0;
9198 } else {
560db612 9199 o = vmPreviewObject(o);
e7546c63 9200 swapped = 1;
9201 }
09241813 9202 expiretime = getExpire(db,&key);
9d65a1bb 9203
9204 /* Save the key and associated value */
9d65a1bb 9205 if (o->type == REDIS_STRING) {
9206 /* Emit a SET command */
9207 char cmd[]="*3\r\n$3\r\nSET\r\n";
9208 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9209 /* Key and value */
09241813 9210 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9c8e3cee 9211 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 9212 } else if (o->type == REDIS_LIST) {
9213 /* Emit the RPUSHes needed to rebuild the list */
6ddc908a
PN
9214 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
9215 if (o->encoding == REDIS_ENCODING_ZIPLIST) {
9216 unsigned char *zl = o->ptr;
9217 unsigned char *p = ziplistIndex(zl,0);
9218 unsigned char *vstr;
9219 unsigned int vlen;
9220 long long vlong;
9221
9222 while(ziplistGet(p,&vstr,&vlen,&vlong)) {
9223 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
846d8b3e 9224 if (fwriteBulkObject(fp,&key) == 0) goto werr;
6ddc908a
PN
9225 if (vstr) {
9226 if (fwriteBulkString(fp,(char*)vstr,vlen) == 0)
9227 goto werr;
9228 } else {
9229 if (fwriteBulkLongLong(fp,vlong) == 0)
9230 goto werr;
9231 }
9232 p = ziplistNext(zl,p);
9233 }
9234 } else if (o->encoding == REDIS_ENCODING_LIST) {
9235 list *list = o->ptr;
9236 listNode *ln;
9237 listIter li;
9238
9239 listRewind(list,&li);
9240 while((ln = listNext(&li))) {
9241 robj *eleobj = listNodeValue(ln);
9242
9243 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
846d8b3e 9244 if (fwriteBulkObject(fp,&key) == 0) goto werr;
6ddc908a
PN
9245 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9246 }
9247 } else {
9248 redisPanic("Unknown list encoding");
9d65a1bb 9249 }
9250 } else if (o->type == REDIS_SET) {
9251 /* Emit the SADDs needed to rebuild the set */
9252 dict *set = o->ptr;
9253 dictIterator *di = dictGetIterator(set);
9254 dictEntry *de;
9255
9256 while((de = dictNext(di)) != NULL) {
9257 char cmd[]="*3\r\n$4\r\nSADD\r\n";
9258 robj *eleobj = dictGetEntryKey(de);
9259
9260 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
09241813 9261 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9c8e3cee 9262 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 9263 }
9264 dictReleaseIterator(di);
9265 } else if (o->type == REDIS_ZSET) {
9266 /* Emit the ZADDs needed to rebuild the sorted set */
9267 zset *zs = o->ptr;
9268 dictIterator *di = dictGetIterator(zs->dict);
9269 dictEntry *de;
9270
9271 while((de = dictNext(di)) != NULL) {
9272 char cmd[]="*4\r\n$4\r\nZADD\r\n";
9273 robj *eleobj = dictGetEntryKey(de);
9274 double *score = dictGetEntryVal(de);
9275
9276 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
09241813 9277 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9d65a1bb 9278 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 9279 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 9280 }
9281 dictReleaseIterator(di);
9c8e3cee 9282 } else if (o->type == REDIS_HASH) {
9283 char cmd[]="*4\r\n$4\r\nHSET\r\n";
9284
9285 /* Emit the HSETs needed to rebuild the hash */
9286 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9287 unsigned char *p = zipmapRewind(o->ptr);
9288 unsigned char *field, *val;
9289 unsigned int flen, vlen;
9290
9291 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
9292 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
09241813 9293 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9c8e3cee 9294 if (fwriteBulkString(fp,(char*)field,flen) == -1)
9295 return -1;
9296 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
9297 return -1;
9298 }
9299 } else {
9300 dictIterator *di = dictGetIterator(o->ptr);
9301 dictEntry *de;
9302
9303 while((de = dictNext(di)) != NULL) {
9304 robj *field = dictGetEntryKey(de);
9305 robj *val = dictGetEntryVal(de);
9306
9307 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
09241813 9308 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9c8e3cee 9309 if (fwriteBulkObject(fp,field) == -1) return -1;
9310 if (fwriteBulkObject(fp,val) == -1) return -1;
9311 }
9312 dictReleaseIterator(di);
9313 }
9d65a1bb 9314 } else {
f83c6cb5 9315 redisPanic("Unknown object type");
9d65a1bb 9316 }
9317 /* Save the expire time */
9318 if (expiretime != -1) {
e96e4fbf 9319 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 9320 /* If this key is already expired skip it */
9321 if (expiretime < now) continue;
9322 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
09241813 9323 if (fwriteBulkObject(fp,&key) == 0) goto werr;
9eaef89f 9324 if (fwriteBulkLongLong(fp,expiretime) == 0) goto werr;
9d65a1bb 9325 }
b9bc0eef 9326 if (swapped) decrRefCount(o);
9d65a1bb 9327 }
9328 dictReleaseIterator(di);
9329 }
9330
9331 /* Make sure data will not remain on the OS's output buffers */
9332 fflush(fp);
b0bd87f6 9333 aof_fsync(fileno(fp));
9d65a1bb 9334 fclose(fp);
e0a62c7f 9335
9d65a1bb 9336 /* Use RENAME to make sure the DB file is changed atomically only
9337 * if the generate DB file is ok. */
9338 if (rename(tmpfile,filename) == -1) {
9339 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
9340 unlink(tmpfile);
9341 return REDIS_ERR;
9342 }
9343 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
9344 return REDIS_OK;
9345
9346werr:
9347 fclose(fp);
9348 unlink(tmpfile);
e96e4fbf 9349 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 9350 if (di) dictReleaseIterator(di);
9351 return REDIS_ERR;
9352}
9353
9354/* This is how rewriting of the append only file in background works:
9355 *
9356 * 1) The user calls BGREWRITEAOF
9357 * 2) Redis calls this function, that forks():
9358 * 2a) the child rewrite the append only file in a temp file.
9359 * 2b) the parent accumulates differences in server.bgrewritebuf.
9360 * 3) When the child finished '2a' exists.
9361 * 4) The parent will trap the exit code, if it's OK, will append the
9362 * data accumulated into server.bgrewritebuf into the temp file, and
9363 * finally will rename(2) the temp file in the actual file name.
9364 * The the new file is reopened as the new append only file. Profit!
9365 */
9366static int rewriteAppendOnlyFileBackground(void) {
9367 pid_t childpid;
9368
9369 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 9370 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 9371 if ((childpid = fork()) == 0) {
9372 /* Child */
9373 char tmpfile[256];
9d65a1bb 9374
054e426d 9375 if (server.vm_enabled) vmReopenSwapFile();
9376 close(server.fd);
9d65a1bb 9377 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
9378 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 9379 _exit(0);
9d65a1bb 9380 } else {
478c2c6f 9381 _exit(1);
9d65a1bb 9382 }
9383 } else {
9384 /* Parent */
9385 if (childpid == -1) {
9386 redisLog(REDIS_WARNING,
9387 "Can't rewrite append only file in background: fork: %s",
9388 strerror(errno));
9389 return REDIS_ERR;
9390 }
9391 redisLog(REDIS_NOTICE,
9392 "Background append only file rewriting started by pid %d",childpid);
9393 server.bgrewritechildpid = childpid;
884d4b39 9394 updateDictResizePolicy();
85a83172 9395 /* We set appendseldb to -1 in order to force the next call to the
9396 * feedAppendOnlyFile() to issue a SELECT command, so the differences
9397 * accumulated by the parent into server.bgrewritebuf will start
9398 * with a SELECT statement and it will be safe to merge. */
9399 server.appendseldb = -1;
9d65a1bb 9400 return REDIS_OK;
9401 }
9402 return REDIS_OK; /* unreached */
9403}
9404
9405static void bgrewriteaofCommand(redisClient *c) {
9406 if (server.bgrewritechildpid != -1) {
9407 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
9408 return;
9409 }
9410 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 9411 char *status = "+Background append only file rewriting started\r\n";
9412 addReplySds(c,sdsnew(status));
9d65a1bb 9413 } else {
9414 addReply(c,shared.err);
9415 }
9416}
9417
9418static void aofRemoveTempFile(pid_t childpid) {
9419 char tmpfile[256];
9420
9421 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
9422 unlink(tmpfile);
9423}
9424
996cb5f7 9425/* Virtual Memory is composed mainly of two subsystems:
9426 * - Blocking Virutal Memory
9427 * - Threaded Virtual Memory I/O
9428 * The two parts are not fully decoupled, but functions are split among two
9429 * different sections of the source code (delimited by comments) in order to
9430 * make more clear what functionality is about the blocking VM and what about
9431 * the threaded (not blocking) VM.
9432 *
9433 * Redis VM design:
9434 *
9435 * Redis VM is a blocking VM (one that blocks reading swapped values from
9436 * disk into memory when a value swapped out is needed in memory) that is made
9437 * unblocking by trying to examine the command argument vector in order to
9438 * load in background values that will likely be needed in order to exec
9439 * the command. The command is executed only once all the relevant keys
9440 * are loaded into memory.
9441 *
9442 * This basically is almost as simple of a blocking VM, but almost as parallel
9443 * as a fully non-blocking VM.
9444 */
9445
560db612 9446/* =================== Virtual Memory - Blocking Side ====================== */
2e5eb04e 9447
560db612 9448/* Create a VM pointer object. This kind of objects are used in place of
9449 * values in the key -> value hash table, for swapped out objects. */
9450static vmpointer *createVmPointer(int vtype) {
9451 vmpointer *vp = zmalloc(sizeof(vmpointer));
2e5eb04e 9452
560db612 9453 vp->type = REDIS_VMPOINTER;
9454 vp->storage = REDIS_VM_SWAPPED;
9455 vp->vtype = vtype;
9456 return vp;
2e5eb04e 9457}
9458
75680a3c 9459static void vmInit(void) {
9460 off_t totsize;
996cb5f7 9461 int pipefds[2];
bcaa7a4f 9462 size_t stacksize;
8b5bb414 9463 struct flock fl;
75680a3c 9464
4ad37480 9465 if (server.vm_max_threads != 0)
9466 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
9467
054e426d 9468 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
8b5bb414 9469 /* Try to open the old swap file, otherwise create it */
6fa987e3 9470 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
9471 server.vm_fp = fopen(server.vm_swap_file,"w+b");
9472 }
75680a3c 9473 if (server.vm_fp == NULL) {
6fa987e3 9474 redisLog(REDIS_WARNING,
8b5bb414 9475 "Can't open the swap file: %s. Exiting.",
6fa987e3 9476 strerror(errno));
75680a3c 9477 exit(1);
9478 }
9479 server.vm_fd = fileno(server.vm_fp);
8b5bb414 9480 /* Lock the swap file for writing, this is useful in order to avoid
9481 * another instance to use the same swap file for a config error. */
9482 fl.l_type = F_WRLCK;
9483 fl.l_whence = SEEK_SET;
9484 fl.l_start = fl.l_len = 0;
9485 if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
9486 redisLog(REDIS_WARNING,
9487 "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
9488 exit(1);
9489 }
9490 /* Initialize */
75680a3c 9491 server.vm_next_page = 0;
9492 server.vm_near_pages = 0;
7d98e08c 9493 server.vm_stats_used_pages = 0;
9494 server.vm_stats_swapped_objects = 0;
9495 server.vm_stats_swapouts = 0;
9496 server.vm_stats_swapins = 0;
75680a3c 9497 totsize = server.vm_pages*server.vm_page_size;
9498 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
9499 if (ftruncate(server.vm_fd,totsize) == -1) {
9500 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
9501 strerror(errno));
9502 exit(1);
9503 } else {
9504 redisLog(REDIS_NOTICE,"Swap file allocated with success");
9505 }
7d30035d 9506 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 9507 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 9508 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 9509 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 9510
996cb5f7 9511 /* Initialize threaded I/O (used by Virtual Memory) */
9512 server.io_newjobs = listCreate();
9513 server.io_processing = listCreate();
9514 server.io_processed = listCreate();
d5d55fc3 9515 server.io_ready_clients = listCreate();
92f8e882 9516 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 9517 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
9518 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 9519 server.io_active_threads = 0;
996cb5f7 9520 if (pipe(pipefds) == -1) {
9521 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
9522 ,strerror(errno));
9523 exit(1);
9524 }
9525 server.io_ready_pipe_read = pipefds[0];
9526 server.io_ready_pipe_write = pipefds[1];
9527 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 9528 /* LZF requires a lot of stack */
9529 pthread_attr_init(&server.io_threads_attr);
9530 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
9531 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
9532 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 9533 /* Listen for events in the threaded I/O pipe */
9534 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
9535 vmThreadedIOCompletedJob, NULL) == AE_ERR)
9536 oom("creating file event");
75680a3c 9537}
9538
06224fec 9539/* Mark the page as used */
9540static void vmMarkPageUsed(off_t page) {
9541 off_t byte = page/8;
9542 int bit = page&7;
970e10bb 9543 redisAssert(vmFreePage(page) == 1);
06224fec 9544 server.vm_bitmap[byte] |= 1<<bit;
9545}
9546
9547/* Mark N contiguous pages as used, with 'page' being the first. */
9548static void vmMarkPagesUsed(off_t page, off_t count) {
9549 off_t j;
9550
9551 for (j = 0; j < count; j++)
7d30035d 9552 vmMarkPageUsed(page+j);
7d98e08c 9553 server.vm_stats_used_pages += count;
7c775e09 9554 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
9555 (long long)count, (long long)page);
06224fec 9556}
9557
9558/* Mark the page as free */
9559static void vmMarkPageFree(off_t page) {
9560 off_t byte = page/8;
9561 int bit = page&7;
970e10bb 9562 redisAssert(vmFreePage(page) == 0);
06224fec 9563 server.vm_bitmap[byte] &= ~(1<<bit);
9564}
9565
9566/* Mark N contiguous pages as free, with 'page' being the first. */
9567static void vmMarkPagesFree(off_t page, off_t count) {
9568 off_t j;
9569
9570 for (j = 0; j < count; j++)
7d30035d 9571 vmMarkPageFree(page+j);
7d98e08c 9572 server.vm_stats_used_pages -= count;
7c775e09 9573 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
9574 (long long)count, (long long)page);
06224fec 9575}
9576
9577/* Test if the page is free */
9578static int vmFreePage(off_t page) {
9579 off_t byte = page/8;
9580 int bit = page&7;
7d30035d 9581 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 9582}
9583
9584/* Find N contiguous free pages storing the first page of the cluster in *first.
e0a62c7f 9585 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
3a66edc7 9586 * REDIS_ERR is returned.
06224fec 9587 *
9588 * This function uses a simple algorithm: we try to allocate
9589 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
9590 * again from the start of the swap file searching for free spaces.
9591 *
9592 * If it looks pretty clear that there are no free pages near our offset
9593 * we try to find less populated places doing a forward jump of
9594 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
9595 * without hurry, and then we jump again and so forth...
e0a62c7f 9596 *
06224fec 9597 * This function can be improved using a free list to avoid to guess
9598 * too much, since we could collect data about freed pages.
9599 *
9600 * note: I implemented this function just after watching an episode of
9601 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
9602 */
c7df85a4 9603static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 9604 off_t base, offset = 0, since_jump = 0, numfree = 0;
9605
9606 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
9607 server.vm_near_pages = 0;
9608 server.vm_next_page = 0;
9609 }
9610 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
9611 base = server.vm_next_page;
9612
9613 while(offset < server.vm_pages) {
9614 off_t this = base+offset;
9615
9616 /* If we overflow, restart from page zero */
9617 if (this >= server.vm_pages) {
9618 this -= server.vm_pages;
9619 if (this == 0) {
9620 /* Just overflowed, what we found on tail is no longer
9621 * interesting, as it's no longer contiguous. */
9622 numfree = 0;
9623 }
9624 }
9625 if (vmFreePage(this)) {
9626 /* This is a free page */
9627 numfree++;
9628 /* Already got N free pages? Return to the caller, with success */
9629 if (numfree == n) {
7d30035d 9630 *first = this-(n-1);
9631 server.vm_next_page = this+1;
7c775e09 9632 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 9633 return REDIS_OK;
06224fec 9634 }
9635 } else {
9636 /* The current one is not a free page */
9637 numfree = 0;
9638 }
9639
9640 /* Fast-forward if the current page is not free and we already
9641 * searched enough near this place. */
9642 since_jump++;
9643 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
9644 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
9645 since_jump = 0;
9646 /* Note that even if we rewind after the jump, we are don't need
9647 * to make sure numfree is set to zero as we only jump *if* it
9648 * is set to zero. */
9649 } else {
9650 /* Otherwise just check the next page */
9651 offset++;
9652 }
9653 }
3a66edc7 9654 return REDIS_ERR;
9655}
9656
a5819310 9657/* Write the specified object at the specified page of the swap file */
9658static int vmWriteObjectOnSwap(robj *o, off_t page) {
9659 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9660 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
9661 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9662 redisLog(REDIS_WARNING,
9ebed7cf 9663 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 9664 strerror(errno));
9665 return REDIS_ERR;
9666 }
9667 rdbSaveObject(server.vm_fp,o);
ba76a8f9 9668 fflush(server.vm_fp);
a5819310 9669 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9670 return REDIS_OK;
9671}
9672
a4798f73 9673/* Transfers the 'val' object to disk. Store all the information
9674 * a 'vmpointer' object containing all the information needed to load the
9675 * object back later is returned.
9676 *
3a66edc7 9677 * If we can't find enough contiguous empty pages to swap the object on disk
a4798f73 9678 * NULL is returned. */
560db612 9679static vmpointer *vmSwapObjectBlocking(robj *val) {
b9bc0eef 9680 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 9681 off_t page;
560db612 9682 vmpointer *vp;
3a66edc7 9683
560db612 9684 assert(val->storage == REDIS_VM_MEMORY);
9685 assert(val->refcount == 1);
9686 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return NULL;
9687 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return NULL;
9688
9689 vp = createVmPointer(val->type);
9690 vp->page = page;
9691 vp->usedpages = pages;
3a66edc7 9692 decrRefCount(val); /* Deallocate the object from memory. */
9693 vmMarkPagesUsed(page,pages);
560db612 9694 redisLog(REDIS_DEBUG,"VM: object %p swapped out at %lld (%lld pages)",
9695 (void*) val,
7d30035d 9696 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 9697 server.vm_stats_swapped_objects++;
9698 server.vm_stats_swapouts++;
560db612 9699 return vp;
3a66edc7 9700}
9701
a5819310 9702static robj *vmReadObjectFromSwap(off_t page, int type) {
9703 robj *o;
3a66edc7 9704
a5819310 9705 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
9706 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 9707 redisLog(REDIS_WARNING,
d5d55fc3 9708 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 9709 strerror(errno));
478c2c6f 9710 _exit(1);
3a66edc7 9711 }
a5819310 9712 o = rdbLoadObject(type,server.vm_fp);
9713 if (o == NULL) {
d5d55fc3 9714 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 9715 _exit(1);
3a66edc7 9716 }
a5819310 9717 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
9718 return o;
9719}
9720
560db612 9721/* Load the specified object from swap to memory.
a5819310 9722 * The newly allocated object is returned.
9723 *
9724 * If preview is true the unserialized object is returned to the caller but
560db612 9725 * the pages are not marked as freed, nor the vp object is freed. */
9726static robj *vmGenericLoadObject(vmpointer *vp, int preview) {
a5819310 9727 robj *val;
9728
560db612 9729 redisAssert(vp->type == REDIS_VMPOINTER &&
9730 (vp->storage == REDIS_VM_SWAPPED || vp->storage == REDIS_VM_LOADING));
9731 val = vmReadObjectFromSwap(vp->page,vp->vtype);
7e69548d 9732 if (!preview) {
560db612 9733 redisLog(REDIS_DEBUG, "VM: object %p loaded from disk", (void*)vp);
9734 vmMarkPagesFree(vp->page,vp->usedpages);
9735 zfree(vp);
7d98e08c 9736 server.vm_stats_swapped_objects--;
38aba9a1 9737 } else {
560db612 9738 redisLog(REDIS_DEBUG, "VM: object %p previewed from disk", (void*)vp);
7e69548d 9739 }
7d98e08c 9740 server.vm_stats_swapins++;
3a66edc7 9741 return val;
06224fec 9742}
9743
560db612 9744/* Plain object loading, from swap to memory.
9745 *
9746 * 'o' is actually a redisVmPointer structure that will be freed by the call.
9747 * The return value is the loaded object. */
9748static robj *vmLoadObject(robj *o) {
996cb5f7 9749 /* If we are loading the object in background, stop it, we
9750 * need to load this object synchronously ASAP. */
560db612 9751 if (o->storage == REDIS_VM_LOADING)
9752 vmCancelThreadedIOJob(o);
9753 return vmGenericLoadObject((vmpointer*)o,0);
7e69548d 9754}
9755
9756/* Just load the value on disk, without to modify the key.
9757 * This is useful when we want to perform some operation on the value
9758 * without to really bring it from swap to memory, like while saving the
9759 * dataset or rewriting the append only log. */
560db612 9760static robj *vmPreviewObject(robj *o) {
9761 return vmGenericLoadObject((vmpointer*)o,1);
7e69548d 9762}
9763
4ef8de8a 9764/* How a good candidate is this object for swapping?
9765 * The better candidate it is, the greater the returned value.
9766 *
9767 * Currently we try to perform a fast estimation of the object size in
9768 * memory, and combine it with aging informations.
9769 *
9770 * Basically swappability = idle-time * log(estimated size)
9771 *
9772 * Bigger objects are preferred over smaller objects, but not
9773 * proportionally, this is why we use the logarithm. This algorithm is
9774 * just a first try and will probably be tuned later. */
9775static double computeObjectSwappability(robj *o) {
560db612 9776 /* actual age can be >= minage, but not < minage. As we use wrapping
9777 * 21 bit clocks with minutes resolution for the LRU. */
9778 time_t minage = abs(server.lruclock - o->lru);
4ef8de8a 9779 long asize = 0;
9780 list *l;
9781 dict *d;
9782 struct dictEntry *de;
9783 int z;
9784
560db612 9785 if (minage <= 0) return 0;
4ef8de8a 9786 switch(o->type) {
9787 case REDIS_STRING:
9788 if (o->encoding != REDIS_ENCODING_RAW) {
9789 asize = sizeof(*o);
9790 } else {
9791 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
9792 }
9793 break;
9794 case REDIS_LIST:
9795 l = o->ptr;
9796 listNode *ln = listFirst(l);
9797
9798 asize = sizeof(list);
9799 if (ln) {
9800 robj *ele = ln->value;
9801 long elesize;
9802
9803 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
560db612 9804 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
4ef8de8a 9805 asize += (sizeof(listNode)+elesize)*listLength(l);
9806 }
9807 break;
9808 case REDIS_SET:
9809 case REDIS_ZSET:
9810 z = (o->type == REDIS_ZSET);
9811 d = z ? ((zset*)o->ptr)->dict : o->ptr;
9812
9813 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9814 if (z) asize += sizeof(zset)-sizeof(dict);
9815 if (dictSize(d)) {
9816 long elesize;
9817 robj *ele;
9818
9819 de = dictGetRandomKey(d);
9820 ele = dictGetEntryKey(de);
9821 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
560db612 9822 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
4ef8de8a 9823 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9824 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
9825 }
9826 break;
a97b9060 9827 case REDIS_HASH:
9828 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
9829 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
9830 unsigned int len = zipmapLen((unsigned char*)o->ptr);
9831 unsigned int klen, vlen;
9832 unsigned char *key, *val;
9833
9834 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
9835 klen = 0;
9836 vlen = 0;
9837 }
9838 asize = len*(klen+vlen+3);
9839 } else if (o->encoding == REDIS_ENCODING_HT) {
9840 d = o->ptr;
9841 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
9842 if (dictSize(d)) {
9843 long elesize;
9844 robj *ele;
9845
9846 de = dictGetRandomKey(d);
9847 ele = dictGetEntryKey(de);
9848 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
560db612 9849 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
a97b9060 9850 ele = dictGetEntryVal(de);
9851 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
560db612 9852 (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
a97b9060 9853 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
9854 }
9855 }
9856 break;
4ef8de8a 9857 }
560db612 9858 return (double)minage*log(1+asize);
4ef8de8a 9859}
9860
9861/* Try to swap an object that's a good candidate for swapping.
9862 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 9863 * to swap any object at all.
9864 *
9865 * If 'usethreaded' is true, Redis will try to swap the object in background
9866 * using I/O threads. */
9867static int vmSwapOneObject(int usethreads) {
4ef8de8a 9868 int j, i;
9869 struct dictEntry *best = NULL;
9870 double best_swappability = 0;
b9bc0eef 9871 redisDb *best_db = NULL;
44262c58 9872 robj *val;
9873 sds key;
4ef8de8a 9874
9875 for (j = 0; j < server.dbnum; j++) {
9876 redisDb *db = server.db+j;
b72f6a4b 9877 /* Why maxtries is set to 100?
9878 * Because this way (usually) we'll find 1 object even if just 1% - 2%
9879 * are swappable objects */
b0d8747d 9880 int maxtries = 100;
4ef8de8a 9881
9882 if (dictSize(db->dict) == 0) continue;
9883 for (i = 0; i < 5; i++) {
9884 dictEntry *de;
9885 double swappability;
9886
e3cadb8a 9887 if (maxtries) maxtries--;
4ef8de8a 9888 de = dictGetRandomKey(db->dict);
4ef8de8a 9889 val = dictGetEntryVal(de);
1064ef87 9890 /* Only swap objects that are currently in memory.
9891 *
560db612 9892 * Also don't swap shared objects: not a good idea in general and
9893 * we need to ensure that the main thread does not touch the
1064ef87 9894 * object while the I/O thread is using it, but we can't
9895 * control other keys without adding additional mutex. */
560db612 9896 if (val->storage != REDIS_VM_MEMORY || val->refcount != 1) {
e3cadb8a 9897 if (maxtries) i--; /* don't count this try */
9898 continue;
9899 }
4ef8de8a 9900 swappability = computeObjectSwappability(val);
9901 if (!best || swappability > best_swappability) {
9902 best = de;
9903 best_swappability = swappability;
b9bc0eef 9904 best_db = db;
4ef8de8a 9905 }
9906 }
9907 }
7c775e09 9908 if (best == NULL) return REDIS_ERR;
4ef8de8a 9909 key = dictGetEntryKey(best);
9910 val = dictGetEntryVal(best);
9911
e3cadb8a 9912 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
44262c58 9913 key, best_swappability);
4ef8de8a 9914
4ef8de8a 9915 /* Swap it */
a69a0c9c 9916 if (usethreads) {
4c8f2370 9917 robj *keyobj = createStringObject(key,sdslen(key));
9918 vmSwapObjectThreaded(keyobj,val,best_db);
9919 decrRefCount(keyobj);
4ef8de8a 9920 return REDIS_OK;
9921 } else {
560db612 9922 vmpointer *vp;
9923
9924 if ((vp = vmSwapObjectBlocking(val)) != NULL) {
9925 dictGetEntryVal(best) = vp;
a69a0c9c 9926 return REDIS_OK;
9927 } else {
9928 return REDIS_ERR;
9929 }
4ef8de8a 9930 }
9931}
9932
a69a0c9c 9933static int vmSwapOneObjectBlocking() {
9934 return vmSwapOneObject(0);
9935}
9936
9937static int vmSwapOneObjectThreaded() {
9938 return vmSwapOneObject(1);
9939}
9940
7e69548d 9941/* Return true if it's safe to swap out objects in a given moment.
9942 * Basically we don't want to swap objects out while there is a BGSAVE
9943 * or a BGAEOREWRITE running in backgroud. */
9944static int vmCanSwapOut(void) {
9945 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9946}
9947
996cb5f7 9948/* =================== Virtual Memory - Threaded I/O ======================= */
9949
b9bc0eef 9950static void freeIOJob(iojob *j) {
d5d55fc3 9951 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9952 j->type == REDIS_IOJOB_DO_SWAP ||
9953 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
560db612 9954 {
e4ed181d 9955 /* we fix the storage type, otherwise decrRefCount() will try to
9956 * kill the I/O thread Job (that does no longer exists). */
9957 if (j->val->storage == REDIS_VM_SWAPPING)
560db612 9958 j->val->storage = REDIS_VM_MEMORY;
b9bc0eef 9959 decrRefCount(j->val);
560db612 9960 }
9961 decrRefCount(j->key);
b9bc0eef 9962 zfree(j);
9963}
9964
996cb5f7 9965/* Every time a thread finished a Job, it writes a byte into the write side
9966 * of an unix pipe in order to "awake" the main thread, and this function
9967 * is called. */
9968static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9969 int mask)
9970{
9971 char buf[1];
b0d8747d 9972 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 9973 REDIS_NOTUSED(el);
9974 REDIS_NOTUSED(mask);
9975 REDIS_NOTUSED(privdata);
9976
9977 /* For every byte we read in the read side of the pipe, there is one
9978 * I/O job completed to process. */
9979 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 9980 iojob *j;
9981 listNode *ln;
b9bc0eef 9982 struct dictEntry *de;
9983
996cb5f7 9984 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 9985
9986 /* Get the processed element (the oldest one) */
9987 lockThreadedIO();
1064ef87 9988 assert(listLength(server.io_processed) != 0);
f6c0bba8 9989 if (toprocess == -1) {
9990 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9991 if (toprocess <= 0) toprocess = 1;
9992 }
b9bc0eef 9993 ln = listFirst(server.io_processed);
9994 j = ln->value;
9995 listDelNode(server.io_processed,ln);
9996 unlockThreadedIO();
9997 /* If this job is marked as canceled, just ignore it */
9998 if (j->canceled) {
9999 freeIOJob(j);
10000 continue;
10001 }
10002 /* Post process it in the main thread, as there are things we
10003 * can do just here to avoid race conditions and/or invasive locks */
560db612 10004 redisLog(REDIS_DEBUG,"COMPLETED Job type: %d, ID %p, key: %s", j->type, (void*)j->id, (unsigned char*)j->key->ptr);
44262c58 10005 de = dictFind(j->db->dict,j->key->ptr);
e4ed181d 10006 redisAssert(de != NULL);
b9bc0eef 10007 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 10008 redisDb *db;
560db612 10009 vmpointer *vp = dictGetEntryVal(de);
d5d55fc3 10010
b9bc0eef 10011 /* Key loaded, bring it at home */
560db612 10012 vmMarkPagesFree(vp->page,vp->usedpages);
b9bc0eef 10013 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
560db612 10014 (unsigned char*) j->key->ptr);
b9bc0eef 10015 server.vm_stats_swapped_objects--;
10016 server.vm_stats_swapins++;
d5d55fc3 10017 dictGetEntryVal(de) = j->val;
10018 incrRefCount(j->val);
10019 db = j->db;
d5d55fc3 10020 /* Handle clients waiting for this key to be loaded. */
560db612 10021 handleClientsBlockedOnSwappedKey(db,j->key);
10022 freeIOJob(j);
10023 zfree(vp);
b9bc0eef 10024 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
10025 /* Now we know the amount of pages required to swap this object.
10026 * Let's find some space for it, and queue this task again
10027 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 10028 if (!vmCanSwapOut() ||
10029 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
10030 {
10031 /* Ooops... no space or we can't swap as there is
10032 * a fork()ed Redis trying to save stuff on disk. */
560db612 10033 j->val->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 10034 freeIOJob(j);
10035 } else {
c7df85a4 10036 /* Note that we need to mark this pages as used now,
10037 * if the job will be canceled, we'll mark them as freed
10038 * again. */
10039 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 10040 j->type = REDIS_IOJOB_DO_SWAP;
10041 lockThreadedIO();
10042 queueIOJob(j);
10043 unlockThreadedIO();
10044 }
10045 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
560db612 10046 vmpointer *vp;
b9bc0eef 10047
10048 /* Key swapped. We can finally free some memory. */
560db612 10049 if (j->val->storage != REDIS_VM_SWAPPING) {
10050 vmpointer *vp = (vmpointer*) j->id;
10051 printf("storage: %d\n",vp->storage);
10052 printf("key->name: %s\n",(char*)j->key->ptr);
6c96ba7d 10053 printf("val: %p\n",(void*)j->val);
10054 printf("val->type: %d\n",j->val->type);
10055 printf("val->ptr: %s\n",(char*)j->val->ptr);
10056 }
560db612 10057 redisAssert(j->val->storage == REDIS_VM_SWAPPING);
10058 vp = createVmPointer(j->val->type);
10059 vp->page = j->page;
10060 vp->usedpages = j->pages;
10061 dictGetEntryVal(de) = vp;
e4ed181d 10062 /* Fix the storage otherwise decrRefCount will attempt to
10063 * remove the associated I/O job */
10064 j->val->storage = REDIS_VM_MEMORY;
560db612 10065 decrRefCount(j->val);
b9bc0eef 10066 redisLog(REDIS_DEBUG,
10067 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
560db612 10068 (unsigned char*) j->key->ptr,
b9bc0eef 10069 (unsigned long long) j->page, (unsigned long long) j->pages);
10070 server.vm_stats_swapped_objects++;
10071 server.vm_stats_swapouts++;
10072 freeIOJob(j);
f11b8647 10073 /* Put a few more swap requests in queue if we are still
10074 * out of memory */
b0d8747d 10075 if (trytoswap && vmCanSwapOut() &&
10076 zmalloc_used_memory() > server.vm_max_memory)
10077 {
f11b8647 10078 int more = 1;
10079 while(more) {
10080 lockThreadedIO();
10081 more = listLength(server.io_newjobs) <
10082 (unsigned) server.vm_max_threads;
10083 unlockThreadedIO();
10084 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 10085 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
10086 trytoswap = 0;
10087 break;
10088 }
f11b8647 10089 }
10090 }
b9bc0eef 10091 }
c953f24b 10092 processed++;
f6c0bba8 10093 if (processed == toprocess) return;
996cb5f7 10094 }
10095 if (retval < 0 && errno != EAGAIN) {
10096 redisLog(REDIS_WARNING,
10097 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
10098 strerror(errno));
10099 }
10100}
10101
10102static void lockThreadedIO(void) {
10103 pthread_mutex_lock(&server.io_mutex);
10104}
10105
10106static void unlockThreadedIO(void) {
10107 pthread_mutex_unlock(&server.io_mutex);
10108}
10109
10110/* Remove the specified object from the threaded I/O queue if still not
10111 * processed, otherwise make sure to flag it as canceled. */
10112static void vmCancelThreadedIOJob(robj *o) {
10113 list *lists[3] = {
6c96ba7d 10114 server.io_newjobs, /* 0 */
10115 server.io_processing, /* 1 */
10116 server.io_processed /* 2 */
996cb5f7 10117 };
10118 int i;
10119
10120 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 10121again:
996cb5f7 10122 lockThreadedIO();
560db612 10123 /* Search for a matching object in one of the queues */
996cb5f7 10124 for (i = 0; i < 3; i++) {
10125 listNode *ln;
c7df85a4 10126 listIter li;
996cb5f7 10127
c7df85a4 10128 listRewind(lists[i],&li);
10129 while ((ln = listNext(&li)) != NULL) {
996cb5f7 10130 iojob *job = ln->value;
10131
6c96ba7d 10132 if (job->canceled) continue; /* Skip this, already canceled. */
560db612 10133 if (job->id == o) {
dbc289ae 10134 redisLog(REDIS_DEBUG,"*** CANCELED %p (key %s) (type %d) (LIST ID %d)\n",
10135 (void*)job, (char*)job->key->ptr, job->type, i);
427a2153 10136 /* Mark the pages as free since the swap didn't happened
10137 * or happened but is now discarded. */
970e10bb 10138 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 10139 vmMarkPagesFree(job->page,job->pages);
10140 /* Cancel the job. It depends on the list the job is
10141 * living in. */
996cb5f7 10142 switch(i) {
10143 case 0: /* io_newjobs */
6c96ba7d 10144 /* If the job was yet not processed the best thing to do
996cb5f7 10145 * is to remove it from the queue at all */
6c96ba7d 10146 freeIOJob(job);
996cb5f7 10147 listDelNode(lists[i],ln);
10148 break;
10149 case 1: /* io_processing */
d5d55fc3 10150 /* Oh Shi- the thread is messing with the Job:
10151 *
10152 * Probably it's accessing the object if this is a
10153 * PREPARE_SWAP or DO_SWAP job.
10154 * If it's a LOAD job it may be reading from disk and
10155 * if we don't wait for the job to terminate before to
10156 * cancel it, maybe in a few microseconds data can be
10157 * corrupted in this pages. So the short story is:
10158 *
10159 * Better to wait for the job to move into the
10160 * next queue (processed)... */
10161
10162 /* We try again and again until the job is completed. */
10163 unlockThreadedIO();
10164 /* But let's wait some time for the I/O thread
10165 * to finish with this job. After all this condition
10166 * should be very rare. */
10167 usleep(1);
10168 goto again;
996cb5f7 10169 case 2: /* io_processed */
2e111efe 10170 /* The job was already processed, that's easy...
10171 * just mark it as canceled so that we'll ignore it
10172 * when processing completed jobs. */
996cb5f7 10173 job->canceled = 1;
10174 break;
10175 }
c7df85a4 10176 /* Finally we have to adjust the storage type of the object
10177 * in order to "UNDO" the operaiton. */
996cb5f7 10178 if (o->storage == REDIS_VM_LOADING)
10179 o->storage = REDIS_VM_SWAPPED;
10180 else if (o->storage == REDIS_VM_SWAPPING)
10181 o->storage = REDIS_VM_MEMORY;
10182 unlockThreadedIO();
e4ed181d 10183 redisLog(REDIS_DEBUG,"*** DONE");
996cb5f7 10184 return;
10185 }
10186 }
10187 }
10188 unlockThreadedIO();
560db612 10189 printf("Not found: %p\n", (void*)o);
10190 redisAssert(1 != 1); /* We should never reach this */
996cb5f7 10191}
10192
b9bc0eef 10193static void *IOThreadEntryPoint(void *arg) {
10194 iojob *j;
10195 listNode *ln;
10196 REDIS_NOTUSED(arg);
10197
10198 pthread_detach(pthread_self());
10199 while(1) {
10200 /* Get a new job to process */
10201 lockThreadedIO();
10202 if (listLength(server.io_newjobs) == 0) {
10203 /* No new jobs in queue, exit. */
9ebed7cf 10204 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
10205 (long) pthread_self());
b9bc0eef 10206 server.io_active_threads--;
10207 unlockThreadedIO();
10208 return NULL;
10209 }
10210 ln = listFirst(server.io_newjobs);
10211 j = ln->value;
10212 listDelNode(server.io_newjobs,ln);
10213 /* Add the job in the processing queue */
10214 j->thread = pthread_self();
10215 listAddNodeTail(server.io_processing,j);
10216 ln = listLast(server.io_processing); /* We use ln later to remove it */
10217 unlockThreadedIO();
9ebed7cf 10218 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
10219 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 10220
10221 /* Process the Job */
10222 if (j->type == REDIS_IOJOB_LOAD) {
560db612 10223 vmpointer *vp = (vmpointer*)j->id;
10224 j->val = vmReadObjectFromSwap(j->page,vp->vtype);
b9bc0eef 10225 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
10226 FILE *fp = fopen("/dev/null","w+");
10227 j->pages = rdbSavedObjectPages(j->val,fp);
10228 fclose(fp);
10229 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 10230 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
10231 j->canceled = 1;
b9bc0eef 10232 }
10233
10234 /* Done: insert the job into the processed queue */
9ebed7cf 10235 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
10236 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 10237 lockThreadedIO();
10238 listDelNode(server.io_processing,ln);
10239 listAddNodeTail(server.io_processed,j);
10240 unlockThreadedIO();
e0a62c7f 10241
b9bc0eef 10242 /* Signal the main thread there is new stuff to process */
10243 assert(write(server.io_ready_pipe_write,"x",1) == 1);
10244 }
10245 return NULL; /* never reached */
10246}
10247
10248static void spawnIOThread(void) {
10249 pthread_t thread;
478c2c6f 10250 sigset_t mask, omask;
a97b9060 10251 int err;
b9bc0eef 10252
478c2c6f 10253 sigemptyset(&mask);
10254 sigaddset(&mask,SIGCHLD);
10255 sigaddset(&mask,SIGHUP);
10256 sigaddset(&mask,SIGPIPE);
10257 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 10258 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
10259 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
10260 strerror(err));
10261 usleep(1000000);
10262 }
478c2c6f 10263 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 10264 server.io_active_threads++;
10265}
10266
4ee9488d 10267/* We need to wait for the last thread to exit before we are able to
10268 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 10269static void waitEmptyIOJobsQueue(void) {
4ee9488d 10270 while(1) {
76b7233a 10271 int io_processed_len;
10272
4ee9488d 10273 lockThreadedIO();
054e426d 10274 if (listLength(server.io_newjobs) == 0 &&
10275 listLength(server.io_processing) == 0 &&
10276 server.io_active_threads == 0)
10277 {
4ee9488d 10278 unlockThreadedIO();
10279 return;
10280 }
76b7233a 10281 /* While waiting for empty jobs queue condition we post-process some
10282 * finshed job, as I/O threads may be hanging trying to write against
10283 * the io_ready_pipe_write FD but there are so much pending jobs that
10284 * it's blocking. */
10285 io_processed_len = listLength(server.io_processed);
4ee9488d 10286 unlockThreadedIO();
76b7233a 10287 if (io_processed_len) {
10288 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
10289 usleep(1000); /* 1 millisecond */
10290 } else {
10291 usleep(10000); /* 10 milliseconds */
10292 }
4ee9488d 10293 }
10294}
10295
054e426d 10296static void vmReopenSwapFile(void) {
478c2c6f 10297 /* Note: we don't close the old one as we are in the child process
10298 * and don't want to mess at all with the original file object. */
054e426d 10299 server.vm_fp = fopen(server.vm_swap_file,"r+b");
10300 if (server.vm_fp == NULL) {
10301 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
10302 server.vm_swap_file);
478c2c6f 10303 _exit(1);
054e426d 10304 }
10305 server.vm_fd = fileno(server.vm_fp);
10306}
10307
b9bc0eef 10308/* This function must be called while with threaded IO locked */
10309static void queueIOJob(iojob *j) {
6c96ba7d 10310 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
10311 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 10312 listAddNodeTail(server.io_newjobs,j);
10313 if (server.io_active_threads < server.vm_max_threads)
10314 spawnIOThread();
10315}
10316
10317static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
10318 iojob *j;
e0a62c7f 10319
b9bc0eef 10320 j = zmalloc(sizeof(*j));
10321 j->type = REDIS_IOJOB_PREPARE_SWAP;
10322 j->db = db;
78ebe4c8 10323 j->key = key;
7dd8e7cf 10324 incrRefCount(key);
560db612 10325 j->id = j->val = val;
b9bc0eef 10326 incrRefCount(val);
10327 j->canceled = 0;
10328 j->thread = (pthread_t) -1;
560db612 10329 val->storage = REDIS_VM_SWAPPING;
b9bc0eef 10330
10331 lockThreadedIO();
10332 queueIOJob(j);
10333 unlockThreadedIO();
10334 return REDIS_OK;
10335}
10336
b0d8747d 10337/* ============ Virtual Memory - Blocking clients on missing keys =========== */
10338
d5d55fc3 10339/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
10340 * If there is not already a job loading the key, it is craeted.
10341 * The key is added to the io_keys list in the client structure, and also
10342 * in the hash table mapping swapped keys to waiting clients, that is,
10343 * server.io_waited_keys. */
10344static int waitForSwappedKey(redisClient *c, robj *key) {
10345 struct dictEntry *de;
10346 robj *o;
10347 list *l;
10348
10349 /* If the key does not exist or is already in RAM we don't need to
10350 * block the client at all. */
09241813 10351 de = dictFind(c->db->dict,key->ptr);
d5d55fc3 10352 if (de == NULL) return 0;
560db612 10353 o = dictGetEntryVal(de);
d5d55fc3 10354 if (o->storage == REDIS_VM_MEMORY) {
10355 return 0;
10356 } else if (o->storage == REDIS_VM_SWAPPING) {
10357 /* We were swapping the key, undo it! */
10358 vmCancelThreadedIOJob(o);
10359 return 0;
10360 }
e0a62c7f 10361
d5d55fc3 10362 /* OK: the key is either swapped, or being loaded just now. */
10363
10364 /* Add the key to the list of keys this client is waiting for.
10365 * This maps clients to keys they are waiting for. */
10366 listAddNodeTail(c->io_keys,key);
10367 incrRefCount(key);
10368
10369 /* Add the client to the swapped keys => clients waiting map. */
10370 de = dictFind(c->db->io_keys,key);
10371 if (de == NULL) {
10372 int retval;
10373
10374 /* For every key we take a list of clients blocked for it */
10375 l = listCreate();
10376 retval = dictAdd(c->db->io_keys,key,l);
10377 incrRefCount(key);
10378 assert(retval == DICT_OK);
10379 } else {
10380 l = dictGetEntryVal(de);
10381 }
10382 listAddNodeTail(l,c);
10383
10384 /* Are we already loading the key from disk? If not create a job */
10385 if (o->storage == REDIS_VM_SWAPPED) {
10386 iojob *j;
560db612 10387 vmpointer *vp = (vmpointer*)o;
d5d55fc3 10388
10389 o->storage = REDIS_VM_LOADING;
10390 j = zmalloc(sizeof(*j));
10391 j->type = REDIS_IOJOB_LOAD;
10392 j->db = c->db;
560db612 10393 j->id = (robj*)vp;
10394 j->key = key;
10395 incrRefCount(key);
10396 j->page = vp->page;
d5d55fc3 10397 j->val = NULL;
10398 j->canceled = 0;
10399 j->thread = (pthread_t) -1;
10400 lockThreadedIO();
10401 queueIOJob(j);
10402 unlockThreadedIO();
10403 }
10404 return 1;
10405}
10406
6f078746
PN
10407/* Preload keys for any command with first, last and step values for
10408 * the command keys prototype, as defined in the command table. */
10409static void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10410 int j, last;
10411 if (cmd->vm_firstkey == 0) return;
10412 last = cmd->vm_lastkey;
10413 if (last < 0) last = argc+last;
10414 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
10415 redisAssert(j < argc);
10416 waitForSwappedKey(c,argv[j]);
10417 }
10418}
10419
5d373da9 10420/* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
739ba0d2
PN
10421 * Note that the number of keys to preload is user-defined, so we need to
10422 * apply a sanity check against argc. */
ca1788b5 10423static void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
76583ea4 10424 int i, num;
ca1788b5 10425 REDIS_NOTUSED(cmd);
ca1788b5
PN
10426
10427 num = atoi(argv[2]->ptr);
739ba0d2 10428 if (num > (argc-3)) return;
76583ea4 10429 for (i = 0; i < num; i++) {
ca1788b5 10430 waitForSwappedKey(c,argv[3+i]);
76583ea4
PN
10431 }
10432}
10433
3805e04f
PN
10434/* Preload keys needed to execute the entire MULTI/EXEC block.
10435 *
10436 * This function is called by blockClientOnSwappedKeys when EXEC is issued,
10437 * and will block the client when any command requires a swapped out value. */
10438static void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
10439 int i, margc;
10440 struct redisCommand *mcmd;
10441 robj **margv;
10442 REDIS_NOTUSED(cmd);
10443 REDIS_NOTUSED(argc);
10444 REDIS_NOTUSED(argv);
10445
10446 if (!(c->flags & REDIS_MULTI)) return;
10447 for (i = 0; i < c->mstate.count; i++) {
10448 mcmd = c->mstate.commands[i].cmd;
10449 margc = c->mstate.commands[i].argc;
10450 margv = c->mstate.commands[i].argv;
10451
10452 if (mcmd->vm_preload_proc != NULL) {
10453 mcmd->vm_preload_proc(c,mcmd,margc,margv);
10454 } else {
10455 waitForMultipleSwappedKeys(c,mcmd,margc,margv);
10456 }
76583ea4
PN
10457 }
10458}
10459
b0d8747d 10460/* Is this client attempting to run a command against swapped keys?
d5d55fc3 10461 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 10462 *
d5d55fc3 10463 * The important idea about this function is that it can fail! If keys will
10464 * still be swapped when the client is resumed, this key lookups will
10465 * just block loading keys from disk. In practical terms this should only
10466 * happen with SORT BY command or if there is a bug in this function.
10467 *
10468 * Return 1 if the client is marked as blocked, 0 if the client can
10469 * continue as the keys it is going to access appear to be in memory. */
0a6f3f0f 10470static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
76583ea4 10471 if (cmd->vm_preload_proc != NULL) {
ca1788b5 10472 cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
76583ea4 10473 } else {
6f078746 10474 waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
76583ea4
PN
10475 }
10476
d5d55fc3 10477 /* If the client was blocked for at least one key, mark it as blocked. */
10478 if (listLength(c->io_keys)) {
10479 c->flags |= REDIS_IO_WAIT;
10480 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
10481 server.vm_blocked_clients++;
10482 return 1;
10483 } else {
10484 return 0;
10485 }
10486}
10487
10488/* Remove the 'key' from the list of blocked keys for a given client.
10489 *
10490 * The function returns 1 when there are no longer blocking keys after
10491 * the current one was removed (and the client can be unblocked). */
10492static int dontWaitForSwappedKey(redisClient *c, robj *key) {
10493 list *l;
10494 listNode *ln;
10495 listIter li;
10496 struct dictEntry *de;
10497
10498 /* Remove the key from the list of keys this client is waiting for. */
10499 listRewind(c->io_keys,&li);
10500 while ((ln = listNext(&li)) != NULL) {
bf028098 10501 if (equalStringObjects(ln->value,key)) {
d5d55fc3 10502 listDelNode(c->io_keys,ln);
10503 break;
10504 }
10505 }
10506 assert(ln != NULL);
10507
10508 /* Remove the client form the key => waiting clients map. */
10509 de = dictFind(c->db->io_keys,key);
10510 assert(de != NULL);
10511 l = dictGetEntryVal(de);
10512 ln = listSearchKey(l,c);
10513 assert(ln != NULL);
10514 listDelNode(l,ln);
10515 if (listLength(l) == 0)
10516 dictDelete(c->db->io_keys,key);
10517
10518 return listLength(c->io_keys) == 0;
10519}
10520
560db612 10521/* Every time we now a key was loaded back in memory, we handle clients
10522 * waiting for this key if any. */
d5d55fc3 10523static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
10524 struct dictEntry *de;
10525 list *l;
10526 listNode *ln;
10527 int len;
10528
10529 de = dictFind(db->io_keys,key);
10530 if (!de) return;
10531
10532 l = dictGetEntryVal(de);
10533 len = listLength(l);
10534 /* Note: we can't use something like while(listLength(l)) as the list
10535 * can be freed by the calling function when we remove the last element. */
10536 while (len--) {
10537 ln = listFirst(l);
10538 redisClient *c = ln->value;
10539
10540 if (dontWaitForSwappedKey(c,key)) {
10541 /* Put the client in the list of clients ready to go as we
10542 * loaded all the keys about it. */
10543 listAddNodeTail(server.io_ready_clients,c);
10544 }
10545 }
b0d8747d 10546}
b0d8747d 10547
500ece7c 10548/* =========================== Remote Configuration ========================= */
10549
10550static void configSetCommand(redisClient *c) {
10551 robj *o = getDecodedObject(c->argv[3]);
2e5eb04e 10552 long long ll;
10553
500ece7c 10554 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
10555 zfree(server.dbfilename);
10556 server.dbfilename = zstrdup(o->ptr);
10557 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
10558 zfree(server.requirepass);
10559 server.requirepass = zstrdup(o->ptr);
10560 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
10561 zfree(server.masterauth);
10562 server.masterauth = zstrdup(o->ptr);
10563 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
2e5eb04e 10564 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10565 ll < 0) goto badfmt;
10566 server.maxmemory = ll;
10567 } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
10568 if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
10569 ll < 0 || ll > LONG_MAX) goto badfmt;
10570 server.maxidletime = ll;
1b677732 10571 } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
10572 if (!strcasecmp(o->ptr,"no")) {
10573 server.appendfsync = APPENDFSYNC_NO;
10574 } else if (!strcasecmp(o->ptr,"everysec")) {
10575 server.appendfsync = APPENDFSYNC_EVERYSEC;
10576 } else if (!strcasecmp(o->ptr,"always")) {
10577 server.appendfsync = APPENDFSYNC_ALWAYS;
10578 } else {
10579 goto badfmt;
10580 }
38db9171 10581 } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) {
10582 int yn = yesnotoi(o->ptr);
10583
10584 if (yn == -1) goto badfmt;
10585 server.no_appendfsync_on_rewrite = yn;
2e5eb04e 10586 } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
10587 int old = server.appendonly;
10588 int new = yesnotoi(o->ptr);
10589
10590 if (new == -1) goto badfmt;
10591 if (old != new) {
10592 if (new == 0) {
10593 stopAppendOnly();
10594 } else {
10595 if (startAppendOnly() == REDIS_ERR) {
10596 addReplySds(c,sdscatprintf(sdsempty(),
10597 "-ERR Unable to turn on AOF. Check server logs.\r\n"));
10598 decrRefCount(o);
10599 return;
10600 }
10601 }
10602 }
a34e0a25 10603 } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
10604 int vlen, j;
10605 sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
10606
10607 /* Perform sanity check before setting the new config:
10608 * - Even number of args
10609 * - Seconds >= 1, changes >= 0 */
10610 if (vlen & 1) {
10611 sdsfreesplitres(v,vlen);
10612 goto badfmt;
10613 }
10614 for (j = 0; j < vlen; j++) {
10615 char *eptr;
10616 long val;
10617
10618 val = strtoll(v[j], &eptr, 10);
10619 if (eptr[0] != '\0' ||
10620 ((j & 1) == 0 && val < 1) ||
10621 ((j & 1) == 1 && val < 0)) {
10622 sdsfreesplitres(v,vlen);
10623 goto badfmt;
10624 }
10625 }
10626 /* Finally set the new config */
10627 resetServerSaveParams();
10628 for (j = 0; j < vlen; j += 2) {
10629 time_t seconds;
10630 int changes;
10631
10632 seconds = strtoll(v[j],NULL,10);
10633 changes = strtoll(v[j+1],NULL,10);
10634 appendServerSaveParams(seconds, changes);
10635 }
10636 sdsfreesplitres(v,vlen);
500ece7c 10637 } else {
10638 addReplySds(c,sdscatprintf(sdsempty(),
10639 "-ERR not supported CONFIG parameter %s\r\n",
10640 (char*)c->argv[2]->ptr));
10641 decrRefCount(o);
10642 return;
10643 }
10644 decrRefCount(o);
10645 addReply(c,shared.ok);
a34e0a25 10646 return;
10647
10648badfmt: /* Bad format errors */
10649 addReplySds(c,sdscatprintf(sdsempty(),
10650 "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
10651 (char*)o->ptr,
10652 (char*)c->argv[2]->ptr));
10653 decrRefCount(o);
500ece7c 10654}
10655
10656static void configGetCommand(redisClient *c) {
10657 robj *o = getDecodedObject(c->argv[2]);
10658 robj *lenobj = createObject(REDIS_STRING,NULL);
10659 char *pattern = o->ptr;
10660 int matches = 0;
10661
10662 addReply(c,lenobj);
10663 decrRefCount(lenobj);
10664
10665 if (stringmatch(pattern,"dbfilename",0)) {
10666 addReplyBulkCString(c,"dbfilename");
10667 addReplyBulkCString(c,server.dbfilename);
10668 matches++;
10669 }
10670 if (stringmatch(pattern,"requirepass",0)) {
10671 addReplyBulkCString(c,"requirepass");
10672 addReplyBulkCString(c,server.requirepass);
10673 matches++;
10674 }
10675 if (stringmatch(pattern,"masterauth",0)) {
10676 addReplyBulkCString(c,"masterauth");
10677 addReplyBulkCString(c,server.masterauth);
10678 matches++;
10679 }
10680 if (stringmatch(pattern,"maxmemory",0)) {
10681 char buf[128];
10682
2e5eb04e 10683 ll2string(buf,128,server.maxmemory);
500ece7c 10684 addReplyBulkCString(c,"maxmemory");
10685 addReplyBulkCString(c,buf);
10686 matches++;
10687 }
2e5eb04e 10688 if (stringmatch(pattern,"timeout",0)) {
10689 char buf[128];
10690
10691 ll2string(buf,128,server.maxidletime);
10692 addReplyBulkCString(c,"timeout");
10693 addReplyBulkCString(c,buf);
10694 matches++;
10695 }
10696 if (stringmatch(pattern,"appendonly",0)) {
10697 addReplyBulkCString(c,"appendonly");
10698 addReplyBulkCString(c,server.appendonly ? "yes" : "no");
10699 matches++;
10700 }
38db9171 10701 if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) {
10702 addReplyBulkCString(c,"no-appendfsync-on-rewrite");
10703 addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no");
10704 matches++;
10705 }
1b677732 10706 if (stringmatch(pattern,"appendfsync",0)) {
10707 char *policy;
10708
10709 switch(server.appendfsync) {
10710 case APPENDFSYNC_NO: policy = "no"; break;
10711 case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
10712 case APPENDFSYNC_ALWAYS: policy = "always"; break;
10713 default: policy = "unknown"; break; /* too harmless to panic */
10714 }
10715 addReplyBulkCString(c,"appendfsync");
10716 addReplyBulkCString(c,policy);
10717 matches++;
10718 }
a34e0a25 10719 if (stringmatch(pattern,"save",0)) {
10720 sds buf = sdsempty();
10721 int j;
10722
10723 for (j = 0; j < server.saveparamslen; j++) {
10724 buf = sdscatprintf(buf,"%ld %d",
10725 server.saveparams[j].seconds,
10726 server.saveparams[j].changes);
10727 if (j != server.saveparamslen-1)
10728 buf = sdscatlen(buf," ",1);
10729 }
10730 addReplyBulkCString(c,"save");
10731 addReplyBulkCString(c,buf);
10732 sdsfree(buf);
10733 matches++;
10734 }
500ece7c 10735 decrRefCount(o);
10736 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
10737}
10738
10739static void configCommand(redisClient *c) {
10740 if (!strcasecmp(c->argv[1]->ptr,"set")) {
10741 if (c->argc != 4) goto badarity;
10742 configSetCommand(c);
10743 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
10744 if (c->argc != 3) goto badarity;
10745 configGetCommand(c);
10746 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
10747 if (c->argc != 2) goto badarity;
10748 server.stat_numcommands = 0;
10749 server.stat_numconnections = 0;
10750 server.stat_expiredkeys = 0;
10751 server.stat_starttime = time(NULL);
10752 addReply(c,shared.ok);
10753 } else {
10754 addReplySds(c,sdscatprintf(sdsempty(),
10755 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
10756 }
10757 return;
10758
10759badarity:
10760 addReplySds(c,sdscatprintf(sdsempty(),
10761 "-ERR Wrong number of arguments for CONFIG %s\r\n",
10762 (char*) c->argv[1]->ptr));
10763}
10764
befec3cd 10765/* =========================== Pubsub implementation ======================== */
10766
ffc6b7f8 10767static void freePubsubPattern(void *p) {
10768 pubsubPattern *pat = p;
10769
10770 decrRefCount(pat->pattern);
10771 zfree(pat);
10772}
10773
10774static int listMatchPubsubPattern(void *a, void *b) {
10775 pubsubPattern *pa = a, *pb = b;
10776
10777 return (pa->client == pb->client) &&
bf028098 10778 (equalStringObjects(pa->pattern,pb->pattern));
ffc6b7f8 10779}
10780
10781/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
10782 * 0 if the client was already subscribed to that channel. */
10783static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
befec3cd 10784 struct dictEntry *de;
10785 list *clients = NULL;
10786 int retval = 0;
10787
ffc6b7f8 10788 /* Add the channel to the client -> channels hash table */
10789 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
befec3cd 10790 retval = 1;
ffc6b7f8 10791 incrRefCount(channel);
10792 /* Add the client to the channel -> list of clients hash table */
10793 de = dictFind(server.pubsub_channels,channel);
befec3cd 10794 if (de == NULL) {
10795 clients = listCreate();
ffc6b7f8 10796 dictAdd(server.pubsub_channels,channel,clients);
10797 incrRefCount(channel);
befec3cd 10798 } else {
10799 clients = dictGetEntryVal(de);
10800 }
10801 listAddNodeTail(clients,c);
10802 }
10803 /* Notify the client */
10804 addReply(c,shared.mbulk3);
10805 addReply(c,shared.subscribebulk);
ffc6b7f8 10806 addReplyBulk(c,channel);
482b672d 10807 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
befec3cd 10808 return retval;
10809}
10810
ffc6b7f8 10811/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10812 * 0 if the client was not subscribed to the specified channel. */
10813static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
befec3cd 10814 struct dictEntry *de;
10815 list *clients;
10816 listNode *ln;
10817 int retval = 0;
10818
ffc6b7f8 10819 /* Remove the channel from the client -> channels hash table */
10820 incrRefCount(channel); /* channel may be just a pointer to the same object
201037f5 10821 we have in the hash tables. Protect it... */
ffc6b7f8 10822 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
befec3cd 10823 retval = 1;
ffc6b7f8 10824 /* Remove the client from the channel -> clients list hash table */
10825 de = dictFind(server.pubsub_channels,channel);
befec3cd 10826 assert(de != NULL);
10827 clients = dictGetEntryVal(de);
10828 ln = listSearchKey(clients,c);
10829 assert(ln != NULL);
10830 listDelNode(clients,ln);
ff767a75 10831 if (listLength(clients) == 0) {
10832 /* Free the list and associated hash entry at all if this was
10833 * the latest client, so that it will be possible to abuse
ffc6b7f8 10834 * Redis PUBSUB creating millions of channels. */
10835 dictDelete(server.pubsub_channels,channel);
ff767a75 10836 }
befec3cd 10837 }
10838 /* Notify the client */
10839 if (notify) {
10840 addReply(c,shared.mbulk3);
10841 addReply(c,shared.unsubscribebulk);
ffc6b7f8 10842 addReplyBulk(c,channel);
482b672d 10843 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10844 listLength(c->pubsub_patterns));
10845
10846 }
10847 decrRefCount(channel); /* it is finally safe to release it */
10848 return retval;
10849}
10850
10851/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
10852static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
10853 int retval = 0;
10854
10855 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
10856 retval = 1;
10857 pubsubPattern *pat;
10858 listAddNodeTail(c->pubsub_patterns,pattern);
10859 incrRefCount(pattern);
10860 pat = zmalloc(sizeof(*pat));
10861 pat->pattern = getDecodedObject(pattern);
10862 pat->client = c;
10863 listAddNodeTail(server.pubsub_patterns,pat);
10864 }
10865 /* Notify the client */
10866 addReply(c,shared.mbulk3);
10867 addReply(c,shared.psubscribebulk);
10868 addReplyBulk(c,pattern);
482b672d 10869 addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
ffc6b7f8 10870 return retval;
10871}
10872
10873/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
10874 * 0 if the client was not subscribed to the specified channel. */
10875static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
10876 listNode *ln;
10877 pubsubPattern pat;
10878 int retval = 0;
10879
10880 incrRefCount(pattern); /* Protect the object. May be the same we remove */
10881 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
10882 retval = 1;
10883 listDelNode(c->pubsub_patterns,ln);
10884 pat.client = c;
10885 pat.pattern = pattern;
10886 ln = listSearchKey(server.pubsub_patterns,&pat);
10887 listDelNode(server.pubsub_patterns,ln);
10888 }
10889 /* Notify the client */
10890 if (notify) {
10891 addReply(c,shared.mbulk3);
10892 addReply(c,shared.punsubscribebulk);
10893 addReplyBulk(c,pattern);
482b672d 10894 addReplyLongLong(c,dictSize(c->pubsub_channels)+
ffc6b7f8 10895 listLength(c->pubsub_patterns));
befec3cd 10896 }
ffc6b7f8 10897 decrRefCount(pattern);
befec3cd 10898 return retval;
10899}
10900
ffc6b7f8 10901/* Unsubscribe from all the channels. Return the number of channels the
10902 * client was subscribed from. */
10903static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
10904 dictIterator *di = dictGetIterator(c->pubsub_channels);
befec3cd 10905 dictEntry *de;
10906 int count = 0;
10907
10908 while((de = dictNext(di)) != NULL) {
ffc6b7f8 10909 robj *channel = dictGetEntryKey(de);
befec3cd 10910
ffc6b7f8 10911 count += pubsubUnsubscribeChannel(c,channel,notify);
befec3cd 10912 }
10913 dictReleaseIterator(di);
10914 return count;
10915}
10916
ffc6b7f8 10917/* Unsubscribe from all the patterns. Return the number of patterns the
10918 * client was subscribed from. */
10919static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
10920 listNode *ln;
10921 listIter li;
10922 int count = 0;
10923
10924 listRewind(c->pubsub_patterns,&li);
10925 while ((ln = listNext(&li)) != NULL) {
10926 robj *pattern = ln->value;
10927
10928 count += pubsubUnsubscribePattern(c,pattern,notify);
10929 }
10930 return count;
10931}
10932
befec3cd 10933/* Publish a message */
ffc6b7f8 10934static int pubsubPublishMessage(robj *channel, robj *message) {
befec3cd 10935 int receivers = 0;
10936 struct dictEntry *de;
ffc6b7f8 10937 listNode *ln;
10938 listIter li;
befec3cd 10939
ffc6b7f8 10940 /* Send to clients listening for that channel */
10941 de = dictFind(server.pubsub_channels,channel);
befec3cd 10942 if (de) {
10943 list *list = dictGetEntryVal(de);
10944 listNode *ln;
10945 listIter li;
10946
10947 listRewind(list,&li);
10948 while ((ln = listNext(&li)) != NULL) {
10949 redisClient *c = ln->value;
10950
10951 addReply(c,shared.mbulk3);
10952 addReply(c,shared.messagebulk);
ffc6b7f8 10953 addReplyBulk(c,channel);
befec3cd 10954 addReplyBulk(c,message);
10955 receivers++;
10956 }
10957 }
ffc6b7f8 10958 /* Send to clients listening to matching channels */
10959 if (listLength(server.pubsub_patterns)) {
10960 listRewind(server.pubsub_patterns,&li);
10961 channel = getDecodedObject(channel);
10962 while ((ln = listNext(&li)) != NULL) {
10963 pubsubPattern *pat = ln->value;
10964
10965 if (stringmatchlen((char*)pat->pattern->ptr,
10966 sdslen(pat->pattern->ptr),
10967 (char*)channel->ptr,
10968 sdslen(channel->ptr),0)) {
c8d0ea0e 10969 addReply(pat->client,shared.mbulk4);
10970 addReply(pat->client,shared.pmessagebulk);
10971 addReplyBulk(pat->client,pat->pattern);
ffc6b7f8 10972 addReplyBulk(pat->client,channel);
10973 addReplyBulk(pat->client,message);
10974 receivers++;
10975 }
10976 }
10977 decrRefCount(channel);
10978 }
befec3cd 10979 return receivers;
10980}
10981
10982static void subscribeCommand(redisClient *c) {
10983 int j;
10984
10985 for (j = 1; j < c->argc; j++)
ffc6b7f8 10986 pubsubSubscribeChannel(c,c->argv[j]);
befec3cd 10987}
10988
10989static void unsubscribeCommand(redisClient *c) {
10990 if (c->argc == 1) {
ffc6b7f8 10991 pubsubUnsubscribeAllChannels(c,1);
10992 return;
10993 } else {
10994 int j;
10995
10996 for (j = 1; j < c->argc; j++)
10997 pubsubUnsubscribeChannel(c,c->argv[j],1);
10998 }
10999}
11000
11001static void psubscribeCommand(redisClient *c) {
11002 int j;
11003
11004 for (j = 1; j < c->argc; j++)
11005 pubsubSubscribePattern(c,c->argv[j]);
11006}
11007
11008static void punsubscribeCommand(redisClient *c) {
11009 if (c->argc == 1) {
11010 pubsubUnsubscribeAllPatterns(c,1);
befec3cd 11011 return;
11012 } else {
11013 int j;
11014
11015 for (j = 1; j < c->argc; j++)
ffc6b7f8 11016 pubsubUnsubscribePattern(c,c->argv[j],1);
befec3cd 11017 }
11018}
11019
11020static void publishCommand(redisClient *c) {
11021 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
482b672d 11022 addReplyLongLong(c,receivers);
befec3cd 11023}
11024
37ab76c9 11025/* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
11026 *
11027 * The implementation uses a per-DB hash table mapping keys to list of clients
11028 * WATCHing those keys, so that given a key that is going to be modified
11029 * we can mark all the associated clients as dirty.
11030 *
11031 * Also every client contains a list of WATCHed keys so that's possible to
11032 * un-watch such keys when the client is freed or when UNWATCH is called. */
11033
11034/* In the client->watched_keys list we need to use watchedKey structures
11035 * as in order to identify a key in Redis we need both the key name and the
11036 * DB */
11037typedef struct watchedKey {
11038 robj *key;
11039 redisDb *db;
11040} watchedKey;
11041
11042/* Watch for the specified key */
11043static void watchForKey(redisClient *c, robj *key) {
11044 list *clients = NULL;
11045 listIter li;
11046 listNode *ln;
11047 watchedKey *wk;
11048
11049 /* Check if we are already watching for this key */
11050 listRewind(c->watched_keys,&li);
11051 while((ln = listNext(&li))) {
11052 wk = listNodeValue(ln);
11053 if (wk->db == c->db && equalStringObjects(key,wk->key))
11054 return; /* Key already watched */
11055 }
11056 /* This key is not already watched in this DB. Let's add it */
11057 clients = dictFetchValue(c->db->watched_keys,key);
11058 if (!clients) {
11059 clients = listCreate();
11060 dictAdd(c->db->watched_keys,key,clients);
11061 incrRefCount(key);
11062 }
11063 listAddNodeTail(clients,c);
11064 /* Add the new key to the lits of keys watched by this client */
11065 wk = zmalloc(sizeof(*wk));
11066 wk->key = key;
11067 wk->db = c->db;
11068 incrRefCount(key);
11069 listAddNodeTail(c->watched_keys,wk);
11070}
11071
11072/* Unwatch all the keys watched by this client. To clean the EXEC dirty
11073 * flag is up to the caller. */
11074static void unwatchAllKeys(redisClient *c) {
11075 listIter li;
11076 listNode *ln;
11077
11078 if (listLength(c->watched_keys) == 0) return;
11079 listRewind(c->watched_keys,&li);
11080 while((ln = listNext(&li))) {
11081 list *clients;
11082 watchedKey *wk;
11083
11084 /* Lookup the watched key -> clients list and remove the client
11085 * from the list */
11086 wk = listNodeValue(ln);
11087 clients = dictFetchValue(wk->db->watched_keys, wk->key);
11088 assert(clients != NULL);
11089 listDelNode(clients,listSearchKey(clients,c));
11090 /* Kill the entry at all if this was the only client */
11091 if (listLength(clients) == 0)
11092 dictDelete(wk->db->watched_keys, wk->key);
11093 /* Remove this watched key from the client->watched list */
11094 listDelNode(c->watched_keys,ln);
11095 decrRefCount(wk->key);
11096 zfree(wk);
11097 }
11098}
11099
ca3f830b 11100/* "Touch" a key, so that if this key is being WATCHed by some client the
37ab76c9 11101 * next EXEC will fail. */
11102static void touchWatchedKey(redisDb *db, robj *key) {
11103 list *clients;
11104 listIter li;
11105 listNode *ln;
11106
11107 if (dictSize(db->watched_keys) == 0) return;
11108 clients = dictFetchValue(db->watched_keys, key);
11109 if (!clients) return;
11110
11111 /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
11112 /* Check if we are already watching for this key */
11113 listRewind(clients,&li);
11114 while((ln = listNext(&li))) {
11115 redisClient *c = listNodeValue(ln);
11116
11117 c->flags |= REDIS_DIRTY_CAS;
11118 }
11119}
11120
9b30e1a2 11121/* On FLUSHDB or FLUSHALL all the watched keys that are present before the
11122 * flush but will be deleted as effect of the flushing operation should
11123 * be touched. "dbid" is the DB that's getting the flush. -1 if it is
11124 * a FLUSHALL operation (all the DBs flushed). */
11125static void touchWatchedKeysOnFlush(int dbid) {
11126 listIter li1, li2;
11127 listNode *ln;
11128
11129 /* For every client, check all the waited keys */
11130 listRewind(server.clients,&li1);
11131 while((ln = listNext(&li1))) {
11132 redisClient *c = listNodeValue(ln);
11133 listRewind(c->watched_keys,&li2);
11134 while((ln = listNext(&li2))) {
11135 watchedKey *wk = listNodeValue(ln);
11136
11137 /* For every watched key matching the specified DB, if the
11138 * key exists, mark the client as dirty, as the key will be
11139 * removed. */
11140 if (dbid == -1 || wk->db->id == dbid) {
09241813 11141 if (dictFind(wk->db->dict, wk->key->ptr) != NULL)
9b30e1a2 11142 c->flags |= REDIS_DIRTY_CAS;
11143 }
11144 }
11145 }
11146}
11147
37ab76c9 11148static void watchCommand(redisClient *c) {
11149 int j;
11150
6531c94d 11151 if (c->flags & REDIS_MULTI) {
11152 addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
11153 return;
11154 }
37ab76c9 11155 for (j = 1; j < c->argc; j++)
11156 watchForKey(c,c->argv[j]);
11157 addReply(c,shared.ok);
11158}
11159
11160static void unwatchCommand(redisClient *c) {
11161 unwatchAllKeys(c);
11162 c->flags &= (~REDIS_DIRTY_CAS);
11163 addReply(c,shared.ok);
11164}
11165
7f957c92 11166/* ================================= Debugging ============================== */
11167
ba798261 11168/* Compute the sha1 of string at 's' with 'len' bytes long.
11169 * The SHA1 is then xored againt the string pointed by digest.
11170 * Since xor is commutative, this operation is used in order to
11171 * "add" digests relative to unordered elements.
11172 *
11173 * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
11174static void xorDigest(unsigned char *digest, void *ptr, size_t len) {
11175 SHA1_CTX ctx;
11176 unsigned char hash[20], *s = ptr;
11177 int j;
11178
11179 SHA1Init(&ctx);
11180 SHA1Update(&ctx,s,len);
11181 SHA1Final(hash,&ctx);
11182
11183 for (j = 0; j < 20; j++)
11184 digest[j] ^= hash[j];
11185}
11186
11187static void xorObjectDigest(unsigned char *digest, robj *o) {
11188 o = getDecodedObject(o);
11189 xorDigest(digest,o->ptr,sdslen(o->ptr));
11190 decrRefCount(o);
11191}
11192
11193/* This function instead of just computing the SHA1 and xoring it
11194 * against diget, also perform the digest of "digest" itself and
11195 * replace the old value with the new one.
11196 *
11197 * So the final digest will be:
11198 *
11199 * digest = SHA1(digest xor SHA1(data))
11200 *
11201 * This function is used every time we want to preserve the order so
11202 * that digest(a,b,c,d) will be different than digest(b,c,d,a)
11203 *
11204 * Also note that mixdigest("foo") followed by mixdigest("bar")
11205 * will lead to a different digest compared to "fo", "obar".
11206 */
11207static void mixDigest(unsigned char *digest, void *ptr, size_t len) {
11208 SHA1_CTX ctx;
11209 char *s = ptr;
11210
11211 xorDigest(digest,s,len);
11212 SHA1Init(&ctx);
11213 SHA1Update(&ctx,digest,20);
11214 SHA1Final(digest,&ctx);
11215}
11216
11217static void mixObjectDigest(unsigned char *digest, robj *o) {
11218 o = getDecodedObject(o);
11219 mixDigest(digest,o->ptr,sdslen(o->ptr));
11220 decrRefCount(o);
11221}
11222
11223/* Compute the dataset digest. Since keys, sets elements, hashes elements
11224 * are not ordered, we use a trick: every aggregate digest is the xor
11225 * of the digests of their elements. This way the order will not change
11226 * the result. For list instead we use a feedback entering the output digest
11227 * as input in order to ensure that a different ordered list will result in
11228 * a different digest. */
11229static void computeDatasetDigest(unsigned char *final) {
11230 unsigned char digest[20];
11231 char buf[128];
11232 dictIterator *di = NULL;
11233 dictEntry *de;
11234 int j;
11235 uint32_t aux;
11236
11237 memset(final,0,20); /* Start with a clean result */
11238
11239 for (j = 0; j < server.dbnum; j++) {
11240 redisDb *db = server.db+j;
11241
11242 if (dictSize(db->dict) == 0) continue;
11243 di = dictGetIterator(db->dict);
11244
11245 /* hash the DB id, so the same dataset moved in a different
11246 * DB will lead to a different digest */
11247 aux = htonl(j);
11248 mixDigest(final,&aux,sizeof(aux));
11249
11250 /* Iterate this DB writing every entry */
11251 while((de = dictNext(di)) != NULL) {
09241813 11252 sds key;
11253 robj *keyobj, *o;
ba798261 11254 time_t expiretime;
11255
11256 memset(digest,0,20); /* This key-val digest */
11257 key = dictGetEntryKey(de);
09241813 11258 keyobj = createStringObject(key,sdslen(key));
11259
11260 mixDigest(digest,key,sdslen(key));
11261
11262 /* Make sure the key is loaded if VM is active */
11263 o = lookupKeyRead(db,keyobj);
cbae1d34 11264
ba798261 11265 aux = htonl(o->type);
11266 mixDigest(digest,&aux,sizeof(aux));
09241813 11267 expiretime = getExpire(db,keyobj);
ba798261 11268
11269 /* Save the key and associated value */
11270 if (o->type == REDIS_STRING) {
11271 mixObjectDigest(digest,o);
11272 } else if (o->type == REDIS_LIST) {
003f0840
PN
11273 listTypeIterator *li = listTypeInitIterator(o,0,REDIS_TAIL);
11274 listTypeEntry entry;
11275 while(listTypeNext(li,&entry)) {
11276 robj *eleobj = listTypeGet(&entry);
ba798261 11277 mixObjectDigest(digest,eleobj);
dc845730 11278 decrRefCount(eleobj);
ba798261 11279 }
003f0840 11280 listTypeReleaseIterator(li);
ba798261 11281 } else if (o->type == REDIS_SET) {
11282 dict *set = o->ptr;
11283 dictIterator *di = dictGetIterator(set);
11284 dictEntry *de;
11285
11286 while((de = dictNext(di)) != NULL) {
11287 robj *eleobj = dictGetEntryKey(de);
11288
11289 xorObjectDigest(digest,eleobj);
11290 }
11291 dictReleaseIterator(di);
11292 } else if (o->type == REDIS_ZSET) {
11293 zset *zs = o->ptr;
11294 dictIterator *di = dictGetIterator(zs->dict);
11295 dictEntry *de;
11296
11297 while((de = dictNext(di)) != NULL) {
11298 robj *eleobj = dictGetEntryKey(de);
11299 double *score = dictGetEntryVal(de);
11300 unsigned char eledigest[20];
11301
11302 snprintf(buf,sizeof(buf),"%.17g",*score);
11303 memset(eledigest,0,20);
11304 mixObjectDigest(eledigest,eleobj);
11305 mixDigest(eledigest,buf,strlen(buf));
11306 xorDigest(digest,eledigest,20);
11307 }
11308 dictReleaseIterator(di);
11309 } else if (o->type == REDIS_HASH) {
d1578a33 11310 hashTypeIterator *hi;
ba798261 11311 robj *obj;
11312
d1578a33
PN
11313 hi = hashTypeInitIterator(o);
11314 while (hashTypeNext(hi) != REDIS_ERR) {
ba798261 11315 unsigned char eledigest[20];
11316
11317 memset(eledigest,0,20);
d1578a33 11318 obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
ba798261 11319 mixObjectDigest(eledigest,obj);
11320 decrRefCount(obj);
d1578a33 11321 obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
ba798261 11322 mixObjectDigest(eledigest,obj);
11323 decrRefCount(obj);
11324 xorDigest(digest,eledigest,20);
11325 }
d1578a33 11326 hashTypeReleaseIterator(hi);
ba798261 11327 } else {
11328 redisPanic("Unknown object type");
11329 }
ba798261 11330 /* If the key has an expire, add it to the mix */
11331 if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
11332 /* We can finally xor the key-val digest to the final digest */
11333 xorDigest(final,digest,20);
09241813 11334 decrRefCount(keyobj);
ba798261 11335 }
11336 dictReleaseIterator(di);
11337 }
11338}
11339
7f957c92 11340static void debugCommand(redisClient *c) {
11341 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
11342 *((char*)-1) = 'x';
210e29f7 11343 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
11344 if (rdbSave(server.dbfilename) != REDIS_OK) {
11345 addReply(c,shared.err);
11346 return;
11347 }
11348 emptyDb();
11349 if (rdbLoad(server.dbfilename) != REDIS_OK) {
11350 addReply(c,shared.err);
11351 return;
11352 }
11353 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
11354 addReply(c,shared.ok);
71c2b467 11355 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
11356 emptyDb();
11357 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
11358 addReply(c,shared.err);
11359 return;
11360 }
11361 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
11362 addReply(c,shared.ok);
333298da 11363 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
09241813 11364 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11365 robj *val;
333298da 11366
11367 if (!de) {
11368 addReply(c,shared.nokeyerr);
11369 return;
11370 }
333298da 11371 val = dictGetEntryVal(de);
560db612 11372 if (!server.vm_enabled || (val->storage == REDIS_VM_MEMORY ||
11373 val->storage == REDIS_VM_SWAPPING)) {
07efaf74 11374 char *strenc;
11375 char buf[128];
11376
11377 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
11378 strenc = strencoding[val->encoding];
11379 } else {
11380 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
11381 strenc = buf;
11382 }
ace06542 11383 addReplySds(c,sdscatprintf(sdsempty(),
09241813 11384 "+Value at:%p refcount:%d "
07efaf74 11385 "encoding:%s serializedlength:%lld\r\n",
09241813 11386 (void*)val, val->refcount,
07efaf74 11387 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 11388 } else {
560db612 11389 vmpointer *vp = (vmpointer*) val;
ace06542 11390 addReplySds(c,sdscatprintf(sdsempty(),
09241813 11391 "+Value swapped at: page %llu "
ace06542 11392 "using %llu pages\r\n",
09241813 11393 (unsigned long long) vp->page,
560db612 11394 (unsigned long long) vp->usedpages));
ace06542 11395 }
78ebe4c8 11396 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
11397 lookupKeyRead(c->db,c->argv[2]);
11398 addReply(c,shared.ok);
7d30035d 11399 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
09241813 11400 dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
11401 robj *val;
560db612 11402 vmpointer *vp;
7d30035d 11403
11404 if (!server.vm_enabled) {
11405 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
11406 return;
11407 }
11408 if (!de) {
11409 addReply(c,shared.nokeyerr);
11410 return;
11411 }
7d30035d 11412 val = dictGetEntryVal(de);
4ef8de8a 11413 /* Swap it */
560db612 11414 if (val->storage != REDIS_VM_MEMORY) {
7d30035d 11415 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
560db612 11416 } else if (val->refcount != 1) {
11417 addReplySds(c,sdsnew("-ERR Object is shared\r\n"));
11418 } else if ((vp = vmSwapObjectBlocking(val)) != NULL) {
11419 dictGetEntryVal(de) = vp;
7d30035d 11420 addReply(c,shared.ok);
11421 } else {
11422 addReply(c,shared.err);
11423 }
59305dc7 11424 } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
11425 long keys, j;
11426 robj *key, *val;
11427 char buf[128];
11428
11429 if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
11430 return;
11431 for (j = 0; j < keys; j++) {
11432 snprintf(buf,sizeof(buf),"key:%lu",j);
11433 key = createStringObject(buf,strlen(buf));
11434 if (lookupKeyRead(c->db,key) != NULL) {
11435 decrRefCount(key);
11436 continue;
11437 }
11438 snprintf(buf,sizeof(buf),"value:%lu",j);
11439 val = createStringObject(buf,strlen(buf));
09241813 11440 dbAdd(c->db,key,val);
11441 decrRefCount(key);
59305dc7 11442 }
11443 addReply(c,shared.ok);
ba798261 11444 } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
11445 unsigned char digest[20];
11446 sds d = sdsnew("+");
11447 int j;
11448
11449 computeDatasetDigest(digest);
11450 for (j = 0; j < 20; j++)
11451 d = sdscatprintf(d, "%02x",digest[j]);
11452
11453 d = sdscatlen(d,"\r\n",2);
11454 addReplySds(c,d);
7f957c92 11455 } else {
333298da 11456 addReplySds(c,sdsnew(
bdcb92f2 11457 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 11458 }
11459}
56906eef 11460
6c96ba7d 11461static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 11462 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
fdfb02e7 11463 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
dfc5e96c 11464#ifdef HAVE_BACKTRACE
11465 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11466 *((char*)-1) = 'x';
11467#endif
11468}
11469
c651fd9e 11470static void _redisPanic(char *msg, char *file, int line) {
11471 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
17772754 11472 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
c651fd9e 11473#ifdef HAVE_BACKTRACE
11474 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
11475 *((char*)-1) = 'x';
11476#endif
11477}
11478
bcfc686d 11479/* =================================== Main! ================================ */
56906eef 11480
bcfc686d 11481#ifdef __linux__
11482int linuxOvercommitMemoryValue(void) {
11483 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
11484 char buf[64];
56906eef 11485
bcfc686d 11486 if (!fp) return -1;
11487 if (fgets(buf,64,fp) == NULL) {
11488 fclose(fp);
11489 return -1;
11490 }
11491 fclose(fp);
56906eef 11492
bcfc686d 11493 return atoi(buf);
11494}
11495
11496void linuxOvercommitMemoryWarning(void) {
11497 if (linuxOvercommitMemoryValue() == 0) {
7ccd2d0a 11498 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
bcfc686d 11499 }
11500}
11501#endif /* __linux__ */
11502
11503static void daemonize(void) {
11504 int fd;
11505 FILE *fp;
11506
11507 if (fork() != 0) exit(0); /* parent exits */
11508 setsid(); /* create a new session */
11509
11510 /* Every output goes to /dev/null. If Redis is daemonized but
11511 * the 'logfile' is set to 'stdout' in the configuration file
11512 * it will not log at all. */
11513 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
11514 dup2(fd, STDIN_FILENO);
11515 dup2(fd, STDOUT_FILENO);
11516 dup2(fd, STDERR_FILENO);
11517 if (fd > STDERR_FILENO) close(fd);
11518 }
11519 /* Try to write the pid file */
11520 fp = fopen(server.pidfile,"w");
11521 if (fp) {
11522 fprintf(fp,"%d\n",getpid());
11523 fclose(fp);
56906eef 11524 }
56906eef 11525}
11526
42ab0172 11527static void version() {
8a3b0d2d 11528 printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
11529 REDIS_GIT_SHA1, atoi(REDIS_GIT_DIRTY) > 0);
42ab0172
AO
11530 exit(0);
11531}
11532
723fb69b
AO
11533static void usage() {
11534 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 11535 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
11536 exit(1);
11537}
11538
bcfc686d 11539int main(int argc, char **argv) {
9651a787 11540 time_t start;
11541
bcfc686d 11542 initServerConfig();
1a132bbc 11543 sortCommandTable();
bcfc686d 11544 if (argc == 2) {
44efe66e 11545 if (strcmp(argv[1], "-v") == 0 ||
11546 strcmp(argv[1], "--version") == 0) version();
11547 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 11548 resetServerSaveParams();
11549 loadServerConfig(argv[1]);
723fb69b
AO
11550 } else if ((argc > 2)) {
11551 usage();
bcfc686d 11552 } else {
11553 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
11554 }
bcfc686d 11555 if (server.daemonize) daemonize();
71c54b21 11556 initServer();
bcfc686d 11557 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
11558#ifdef __linux__
11559 linuxOvercommitMemoryWarning();
11560#endif
9651a787 11561 start = time(NULL);
bcfc686d 11562 if (server.appendonly) {
11563 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 11564 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 11565 } else {
11566 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 11567 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 11568 }
bcfc686d 11569 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 11570 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 11571 aeMain(server.el);
11572 aeDeleteEventLoop(server.el);
11573 return 0;
11574}
11575
11576/* ============================= Backtrace support ========================= */
11577
11578#ifdef HAVE_BACKTRACE
11579static char *findFuncName(void *pointer, unsigned long *offset);
11580
56906eef 11581static void *getMcontextEip(ucontext_t *uc) {
11582#if defined(__FreeBSD__)
11583 return (void*) uc->uc_mcontext.mc_eip;
11584#elif defined(__dietlibc__)
11585 return (void*) uc->uc_mcontext.eip;
06db1f50 11586#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 11587 #if __x86_64__
11588 return (void*) uc->uc_mcontext->__ss.__rip;
11589 #else
56906eef 11590 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 11591 #endif
06db1f50 11592#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 11593 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 11594 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 11595 #else
11596 return (void*) uc->uc_mcontext->__ss.__eip;
e0a62c7f 11597 #endif
54bac49d 11598#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 11599 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 11600#elif defined(__ia64__) /* Linux IA64 */
11601 return (void*) uc->uc_mcontext.sc_ip;
11602#else
11603 return NULL;
56906eef 11604#endif
11605}
11606
11607static void segvHandler(int sig, siginfo_t *info, void *secret) {
11608 void *trace[100];
11609 char **messages = NULL;
11610 int i, trace_size = 0;
11611 unsigned long offset=0;
56906eef 11612 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 11613 sds infostring;
56906eef 11614 REDIS_NOTUSED(info);
11615
11616 redisLog(REDIS_WARNING,
11617 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 11618 infostring = genRedisInfoString();
11619 redisLog(REDIS_WARNING, "%s",infostring);
11620 /* It's not safe to sdsfree() the returned string under memory
11621 * corruption conditions. Let it leak as we are going to abort */
e0a62c7f 11622
56906eef 11623 trace_size = backtrace(trace, 100);
de96dbfe 11624 /* overwrite sigaction with caller's address */
b91cf5ef 11625 if (getMcontextEip(uc) != NULL) {
11626 trace[1] = getMcontextEip(uc);
11627 }
56906eef 11628 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 11629
d76412d1 11630 for (i=1; i<trace_size; ++i) {
56906eef 11631 char *fn = findFuncName(trace[i], &offset), *p;
11632
11633 p = strchr(messages[i],'+');
11634 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
11635 redisLog(REDIS_WARNING,"%s", messages[i]);
11636 } else {
11637 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
11638 }
11639 }
b177fd30 11640 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 11641 _exit(0);
fe3bbfbe 11642}
56906eef 11643
fab43727 11644static void sigtermHandler(int sig) {
11645 REDIS_NOTUSED(sig);
b58ba105 11646
fab43727 11647 redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
11648 server.shutdown_asap = 1;
b58ba105
AM
11649}
11650
56906eef 11651static void setupSigSegvAction(void) {
11652 struct sigaction act;
11653
11654 sigemptyset (&act.sa_mask);
11655 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
11656 * is used. Otherwise, sa_handler is used */
11657 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
11658 act.sa_sigaction = segvHandler;
11659 sigaction (SIGSEGV, &act, NULL);
11660 sigaction (SIGBUS, &act, NULL);
12fea928 11661 sigaction (SIGFPE, &act, NULL);
11662 sigaction (SIGILL, &act, NULL);
11663 sigaction (SIGBUS, &act, NULL);
b58ba105
AM
11664
11665 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
fab43727 11666 act.sa_handler = sigtermHandler;
b58ba105 11667 sigaction (SIGTERM, &act, NULL);
e65fdc78 11668 return;
56906eef 11669}
e65fdc78 11670
bcfc686d 11671#include "staticsymbols.h"
11672/* This function try to convert a pointer into a function name. It's used in
11673 * oreder to provide a backtrace under segmentation fault that's able to
11674 * display functions declared as static (otherwise the backtrace is useless). */
11675static char *findFuncName(void *pointer, unsigned long *offset){
11676 int i, ret = -1;
11677 unsigned long off, minoff = 0;
ed9b544e 11678
bcfc686d 11679 /* Try to match against the Symbol with the smallest offset */
11680 for (i=0; symsTable[i].pointer; i++) {
11681 unsigned long lp = (unsigned long) pointer;
0bc03378 11682
bcfc686d 11683 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
11684 off=lp-symsTable[i].pointer;
11685 if (ret < 0 || off < minoff) {
11686 minoff=off;
11687 ret=i;
11688 }
11689 }
0bc03378 11690 }
bcfc686d 11691 if (ret == -1) return NULL;
11692 *offset = minoff;
11693 return symsTable[ret].name;
0bc03378 11694}
bcfc686d 11695#else /* HAVE_BACKTRACE */
11696static void setupSigSegvAction(void) {
0bc03378 11697}
bcfc686d 11698#endif /* HAVE_BACKTRACE */
0bc03378 11699
ed9b544e 11700
ed9b544e 11701
bcfc686d 11702/* The End */
11703
11704
ed9b544e 11705