]> git.saurik.com Git - redis.git/blame - redis.c
log error and quit when the AOF contains an unfinished MULTI
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
24df7698 30#define REDIS_VERSION "1.3.10"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
40#include <signal.h>
fbf9bcdb 41
42#ifdef HAVE_BACKTRACE
c9468bcf 43#include <execinfo.h>
44#include <ucontext.h>
fbf9bcdb 45#endif /* HAVE_BACKTRACE */
46
ed9b544e 47#include <sys/wait.h>
48#include <errno.h>
49#include <assert.h>
50#include <ctype.h>
51#include <stdarg.h>
52#include <inttypes.h>
53#include <arpa/inet.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#include <sys/time.h>
57#include <sys/resource.h>
2895e862 58#include <sys/uio.h>
f78fd11b 59#include <limits.h>
a7866db6 60#include <math.h>
92f8e882 61#include <pthread.h>
0bc1b2f6 62
63#if defined(__sun)
5043dff3 64#include "solarisfixes.h"
65#endif
ed9b544e 66
c9468bcf 67#include "redis.h"
ed9b544e 68#include "ae.h" /* Event driven programming library */
69#include "sds.h" /* Dynamic safe strings */
70#include "anet.h" /* Networking the easy way */
71#include "dict.h" /* Hash tables */
72#include "adlist.h" /* Linked lists */
73#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 74#include "lzf.h" /* LZF compression library */
75#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
5234952b 76#include "zipmap.h"
ed9b544e 77
78/* Error codes */
79#define REDIS_OK 0
80#define REDIS_ERR -1
81
82/* Static server configuration */
83#define REDIS_SERVERPORT 6379 /* TCP port */
84#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 85#define REDIS_IOBUF_LEN 1024
ed9b544e 86#define REDIS_LOADBUF_LEN 1024
248ea310 87#define REDIS_STATIC_ARGS 8
ed9b544e 88#define REDIS_DEFAULT_DBNUM 16
89#define REDIS_CONFIGLINE_MAX 1024
90#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
91#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
8ca3e9d1 92#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
6f376729 93#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 94#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
95
96/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
97#define REDIS_WRITEV_THRESHOLD 3
98/* Max number of iovecs used for each writev call */
99#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 100
101/* Hash table parameters */
102#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 103
104/* Command flags */
3fd78bcd 105#define REDIS_CMD_BULK 1 /* Bulk write command */
106#define REDIS_CMD_INLINE 2 /* Inline command */
107/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
108 this flags will return an error when the 'maxmemory' option is set in the
109 config file and the server is using more than maxmemory bytes of memory.
110 In short this commands are denied on low memory conditions. */
111#define REDIS_CMD_DENYOOM 4
4005fef1 112#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
ed9b544e 113
114/* Object types */
115#define REDIS_STRING 0
116#define REDIS_LIST 1
117#define REDIS_SET 2
1812e024 118#define REDIS_ZSET 3
119#define REDIS_HASH 4
f78fd11b 120
5234952b 121/* Objects encoding. Some kind of objects like Strings and Hashes can be
122 * internally represented in multiple ways. The 'encoding' field of the object
123 * is set to one of this fields for this object. */
942a3961 124#define REDIS_ENCODING_RAW 0 /* Raw representation */
125#define REDIS_ENCODING_INT 1 /* Encoded as integer */
5234952b 126#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
127#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
942a3961 128
07efaf74 129static char* strencoding[] = {
130 "raw", "int", "zipmap", "hashtable"
131};
132
f78fd11b 133/* Object types only used for dumping to disk */
bb32ede5 134#define REDIS_EXPIRETIME 253
ed9b544e 135#define REDIS_SELECTDB 254
136#define REDIS_EOF 255
137
f78fd11b 138/* Defines related to the dump file format. To store 32 bits lengths for short
139 * keys requires a lot of space, so we check the most significant 2 bits of
140 * the first byte to interpreter the length:
141 *
142 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
143 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
144 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 145 * 11|000000 this means: specially encoded object will follow. The six bits
146 * number specify the kind of object that follows.
147 * See the REDIS_RDB_ENC_* defines.
f78fd11b 148 *
10c43610 149 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
150 * values, will fit inside. */
f78fd11b 151#define REDIS_RDB_6BITLEN 0
152#define REDIS_RDB_14BITLEN 1
153#define REDIS_RDB_32BITLEN 2
17be1a4a 154#define REDIS_RDB_ENCVAL 3
f78fd11b 155#define REDIS_RDB_LENERR UINT_MAX
156
a4d1ba9a 157/* When a length of a string object stored on disk has the first two bits
158 * set, the remaining two bits specify a special encoding for the object
159 * accordingly to the following defines: */
160#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
161#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
162#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 163#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 164
75680a3c 165/* Virtual memory object->where field. */
166#define REDIS_VM_MEMORY 0 /* The object is on memory */
167#define REDIS_VM_SWAPPED 1 /* The object is on disk */
168#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
169#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
170
06224fec 171/* Virtual memory static configuration stuff.
172 * Check vmFindContiguousPages() to know more about this magic numbers. */
173#define REDIS_VM_MAX_NEAR_PAGES 65536
174#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 175#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 176#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 177/* The following is the *percentage* of completed I/O jobs to process when the
178 * handelr is called. While Virtual Memory I/O operations are performed by
179 * threads, this operations must be processed by the main thread when completed
180 * in order to take effect. */
c953f24b 181#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 182
ed9b544e 183/* Client flags */
d5d55fc3 184#define REDIS_SLAVE 1 /* This client is a slave server */
185#define REDIS_MASTER 2 /* This client is a master server */
186#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
187#define REDIS_MULTI 8 /* This client is in a MULTI context */
188#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
189#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
ed9b544e 190
40d224a9 191/* Slave replication state - slave side */
ed9b544e 192#define REDIS_REPL_NONE 0 /* No active replication */
193#define REDIS_REPL_CONNECT 1 /* Must connect to master */
194#define REDIS_REPL_CONNECTED 2 /* Connected to master */
195
40d224a9 196/* Slave replication state - from the point of view of master
197 * Note that in SEND_BULK and ONLINE state the slave receives new updates
198 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
199 * to start the next background saving in order to send updates to it. */
200#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
201#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
202#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
203#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
204
ed9b544e 205/* List related stuff */
206#define REDIS_HEAD 0
207#define REDIS_TAIL 1
208
209/* Sort operations */
210#define REDIS_SORT_GET 0
443c6409 211#define REDIS_SORT_ASC 1
212#define REDIS_SORT_DESC 2
ed9b544e 213#define REDIS_SORTKEY_MAX 1024
214
215/* Log levels */
216#define REDIS_DEBUG 0
f870935d 217#define REDIS_VERBOSE 1
218#define REDIS_NOTICE 2
219#define REDIS_WARNING 3
ed9b544e 220
221/* Anti-warning macro... */
222#define REDIS_NOTUSED(V) ((void) V)
223
6b47e12e 224#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
225#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 226
48f0308a 227/* Append only defines */
228#define APPENDFSYNC_NO 0
229#define APPENDFSYNC_ALWAYS 1
230#define APPENDFSYNC_EVERYSEC 2
231
cbba7dd7 232/* Hashes related defaults */
233#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
234#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
235
dfc5e96c 236/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 237#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
c651fd9e 238#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
6c96ba7d 239static void _redisAssert(char *estr, char *file, int line);
c651fd9e 240static void _redisPanic(char *msg, char *file, int line);
dfc5e96c 241
ed9b544e 242/*================================= Data types ============================== */
243
244/* A redis object, that is a type able to hold a string / list / set */
75680a3c 245
246/* The VM object structure */
247struct redisObjectVM {
3a66edc7 248 off_t page; /* the page at witch the object is stored on disk */
249 off_t usedpages; /* number of pages used on disk */
250 time_t atime; /* Last access time */
75680a3c 251} vm;
252
253/* The actual Redis Object */
ed9b544e 254typedef struct redisObject {
ed9b544e 255 void *ptr;
942a3961 256 unsigned char type;
257 unsigned char encoding;
d894161b 258 unsigned char storage; /* If this object is a key, where is the value?
259 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
260 unsigned char vtype; /* If this object is a key, and value is swapped out,
261 * this is the type of the swapped out object. */
ed9b544e 262 int refcount;
75680a3c 263 /* VM fields, this are only allocated if VM is active, otherwise the
264 * object allocation function will just allocate
265 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
266 * Redis without VM active will not have any overhead. */
267 struct redisObjectVM vm;
ed9b544e 268} robj;
269
dfc5e96c 270/* Macro used to initalize a Redis object allocated on the stack.
271 * Note that this macro is taken near the structure definition to make sure
272 * we'll update it when the structure is changed, to avoid bugs like
273 * bug #85 introduced exactly in this way. */
274#define initStaticStringObject(_var,_ptr) do { \
275 _var.refcount = 1; \
276 _var.type = REDIS_STRING; \
277 _var.encoding = REDIS_ENCODING_RAW; \
278 _var.ptr = _ptr; \
3a66edc7 279 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 280} while(0);
281
3305306f 282typedef struct redisDb {
4409877e 283 dict *dict; /* The keyspace for this DB */
284 dict *expires; /* Timeout of keys with a timeout set */
285 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 286 dict *io_keys; /* Keys with clients waiting for VM I/O */
3305306f 287 int id;
288} redisDb;
289
6e469882 290/* Client MULTI/EXEC state */
291typedef struct multiCmd {
292 robj **argv;
293 int argc;
294 struct redisCommand *cmd;
295} multiCmd;
296
297typedef struct multiState {
298 multiCmd *commands; /* Array of MULTI commands */
299 int count; /* Total number of MULTI commands */
300} multiState;
301
ed9b544e 302/* With multiplexing we need to take per-clinet state.
303 * Clients are taken in a liked list. */
304typedef struct redisClient {
305 int fd;
3305306f 306 redisDb *db;
ed9b544e 307 int dictid;
308 sds querybuf;
e8a74421 309 robj **argv, **mbargv;
310 int argc, mbargc;
40d224a9 311 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 312 int multibulk; /* multi bulk command format active */
ed9b544e 313 list *reply;
314 int sentlen;
315 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 316 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 317 int slaveseldb; /* slave selected db, if this client is a slave */
318 int authenticated; /* when requirepass is non-NULL */
319 int replstate; /* replication state if this is a slave */
320 int repldbfd; /* replication DB file descriptor */
6e469882 321 long repldboff; /* replication DB file offset */
40d224a9 322 off_t repldbsize; /* replication DB file size */
6e469882 323 multiState mstate; /* MULTI/EXEC state */
d5d55fc3 324 robj **blockingkeys; /* The key we are waiting to terminate a blocking
4409877e 325 * operation such as BLPOP. Otherwise NULL. */
b177fd30 326 int blockingkeysnum; /* Number of blocking keys */
4409877e 327 time_t blockingto; /* Blocking operation timeout. If UNIX current time
328 * is >= blockingto then the operation timed out. */
92f8e882 329 list *io_keys; /* Keys this client is waiting to be loaded from the
330 * swap file in order to continue. */
ffc6b7f8 331 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
332 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
ed9b544e 333} redisClient;
334
335struct saveparam {
336 time_t seconds;
337 int changes;
338};
339
340/* Global server state structure */
341struct redisServer {
342 int port;
343 int fd;
3305306f 344 redisDb *db;
ed9b544e 345 long long dirty; /* changes to DB from the last save */
346 list *clients;
87eca727 347 list *slaves, *monitors;
ed9b544e 348 char neterr[ANET_ERR_LEN];
349 aeEventLoop *el;
350 int cronloops; /* number of times the cron function run */
351 list *objfreelist; /* A list of freed objects to avoid malloc() */
352 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 353 /* Fields used only for stats */
354 time_t stat_starttime; /* server start time */
355 long long stat_numcommands; /* number of processed commands */
356 long long stat_numconnections; /* number of connections received */
2a6a2ed1 357 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 358 /* Configuration */
359 int verbosity;
360 int glueoutputbuf;
361 int maxidletime;
362 int dbnum;
363 int daemonize;
44b38ef4 364 int appendonly;
48f0308a 365 int appendfsync;
366 time_t lastfsync;
44b38ef4 367 int appendfd;
368 int appendseldb;
ed329fcf 369 char *pidfile;
9f3c422c 370 pid_t bgsavechildpid;
9d65a1bb 371 pid_t bgrewritechildpid;
372 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
ed9b544e 373 struct saveparam *saveparams;
374 int saveparamslen;
375 char *logfile;
376 char *bindaddr;
377 char *dbfilename;
44b38ef4 378 char *appendfilename;
abcb223e 379 char *requirepass;
121f70cf 380 int rdbcompression;
8ca3e9d1 381 int activerehashing;
ed9b544e 382 /* Replication related */
383 int isslave;
d0ccebcf 384 char *masterauth;
ed9b544e 385 char *masterhost;
386 int masterport;
40d224a9 387 redisClient *master; /* client that is master for this slave */
ed9b544e 388 int replstate;
285add55 389 unsigned int maxclients;
4ef8de8a 390 unsigned long long maxmemory;
d5d55fc3 391 unsigned int blpop_blocked_clients;
392 unsigned int vm_blocked_clients;
ed9b544e 393 /* Sort parameters - qsort_r() is only available under BSD so we
394 * have to take this state global, in order to pass it to sortCompare() */
395 int sort_desc;
396 int sort_alpha;
397 int sort_bypattern;
75680a3c 398 /* Virtual memory configuration */
399 int vm_enabled;
054e426d 400 char *vm_swap_file;
75680a3c 401 off_t vm_page_size;
402 off_t vm_pages;
4ef8de8a 403 unsigned long long vm_max_memory;
cbba7dd7 404 /* Hashes config */
405 size_t hash_max_zipmap_entries;
406 size_t hash_max_zipmap_value;
75680a3c 407 /* Virtual memory state */
408 FILE *vm_fp;
409 int vm_fd;
410 off_t vm_next_page; /* Next probably empty page */
411 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 412 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 413 time_t unixtime; /* Unix time sampled every second. */
92f8e882 414 /* Virtual memory I/O threads stuff */
92f8e882 415 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 416 * put the result of the operation in the io_done list. While the
417 * job is being processed, it's put on io_processing queue. */
418 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
419 list *io_processing; /* List of VM I/O jobs being processed */
420 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 421 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 422 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 423 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
424 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 425 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 426 int io_active_threads; /* Number of running I/O threads */
427 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 428 /* Our main thread is blocked on the event loop, locking for sockets ready
429 * to be read or written, so when a threaded I/O operation is ready to be
430 * processed by the main thread, the I/O thread will use a unix pipe to
431 * awake the main thread. The followings are the two pipe FDs. */
432 int io_ready_pipe_read;
433 int io_ready_pipe_write;
7d98e08c 434 /* Virtual memory stats */
435 unsigned long long vm_stats_used_pages;
436 unsigned long long vm_stats_swapped_objects;
437 unsigned long long vm_stats_swapouts;
438 unsigned long long vm_stats_swapins;
befec3cd 439 /* Pubsub */
ffc6b7f8 440 dict *pubsub_channels; /* Map channels to list of subscribed clients */
441 list *pubsub_patterns; /* A list of pubsub_patterns */
befec3cd 442 /* Misc */
b9bc0eef 443 FILE *devnull;
ed9b544e 444};
445
ffc6b7f8 446typedef struct pubsubPattern {
447 redisClient *client;
448 robj *pattern;
449} pubsubPattern;
450
ed9b544e 451typedef void redisCommandProc(redisClient *c);
452struct redisCommand {
453 char *name;
454 redisCommandProc *proc;
455 int arity;
456 int flags;
76583ea4
PN
457 /* Use a function to determine which keys need to be loaded
458 * in the background prior to executing this command. Takes precedence
459 * over vm_firstkey and others, ignored when NULL */
460 redisCommandProc *vm_preload_proc;
7c775e09 461 /* What keys should be loaded in background when calling this command? */
462 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
463 int vm_lastkey; /* THe last argument that's a key */
464 int vm_keystep; /* The step between first and last key */
ed9b544e 465};
466
de96dbfe 467struct redisFunctionSym {
468 char *name;
56906eef 469 unsigned long pointer;
de96dbfe 470};
471
ed9b544e 472typedef struct _redisSortObject {
473 robj *obj;
474 union {
475 double score;
476 robj *cmpobj;
477 } u;
478} redisSortObject;
479
480typedef struct _redisSortOperation {
481 int type;
482 robj *pattern;
483} redisSortOperation;
484
6b47e12e 485/* ZSETs use a specialized version of Skiplists */
486
487typedef struct zskiplistNode {
488 struct zskiplistNode **forward;
e3870fab 489 struct zskiplistNode *backward;
912b9165 490 unsigned int *span;
6b47e12e 491 double score;
492 robj *obj;
493} zskiplistNode;
494
495typedef struct zskiplist {
e3870fab 496 struct zskiplistNode *header, *tail;
d13f767c 497 unsigned long length;
6b47e12e 498 int level;
499} zskiplist;
500
1812e024 501typedef struct zset {
502 dict *dict;
6b47e12e 503 zskiplist *zsl;
1812e024 504} zset;
505
6b47e12e 506/* Our shared "common" objects */
507
05df7621 508#define REDIS_SHARED_INTEGERS 10000
ed9b544e 509struct sharedObjectsStruct {
c937aa89 510 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 511 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 512 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
513 *outofrangeerr, *plus,
ed9b544e 514 *select0, *select1, *select2, *select3, *select4,
befec3cd 515 *select5, *select6, *select7, *select8, *select9,
c8d0ea0e 516 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
517 *mbulk4, *psubscribebulk, *punsubscribebulk,
518 *integers[REDIS_SHARED_INTEGERS];
ed9b544e 519} shared;
520
a7866db6 521/* Global vars that are actally used as constants. The following double
522 * values are used for double on-disk serialization, and are initialized
523 * at runtime to avoid strange compiler optimizations. */
524
525static double R_Zero, R_PosInf, R_NegInf, R_Nan;
526
92f8e882 527/* VM threaded I/O request message */
b9bc0eef 528#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
529#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
530#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 531typedef struct iojob {
996cb5f7 532 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 533 redisDb *db;/* Redis database */
92f8e882 534 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 535 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 536 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
537 off_t page; /* Swap page where to read/write the object */
248ea310 538 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 539 int canceled; /* True if this command was canceled by blocking side of VM */
540 pthread_t thread; /* ID of the thread processing this entry */
541} iojob;
92f8e882 542
ed9b544e 543/*================================ Prototypes =============================== */
544
545static void freeStringObject(robj *o);
546static void freeListObject(robj *o);
547static void freeSetObject(robj *o);
548static void decrRefCount(void *o);
549static robj *createObject(int type, void *ptr);
550static void freeClient(redisClient *c);
f78fd11b 551static int rdbLoad(char *filename);
ed9b544e 552static void addReply(redisClient *c, robj *obj);
553static void addReplySds(redisClient *c, sds s);
554static void incrRefCount(robj *o);
f78fd11b 555static int rdbSaveBackground(char *filename);
ed9b544e 556static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 557static robj *dupStringObject(robj *o);
248ea310 558static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
dd142b9c 559static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
44b38ef4 560static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 561static int syncWithMaster(void);
05df7621 562static robj *tryObjectEncoding(robj *o);
9d65a1bb 563static robj *getDecodedObject(robj *o);
3305306f 564static int removeExpire(redisDb *db, robj *key);
565static int expireIfNeeded(redisDb *db, robj *key);
566static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 567static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 568static int deleteKey(redisDb *db, robj *key);
bb32ede5 569static time_t getExpire(redisDb *db, robj *key);
570static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 571static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 572static void freeMemoryIfNeeded(void);
de96dbfe 573static int processCommand(redisClient *c);
56906eef 574static void setupSigSegvAction(void);
a3b21203 575static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 576static void aofRemoveTempFile(pid_t childpid);
0ea663ea 577static size_t stringObjectLen(robj *o);
638e42ac 578static void processInputBuffer(redisClient *c);
6b47e12e 579static zskiplist *zslCreate(void);
fd8ccf44 580static void zslFree(zskiplist *zsl);
2b59cfdf 581static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 582static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 583static void initClientMultiState(redisClient *c);
584static void freeClientMultiState(redisClient *c);
585static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 586static void unblockClientWaitingData(redisClient *c);
4409877e 587static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 588static void vmInit(void);
a35ddf12 589static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 590static robj *vmLoadObject(robj *key);
7e69548d 591static robj *vmPreviewObject(robj *key);
a69a0c9c 592static int vmSwapOneObjectBlocking(void);
593static int vmSwapOneObjectThreaded(void);
7e69548d 594static int vmCanSwapOut(void);
a5819310 595static int tryFreeOneObjectFromFreelist(void);
996cb5f7 596static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
597static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
598static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 599static void lockThreadedIO(void);
600static void unlockThreadedIO(void);
601static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
602static void freeIOJob(iojob *j);
603static void queueIOJob(iojob *j);
a5819310 604static int vmWriteObjectOnSwap(robj *o, off_t page);
605static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 606static void waitEmptyIOJobsQueue(void);
607static void vmReopenSwapFile(void);
970e10bb 608static int vmFreePage(off_t page);
76583ea4 609static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
d5d55fc3 610static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
611static int dontWaitForSwappedKey(redisClient *c, robj *key);
612static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
613static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
614static struct redisCommand *lookupCommand(char *name);
615static void call(redisClient *c, struct redisCommand *cmd);
616static void resetClient(redisClient *c);
ada386b2 617static void convertToRealHash(robj *o);
ffc6b7f8 618static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
619static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
620static void freePubsubPattern(void *p);
621static int listMatchPubsubPattern(void *a, void *b);
622static int compareStringObjects(robj *a, robj *b);
befec3cd 623static void usage();
8f63ddca 624static int rewriteAppendOnlyFileBackground(void);
ed9b544e 625
abcb223e 626static void authCommand(redisClient *c);
ed9b544e 627static void pingCommand(redisClient *c);
628static void echoCommand(redisClient *c);
629static void setCommand(redisClient *c);
630static void setnxCommand(redisClient *c);
526d00a5 631static void setexCommand(redisClient *c);
ed9b544e 632static void getCommand(redisClient *c);
633static void delCommand(redisClient *c);
634static void existsCommand(redisClient *c);
635static void incrCommand(redisClient *c);
636static void decrCommand(redisClient *c);
637static void incrbyCommand(redisClient *c);
638static void decrbyCommand(redisClient *c);
639static void selectCommand(redisClient *c);
640static void randomkeyCommand(redisClient *c);
641static void keysCommand(redisClient *c);
642static void dbsizeCommand(redisClient *c);
643static void lastsaveCommand(redisClient *c);
644static void saveCommand(redisClient *c);
645static void bgsaveCommand(redisClient *c);
9d65a1bb 646static void bgrewriteaofCommand(redisClient *c);
ed9b544e 647static void shutdownCommand(redisClient *c);
648static void moveCommand(redisClient *c);
649static void renameCommand(redisClient *c);
650static void renamenxCommand(redisClient *c);
651static void lpushCommand(redisClient *c);
652static void rpushCommand(redisClient *c);
653static void lpopCommand(redisClient *c);
654static void rpopCommand(redisClient *c);
655static void llenCommand(redisClient *c);
656static void lindexCommand(redisClient *c);
657static void lrangeCommand(redisClient *c);
658static void ltrimCommand(redisClient *c);
659static void typeCommand(redisClient *c);
660static void lsetCommand(redisClient *c);
661static void saddCommand(redisClient *c);
662static void sremCommand(redisClient *c);
a4460ef4 663static void smoveCommand(redisClient *c);
ed9b544e 664static void sismemberCommand(redisClient *c);
665static void scardCommand(redisClient *c);
12fea928 666static void spopCommand(redisClient *c);
2abb95a9 667static void srandmemberCommand(redisClient *c);
ed9b544e 668static void sinterCommand(redisClient *c);
669static void sinterstoreCommand(redisClient *c);
40d224a9 670static void sunionCommand(redisClient *c);
671static void sunionstoreCommand(redisClient *c);
f4f56e1d 672static void sdiffCommand(redisClient *c);
673static void sdiffstoreCommand(redisClient *c);
ed9b544e 674static void syncCommand(redisClient *c);
675static void flushdbCommand(redisClient *c);
676static void flushallCommand(redisClient *c);
677static void sortCommand(redisClient *c);
678static void lremCommand(redisClient *c);
0f5f7e9a 679static void rpoplpushcommand(redisClient *c);
ed9b544e 680static void infoCommand(redisClient *c);
70003d28 681static void mgetCommand(redisClient *c);
87eca727 682static void monitorCommand(redisClient *c);
3305306f 683static void expireCommand(redisClient *c);
802e8373 684static void expireatCommand(redisClient *c);
f6b141c5 685static void getsetCommand(redisClient *c);
fd88489a 686static void ttlCommand(redisClient *c);
321b0e13 687static void slaveofCommand(redisClient *c);
7f957c92 688static void debugCommand(redisClient *c);
f6b141c5 689static void msetCommand(redisClient *c);
690static void msetnxCommand(redisClient *c);
fd8ccf44 691static void zaddCommand(redisClient *c);
7db723ad 692static void zincrbyCommand(redisClient *c);
cc812361 693static void zrangeCommand(redisClient *c);
50c55df5 694static void zrangebyscoreCommand(redisClient *c);
f44dd428 695static void zcountCommand(redisClient *c);
e3870fab 696static void zrevrangeCommand(redisClient *c);
3c41331e 697static void zcardCommand(redisClient *c);
1b7106e7 698static void zremCommand(redisClient *c);
6e333bbe 699static void zscoreCommand(redisClient *c);
1807985b 700static void zremrangebyscoreCommand(redisClient *c);
6e469882 701static void multiCommand(redisClient *c);
702static void execCommand(redisClient *c);
18b6cb76 703static void discardCommand(redisClient *c);
4409877e 704static void blpopCommand(redisClient *c);
705static void brpopCommand(redisClient *c);
4b00bebd 706static void appendCommand(redisClient *c);
39191553 707static void substrCommand(redisClient *c);
69d95c3e 708static void zrankCommand(redisClient *c);
798d9e55 709static void zrevrankCommand(redisClient *c);
978c2c94 710static void hsetCommand(redisClient *c);
1f1c7695 711static void hsetnxCommand(redisClient *c);
978c2c94 712static void hgetCommand(redisClient *c);
09aeb579
PN
713static void hmsetCommand(redisClient *c);
714static void hmgetCommand(redisClient *c);
07efaf74 715static void hdelCommand(redisClient *c);
92b27fe9 716static void hlenCommand(redisClient *c);
9212eafd 717static void zremrangebyrankCommand(redisClient *c);
2830ca53
PN
718static void zunionCommand(redisClient *c);
719static void zinterCommand(redisClient *c);
78409a0f 720static void hkeysCommand(redisClient *c);
721static void hvalsCommand(redisClient *c);
722static void hgetallCommand(redisClient *c);
a86f14b1 723static void hexistsCommand(redisClient *c);
500ece7c 724static void configCommand(redisClient *c);
01426b05 725static void hincrbyCommand(redisClient *c);
befec3cd 726static void subscribeCommand(redisClient *c);
727static void unsubscribeCommand(redisClient *c);
ffc6b7f8 728static void psubscribeCommand(redisClient *c);
729static void punsubscribeCommand(redisClient *c);
befec3cd 730static void publishCommand(redisClient *c);
f6b141c5 731
ed9b544e 732/*================================= Globals ================================= */
733
734/* Global vars */
735static struct redisServer server; /* server global state */
736static struct redisCommand cmdTable[] = {
76583ea4
PN
737 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
738 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
739 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
526d00a5 740 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
741 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
742 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
743 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
744 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
745 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
746 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
747 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
748 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
749 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
750 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
751 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
752 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
753 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
754 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
755 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
756 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
757 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
758 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
760 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
761 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
762 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
763 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
764 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
765 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
766 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
767 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
768 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
769 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
770 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
771 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
772 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
773 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
774 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
776 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
777 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
778 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
780 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
781 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
782 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
783 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
784 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
785 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
788 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
789 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
790 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
1f1c7695 791 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 792 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
d33278d1 793 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 794 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
01426b05 795 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
796 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
797 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
798 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
800 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 801 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
802 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
803 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
804 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
805 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
806 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
807 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
808 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
809 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
810 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
811 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
812 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
814 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
815 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
816 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
818 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
819 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
820 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
823 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
824 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
825 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
958cd5f3 826 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
827 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
831 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
832 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
835 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 837 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
befec3cd 838 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
ffc6b7f8 840 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
4005fef1 842 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
76583ea4 843 {NULL,NULL,0,0,NULL,0,0,0}
ed9b544e 844};
bcfc686d 845
ed9b544e 846/*============================ Utility functions ============================ */
847
848/* Glob-style pattern matching. */
500ece7c 849static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 850 const char *string, int stringLen, int nocase)
851{
852 while(patternLen) {
853 switch(pattern[0]) {
854 case '*':
855 while (pattern[1] == '*') {
856 pattern++;
857 patternLen--;
858 }
859 if (patternLen == 1)
860 return 1; /* match */
861 while(stringLen) {
862 if (stringmatchlen(pattern+1, patternLen-1,
863 string, stringLen, nocase))
864 return 1; /* match */
865 string++;
866 stringLen--;
867 }
868 return 0; /* no match */
869 break;
870 case '?':
871 if (stringLen == 0)
872 return 0; /* no match */
873 string++;
874 stringLen--;
875 break;
876 case '[':
877 {
878 int not, match;
879
880 pattern++;
881 patternLen--;
882 not = pattern[0] == '^';
883 if (not) {
884 pattern++;
885 patternLen--;
886 }
887 match = 0;
888 while(1) {
889 if (pattern[0] == '\\') {
890 pattern++;
891 patternLen--;
892 if (pattern[0] == string[0])
893 match = 1;
894 } else if (pattern[0] == ']') {
895 break;
896 } else if (patternLen == 0) {
897 pattern--;
898 patternLen++;
899 break;
900 } else if (pattern[1] == '-' && patternLen >= 3) {
901 int start = pattern[0];
902 int end = pattern[2];
903 int c = string[0];
904 if (start > end) {
905 int t = start;
906 start = end;
907 end = t;
908 }
909 if (nocase) {
910 start = tolower(start);
911 end = tolower(end);
912 c = tolower(c);
913 }
914 pattern += 2;
915 patternLen -= 2;
916 if (c >= start && c <= end)
917 match = 1;
918 } else {
919 if (!nocase) {
920 if (pattern[0] == string[0])
921 match = 1;
922 } else {
923 if (tolower((int)pattern[0]) == tolower((int)string[0]))
924 match = 1;
925 }
926 }
927 pattern++;
928 patternLen--;
929 }
930 if (not)
931 match = !match;
932 if (!match)
933 return 0; /* no match */
934 string++;
935 stringLen--;
936 break;
937 }
938 case '\\':
939 if (patternLen >= 2) {
940 pattern++;
941 patternLen--;
942 }
943 /* fall through */
944 default:
945 if (!nocase) {
946 if (pattern[0] != string[0])
947 return 0; /* no match */
948 } else {
949 if (tolower((int)pattern[0]) != tolower((int)string[0]))
950 return 0; /* no match */
951 }
952 string++;
953 stringLen--;
954 break;
955 }
956 pattern++;
957 patternLen--;
958 if (stringLen == 0) {
959 while(*pattern == '*') {
960 pattern++;
961 patternLen--;
962 }
963 break;
964 }
965 }
966 if (patternLen == 0 && stringLen == 0)
967 return 1;
968 return 0;
969}
970
500ece7c 971static int stringmatch(const char *pattern, const char *string, int nocase) {
972 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
973}
974
2b619329 975/* Convert a string representing an amount of memory into the number of
976 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
977 * (1024*1024*1024).
978 *
979 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
980 * set to 0 */
981static long long memtoll(const char *p, int *err) {
982 const char *u;
983 char buf[128];
984 long mul; /* unit multiplier */
985 long long val;
986 unsigned int digits;
987
988 if (err) *err = 0;
989 /* Search the first non digit character. */
990 u = p;
991 if (*u == '-') u++;
992 while(*u && isdigit(*u)) u++;
993 if (*u == '\0' || !strcasecmp(u,"b")) {
994 mul = 1;
72324005 995 } else if (!strcasecmp(u,"k")) {
2b619329 996 mul = 1000;
72324005 997 } else if (!strcasecmp(u,"kb")) {
2b619329 998 mul = 1024;
72324005 999 } else if (!strcasecmp(u,"m")) {
2b619329 1000 mul = 1000*1000;
72324005 1001 } else if (!strcasecmp(u,"mb")) {
2b619329 1002 mul = 1024*1024;
72324005 1003 } else if (!strcasecmp(u,"g")) {
2b619329 1004 mul = 1000L*1000*1000;
72324005 1005 } else if (!strcasecmp(u,"gb")) {
2b619329 1006 mul = 1024L*1024*1024;
1007 } else {
1008 if (err) *err = 1;
1009 mul = 1;
1010 }
1011 digits = u-p;
1012 if (digits >= sizeof(buf)) {
1013 if (err) *err = 1;
1014 return LLONG_MAX;
1015 }
1016 memcpy(buf,p,digits);
1017 buf[digits] = '\0';
1018 val = strtoll(buf,NULL,10);
1019 return val*mul;
1020}
1021
56906eef 1022static void redisLog(int level, const char *fmt, ...) {
ed9b544e 1023 va_list ap;
1024 FILE *fp;
1025
1026 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1027 if (!fp) return;
1028
1029 va_start(ap, fmt);
1030 if (level >= server.verbosity) {
6766f45e 1031 char *c = ".-*#";
1904ecc1 1032 char buf[64];
1033 time_t now;
1034
1035 now = time(NULL);
6c9385e0 1036 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 1037 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 1038 vfprintf(fp, fmt, ap);
1039 fprintf(fp,"\n");
1040 fflush(fp);
1041 }
1042 va_end(ap);
1043
1044 if (server.logfile) fclose(fp);
1045}
1046
1047/*====================== Hash table type implementation ==================== */
1048
1049/* This is an hash table type that uses the SDS dynamic strings libary as
1050 * keys and radis objects as values (objects can hold SDS strings,
1051 * lists, sets). */
1052
1812e024 1053static void dictVanillaFree(void *privdata, void *val)
1054{
1055 DICT_NOTUSED(privdata);
1056 zfree(val);
1057}
1058
4409877e 1059static void dictListDestructor(void *privdata, void *val)
1060{
1061 DICT_NOTUSED(privdata);
1062 listRelease((list*)val);
1063}
1064
ed9b544e 1065static int sdsDictKeyCompare(void *privdata, const void *key1,
1066 const void *key2)
1067{
1068 int l1,l2;
1069 DICT_NOTUSED(privdata);
1070
1071 l1 = sdslen((sds)key1);
1072 l2 = sdslen((sds)key2);
1073 if (l1 != l2) return 0;
1074 return memcmp(key1, key2, l1) == 0;
1075}
1076
1077static void dictRedisObjectDestructor(void *privdata, void *val)
1078{
1079 DICT_NOTUSED(privdata);
1080
a35ddf12 1081 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 1082 decrRefCount(val);
1083}
1084
942a3961 1085static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1086 const void *key2)
1087{
1088 const robj *o1 = key1, *o2 = key2;
1089 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1090}
1091
942a3961 1092static unsigned int dictObjHash(const void *key) {
ed9b544e 1093 const robj *o = key;
1094 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1095}
1096
942a3961 1097static int dictEncObjKeyCompare(void *privdata, const void *key1,
1098 const void *key2)
1099{
9d65a1bb 1100 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1101 int cmp;
942a3961 1102
2a1198b4 1103 if (o1->encoding == REDIS_ENCODING_INT &&
1104 o2->encoding == REDIS_ENCODING_INT &&
db5946fc 1105 o1->ptr == o2->ptr) return 1;
2a1198b4 1106
9d65a1bb 1107 o1 = getDecodedObject(o1);
1108 o2 = getDecodedObject(o2);
1109 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1110 decrRefCount(o1);
1111 decrRefCount(o2);
1112 return cmp;
942a3961 1113}
1114
1115static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1116 robj *o = (robj*) key;
942a3961 1117
ed9e4966 1118 if (o->encoding == REDIS_ENCODING_RAW) {
1119 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1120 } else {
1121 if (o->encoding == REDIS_ENCODING_INT) {
1122 char buf[32];
1123 int len;
1124
1125 len = snprintf(buf,32,"%ld",(long)o->ptr);
1126 return dictGenHashFunction((unsigned char*)buf, len);
1127 } else {
1128 unsigned int hash;
1129
1130 o = getDecodedObject(o);
1131 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1132 decrRefCount(o);
1133 return hash;
1134 }
1135 }
942a3961 1136}
1137
f2d9f50f 1138/* Sets type and expires */
ed9b544e 1139static dictType setDictType = {
942a3961 1140 dictEncObjHash, /* hash function */
ed9b544e 1141 NULL, /* key dup */
1142 NULL, /* val dup */
942a3961 1143 dictEncObjKeyCompare, /* key compare */
ed9b544e 1144 dictRedisObjectDestructor, /* key destructor */
1145 NULL /* val destructor */
1146};
1147
f2d9f50f 1148/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1149static dictType zsetDictType = {
1150 dictEncObjHash, /* hash function */
1151 NULL, /* key dup */
1152 NULL, /* val dup */
1153 dictEncObjKeyCompare, /* key compare */
1154 dictRedisObjectDestructor, /* key destructor */
da0a1620 1155 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1156};
1157
f2d9f50f 1158/* Db->dict */
5234952b 1159static dictType dbDictType = {
942a3961 1160 dictObjHash, /* hash function */
ed9b544e 1161 NULL, /* key dup */
1162 NULL, /* val dup */
942a3961 1163 dictObjKeyCompare, /* key compare */
ed9b544e 1164 dictRedisObjectDestructor, /* key destructor */
1165 dictRedisObjectDestructor /* val destructor */
1166};
1167
f2d9f50f 1168/* Db->expires */
1169static dictType keyptrDictType = {
1170 dictObjHash, /* hash function */
1171 NULL, /* key dup */
1172 NULL, /* val dup */
1173 dictObjKeyCompare, /* key compare */
1174 dictRedisObjectDestructor, /* key destructor */
1175 NULL /* val destructor */
1176};
1177
5234952b 1178/* Hash type hash table (note that small hashes are represented with zimpaps) */
1179static dictType hashDictType = {
1180 dictEncObjHash, /* hash function */
1181 NULL, /* key dup */
1182 NULL, /* val dup */
1183 dictEncObjKeyCompare, /* key compare */
1184 dictRedisObjectDestructor, /* key destructor */
1185 dictRedisObjectDestructor /* val destructor */
1186};
1187
4409877e 1188/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1189 * lists as values. It's used for blocking operations (BLPOP) and to
1190 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1191static dictType keylistDictType = {
1192 dictObjHash, /* hash function */
1193 NULL, /* key dup */
1194 NULL, /* val dup */
1195 dictObjKeyCompare, /* key compare */
1196 dictRedisObjectDestructor, /* key destructor */
1197 dictListDestructor /* val destructor */
1198};
1199
42ab0172
AO
1200static void version();
1201
ed9b544e 1202/* ========================= Random utility functions ======================= */
1203
1204/* Redis generally does not try to recover from out of memory conditions
1205 * when allocating objects or strings, it is not clear if it will be possible
1206 * to report this condition to the client since the networking layer itself
1207 * is based on heap allocation for send buffers, so we simply abort.
1208 * At least the code will be simpler to read... */
1209static void oom(const char *msg) {
71c54b21 1210 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1211 sleep(1);
1212 abort();
1213}
1214
1215/* ====================== Redis server networking stuff ===================== */
56906eef 1216static void closeTimedoutClients(void) {
ed9b544e 1217 redisClient *c;
ed9b544e 1218 listNode *ln;
1219 time_t now = time(NULL);
c7df85a4 1220 listIter li;
ed9b544e 1221
c7df85a4 1222 listRewind(server.clients,&li);
1223 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1224 c = listNodeValue(ln);
f86a74e9 1225 if (server.maxidletime &&
1226 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1227 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
ffc6b7f8 1228 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1229 listLength(c->pubsub_patterns) == 0 &&
d6cc8867 1230 (now - c->lastinteraction > server.maxidletime))
f86a74e9 1231 {
f870935d 1232 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1233 freeClient(c);
f86a74e9 1234 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1235 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1236 addReply(c,shared.nullmultibulk);
b0d8747d 1237 unblockClientWaitingData(c);
f86a74e9 1238 }
ed9b544e 1239 }
1240 }
ed9b544e 1241}
1242
12fea928 1243static int htNeedsResize(dict *dict) {
1244 long long size, used;
1245
1246 size = dictSlots(dict);
1247 used = dictSize(dict);
1248 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1249 (used*100/size < REDIS_HT_MINFILL));
1250}
1251
0bc03378 1252/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1253 * we resize the hash table to save memory */
56906eef 1254static void tryResizeHashTables(void) {
0bc03378 1255 int j;
1256
1257 for (j = 0; j < server.dbnum; j++) {
5413c40d 1258 if (htNeedsResize(server.db[j].dict))
0bc03378 1259 dictResize(server.db[j].dict);
12fea928 1260 if (htNeedsResize(server.db[j].expires))
1261 dictResize(server.db[j].expires);
0bc03378 1262 }
1263}
1264
8ca3e9d1 1265/* Our hash table implementation performs rehashing incrementally while
1266 * we write/read from the hash table. Still if the server is idle, the hash
1267 * table will use two tables for a long time. So we try to use 1 millisecond
1268 * of CPU time at every serverCron() loop in order to rehash some key. */
1269static void incrementallyRehash(void) {
1270 int j;
1271
1272 for (j = 0; j < server.dbnum; j++) {
1273 if (dictIsRehashing(server.db[j].dict)) {
1274 dictRehashMilliseconds(server.db[j].dict,1);
1275 break; /* already used our millisecond for this loop... */
1276 }
1277 }
1278}
1279
9d65a1bb 1280/* A background saving child (BGSAVE) terminated its work. Handle this. */
1281void backgroundSaveDoneHandler(int statloc) {
1282 int exitcode = WEXITSTATUS(statloc);
1283 int bysignal = WIFSIGNALED(statloc);
1284
1285 if (!bysignal && exitcode == 0) {
1286 redisLog(REDIS_NOTICE,
1287 "Background saving terminated with success");
1288 server.dirty = 0;
1289 server.lastsave = time(NULL);
1290 } else if (!bysignal && exitcode != 0) {
1291 redisLog(REDIS_WARNING, "Background saving error");
1292 } else {
1293 redisLog(REDIS_WARNING,
454eea7c 1294 "Background saving terminated by signal %d", WTERMSIG(statloc));
9d65a1bb 1295 rdbRemoveTempFile(server.bgsavechildpid);
1296 }
1297 server.bgsavechildpid = -1;
1298 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1299 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1300 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1301}
1302
1303/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1304 * Handle this. */
1305void backgroundRewriteDoneHandler(int statloc) {
1306 int exitcode = WEXITSTATUS(statloc);
1307 int bysignal = WIFSIGNALED(statloc);
1308
1309 if (!bysignal && exitcode == 0) {
1310 int fd;
1311 char tmpfile[256];
1312
1313 redisLog(REDIS_NOTICE,
1314 "Background append only file rewriting terminated with success");
1315 /* Now it's time to flush the differences accumulated by the parent */
1316 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1317 fd = open(tmpfile,O_WRONLY|O_APPEND);
1318 if (fd == -1) {
1319 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1320 goto cleanup;
1321 }
1322 /* Flush our data... */
1323 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1324 (signed) sdslen(server.bgrewritebuf)) {
1325 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1326 close(fd);
1327 goto cleanup;
1328 }
b32627cd 1329 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1330 /* Now our work is to rename the temp file into the stable file. And
1331 * switch the file descriptor used by the server for append only. */
1332 if (rename(tmpfile,server.appendfilename) == -1) {
1333 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1334 close(fd);
1335 goto cleanup;
1336 }
1337 /* Mission completed... almost */
1338 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1339 if (server.appendfd != -1) {
1340 /* If append only is actually enabled... */
1341 close(server.appendfd);
1342 server.appendfd = fd;
1343 fsync(fd);
85a83172 1344 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1345 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1346 } else {
1347 /* If append only is disabled we just generate a dump in this
1348 * format. Why not? */
1349 close(fd);
1350 }
1351 } else if (!bysignal && exitcode != 0) {
1352 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1353 } else {
1354 redisLog(REDIS_WARNING,
454eea7c 1355 "Background append only file rewriting terminated by signal %d",
1356 WTERMSIG(statloc));
9d65a1bb 1357 }
1358cleanup:
1359 sdsfree(server.bgrewritebuf);
1360 server.bgrewritebuf = sdsempty();
1361 aofRemoveTempFile(server.bgrewritechildpid);
1362 server.bgrewritechildpid = -1;
1363}
1364
884d4b39 1365/* This function is called once a background process of some kind terminates,
1366 * as we want to avoid resizing the hash tables when there is a child in order
1367 * to play well with copy-on-write (otherwise when a resize happens lots of
1368 * memory pages are copied). The goal of this function is to update the ability
1369 * for dict.c to resize the hash tables accordingly to the fact we have o not
1370 * running childs. */
1371static void updateDictResizePolicy(void) {
1372 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1373 dictEnableResize();
1374 else
1375 dictDisableResize();
1376}
1377
56906eef 1378static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1379 int j, loops = server.cronloops++;
ed9b544e 1380 REDIS_NOTUSED(eventLoop);
1381 REDIS_NOTUSED(id);
1382 REDIS_NOTUSED(clientData);
1383
3a66edc7 1384 /* We take a cached value of the unix time in the global state because
1385 * with virtual memory and aging there is to store the current time
1386 * in objects at every object access, and accuracy is not needed.
1387 * To access a global var is faster than calling time(NULL) */
1388 server.unixtime = time(NULL);
1389
0bc03378 1390 /* Show some info about non-empty databases */
ed9b544e 1391 for (j = 0; j < server.dbnum; j++) {
dec423d9 1392 long long size, used, vkeys;
94754ccc 1393
3305306f 1394 size = dictSlots(server.db[j].dict);
1395 used = dictSize(server.db[j].dict);
94754ccc 1396 vkeys = dictSize(server.db[j].expires);
1763929f 1397 if (!(loops % 50) && (used || vkeys)) {
f870935d 1398 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1399 /* dictPrintStats(server.dict); */
ed9b544e 1400 }
ed9b544e 1401 }
1402
0bc03378 1403 /* We don't want to resize the hash tables while a bacground saving
1404 * is in progress: the saving child is created using fork() that is
1405 * implemented with a copy-on-write semantic in most modern systems, so
1406 * if we resize the HT while there is the saving child at work actually
1407 * a lot of memory movements in the parent will cause a lot of pages
1408 * copied. */
8ca3e9d1 1409 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1410 if (!(loops % 10)) tryResizeHashTables();
1411 if (server.activerehashing) incrementallyRehash();
884d4b39 1412 }
0bc03378 1413
ed9b544e 1414 /* Show information about connected clients */
1763929f 1415 if (!(loops % 50)) {
bdcb92f2 1416 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
ed9b544e 1417 listLength(server.clients)-listLength(server.slaves),
1418 listLength(server.slaves),
bdcb92f2 1419 zmalloc_used_memory());
ed9b544e 1420 }
1421
1422 /* Close connections of timedout clients */
1763929f 1423 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1424 closeTimedoutClients();
1425
9d65a1bb 1426 /* Check if a background saving or AOF rewrite in progress terminated */
1427 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1428 int statloc;
9d65a1bb 1429 pid_t pid;
1430
1431 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1432 if (pid == server.bgsavechildpid) {
1433 backgroundSaveDoneHandler(statloc);
ed9b544e 1434 } else {
9d65a1bb 1435 backgroundRewriteDoneHandler(statloc);
ed9b544e 1436 }
884d4b39 1437 updateDictResizePolicy();
ed9b544e 1438 }
1439 } else {
1440 /* If there is not a background saving in progress check if
1441 * we have to save now */
1442 time_t now = time(NULL);
1443 for (j = 0; j < server.saveparamslen; j++) {
1444 struct saveparam *sp = server.saveparams+j;
1445
1446 if (server.dirty >= sp->changes &&
1447 now-server.lastsave > sp->seconds) {
1448 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1449 sp->changes, sp->seconds);
f78fd11b 1450 rdbSaveBackground(server.dbfilename);
ed9b544e 1451 break;
1452 }
1453 }
1454 }
94754ccc 1455
f2324293 1456 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1457 * will use few CPU cycles if there are few expiring keys, otherwise
1458 * it will get more aggressive to avoid that too much memory is used by
1459 * keys that can be removed from the keyspace. */
94754ccc 1460 for (j = 0; j < server.dbnum; j++) {
f2324293 1461 int expired;
94754ccc 1462 redisDb *db = server.db+j;
94754ccc 1463
f2324293 1464 /* Continue to expire if at the end of the cycle more than 25%
1465 * of the keys were expired. */
1466 do {
4ef8de8a 1467 long num = dictSize(db->expires);
94754ccc 1468 time_t now = time(NULL);
1469
f2324293 1470 expired = 0;
94754ccc 1471 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1472 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1473 while (num--) {
1474 dictEntry *de;
1475 time_t t;
1476
1477 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1478 t = (time_t) dictGetEntryVal(de);
1479 if (now > t) {
1480 deleteKey(db,dictGetEntryKey(de));
f2324293 1481 expired++;
2a6a2ed1 1482 server.stat_expiredkeys++;
94754ccc 1483 }
1484 }
f2324293 1485 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1486 }
1487
4ef8de8a 1488 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1489 * is enbled. Try to free objects from the free list first. */
7e69548d 1490 if (vmCanSwapOut()) {
1491 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1492 server.vm_max_memory)
1493 {
72e9fd40 1494 int retval;
1495
a5819310 1496 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1497 retval = (server.vm_max_threads == 0) ?
1498 vmSwapOneObjectBlocking() :
1499 vmSwapOneObjectThreaded();
1763929f 1500 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1501 zmalloc_used_memory() >
1502 (server.vm_max_memory+server.vm_max_memory/10))
1503 {
1504 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1505 }
72e9fd40 1506 /* Note that when using threade I/O we free just one object,
1507 * because anyway when the I/O thread in charge to swap this
1508 * object out will finish, the handler of completed jobs
1509 * will try to swap more objects if we are still out of memory. */
1510 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1511 }
1512 }
1513
ed9b544e 1514 /* Check if we should connect to a MASTER */
1763929f 1515 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1516 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1517 if (syncWithMaster() == REDIS_OK) {
1518 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
8f63ddca 1519 if (server.appendonly) rewriteAppendOnlyFileBackground();
ed9b544e 1520 }
1521 }
1763929f 1522 return 100;
ed9b544e 1523}
1524
d5d55fc3 1525/* This function gets called every time Redis is entering the
1526 * main loop of the event driven library, that is, before to sleep
1527 * for ready file descriptors. */
1528static void beforeSleep(struct aeEventLoop *eventLoop) {
1529 REDIS_NOTUSED(eventLoop);
1530
1531 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1532 listIter li;
1533 listNode *ln;
1534
1535 listRewind(server.io_ready_clients,&li);
1536 while((ln = listNext(&li))) {
1537 redisClient *c = ln->value;
1538 struct redisCommand *cmd;
1539
1540 /* Resume the client. */
1541 listDelNode(server.io_ready_clients,ln);
1542 c->flags &= (~REDIS_IO_WAIT);
1543 server.vm_blocked_clients--;
1544 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1545 readQueryFromClient, c);
1546 cmd = lookupCommand(c->argv[0]->ptr);
1547 assert(cmd != NULL);
1548 call(c,cmd);
1549 resetClient(c);
1550 /* There may be more data to process in the input buffer. */
1551 if (c->querybuf && sdslen(c->querybuf) > 0)
1552 processInputBuffer(c);
1553 }
1554 }
1555}
1556
ed9b544e 1557static void createSharedObjects(void) {
05df7621 1558 int j;
1559
ed9b544e 1560 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1561 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1562 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1563 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1564 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1565 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1566 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1567 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1568 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1569 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1570 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1571 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1572 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1573 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1574 "-ERR no such key\r\n"));
ed9b544e 1575 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1576 "-ERR syntax error\r\n"));
c937aa89 1577 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1578 "-ERR source and destination objects are the same\r\n"));
1579 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1580 "-ERR index out of range\r\n"));
ed9b544e 1581 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1582 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1583 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1584 shared.select0 = createStringObject("select 0\r\n",10);
1585 shared.select1 = createStringObject("select 1\r\n",10);
1586 shared.select2 = createStringObject("select 2\r\n",10);
1587 shared.select3 = createStringObject("select 3\r\n",10);
1588 shared.select4 = createStringObject("select 4\r\n",10);
1589 shared.select5 = createStringObject("select 5\r\n",10);
1590 shared.select6 = createStringObject("select 6\r\n",10);
1591 shared.select7 = createStringObject("select 7\r\n",10);
1592 shared.select8 = createStringObject("select 8\r\n",10);
1593 shared.select9 = createStringObject("select 9\r\n",10);
befec3cd 1594 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
c8d0ea0e 1595 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
befec3cd 1596 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
fc46bb71 1597 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
ffc6b7f8 1598 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1599 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
befec3cd 1600 shared.mbulk3 = createStringObject("*3\r\n",4);
c8d0ea0e 1601 shared.mbulk4 = createStringObject("*4\r\n",4);
05df7621 1602 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1603 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1604 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1605 }
ed9b544e 1606}
1607
1608static void appendServerSaveParams(time_t seconds, int changes) {
1609 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1610 server.saveparams[server.saveparamslen].seconds = seconds;
1611 server.saveparams[server.saveparamslen].changes = changes;
1612 server.saveparamslen++;
1613}
1614
bcfc686d 1615static void resetServerSaveParams() {
ed9b544e 1616 zfree(server.saveparams);
1617 server.saveparams = NULL;
1618 server.saveparamslen = 0;
1619}
1620
1621static void initServerConfig() {
1622 server.dbnum = REDIS_DEFAULT_DBNUM;
1623 server.port = REDIS_SERVERPORT;
f870935d 1624 server.verbosity = REDIS_VERBOSE;
ed9b544e 1625 server.maxidletime = REDIS_MAXIDLETIME;
1626 server.saveparams = NULL;
1627 server.logfile = NULL; /* NULL = log on standard output */
1628 server.bindaddr = NULL;
1629 server.glueoutputbuf = 1;
1630 server.daemonize = 0;
44b38ef4 1631 server.appendonly = 0;
4e141d5a 1632 server.appendfsync = APPENDFSYNC_ALWAYS;
48f0308a 1633 server.lastfsync = time(NULL);
44b38ef4 1634 server.appendfd = -1;
1635 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1636 server.pidfile = zstrdup("/var/run/redis.pid");
1637 server.dbfilename = zstrdup("dump.rdb");
1638 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1639 server.requirepass = NULL;
b0553789 1640 server.rdbcompression = 1;
8ca3e9d1 1641 server.activerehashing = 1;
285add55 1642 server.maxclients = 0;
d5d55fc3 1643 server.blpop_blocked_clients = 0;
3fd78bcd 1644 server.maxmemory = 0;
75680a3c 1645 server.vm_enabled = 0;
054e426d 1646 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1647 server.vm_page_size = 256; /* 256 bytes per page */
1648 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1649 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1650 server.vm_max_threads = 4;
d5d55fc3 1651 server.vm_blocked_clients = 0;
cbba7dd7 1652 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1653 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
75680a3c 1654
bcfc686d 1655 resetServerSaveParams();
ed9b544e 1656
1657 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1658 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1659 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1660 /* Replication related */
1661 server.isslave = 0;
d0ccebcf 1662 server.masterauth = NULL;
ed9b544e 1663 server.masterhost = NULL;
1664 server.masterport = 6379;
1665 server.master = NULL;
1666 server.replstate = REDIS_REPL_NONE;
a7866db6 1667
1668 /* Double constants initialization */
1669 R_Zero = 0.0;
1670 R_PosInf = 1.0/R_Zero;
1671 R_NegInf = -1.0/R_Zero;
1672 R_Nan = R_Zero/R_Zero;
ed9b544e 1673}
1674
1675static void initServer() {
1676 int j;
1677
1678 signal(SIGHUP, SIG_IGN);
1679 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1680 setupSigSegvAction();
ed9b544e 1681
b9bc0eef 1682 server.devnull = fopen("/dev/null","w");
1683 if (server.devnull == NULL) {
1684 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1685 exit(1);
1686 }
ed9b544e 1687 server.clients = listCreate();
1688 server.slaves = listCreate();
87eca727 1689 server.monitors = listCreate();
ed9b544e 1690 server.objfreelist = listCreate();
1691 createSharedObjects();
1692 server.el = aeCreateEventLoop();
3305306f 1693 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
ed9b544e 1694 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1695 if (server.fd == -1) {
1696 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1697 exit(1);
1698 }
3305306f 1699 for (j = 0; j < server.dbnum; j++) {
5234952b 1700 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1701 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
4409877e 1702 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1703 if (server.vm_enabled)
1704 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1705 server.db[j].id = j;
1706 }
ffc6b7f8 1707 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1708 server.pubsub_patterns = listCreate();
1709 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1710 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
ed9b544e 1711 server.cronloops = 0;
9f3c422c 1712 server.bgsavechildpid = -1;
9d65a1bb 1713 server.bgrewritechildpid = -1;
1714 server.bgrewritebuf = sdsempty();
ed9b544e 1715 server.lastsave = time(NULL);
1716 server.dirty = 0;
ed9b544e 1717 server.stat_numcommands = 0;
1718 server.stat_numconnections = 0;
2a6a2ed1 1719 server.stat_expiredkeys = 0;
ed9b544e 1720 server.stat_starttime = time(NULL);
3a66edc7 1721 server.unixtime = time(NULL);
d8f8b666 1722 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1723 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1724 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1725
1726 if (server.appendonly) {
3bb225d6 1727 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1728 if (server.appendfd == -1) {
1729 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1730 strerror(errno));
1731 exit(1);
1732 }
1733 }
75680a3c 1734
1735 if (server.vm_enabled) vmInit();
ed9b544e 1736}
1737
1738/* Empty the whole database */
ca37e9cd 1739static long long emptyDb() {
ed9b544e 1740 int j;
ca37e9cd 1741 long long removed = 0;
ed9b544e 1742
3305306f 1743 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1744 removed += dictSize(server.db[j].dict);
3305306f 1745 dictEmpty(server.db[j].dict);
1746 dictEmpty(server.db[j].expires);
1747 }
ca37e9cd 1748 return removed;
ed9b544e 1749}
1750
85dd2f3a 1751static int yesnotoi(char *s) {
1752 if (!strcasecmp(s,"yes")) return 1;
1753 else if (!strcasecmp(s,"no")) return 0;
1754 else return -1;
1755}
1756
ed9b544e 1757/* I agree, this is a very rudimental way to load a configuration...
1758 will improve later if the config gets more complex */
1759static void loadServerConfig(char *filename) {
c9a111ac 1760 FILE *fp;
ed9b544e 1761 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1762 int linenum = 0;
1763 sds line = NULL;
c9a111ac 1764
1765 if (filename[0] == '-' && filename[1] == '\0')
1766 fp = stdin;
1767 else {
1768 if ((fp = fopen(filename,"r")) == NULL) {
9a22de82 1769 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
c9a111ac 1770 exit(1);
1771 }
ed9b544e 1772 }
c9a111ac 1773
ed9b544e 1774 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1775 sds *argv;
1776 int argc, j;
1777
1778 linenum++;
1779 line = sdsnew(buf);
1780 line = sdstrim(line," \t\r\n");
1781
1782 /* Skip comments and blank lines*/
1783 if (line[0] == '#' || line[0] == '\0') {
1784 sdsfree(line);
1785 continue;
1786 }
1787
1788 /* Split into arguments */
1789 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1790 sdstolower(argv[0]);
1791
1792 /* Execute config directives */
bb0b03a3 1793 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1794 server.maxidletime = atoi(argv[1]);
0150db36 1795 if (server.maxidletime < 0) {
ed9b544e 1796 err = "Invalid timeout value"; goto loaderr;
1797 }
bb0b03a3 1798 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1799 server.port = atoi(argv[1]);
1800 if (server.port < 1 || server.port > 65535) {
1801 err = "Invalid port"; goto loaderr;
1802 }
bb0b03a3 1803 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1804 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1805 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1806 int seconds = atoi(argv[1]);
1807 int changes = atoi(argv[2]);
1808 if (seconds < 1 || changes < 0) {
1809 err = "Invalid save parameters"; goto loaderr;
1810 }
1811 appendServerSaveParams(seconds,changes);
bb0b03a3 1812 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1813 if (chdir(argv[1]) == -1) {
1814 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1815 argv[1], strerror(errno));
1816 exit(1);
1817 }
bb0b03a3 1818 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1819 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1820 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1821 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1822 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1823 else {
1824 err = "Invalid log level. Must be one of debug, notice, warning";
1825 goto loaderr;
1826 }
bb0b03a3 1827 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1828 FILE *logfp;
ed9b544e 1829
1830 server.logfile = zstrdup(argv[1]);
bb0b03a3 1831 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1832 zfree(server.logfile);
1833 server.logfile = NULL;
1834 }
1835 if (server.logfile) {
1836 /* Test if we are able to open the file. The server will not
1837 * be able to abort just for this problem later... */
c9a111ac 1838 logfp = fopen(server.logfile,"a");
1839 if (logfp == NULL) {
ed9b544e 1840 err = sdscatprintf(sdsempty(),
1841 "Can't open the log file: %s", strerror(errno));
1842 goto loaderr;
1843 }
c9a111ac 1844 fclose(logfp);
ed9b544e 1845 }
bb0b03a3 1846 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1847 server.dbnum = atoi(argv[1]);
1848 if (server.dbnum < 1) {
1849 err = "Invalid number of databases"; goto loaderr;
1850 }
b3f83f12
JZ
1851 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1852 loadServerConfig(argv[1]);
285add55 1853 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1854 server.maxclients = atoi(argv[1]);
3fd78bcd 1855 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
2b619329 1856 server.maxmemory = memtoll(argv[1],NULL);
bb0b03a3 1857 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1858 server.masterhost = sdsnew(argv[1]);
1859 server.masterport = atoi(argv[2]);
1860 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1861 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1862 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1863 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1864 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1865 err = "argument must be 'yes' or 'no'"; goto loaderr;
1866 }
121f70cf 1867 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1868 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
8ca3e9d1 1869 err = "argument must be 'yes' or 'no'"; goto loaderr;
1870 }
1871 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1872 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
121f70cf 1873 err = "argument must be 'yes' or 'no'"; goto loaderr;
1874 }
bb0b03a3 1875 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1876 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1877 err = "argument must be 'yes' or 'no'"; goto loaderr;
1878 }
44b38ef4 1879 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1880 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1881 err = "argument must be 'yes' or 'no'"; goto loaderr;
1882 }
48f0308a 1883 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1884 if (!strcasecmp(argv[1],"no")) {
48f0308a 1885 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1886 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1887 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1888 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1889 server.appendfsync = APPENDFSYNC_EVERYSEC;
1890 } else {
1891 err = "argument must be 'no', 'always' or 'everysec'";
1892 goto loaderr;
1893 }
bb0b03a3 1894 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 1895 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1896 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 1897 zfree(server.pidfile);
054e426d 1898 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1899 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 1900 zfree(server.dbfilename);
054e426d 1901 server.dbfilename = zstrdup(argv[1]);
75680a3c 1902 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1903 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1904 err = "argument must be 'yes' or 'no'"; goto loaderr;
1905 }
054e426d 1906 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 1907 zfree(server.vm_swap_file);
054e426d 1908 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 1909 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2b619329 1910 server.vm_max_memory = memtoll(argv[1],NULL);
4ef8de8a 1911 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2b619329 1912 server.vm_page_size = memtoll(argv[1], NULL);
4ef8de8a 1913 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2b619329 1914 server.vm_pages = memtoll(argv[1], NULL);
92f8e882 1915 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1916 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 1917 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2b619329 1918 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
cbba7dd7 1919 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2b619329 1920 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
ed9b544e 1921 } else {
1922 err = "Bad directive or wrong number of arguments"; goto loaderr;
1923 }
1924 for (j = 0; j < argc; j++)
1925 sdsfree(argv[j]);
1926 zfree(argv);
1927 sdsfree(line);
1928 }
c9a111ac 1929 if (fp != stdin) fclose(fp);
ed9b544e 1930 return;
1931
1932loaderr:
1933 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1934 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1935 fprintf(stderr, ">>> '%s'\n", line);
1936 fprintf(stderr, "%s\n", err);
1937 exit(1);
1938}
1939
1940static void freeClientArgv(redisClient *c) {
1941 int j;
1942
1943 for (j = 0; j < c->argc; j++)
1944 decrRefCount(c->argv[j]);
e8a74421 1945 for (j = 0; j < c->mbargc; j++)
1946 decrRefCount(c->mbargv[j]);
ed9b544e 1947 c->argc = 0;
e8a74421 1948 c->mbargc = 0;
ed9b544e 1949}
1950
1951static void freeClient(redisClient *c) {
1952 listNode *ln;
1953
4409877e 1954 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 1955 * call, we have to set querybuf to NULL *before* to call
1956 * unblockClientWaitingData() to avoid processInputBuffer() will get
1957 * called. Also it is important to remove the file events after
1958 * this, because this call adds the READABLE event. */
4409877e 1959 sdsfree(c->querybuf);
1960 c->querybuf = NULL;
1961 if (c->flags & REDIS_BLOCKED)
b0d8747d 1962 unblockClientWaitingData(c);
4409877e 1963
ffc6b7f8 1964 /* Unsubscribe from all the pubsub channels */
1965 pubsubUnsubscribeAllChannels(c,0);
1966 pubsubUnsubscribeAllPatterns(c,0);
1967 dictRelease(c->pubsub_channels);
1968 listRelease(c->pubsub_patterns);
befec3cd 1969 /* Obvious cleanup */
ed9b544e 1970 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1971 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 1972 listRelease(c->reply);
1973 freeClientArgv(c);
1974 close(c->fd);
92f8e882 1975 /* Remove from the list of clients */
ed9b544e 1976 ln = listSearchKey(server.clients,c);
dfc5e96c 1977 redisAssert(ln != NULL);
ed9b544e 1978 listDelNode(server.clients,ln);
d5d55fc3 1979 /* Remove from the list of clients waiting for swapped keys */
1980 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1981 ln = listSearchKey(server.io_ready_clients,c);
1982 if (ln) {
1983 listDelNode(server.io_ready_clients,ln);
1984 server.vm_blocked_clients--;
1985 }
1986 }
1987 while (server.vm_enabled && listLength(c->io_keys)) {
1988 ln = listFirst(c->io_keys);
1989 dontWaitForSwappedKey(c,ln->value);
92f8e882 1990 }
b3e3d0d7 1991 listRelease(c->io_keys);
befec3cd 1992 /* Master/slave cleanup */
ed9b544e 1993 if (c->flags & REDIS_SLAVE) {
6208b3a7 1994 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1995 close(c->repldbfd);
87eca727 1996 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1997 ln = listSearchKey(l,c);
dfc5e96c 1998 redisAssert(ln != NULL);
87eca727 1999 listDelNode(l,ln);
ed9b544e 2000 }
2001 if (c->flags & REDIS_MASTER) {
2002 server.master = NULL;
2003 server.replstate = REDIS_REPL_CONNECT;
2004 }
befec3cd 2005 /* Release memory */
93ea3759 2006 zfree(c->argv);
e8a74421 2007 zfree(c->mbargv);
6e469882 2008 freeClientMultiState(c);
ed9b544e 2009 zfree(c);
2010}
2011
cc30e368 2012#define GLUEREPLY_UP_TO (1024)
ed9b544e 2013static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 2014 int copylen = 0;
2015 char buf[GLUEREPLY_UP_TO];
6208b3a7 2016 listNode *ln;
c7df85a4 2017 listIter li;
ed9b544e 2018 robj *o;
2019
c7df85a4 2020 listRewind(c->reply,&li);
2021 while((ln = listNext(&li))) {
c28b42ac 2022 int objlen;
2023
ed9b544e 2024 o = ln->value;
c28b42ac 2025 objlen = sdslen(o->ptr);
2026 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2027 memcpy(buf+copylen,o->ptr,objlen);
2028 copylen += objlen;
ed9b544e 2029 listDelNode(c->reply,ln);
c28b42ac 2030 } else {
2031 if (copylen == 0) return;
2032 break;
ed9b544e 2033 }
ed9b544e 2034 }
c28b42ac 2035 /* Now the output buffer is empty, add the new single element */
2036 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2037 listAddNodeHead(c->reply,o);
ed9b544e 2038}
2039
2040static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2041 redisClient *c = privdata;
2042 int nwritten = 0, totwritten = 0, objlen;
2043 robj *o;
2044 REDIS_NOTUSED(el);
2045 REDIS_NOTUSED(mask);
2046
2895e862 2047 /* Use writev() if we have enough buffers to send */
7ea870c0 2048 if (!server.glueoutputbuf &&
e0a62c7f 2049 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
7ea870c0 2050 !(c->flags & REDIS_MASTER))
2895e862 2051 {
2052 sendReplyToClientWritev(el, fd, privdata, mask);
2053 return;
2054 }
2895e862 2055
ed9b544e 2056 while(listLength(c->reply)) {
c28b42ac 2057 if (server.glueoutputbuf && listLength(c->reply) > 1)
2058 glueReplyBuffersIfNeeded(c);
2059
ed9b544e 2060 o = listNodeValue(listFirst(c->reply));
2061 objlen = sdslen(o->ptr);
2062
2063 if (objlen == 0) {
2064 listDelNode(c->reply,listFirst(c->reply));
2065 continue;
2066 }
2067
2068 if (c->flags & REDIS_MASTER) {
6f376729 2069 /* Don't reply to a master */
ed9b544e 2070 nwritten = objlen - c->sentlen;
2071 } else {
a4d1ba9a 2072 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 2073 if (nwritten <= 0) break;
2074 }
2075 c->sentlen += nwritten;
2076 totwritten += nwritten;
2077 /* If we fully sent the object on head go to the next one */
2078 if (c->sentlen == objlen) {
2079 listDelNode(c->reply,listFirst(c->reply));
2080 c->sentlen = 0;
2081 }
6f376729 2082 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 2083 * bytes, in a single threaded server it's a good idea to serve
6f376729 2084 * other clients as well, even if a very large request comes from
2085 * super fast link that is always able to accept data (in real world
12f9d551 2086 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 2087 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 2088 }
2089 if (nwritten == -1) {
2090 if (errno == EAGAIN) {
2091 nwritten = 0;
2092 } else {
f870935d 2093 redisLog(REDIS_VERBOSE,
ed9b544e 2094 "Error writing to client: %s", strerror(errno));
2095 freeClient(c);
2096 return;
2097 }
2098 }
2099 if (totwritten > 0) c->lastinteraction = time(NULL);
2100 if (listLength(c->reply) == 0) {
2101 c->sentlen = 0;
2102 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2103 }
2104}
2105
2895e862 2106static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2107{
2108 redisClient *c = privdata;
2109 int nwritten = 0, totwritten = 0, objlen, willwrite;
2110 robj *o;
2111 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2112 int offset, ion = 0;
2113 REDIS_NOTUSED(el);
2114 REDIS_NOTUSED(mask);
2115
2116 listNode *node;
2117 while (listLength(c->reply)) {
2118 offset = c->sentlen;
2119 ion = 0;
2120 willwrite = 0;
2121
2122 /* fill-in the iov[] array */
2123 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2124 o = listNodeValue(node);
2125 objlen = sdslen(o->ptr);
2126
e0a62c7f 2127 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2895e862 2128 break;
2129
2130 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2131 break; /* no more iovecs */
2132
2133 iov[ion].iov_base = ((char*)o->ptr) + offset;
2134 iov[ion].iov_len = objlen - offset;
2135 willwrite += objlen - offset;
2136 offset = 0; /* just for the first item */
2137 ion++;
2138 }
2139
2140 if(willwrite == 0)
2141 break;
2142
2143 /* write all collected blocks at once */
2144 if((nwritten = writev(fd, iov, ion)) < 0) {
2145 if (errno != EAGAIN) {
f870935d 2146 redisLog(REDIS_VERBOSE,
2895e862 2147 "Error writing to client: %s", strerror(errno));
2148 freeClient(c);
2149 return;
2150 }
2151 break;
2152 }
2153
2154 totwritten += nwritten;
2155 offset = c->sentlen;
2156
2157 /* remove written robjs from c->reply */
2158 while (nwritten && listLength(c->reply)) {
2159 o = listNodeValue(listFirst(c->reply));
2160 objlen = sdslen(o->ptr);
2161
2162 if(nwritten >= objlen - offset) {
2163 listDelNode(c->reply, listFirst(c->reply));
2164 nwritten -= objlen - offset;
2165 c->sentlen = 0;
2166 } else {
2167 /* partial write */
2168 c->sentlen += nwritten;
2169 break;
2170 }
2171 offset = 0;
2172 }
2173 }
2174
e0a62c7f 2175 if (totwritten > 0)
2895e862 2176 c->lastinteraction = time(NULL);
2177
2178 if (listLength(c->reply) == 0) {
2179 c->sentlen = 0;
2180 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2181 }
2182}
2183
ed9b544e 2184static struct redisCommand *lookupCommand(char *name) {
2185 int j = 0;
2186 while(cmdTable[j].name != NULL) {
bb0b03a3 2187 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
ed9b544e 2188 j++;
2189 }
2190 return NULL;
2191}
2192
2193/* resetClient prepare the client to process the next command */
2194static void resetClient(redisClient *c) {
2195 freeClientArgv(c);
2196 c->bulklen = -1;
e8a74421 2197 c->multibulk = 0;
ed9b544e 2198}
2199
6e469882 2200/* Call() is the core of Redis execution of a command */
2201static void call(redisClient *c, struct redisCommand *cmd) {
2202 long long dirty;
2203
2204 dirty = server.dirty;
2205 cmd->proc(c);
4005fef1 2206 dirty = server.dirty-dirty;
2207
2208 if (server.appendonly && dirty)
6e469882 2209 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
4005fef1 2210 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2211 listLength(server.slaves))
248ea310 2212 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2213 if (listLength(server.monitors))
dd142b9c 2214 replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2215 server.stat_numcommands++;
2216}
2217
ed9b544e 2218/* If this function gets called we already read a whole
2219 * command, argments are in the client argv/argc fields.
2220 * processCommand() execute the command or prepare the
2221 * server for a bulk read from the client.
2222 *
2223 * If 1 is returned the client is still alive and valid and
2224 * and other operations can be performed by the caller. Otherwise
2225 * if 0 is returned the client was destroied (i.e. after QUIT). */
2226static int processCommand(redisClient *c) {
2227 struct redisCommand *cmd;
ed9b544e 2228
3fd78bcd 2229 /* Free some memory if needed (maxmemory setting) */
2230 if (server.maxmemory) freeMemoryIfNeeded();
2231
e8a74421 2232 /* Handle the multi bulk command type. This is an alternative protocol
2233 * supported by Redis in order to receive commands that are composed of
2234 * multiple binary-safe "bulk" arguments. The latency of processing is
2235 * a bit higher but this allows things like multi-sets, so if this
2236 * protocol is used only for MSET and similar commands this is a big win. */
2237 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2238 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2239 if (c->multibulk <= 0) {
2240 resetClient(c);
2241 return 1;
2242 } else {
2243 decrRefCount(c->argv[c->argc-1]);
2244 c->argc--;
2245 return 1;
2246 }
2247 } else if (c->multibulk) {
2248 if (c->bulklen == -1) {
2249 if (((char*)c->argv[0]->ptr)[0] != '$') {
2250 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2251 resetClient(c);
2252 return 1;
2253 } else {
2254 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2255 decrRefCount(c->argv[0]);
2256 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2257 c->argc--;
2258 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2259 resetClient(c);
2260 return 1;
2261 }
2262 c->argc--;
2263 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2264 return 1;
2265 }
2266 } else {
2267 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2268 c->mbargv[c->mbargc] = c->argv[0];
2269 c->mbargc++;
2270 c->argc--;
2271 c->multibulk--;
2272 if (c->multibulk == 0) {
2273 robj **auxargv;
2274 int auxargc;
2275
2276 /* Here we need to swap the multi-bulk argc/argv with the
2277 * normal argc/argv of the client structure. */
2278 auxargv = c->argv;
2279 c->argv = c->mbargv;
2280 c->mbargv = auxargv;
2281
2282 auxargc = c->argc;
2283 c->argc = c->mbargc;
2284 c->mbargc = auxargc;
2285
2286 /* We need to set bulklen to something different than -1
2287 * in order for the code below to process the command without
2288 * to try to read the last argument of a bulk command as
2289 * a special argument. */
2290 c->bulklen = 0;
2291 /* continue below and process the command */
2292 } else {
2293 c->bulklen = -1;
2294 return 1;
2295 }
2296 }
2297 }
2298 /* -- end of multi bulk commands processing -- */
2299
ed9b544e 2300 /* The QUIT command is handled as a special case. Normal command
2301 * procs are unable to close the client connection safely */
bb0b03a3 2302 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2303 freeClient(c);
2304 return 0;
2305 }
d5d55fc3 2306
2307 /* Now lookup the command and check ASAP about trivial error conditions
2308 * such wrong arity, bad command name and so forth. */
ed9b544e 2309 cmd = lookupCommand(c->argv[0]->ptr);
2310 if (!cmd) {
2c14807b 2311 addReplySds(c,
2312 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2313 (char*)c->argv[0]->ptr));
ed9b544e 2314 resetClient(c);
2315 return 1;
2316 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2317 (c->argc < -cmd->arity)) {
454d4e43 2318 addReplySds(c,
2319 sdscatprintf(sdsempty(),
2320 "-ERR wrong number of arguments for '%s' command\r\n",
2321 cmd->name));
ed9b544e 2322 resetClient(c);
2323 return 1;
2324 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2325 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2326 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2327
2328 decrRefCount(c->argv[c->argc-1]);
2329 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2330 c->argc--;
2331 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2332 resetClient(c);
2333 return 1;
2334 }
2335 c->argc--;
2336 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2337 /* It is possible that the bulk read is already in the
8d0490e7 2338 * buffer. Check this condition and handle it accordingly.
2339 * This is just a fast path, alternative to call processInputBuffer().
2340 * It's a good idea since the code is small and this condition
2341 * happens most of the times. */
ed9b544e 2342 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2343 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2344 c->argc++;
2345 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2346 } else {
d5d55fc3 2347 /* Otherwise return... there is to read the last argument
2348 * from the socket. */
ed9b544e 2349 return 1;
2350 }
2351 }
942a3961 2352 /* Let's try to encode the bulk object to save space. */
2353 if (cmd->flags & REDIS_CMD_BULK)
05df7621 2354 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
942a3961 2355
e63943a4 2356 /* Check if the user is authenticated */
2357 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2358 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2359 resetClient(c);
2360 return 1;
2361 }
2362
b61a28fe 2363 /* Handle the maxmemory directive */
2364 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2365 zmalloc_used_memory() > server.maxmemory)
2366 {
2367 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2368 resetClient(c);
2369 return 1;
2370 }
2371
d6cc8867 2372 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
e6cca5db 2373 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2374 &&
ffc6b7f8 2375 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2376 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2377 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
d6cc8867 2378 resetClient(c);
2379 return 1;
2380 }
2381
ed9b544e 2382 /* Exec the command */
18b6cb76 2383 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
6e469882 2384 queueMultiCommand(c,cmd);
2385 addReply(c,shared.queued);
2386 } else {
d5d55fc3 2387 if (server.vm_enabled && server.vm_max_threads > 0 &&
2388 blockClientOnSwappedKeys(cmd,c)) return 1;
6e469882 2389 call(c,cmd);
2390 }
ed9b544e 2391
2392 /* Prepare the client for the next command */
ed9b544e 2393 resetClient(c);
2394 return 1;
2395}
2396
248ea310 2397static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2398 listNode *ln;
c7df85a4 2399 listIter li;
ed9b544e 2400 int outc = 0, j;
93ea3759 2401 robj **outv;
248ea310 2402 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2403 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2404 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2405 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2406 robj *lenobj;
93ea3759 2407
2408 if (argc <= REDIS_STATIC_ARGS) {
2409 outv = static_outv;
2410 } else {
248ea310 2411 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2412 }
248ea310 2413
2414 lenobj = createObject(REDIS_STRING,
2415 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2416 lenobj->refcount = 0;
2417 outv[outc++] = lenobj;
ed9b544e 2418 for (j = 0; j < argc; j++) {
248ea310 2419 lenobj = createObject(REDIS_STRING,
2420 sdscatprintf(sdsempty(),"$%lu\r\n",
2421 (unsigned long) stringObjectLen(argv[j])));
2422 lenobj->refcount = 0;
2423 outv[outc++] = lenobj;
ed9b544e 2424 outv[outc++] = argv[j];
248ea310 2425 outv[outc++] = shared.crlf;
ed9b544e 2426 }
ed9b544e 2427
40d224a9 2428 /* Increment all the refcounts at start and decrement at end in order to
2429 * be sure to free objects if there is no slave in a replication state
2430 * able to be feed with commands */
2431 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2432 listRewind(slaves,&li);
2433 while((ln = listNext(&li))) {
ed9b544e 2434 redisClient *slave = ln->value;
40d224a9 2435
2436 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2437 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2438
2439 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2440 if (slave->slaveseldb != dictid) {
2441 robj *selectcmd;
2442
2443 switch(dictid) {
2444 case 0: selectcmd = shared.select0; break;
2445 case 1: selectcmd = shared.select1; break;
2446 case 2: selectcmd = shared.select2; break;
2447 case 3: selectcmd = shared.select3; break;
2448 case 4: selectcmd = shared.select4; break;
2449 case 5: selectcmd = shared.select5; break;
2450 case 6: selectcmd = shared.select6; break;
2451 case 7: selectcmd = shared.select7; break;
2452 case 8: selectcmd = shared.select8; break;
2453 case 9: selectcmd = shared.select9; break;
2454 default:
2455 selectcmd = createObject(REDIS_STRING,
2456 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2457 selectcmd->refcount = 0;
2458 break;
2459 }
2460 addReply(slave,selectcmd);
2461 slave->slaveseldb = dictid;
2462 }
2463 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2464 }
40d224a9 2465 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2466 if (outv != static_outv) zfree(outv);
ed9b544e 2467}
2468
dd142b9c 2469static sds sdscatrepr(sds s, char *p, size_t len) {
2470 s = sdscatlen(s,"\"",1);
2471 while(len--) {
2472 switch(*p) {
2473 case '\\':
2474 case '"':
2475 s = sdscatprintf(s,"\\%c",*p);
2476 break;
2477 case '\n': s = sdscatlen(s,"\\n",1); break;
2478 case '\r': s = sdscatlen(s,"\\r",1); break;
2479 case '\t': s = sdscatlen(s,"\\t",1); break;
2480 case '\a': s = sdscatlen(s,"\\a",1); break;
2481 case '\b': s = sdscatlen(s,"\\b",1); break;
2482 default:
2483 if (isprint(*p))
2484 s = sdscatprintf(s,"%c",*p);
2485 else
2486 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
2487 break;
2488 }
2489 p++;
2490 }
2491 return sdscatlen(s,"\"",1);
2492}
2493
2494static void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
2495 listNode *ln;
2496 listIter li;
2497 int j;
2498 sds cmdrepr = sdsnew("+");
2499 robj *cmdobj;
2500 struct timeval tv;
2501
2502 gettimeofday(&tv,NULL);
2503 cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
2504 if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
2505
2506 for (j = 0; j < argc; j++) {
2507 if (argv[j]->encoding == REDIS_ENCODING_INT) {
2508 cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
2509 } else {
2510 cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
2511 sdslen(argv[j]->ptr));
2512 }
2513 if (j != argc-1)
2514 cmdrepr = sdscatlen(cmdrepr," ",1);
2515 }
2516 cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
2517 cmdobj = createObject(REDIS_STRING,cmdrepr);
2518
2519 listRewind(monitors,&li);
2520 while((ln = listNext(&li))) {
2521 redisClient *monitor = ln->value;
2522 addReply(monitor,cmdobj);
2523 }
2524 decrRefCount(cmdobj);
2525}
2526
638e42ac 2527static void processInputBuffer(redisClient *c) {
ed9b544e 2528again:
4409877e 2529 /* Before to process the input buffer, make sure the client is not
2530 * waitig for a blocking operation such as BLPOP. Note that the first
2531 * iteration the client is never blocked, otherwise the processInputBuffer
2532 * would not be called at all, but after the execution of the first commands
2533 * in the input buffer the client may be blocked, and the "goto again"
2534 * will try to reiterate. The following line will make it return asap. */
92f8e882 2535 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2536 if (c->bulklen == -1) {
2537 /* Read the first line of the query */
2538 char *p = strchr(c->querybuf,'\n');
2539 size_t querylen;
644fafa3 2540
ed9b544e 2541 if (p) {
2542 sds query, *argv;
2543 int argc, j;
e0a62c7f 2544
ed9b544e 2545 query = c->querybuf;
2546 c->querybuf = sdsempty();
2547 querylen = 1+(p-(query));
2548 if (sdslen(query) > querylen) {
2549 /* leave data after the first line of the query in the buffer */
2550 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2551 }
2552 *p = '\0'; /* remove "\n" */
2553 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2554 sdsupdatelen(query);
2555
2556 /* Now we can split the query in arguments */
ed9b544e 2557 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2558 sdsfree(query);
2559
2560 if (c->argv) zfree(c->argv);
2561 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2562
2563 for (j = 0; j < argc; j++) {
ed9b544e 2564 if (sdslen(argv[j])) {
2565 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2566 c->argc++;
2567 } else {
2568 sdsfree(argv[j]);
2569 }
2570 }
2571 zfree(argv);
7c49733c 2572 if (c->argc) {
2573 /* Execute the command. If the client is still valid
2574 * after processCommand() return and there is something
2575 * on the query buffer try to process the next command. */
2576 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2577 } else {
2578 /* Nothing to process, argc == 0. Just process the query
2579 * buffer if it's not empty or return to the caller */
2580 if (sdslen(c->querybuf)) goto again;
2581 }
ed9b544e 2582 return;
644fafa3 2583 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2584 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2585 freeClient(c);
2586 return;
2587 }
2588 } else {
2589 /* Bulk read handling. Note that if we are at this point
2590 the client already sent a command terminated with a newline,
2591 we are reading the bulk data that is actually the last
2592 argument of the command. */
2593 int qbl = sdslen(c->querybuf);
2594
2595 if (c->bulklen <= qbl) {
2596 /* Copy everything but the final CRLF as final argument */
2597 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2598 c->argc++;
2599 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2600 /* Process the command. If the client is still valid after
2601 * the processing and there is more data in the buffer
2602 * try to parse it. */
2603 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2604 return;
2605 }
2606 }
2607}
2608
638e42ac 2609static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2610 redisClient *c = (redisClient*) privdata;
2611 char buf[REDIS_IOBUF_LEN];
2612 int nread;
2613 REDIS_NOTUSED(el);
2614 REDIS_NOTUSED(mask);
2615
2616 nread = read(fd, buf, REDIS_IOBUF_LEN);
2617 if (nread == -1) {
2618 if (errno == EAGAIN) {
2619 nread = 0;
2620 } else {
f870935d 2621 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2622 freeClient(c);
2623 return;
2624 }
2625 } else if (nread == 0) {
f870935d 2626 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2627 freeClient(c);
2628 return;
2629 }
2630 if (nread) {
2631 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2632 c->lastinteraction = time(NULL);
2633 } else {
2634 return;
2635 }
168ac5c6 2636 processInputBuffer(c);
638e42ac 2637}
2638
ed9b544e 2639static int selectDb(redisClient *c, int id) {
2640 if (id < 0 || id >= server.dbnum)
2641 return REDIS_ERR;
3305306f 2642 c->db = &server.db[id];
ed9b544e 2643 return REDIS_OK;
2644}
2645
40d224a9 2646static void *dupClientReplyValue(void *o) {
2647 incrRefCount((robj*)o);
12d090d2 2648 return o;
40d224a9 2649}
2650
ffc6b7f8 2651static int listMatchObjects(void *a, void *b) {
2652 return compareStringObjects(a,b) == 0;
2653}
2654
ed9b544e 2655static redisClient *createClient(int fd) {
2656 redisClient *c = zmalloc(sizeof(*c));
2657
2658 anetNonBlock(NULL,fd);
2659 anetTcpNoDelay(NULL,fd);
2660 if (!c) return NULL;
2661 selectDb(c,0);
2662 c->fd = fd;
2663 c->querybuf = sdsempty();
2664 c->argc = 0;
93ea3759 2665 c->argv = NULL;
ed9b544e 2666 c->bulklen = -1;
e8a74421 2667 c->multibulk = 0;
2668 c->mbargc = 0;
2669 c->mbargv = NULL;
ed9b544e 2670 c->sentlen = 0;
2671 c->flags = 0;
2672 c->lastinteraction = time(NULL);
abcb223e 2673 c->authenticated = 0;
40d224a9 2674 c->replstate = REDIS_REPL_NONE;
6b47e12e 2675 c->reply = listCreate();
ed9b544e 2676 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2677 listSetDupMethod(c->reply,dupClientReplyValue);
92f8e882 2678 c->blockingkeys = NULL;
2679 c->blockingkeysnum = 0;
2680 c->io_keys = listCreate();
2681 listSetFreeMethod(c->io_keys,decrRefCount);
ffc6b7f8 2682 c->pubsub_channels = dictCreate(&setDictType,NULL);
2683 c->pubsub_patterns = listCreate();
2684 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2685 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
ed9b544e 2686 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2687 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2688 freeClient(c);
2689 return NULL;
2690 }
6b47e12e 2691 listAddNodeTail(server.clients,c);
6e469882 2692 initClientMultiState(c);
ed9b544e 2693 return c;
2694}
2695
2696static void addReply(redisClient *c, robj *obj) {
2697 if (listLength(c->reply) == 0 &&
6208b3a7 2698 (c->replstate == REDIS_REPL_NONE ||
2699 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2700 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2701 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2702
2703 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2704 obj = dupStringObject(obj);
2705 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2706 }
9d65a1bb 2707 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2708}
2709
2710static void addReplySds(redisClient *c, sds s) {
2711 robj *o = createObject(REDIS_STRING,s);
2712 addReply(c,o);
2713 decrRefCount(o);
2714}
2715
e2665397 2716static void addReplyDouble(redisClient *c, double d) {
2717 char buf[128];
2718
2719 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2720 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2721 (unsigned long) strlen(buf),buf));
e2665397 2722}
2723
f44dd428 2724static void addReplyLong(redisClient *c, long l) {
2725 char buf[128];
2726 size_t len;
2727
dd88747b 2728 if (l == 0) {
2729 addReply(c,shared.czero);
2730 return;
2731 } else if (l == 1) {
2732 addReply(c,shared.cone);
2733 return;
2734 }
f44dd428 2735 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2736 addReplySds(c,sdsnewlen(buf,len));
2737}
2738
aa7c2934
PN
2739static void addReplyLongLong(redisClient *c, long long ll) {
2740 char buf[128];
2741 size_t len;
2742
2743 if (ll == 0) {
2744 addReply(c,shared.czero);
2745 return;
2746 } else if (ll == 1) {
2747 addReply(c,shared.cone);
2748 return;
2749 }
2750 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2751 addReplySds(c,sdsnewlen(buf,len));
2752}
2753
92b27fe9 2754static void addReplyUlong(redisClient *c, unsigned long ul) {
2755 char buf[128];
2756 size_t len;
2757
dd88747b 2758 if (ul == 0) {
2759 addReply(c,shared.czero);
2760 return;
2761 } else if (ul == 1) {
2762 addReply(c,shared.cone);
2763 return;
2764 }
92b27fe9 2765 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2766 addReplySds(c,sdsnewlen(buf,len));
2767}
2768
942a3961 2769static void addReplyBulkLen(redisClient *c, robj *obj) {
2770 size_t len;
2771
2772 if (obj->encoding == REDIS_ENCODING_RAW) {
2773 len = sdslen(obj->ptr);
2774 } else {
2775 long n = (long)obj->ptr;
2776
e054afda 2777 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2778 len = 1;
2779 if (n < 0) {
2780 len++;
2781 n = -n;
2782 }
2783 while((n = n/10) != 0) {
2784 len++;
2785 }
2786 }
83c6a618 2787 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
942a3961 2788}
2789
dd88747b 2790static void addReplyBulk(redisClient *c, robj *obj) {
2791 addReplyBulkLen(c,obj);
2792 addReply(c,obj);
2793 addReply(c,shared.crlf);
2794}
2795
500ece7c 2796/* In the CONFIG command we need to add vanilla C string as bulk replies */
2797static void addReplyBulkCString(redisClient *c, char *s) {
2798 if (s == NULL) {
2799 addReply(c,shared.nullbulk);
2800 } else {
2801 robj *o = createStringObject(s,strlen(s));
2802 addReplyBulk(c,o);
2803 decrRefCount(o);
2804 }
2805}
2806
ed9b544e 2807static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2808 int cport, cfd;
2809 char cip[128];
285add55 2810 redisClient *c;
ed9b544e 2811 REDIS_NOTUSED(el);
2812 REDIS_NOTUSED(mask);
2813 REDIS_NOTUSED(privdata);
2814
2815 cfd = anetAccept(server.neterr, fd, cip, &cport);
2816 if (cfd == AE_ERR) {
f870935d 2817 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2818 return;
2819 }
f870935d 2820 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2821 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2822 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2823 close(cfd); /* May be already closed, just ingore errors */
2824 return;
2825 }
285add55 2826 /* If maxclient directive is set and this is one client more... close the
2827 * connection. Note that we create the client instead to check before
2828 * for this condition, since now the socket is already set in nonblocking
2829 * mode and we can send an error for free using the Kernel I/O */
2830 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2831 char *err = "-ERR max number of clients reached\r\n";
2832
2833 /* That's a best effort error message, don't check write errors */
fee803ba 2834 if (write(c->fd,err,strlen(err)) == -1) {
2835 /* Nothing to do, Just to avoid the warning... */
2836 }
285add55 2837 freeClient(c);
2838 return;
2839 }
ed9b544e 2840 server.stat_numconnections++;
2841}
2842
2843/* ======================= Redis objects implementation ===================== */
2844
2845static robj *createObject(int type, void *ptr) {
2846 robj *o;
2847
a5819310 2848 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2849 if (listLength(server.objfreelist)) {
2850 listNode *head = listFirst(server.objfreelist);
2851 o = listNodeValue(head);
2852 listDelNode(server.objfreelist,head);
a5819310 2853 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2854 } else {
75680a3c 2855 if (server.vm_enabled) {
a5819310 2856 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2857 o = zmalloc(sizeof(*o));
2858 } else {
2859 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2860 }
ed9b544e 2861 }
ed9b544e 2862 o->type = type;
942a3961 2863 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2864 o->ptr = ptr;
2865 o->refcount = 1;
3a66edc7 2866 if (server.vm_enabled) {
1064ef87 2867 /* Note that this code may run in the context of an I/O thread
2868 * and accessing to server.unixtime in theory is an error
2869 * (no locks). But in practice this is safe, and even if we read
2870 * garbage Redis will not fail, as it's just a statistical info */
3a66edc7 2871 o->vm.atime = server.unixtime;
2872 o->storage = REDIS_VM_MEMORY;
2873 }
ed9b544e 2874 return o;
2875}
2876
2877static robj *createStringObject(char *ptr, size_t len) {
2878 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2879}
2880
3f973463
PN
2881static robj *createStringObjectFromLongLong(long long value) {
2882 robj *o;
2883 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2884 incrRefCount(shared.integers[value]);
2885 o = shared.integers[value];
2886 } else {
2887 o = createObject(REDIS_STRING, NULL);
2888 if (value >= LONG_MIN && value <= LONG_MAX) {
2889 o->encoding = REDIS_ENCODING_INT;
2890 o->ptr = (void*)((long)value);
2891 } else {
2892 o->ptr = sdscatprintf(sdsempty(),"%lld",value);
2893 }
2894 }
2895 return o;
2896}
2897
4ef8de8a 2898static robj *dupStringObject(robj *o) {
b9bc0eef 2899 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2900 return createStringObject(o->ptr,sdslen(o->ptr));
2901}
2902
ed9b544e 2903static robj *createListObject(void) {
2904 list *l = listCreate();
2905
ed9b544e 2906 listSetFreeMethod(l,decrRefCount);
2907 return createObject(REDIS_LIST,l);
2908}
2909
2910static robj *createSetObject(void) {
2911 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2912 return createObject(REDIS_SET,d);
2913}
2914
5234952b 2915static robj *createHashObject(void) {
2916 /* All the Hashes start as zipmaps. Will be automatically converted
2917 * into hash tables if there are enough elements or big elements
2918 * inside. */
2919 unsigned char *zm = zipmapNew();
2920 robj *o = createObject(REDIS_HASH,zm);
2921 o->encoding = REDIS_ENCODING_ZIPMAP;
2922 return o;
2923}
2924
1812e024 2925static robj *createZsetObject(void) {
6b47e12e 2926 zset *zs = zmalloc(sizeof(*zs));
2927
2928 zs->dict = dictCreate(&zsetDictType,NULL);
2929 zs->zsl = zslCreate();
2930 return createObject(REDIS_ZSET,zs);
1812e024 2931}
2932
ed9b544e 2933static void freeStringObject(robj *o) {
942a3961 2934 if (o->encoding == REDIS_ENCODING_RAW) {
2935 sdsfree(o->ptr);
2936 }
ed9b544e 2937}
2938
2939static void freeListObject(robj *o) {
2940 listRelease((list*) o->ptr);
2941}
2942
2943static void freeSetObject(robj *o) {
2944 dictRelease((dict*) o->ptr);
2945}
2946
fd8ccf44 2947static void freeZsetObject(robj *o) {
2948 zset *zs = o->ptr;
2949
2950 dictRelease(zs->dict);
2951 zslFree(zs->zsl);
2952 zfree(zs);
2953}
2954
ed9b544e 2955static void freeHashObject(robj *o) {
cbba7dd7 2956 switch (o->encoding) {
2957 case REDIS_ENCODING_HT:
2958 dictRelease((dict*) o->ptr);
2959 break;
2960 case REDIS_ENCODING_ZIPMAP:
2961 zfree(o->ptr);
2962 break;
2963 default:
f83c6cb5 2964 redisPanic("Unknown hash encoding type");
cbba7dd7 2965 break;
2966 }
ed9b544e 2967}
2968
2969static void incrRefCount(robj *o) {
2970 o->refcount++;
2971}
2972
2973static void decrRefCount(void *obj) {
2974 robj *o = obj;
94754ccc 2975
c651fd9e 2976 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
970e10bb 2977 /* Object is a key of a swapped out value, or in the process of being
2978 * loaded. */
996cb5f7 2979 if (server.vm_enabled &&
2980 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2981 {
996cb5f7 2982 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 2983 redisAssert(o->type == REDIS_STRING);
a35ddf12 2984 freeStringObject(o);
2985 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 2986 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 2987 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2988 !listAddNodeHead(server.objfreelist,o))
2989 zfree(o);
a5819310 2990 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 2991 server.vm_stats_swapped_objects--;
a35ddf12 2992 return;
2993 }
996cb5f7 2994 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 2995 if (--(o->refcount) == 0) {
996cb5f7 2996 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2997 vmCancelThreadedIOJob(obj);
ed9b544e 2998 switch(o->type) {
2999 case REDIS_STRING: freeStringObject(o); break;
3000 case REDIS_LIST: freeListObject(o); break;
3001 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 3002 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 3003 case REDIS_HASH: freeHashObject(o); break;
f83c6cb5 3004 default: redisPanic("Unknown object type"); break;
ed9b544e 3005 }
a5819310 3006 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 3007 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
3008 !listAddNodeHead(server.objfreelist,o))
3009 zfree(o);
a5819310 3010 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 3011 }
3012}
3013
942a3961 3014static robj *lookupKey(redisDb *db, robj *key) {
3015 dictEntry *de = dictFind(db->dict,key);
3a66edc7 3016 if (de) {
55cf8433 3017 robj *key = dictGetEntryKey(de);
3018 robj *val = dictGetEntryVal(de);
3a66edc7 3019
55cf8433 3020 if (server.vm_enabled) {
996cb5f7 3021 if (key->storage == REDIS_VM_MEMORY ||
3022 key->storage == REDIS_VM_SWAPPING)
3023 {
3024 /* If we were swapping the object out, stop it, this key
3025 * was requested. */
3026 if (key->storage == REDIS_VM_SWAPPING)
3027 vmCancelThreadedIOJob(key);
55cf8433 3028 /* Update the access time of the key for the aging algorithm. */
3029 key->vm.atime = server.unixtime;
3030 } else {
d5d55fc3 3031 int notify = (key->storage == REDIS_VM_LOADING);
3032
55cf8433 3033 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 3034 redisAssert(val == NULL);
55cf8433 3035 val = vmLoadObject(key);
3036 dictGetEntryVal(de) = val;
d5d55fc3 3037
3038 /* Clients blocked by the VM subsystem may be waiting for
3039 * this key... */
3040 if (notify) handleClientsBlockedOnSwappedKey(db,key);
55cf8433 3041 }
3042 }
3043 return val;
3a66edc7 3044 } else {
3045 return NULL;
3046 }
942a3961 3047}
3048
3049static robj *lookupKeyRead(redisDb *db, robj *key) {
3050 expireIfNeeded(db,key);
3051 return lookupKey(db,key);
3052}
3053
3054static robj *lookupKeyWrite(redisDb *db, robj *key) {
3055 deleteIfVolatile(db,key);
3056 return lookupKey(db,key);
3057}
3058
92b27fe9 3059static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3060 robj *o = lookupKeyRead(c->db, key);
3061 if (!o) addReply(c,reply);
3062 return o;
3063}
3064
3065static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3066 robj *o = lookupKeyWrite(c->db, key);
3067 if (!o) addReply(c,reply);
3068 return o;
3069}
3070
3071static int checkType(redisClient *c, robj *o, int type) {
3072 if (o->type != type) {
3073 addReply(c,shared.wrongtypeerr);
3074 return 1;
3075 }
3076 return 0;
3077}
3078
942a3961 3079static int deleteKey(redisDb *db, robj *key) {
3080 int retval;
3081
3082 /* We need to protect key from destruction: after the first dictDelete()
3083 * it may happen that 'key' is no longer valid if we don't increment
3084 * it's count. This may happen when we get the object reference directly
3085 * from the hash table with dictRandomKey() or dict iterators */
3086 incrRefCount(key);
3087 if (dictSize(db->expires)) dictDelete(db->expires,key);
3088 retval = dictDelete(db->dict,key);
3089 decrRefCount(key);
3090
3091 return retval == DICT_OK;
3092}
3093
724a51b1 3094/* Check if the nul-terminated string 's' can be represented by a long
3095 * (that is, is a number that fits into long without any other space or
3096 * character before or after the digits).
3097 *
3098 * If so, the function returns REDIS_OK and *longval is set to the value
3099 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 3100static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 3101 char buf[32], *endptr;
3102 long value;
3103 int slen;
e0a62c7f 3104
724a51b1 3105 value = strtol(s, &endptr, 10);
3106 if (endptr[0] != '\0') return REDIS_ERR;
3107 slen = snprintf(buf,32,"%ld",value);
3108
3109 /* If the number converted back into a string is not identical
3110 * then it's not possible to encode the string as integer */
f69f2cba 3111 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 3112 if (longval) *longval = value;
3113 return REDIS_OK;
3114}
3115
942a3961 3116/* Try to encode a string object in order to save space */
05df7621 3117static robj *tryObjectEncoding(robj *o) {
942a3961 3118 long value;
942a3961 3119 sds s = o->ptr;
3305306f 3120
942a3961 3121 if (o->encoding != REDIS_ENCODING_RAW)
05df7621 3122 return o; /* Already encoded */
3305306f 3123
05df7621 3124 /* It's not safe to encode shared objects: shared objects can be shared
942a3961 3125 * everywhere in the "object space" of Redis. Encoded objects can only
3126 * appear as "values" (and not, for instance, as keys) */
05df7621 3127 if (o->refcount > 1) return o;
3305306f 3128
942a3961 3129 /* Currently we try to encode only strings */
dfc5e96c 3130 redisAssert(o->type == REDIS_STRING);
94754ccc 3131
724a51b1 3132 /* Check if we can represent this string as a long integer */
05df7621 3133 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
942a3961 3134
3135 /* Ok, this object can be encoded */
05df7621 3136 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3137 decrRefCount(o);
3138 incrRefCount(shared.integers[value]);
3139 return shared.integers[value];
3140 } else {
3141 o->encoding = REDIS_ENCODING_INT;
3142 sdsfree(o->ptr);
3143 o->ptr = (void*) value;
3144 return o;
3145 }
942a3961 3146}
3147
9d65a1bb 3148/* Get a decoded version of an encoded object (returned as a new object).
3149 * If the object is already raw-encoded just increment the ref count. */
3150static robj *getDecodedObject(robj *o) {
942a3961 3151 robj *dec;
e0a62c7f 3152
9d65a1bb 3153 if (o->encoding == REDIS_ENCODING_RAW) {
3154 incrRefCount(o);
3155 return o;
3156 }
942a3961 3157 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3158 char buf[32];
3159
3160 snprintf(buf,32,"%ld",(long)o->ptr);
3161 dec = createStringObject(buf,strlen(buf));
3162 return dec;
3163 } else {
08ee9b57 3164 redisPanic("Unknown encoding type");
942a3961 3165 }
3305306f 3166}
3167
d7f43c08 3168/* Compare two string objects via strcmp() or alike.
3169 * Note that the objects may be integer-encoded. In such a case we
3170 * use snprintf() to get a string representation of the numbers on the stack
1fd9bc8a 3171 * and compare the strings, it's much faster than calling getDecodedObject().
3172 *
3173 * Important note: if objects are not integer encoded, but binary-safe strings,
3174 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3175 * binary safe. */
724a51b1 3176static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 3177 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 3178 char bufa[128], bufb[128], *astr, *bstr;
3179 int bothsds = 1;
724a51b1 3180
e197b441 3181 if (a == b) return 0;
d7f43c08 3182 if (a->encoding != REDIS_ENCODING_RAW) {
3183 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3184 astr = bufa;
3185 bothsds = 0;
724a51b1 3186 } else {
d7f43c08 3187 astr = a->ptr;
724a51b1 3188 }
d7f43c08 3189 if (b->encoding != REDIS_ENCODING_RAW) {
3190 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3191 bstr = bufb;
3192 bothsds = 0;
3193 } else {
3194 bstr = b->ptr;
3195 }
3196 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3197}
3198
0ea663ea 3199static size_t stringObjectLen(robj *o) {
dfc5e96c 3200 redisAssert(o->type == REDIS_STRING);
0ea663ea 3201 if (o->encoding == REDIS_ENCODING_RAW) {
3202 return sdslen(o->ptr);
3203 } else {
3204 char buf[32];
3205
3206 return snprintf(buf,32,"%ld",(long)o->ptr);
3207 }
3208}
3209
bd79a6bd
PN
3210static int getDoubleFromObject(robj *o, double *target) {
3211 double value;
682c73e8 3212 char *eptr;
bbe025e0 3213
bd79a6bd
PN
3214 if (o == NULL) {
3215 value = 0;
3216 } else {
3217 redisAssert(o->type == REDIS_STRING);
3218 if (o->encoding == REDIS_ENCODING_RAW) {
3219 value = strtod(o->ptr, &eptr);
682c73e8 3220 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3221 } else if (o->encoding == REDIS_ENCODING_INT) {
3222 value = (long)o->ptr;
3223 } else {
946342c1 3224 redisPanic("Unknown string encoding");
bd79a6bd
PN
3225 }
3226 }
3227
bd79a6bd
PN
3228 *target = value;
3229 return REDIS_OK;
3230}
bbe025e0 3231
bd79a6bd
PN
3232static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3233 double value;
3234 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3235 if (msg != NULL) {
3236 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3237 } else {
3238 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3239 }
bbe025e0
AM
3240 return REDIS_ERR;
3241 }
3242
bd79a6bd 3243 *target = value;
bbe025e0
AM
3244 return REDIS_OK;
3245}
3246
bd79a6bd
PN
3247static int getLongLongFromObject(robj *o, long long *target) {
3248 long long value;
682c73e8 3249 char *eptr;
bbe025e0 3250
bd79a6bd
PN
3251 if (o == NULL) {
3252 value = 0;
3253 } else {
3254 redisAssert(o->type == REDIS_STRING);
3255 if (o->encoding == REDIS_ENCODING_RAW) {
3256 value = strtoll(o->ptr, &eptr, 10);
682c73e8 3257 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3258 } else if (o->encoding == REDIS_ENCODING_INT) {
3259 value = (long)o->ptr;
3260 } else {
946342c1 3261 redisPanic("Unknown string encoding");
bd79a6bd
PN
3262 }
3263 }
3264
bd79a6bd
PN
3265 *target = value;
3266 return REDIS_OK;
3267}
bbe025e0 3268
bd79a6bd
PN
3269static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3270 long long value;
3271 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3272 if (msg != NULL) {
3273 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3274 } else {
3275 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3276 }
bbe025e0
AM
3277 return REDIS_ERR;
3278 }
3279
bd79a6bd 3280 *target = value;
bbe025e0
AM
3281 return REDIS_OK;
3282}
3283
bd79a6bd
PN
3284static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3285 long long value;
bbe025e0 3286
bd79a6bd
PN
3287 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3288 if (value < LONG_MIN || value > LONG_MAX) {
3289 if (msg != NULL) {
3290 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3291 } else {
3292 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3293 }
bbe025e0
AM
3294 return REDIS_ERR;
3295 }
3296
bd79a6bd 3297 *target = value;
bbe025e0
AM
3298 return REDIS_OK;
3299}
3300
06233c45 3301/*============================ RDB saving/loading =========================== */
ed9b544e 3302
f78fd11b 3303static int rdbSaveType(FILE *fp, unsigned char type) {
3304 if (fwrite(&type,1,1,fp) == 0) return -1;
3305 return 0;
3306}
3307
bb32ede5 3308static int rdbSaveTime(FILE *fp, time_t t) {
3309 int32_t t32 = (int32_t) t;
3310 if (fwrite(&t32,4,1,fp) == 0) return -1;
3311 return 0;
3312}
3313
e3566d4b 3314/* check rdbLoadLen() comments for more info */
f78fd11b 3315static int rdbSaveLen(FILE *fp, uint32_t len) {
3316 unsigned char buf[2];
3317
3318 if (len < (1<<6)) {
3319 /* Save a 6 bit len */
10c43610 3320 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3321 if (fwrite(buf,1,1,fp) == 0) return -1;
3322 } else if (len < (1<<14)) {
3323 /* Save a 14 bit len */
10c43610 3324 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3325 buf[1] = len&0xFF;
17be1a4a 3326 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3327 } else {
3328 /* Save a 32 bit len */
10c43610 3329 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3330 if (fwrite(buf,1,1,fp) == 0) return -1;
3331 len = htonl(len);
3332 if (fwrite(&len,4,1,fp) == 0) return -1;
3333 }
3334 return 0;
3335}
3336
e3566d4b 3337/* String objects in the form "2391" "-100" without any space and with a
3338 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3339 * encoded as integers to save space */
b1befe6a 3340static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
e3566d4b 3341 long long value;
3342 char *endptr, buf[32];
3343
3344 /* Check if it's possible to encode this value as a number */
3345 value = strtoll(s, &endptr, 10);
3346 if (endptr[0] != '\0') return 0;
3347 snprintf(buf,32,"%lld",value);
3348
3349 /* If the number converted back into a string is not identical
3350 * then it's not possible to encode the string as integer */
b1befe6a 3351 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
e3566d4b 3352
3353 /* Finally check if it fits in our ranges */
3354 if (value >= -(1<<7) && value <= (1<<7)-1) {
3355 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3356 enc[1] = value&0xFF;
3357 return 2;
3358 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3359 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3360 enc[1] = value&0xFF;
3361 enc[2] = (value>>8)&0xFF;
3362 return 3;
3363 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3364 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3365 enc[1] = value&0xFF;
3366 enc[2] = (value>>8)&0xFF;
3367 enc[3] = (value>>16)&0xFF;
3368 enc[4] = (value>>24)&0xFF;
3369 return 5;
3370 } else {
3371 return 0;
3372 }
3373}
3374
b1befe6a 3375static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3376 size_t comprlen, outlen;
774e3047 3377 unsigned char byte;
3378 void *out;
3379
3380 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3381 if (len <= 4) return 0;
3382 outlen = len-4;
3a2694c4 3383 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3384 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3385 if (comprlen == 0) {
88e85998 3386 zfree(out);
774e3047 3387 return 0;
3388 }
3389 /* Data compressed! Let's save it on disk */
3390 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3391 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3392 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3393 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3394 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3395 zfree(out);
774e3047 3396 return comprlen;
3397
3398writeerr:
88e85998 3399 zfree(out);
774e3047 3400 return -1;
3401}
3402
e3566d4b 3403/* Save a string objet as [len][data] on disk. If the object is a string
3404 * representation of an integer value we try to safe it in a special form */
b1befe6a 3405static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3406 int enclen;
10c43610 3407
774e3047 3408 /* Try integer encoding */
e3566d4b 3409 if (len <= 11) {
3410 unsigned char buf[5];
b1befe6a 3411 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3412 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3413 return 0;
3414 }
3415 }
774e3047 3416
3417 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3418 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3419 if (server.rdbcompression && len > 20) {
774e3047 3420 int retval;
3421
b1befe6a 3422 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3423 if (retval == -1) return -1;
3424 if (retval > 0) return 0;
3425 /* retval == 0 means data can't be compressed, save the old way */
3426 }
3427
3428 /* Store verbatim */
10c43610 3429 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3430 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3431 return 0;
3432}
3433
942a3961 3434/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3435static int rdbSaveStringObject(FILE *fp, robj *obj) {
3436 int retval;
942a3961 3437
f2d9f50f 3438 /* Avoid incr/decr ref count business when possible.
3439 * This plays well with copy-on-write given that we are probably
3440 * in a child process (BGSAVE). Also this makes sure key objects
3441 * of swapped objects are not incRefCount-ed (an assert does not allow
3442 * this in order to avoid bugs) */
3443 if (obj->encoding != REDIS_ENCODING_RAW) {
996cb5f7 3444 obj = getDecodedObject(obj);
b1befe6a 3445 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3446 decrRefCount(obj);
3447 } else {
b1befe6a 3448 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3449 }
9d65a1bb 3450 return retval;
942a3961 3451}
3452
a7866db6 3453/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3454 * 8 bit integer specifing the length of the representation.
3455 * This 8 bit integer has special values in order to specify the following
3456 * conditions:
3457 * 253: not a number
3458 * 254: + inf
3459 * 255: - inf
3460 */
3461static int rdbSaveDoubleValue(FILE *fp, double val) {
3462 unsigned char buf[128];
3463 int len;
3464
3465 if (isnan(val)) {
3466 buf[0] = 253;
3467 len = 1;
3468 } else if (!isfinite(val)) {
3469 len = 1;
3470 buf[0] = (val < 0) ? 255 : 254;
3471 } else {
eaa256ad 3472 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3473 buf[0] = strlen((char*)buf+1);
a7866db6 3474 len = buf[0]+1;
3475 }
3476 if (fwrite(buf,len,1,fp) == 0) return -1;
3477 return 0;
3478}
3479
06233c45 3480/* Save a Redis object. */
3481static int rdbSaveObject(FILE *fp, robj *o) {
3482 if (o->type == REDIS_STRING) {
3483 /* Save a string value */
3484 if (rdbSaveStringObject(fp,o) == -1) return -1;
3485 } else if (o->type == REDIS_LIST) {
3486 /* Save a list value */
3487 list *list = o->ptr;
c7df85a4 3488 listIter li;
06233c45 3489 listNode *ln;
3490
06233c45 3491 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
c7df85a4 3492 listRewind(list,&li);
3493 while((ln = listNext(&li))) {
06233c45 3494 robj *eleobj = listNodeValue(ln);
3495
3496 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3497 }
3498 } else if (o->type == REDIS_SET) {
3499 /* Save a set value */
3500 dict *set = o->ptr;
3501 dictIterator *di = dictGetIterator(set);
3502 dictEntry *de;
3503
3504 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3505 while((de = dictNext(di)) != NULL) {
3506 robj *eleobj = dictGetEntryKey(de);
3507
3508 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3509 }
3510 dictReleaseIterator(di);
3511 } else if (o->type == REDIS_ZSET) {
3512 /* Save a set value */
3513 zset *zs = o->ptr;
3514 dictIterator *di = dictGetIterator(zs->dict);
3515 dictEntry *de;
3516
3517 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3518 while((de = dictNext(di)) != NULL) {
3519 robj *eleobj = dictGetEntryKey(de);
3520 double *score = dictGetEntryVal(de);
3521
3522 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3523 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3524 }
3525 dictReleaseIterator(di);
b1befe6a 3526 } else if (o->type == REDIS_HASH) {
3527 /* Save a hash value */
3528 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3529 unsigned char *p = zipmapRewind(o->ptr);
3530 unsigned int count = zipmapLen(o->ptr);
3531 unsigned char *key, *val;
3532 unsigned int klen, vlen;
3533
3534 if (rdbSaveLen(fp,count) == -1) return -1;
3535 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3536 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3537 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3538 }
3539 } else {
3540 dictIterator *di = dictGetIterator(o->ptr);
3541 dictEntry *de;
3542
3543 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3544 while((de = dictNext(di)) != NULL) {
3545 robj *key = dictGetEntryKey(de);
3546 robj *val = dictGetEntryVal(de);
3547
3548 if (rdbSaveStringObject(fp,key) == -1) return -1;
3549 if (rdbSaveStringObject(fp,val) == -1) return -1;
3550 }
3551 dictReleaseIterator(di);
3552 }
06233c45 3553 } else {
f83c6cb5 3554 redisPanic("Unknown object type");
06233c45 3555 }
3556 return 0;
3557}
3558
3559/* Return the length the object will have on disk if saved with
3560 * the rdbSaveObject() function. Currently we use a trick to get
3561 * this length with very little changes to the code. In the future
3562 * we could switch to a faster solution. */
b9bc0eef 3563static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3564 if (fp == NULL) fp = server.devnull;
06233c45 3565 rewind(fp);
3566 assert(rdbSaveObject(fp,o) != 1);
3567 return ftello(fp);
3568}
3569
06224fec 3570/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3571static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3572 off_t bytes = rdbSavedObjectLen(o,fp);
e0a62c7f 3573
06224fec 3574 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3575}
3576
ed9b544e 3577/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3578static int rdbSave(char *filename) {
ed9b544e 3579 dictIterator *di = NULL;
3580 dictEntry *de;
ed9b544e 3581 FILE *fp;
3582 char tmpfile[256];
3583 int j;
bb32ede5 3584 time_t now = time(NULL);
ed9b544e 3585
2316bb3b 3586 /* Wait for I/O therads to terminate, just in case this is a
3587 * foreground-saving, to avoid seeking the swap file descriptor at the
3588 * same time. */
3589 if (server.vm_enabled)
3590 waitEmptyIOJobsQueue();
3591
a3b21203 3592 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3593 fp = fopen(tmpfile,"w");
3594 if (!fp) {
3595 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3596 return REDIS_ERR;
3597 }
f78fd11b 3598 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3599 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3600 redisDb *db = server.db+j;
3601 dict *d = db->dict;
3305306f 3602 if (dictSize(d) == 0) continue;
ed9b544e 3603 di = dictGetIterator(d);
3604 if (!di) {
3605 fclose(fp);
3606 return REDIS_ERR;
3607 }
3608
3609 /* Write the SELECT DB opcode */
f78fd11b 3610 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3611 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3612
3613 /* Iterate this DB writing every entry */
3614 while((de = dictNext(di)) != NULL) {
3615 robj *key = dictGetEntryKey(de);
3616 robj *o = dictGetEntryVal(de);
bb32ede5 3617 time_t expiretime = getExpire(db,key);
3618
3619 /* Save the expire time */
3620 if (expiretime != -1) {
3621 /* If this key is already expired skip it */
3622 if (expiretime < now) continue;
3623 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3624 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3625 }
7e69548d 3626 /* Save the key and associated value. This requires special
3627 * handling if the value is swapped out. */
996cb5f7 3628 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3629 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3630 /* Save type, key, value */
3631 if (rdbSaveType(fp,o->type) == -1) goto werr;
3632 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3633 if (rdbSaveObject(fp,o) == -1) goto werr;
3634 } else {
996cb5f7 3635 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3636 robj *po;
7e69548d 3637 /* Get a preview of the object in memory */
3638 po = vmPreviewObject(key);
7e69548d 3639 /* Save type, key, value */
3640 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3641 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3642 if (rdbSaveObject(fp,po) == -1) goto werr;
3643 /* Remove the loaded object from memory */
3644 decrRefCount(po);
7e69548d 3645 }
ed9b544e 3646 }
3647 dictReleaseIterator(di);
3648 }
3649 /* EOF opcode */
f78fd11b 3650 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3651
3652 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3653 fflush(fp);
3654 fsync(fileno(fp));
3655 fclose(fp);
e0a62c7f 3656
ed9b544e 3657 /* Use RENAME to make sure the DB file is changed atomically only
3658 * if the generate DB file is ok. */
3659 if (rename(tmpfile,filename) == -1) {
325d1eb4 3660 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3661 unlink(tmpfile);
3662 return REDIS_ERR;
3663 }
3664 redisLog(REDIS_NOTICE,"DB saved on disk");
3665 server.dirty = 0;
3666 server.lastsave = time(NULL);
3667 return REDIS_OK;
3668
3669werr:
3670 fclose(fp);
3671 unlink(tmpfile);
3672 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3673 if (di) dictReleaseIterator(di);
3674 return REDIS_ERR;
3675}
3676
f78fd11b 3677static int rdbSaveBackground(char *filename) {
ed9b544e 3678 pid_t childpid;
3679
9d65a1bb 3680 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3681 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3682 if ((childpid = fork()) == 0) {
3683 /* Child */
054e426d 3684 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3685 close(server.fd);
f78fd11b 3686 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3687 _exit(0);
ed9b544e 3688 } else {
478c2c6f 3689 _exit(1);
ed9b544e 3690 }
3691 } else {
3692 /* Parent */
5a7c647e 3693 if (childpid == -1) {
3694 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3695 strerror(errno));
3696 return REDIS_ERR;
3697 }
ed9b544e 3698 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3699 server.bgsavechildpid = childpid;
884d4b39 3700 updateDictResizePolicy();
ed9b544e 3701 return REDIS_OK;
3702 }
3703 return REDIS_OK; /* unreached */
3704}
3705
a3b21203 3706static void rdbRemoveTempFile(pid_t childpid) {
3707 char tmpfile[256];
3708
3709 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3710 unlink(tmpfile);
3711}
3712
f78fd11b 3713static int rdbLoadType(FILE *fp) {
3714 unsigned char type;
7b45bfb2 3715 if (fread(&type,1,1,fp) == 0) return -1;
3716 return type;
3717}
3718
bb32ede5 3719static time_t rdbLoadTime(FILE *fp) {
3720 int32_t t32;
3721 if (fread(&t32,4,1,fp) == 0) return -1;
3722 return (time_t) t32;
3723}
3724
e3566d4b 3725/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3726 * of this file for a description of how this are stored on disk.
3727 *
3728 * isencoded is set to 1 if the readed length is not actually a length but
3729 * an "encoding type", check the above comments for more info */
c78a8ccc 3730static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3731 unsigned char buf[2];
3732 uint32_t len;
c78a8ccc 3733 int type;
f78fd11b 3734
e3566d4b 3735 if (isencoded) *isencoded = 0;
c78a8ccc 3736 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3737 type = (buf[0]&0xC0)>>6;
3738 if (type == REDIS_RDB_6BITLEN) {
3739 /* Read a 6 bit len */
3740 return buf[0]&0x3F;
3741 } else if (type == REDIS_RDB_ENCVAL) {
3742 /* Read a 6 bit len encoding type */
3743 if (isencoded) *isencoded = 1;
3744 return buf[0]&0x3F;
3745 } else if (type == REDIS_RDB_14BITLEN) {
3746 /* Read a 14 bit len */
3747 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3748 return ((buf[0]&0x3F)<<8)|buf[1];
3749 } else {
3750 /* Read a 32 bit len */
f78fd11b 3751 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3752 return ntohl(len);
f78fd11b 3753 }
f78fd11b 3754}
3755
e3566d4b 3756static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3757 unsigned char enc[4];
3758 long long val;
3759
3760 if (enctype == REDIS_RDB_ENC_INT8) {
3761 if (fread(enc,1,1,fp) == 0) return NULL;
3762 val = (signed char)enc[0];
3763 } else if (enctype == REDIS_RDB_ENC_INT16) {
3764 uint16_t v;
3765 if (fread(enc,2,1,fp) == 0) return NULL;
3766 v = enc[0]|(enc[1]<<8);
3767 val = (int16_t)v;
3768 } else if (enctype == REDIS_RDB_ENC_INT32) {
3769 uint32_t v;
3770 if (fread(enc,4,1,fp) == 0) return NULL;
3771 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3772 val = (int32_t)v;
3773 } else {
3774 val = 0; /* anti-warning */
f83c6cb5 3775 redisPanic("Unknown RDB integer encoding type");
e3566d4b 3776 }
3777 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3778}
3779
c78a8ccc 3780static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3781 unsigned int len, clen;
3782 unsigned char *c = NULL;
3783 sds val = NULL;
3784
c78a8ccc 3785 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3786 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3787 if ((c = zmalloc(clen)) == NULL) goto err;
3788 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3789 if (fread(c,clen,1,fp) == 0) goto err;
3790 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3791 zfree(c);
88e85998 3792 return createObject(REDIS_STRING,val);
3793err:
3794 zfree(c);
3795 sdsfree(val);
3796 return NULL;
3797}
3798
c78a8ccc 3799static robj *rdbLoadStringObject(FILE*fp) {
e3566d4b 3800 int isencoded;
3801 uint32_t len;
f78fd11b 3802 sds val;
3803
c78a8ccc 3804 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3805 if (isencoded) {
3806 switch(len) {
3807 case REDIS_RDB_ENC_INT8:
3808 case REDIS_RDB_ENC_INT16:
3809 case REDIS_RDB_ENC_INT32:
bdcb92f2 3810 return rdbLoadIntegerObject(fp,len);
88e85998 3811 case REDIS_RDB_ENC_LZF:
bdcb92f2 3812 return rdbLoadLzfStringObject(fp);
e3566d4b 3813 default:
f83c6cb5 3814 redisPanic("Unknown RDB encoding type");
e3566d4b 3815 }
3816 }
3817
f78fd11b 3818 if (len == REDIS_RDB_LENERR) return NULL;
3819 val = sdsnewlen(NULL,len);
3820 if (len && fread(val,len,1,fp) == 0) {
3821 sdsfree(val);
3822 return NULL;
3823 }
bdcb92f2 3824 return createObject(REDIS_STRING,val);
f78fd11b 3825}
3826
a7866db6 3827/* For information about double serialization check rdbSaveDoubleValue() */
3828static int rdbLoadDoubleValue(FILE *fp, double *val) {
3829 char buf[128];
3830 unsigned char len;
3831
3832 if (fread(&len,1,1,fp) == 0) return -1;
3833 switch(len) {
3834 case 255: *val = R_NegInf; return 0;
3835 case 254: *val = R_PosInf; return 0;
3836 case 253: *val = R_Nan; return 0;
3837 default:
3838 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3839 buf[len] = '\0';
a7866db6 3840 sscanf(buf, "%lg", val);
3841 return 0;
3842 }
3843}
3844
c78a8ccc 3845/* Load a Redis object of the specified type from the specified file.
3846 * On success a newly allocated object is returned, otherwise NULL. */
3847static robj *rdbLoadObject(int type, FILE *fp) {
3848 robj *o;
3849
bcd11906 3850 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 3851 if (type == REDIS_STRING) {
3852 /* Read string value */
3853 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
05df7621 3854 o = tryObjectEncoding(o);
c78a8ccc 3855 } else if (type == REDIS_LIST || type == REDIS_SET) {
3856 /* Read list/set value */
3857 uint32_t listlen;
3858
3859 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3860 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3c68de9b 3861 /* It's faster to expand the dict to the right size asap in order
3862 * to avoid rehashing */
3863 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3864 dictExpand(o->ptr,listlen);
c78a8ccc 3865 /* Load every single element of the list/set */
3866 while(listlen--) {
3867 robj *ele;
3868
3869 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
05df7621 3870 ele = tryObjectEncoding(ele);
c78a8ccc 3871 if (type == REDIS_LIST) {
3872 listAddNodeTail((list*)o->ptr,ele);
3873 } else {
3874 dictAdd((dict*)o->ptr,ele,NULL);
3875 }
3876 }
3877 } else if (type == REDIS_ZSET) {
3878 /* Read list/set value */
ada386b2 3879 size_t zsetlen;
c78a8ccc 3880 zset *zs;
3881
3882 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3883 o = createZsetObject();
3884 zs = o->ptr;
3885 /* Load every single element of the list/set */
3886 while(zsetlen--) {
3887 robj *ele;
3888 double *score = zmalloc(sizeof(double));
3889
3890 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
05df7621 3891 ele = tryObjectEncoding(ele);
c78a8ccc 3892 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3893 dictAdd(zs->dict,ele,score);
3894 zslInsert(zs->zsl,*score,ele);
3895 incrRefCount(ele); /* added to skiplist */
3896 }
ada386b2 3897 } else if (type == REDIS_HASH) {
3898 size_t hashlen;
3899
3900 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3901 o = createHashObject();
3902 /* Too many entries? Use an hash table. */
3903 if (hashlen > server.hash_max_zipmap_entries)
3904 convertToRealHash(o);
3905 /* Load every key/value, then set it into the zipmap or hash
3906 * table, as needed. */
3907 while(hashlen--) {
3908 robj *key, *val;
3909
3910 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3911 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3912 /* If we are using a zipmap and there are too big values
3913 * the object is converted to real hash table encoding. */
3914 if (o->encoding != REDIS_ENCODING_HT &&
3915 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3916 sdslen(val->ptr) > server.hash_max_zipmap_value))
3917 {
3918 convertToRealHash(o);
3919 }
3920
3921 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3922 unsigned char *zm = o->ptr;
3923
3924 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3925 val->ptr,sdslen(val->ptr),NULL);
3926 o->ptr = zm;
3927 decrRefCount(key);
3928 decrRefCount(val);
3929 } else {
05df7621 3930 key = tryObjectEncoding(key);
3931 val = tryObjectEncoding(val);
ada386b2 3932 dictAdd((dict*)o->ptr,key,val);
ada386b2 3933 }
3934 }
c78a8ccc 3935 } else {
f83c6cb5 3936 redisPanic("Unknown object type");
c78a8ccc 3937 }
3938 return o;
3939}
3940
f78fd11b 3941static int rdbLoad(char *filename) {
ed9b544e 3942 FILE *fp;
f78fd11b 3943 robj *keyobj = NULL;
3944 uint32_t dbid;
bb32ede5 3945 int type, retval, rdbver;
3305306f 3946 dict *d = server.db[0].dict;
bb32ede5 3947 redisDb *db = server.db+0;
f78fd11b 3948 char buf[1024];
bb32ede5 3949 time_t expiretime = -1, now = time(NULL);
b492cf00 3950 long long loadedkeys = 0;
bb32ede5 3951
ed9b544e 3952 fp = fopen(filename,"r");
3953 if (!fp) return REDIS_ERR;
3954 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 3955 buf[9] = '\0';
3956 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 3957 fclose(fp);
3958 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3959 return REDIS_ERR;
3960 }
f78fd11b 3961 rdbver = atoi(buf+5);
c78a8ccc 3962 if (rdbver != 1) {
f78fd11b 3963 fclose(fp);
3964 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3965 return REDIS_ERR;
3966 }
ed9b544e 3967 while(1) {
3968 robj *o;
3969
3970 /* Read type. */
f78fd11b 3971 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 3972 if (type == REDIS_EXPIRETIME) {
3973 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3974 /* We read the time so we need to read the object type again */
3975 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3976 }
ed9b544e 3977 if (type == REDIS_EOF) break;
3978 /* Handle SELECT DB opcode as a special case */
3979 if (type == REDIS_SELECTDB) {
c78a8ccc 3980 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 3981 goto eoferr;
ed9b544e 3982 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 3983 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 3984 exit(1);
3985 }
bb32ede5 3986 db = server.db+dbid;
3987 d = db->dict;
ed9b544e 3988 continue;
3989 }
3990 /* Read key */
c78a8ccc 3991 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3992 /* Read value */
3993 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
ed9b544e 3994 /* Add the new object in the hash table */
f78fd11b 3995 retval = dictAdd(d,keyobj,o);
ed9b544e 3996 if (retval == DICT_ERR) {
f78fd11b 3997 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
ed9b544e 3998 exit(1);
3999 }
bb32ede5 4000 /* Set the expire time if needed */
4001 if (expiretime != -1) {
4002 setExpire(db,keyobj,expiretime);
4003 /* Delete this key if already expired */
4004 if (expiretime < now) deleteKey(db,keyobj);
4005 expiretime = -1;
4006 }
f78fd11b 4007 keyobj = o = NULL;
b492cf00 4008 /* Handle swapping while loading big datasets when VM is on */
4009 loadedkeys++;
4010 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
4011 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 4012 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 4013 }
4014 }
ed9b544e 4015 }
4016 fclose(fp);
4017 return REDIS_OK;
4018
4019eoferr: /* unexpected end of file is handled here with a fatal exit */
e3566d4b 4020 if (keyobj) decrRefCount(keyobj);
f80dff62 4021 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 4022 exit(1);
4023 return REDIS_ERR; /* Just to avoid warning */
4024}
4025
4026/*================================== Commands =============================== */
4027
abcb223e 4028static void authCommand(redisClient *c) {
2e77c2ee 4029 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
4030 c->authenticated = 1;
4031 addReply(c,shared.ok);
4032 } else {
4033 c->authenticated = 0;
fa4c0aba 4034 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
4035 }
4036}
4037
ed9b544e 4038static void pingCommand(redisClient *c) {
4039 addReply(c,shared.pong);
4040}
4041
4042static void echoCommand(redisClient *c) {
dd88747b 4043 addReplyBulk(c,c->argv[1]);
ed9b544e 4044}
4045
4046/*=================================== Strings =============================== */
4047
526d00a5 4048static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
ed9b544e 4049 int retval;
10ce1276 4050 long seconds = 0; /* initialized to avoid an harmness warning */
ed9b544e 4051
526d00a5 4052 if (expire) {
4053 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
4054 return;
4055 if (seconds <= 0) {
4056 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
4057 return;
4058 }
4059 }
4060
4061 if (nx) deleteIfVolatile(c->db,key);
4062 retval = dictAdd(c->db->dict,key,val);
ed9b544e 4063 if (retval == DICT_ERR) {
4064 if (!nx) {
1b03836c 4065 /* If the key is about a swapped value, we want a new key object
4066 * to overwrite the old. So we delete the old key in the database.
4067 * This will also make sure that swap pages about the old object
4068 * will be marked as free. */
526d00a5 4069 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4070 incrRefCount(key);
4071 dictReplace(c->db->dict,key,val);
4072 incrRefCount(val);
ed9b544e 4073 } else {
c937aa89 4074 addReply(c,shared.czero);
ed9b544e 4075 return;
4076 }
4077 } else {
526d00a5 4078 incrRefCount(key);
4079 incrRefCount(val);
ed9b544e 4080 }
4081 server.dirty++;
526d00a5 4082 removeExpire(c->db,key);
4083 if (expire) setExpire(c->db,key,time(NULL)+seconds);
c937aa89 4084 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 4085}
4086
4087static void setCommand(redisClient *c) {
526d00a5 4088 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
ed9b544e 4089}
4090
4091static void setnxCommand(redisClient *c) {
526d00a5 4092 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4093}
4094
4095static void setexCommand(redisClient *c) {
4096 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
ed9b544e 4097}
4098
322fc7d8 4099static int getGenericCommand(redisClient *c) {
dd88747b 4100 robj *o;
e0a62c7f 4101
dd88747b 4102 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 4103 return REDIS_OK;
dd88747b 4104
4105 if (o->type != REDIS_STRING) {
4106 addReply(c,shared.wrongtypeerr);
4107 return REDIS_ERR;
ed9b544e 4108 } else {
dd88747b 4109 addReplyBulk(c,o);
4110 return REDIS_OK;
ed9b544e 4111 }
4112}
4113
322fc7d8 4114static void getCommand(redisClient *c) {
4115 getGenericCommand(c);
4116}
4117
f6b141c5 4118static void getsetCommand(redisClient *c) {
322fc7d8 4119 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 4120 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4121 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4122 } else {
4123 incrRefCount(c->argv[1]);
4124 }
4125 incrRefCount(c->argv[2]);
4126 server.dirty++;
4127 removeExpire(c->db,c->argv[1]);
4128}
4129
70003d28 4130static void mgetCommand(redisClient *c) {
70003d28 4131 int j;
e0a62c7f 4132
c937aa89 4133 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 4134 for (j = 1; j < c->argc; j++) {
3305306f 4135 robj *o = lookupKeyRead(c->db,c->argv[j]);
4136 if (o == NULL) {
c937aa89 4137 addReply(c,shared.nullbulk);
70003d28 4138 } else {
70003d28 4139 if (o->type != REDIS_STRING) {
c937aa89 4140 addReply(c,shared.nullbulk);
70003d28 4141 } else {
dd88747b 4142 addReplyBulk(c,o);
70003d28 4143 }
4144 }
4145 }
4146}
4147
6c446631 4148static void msetGenericCommand(redisClient *c, int nx) {
906573e7 4149 int j, busykeys = 0;
6c446631 4150
4151 if ((c->argc % 2) == 0) {
454d4e43 4152 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 4153 return;
4154 }
4155 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4156 * set nothing at all if at least one already key exists. */
4157 if (nx) {
4158 for (j = 1; j < c->argc; j += 2) {
906573e7 4159 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4160 busykeys++;
6c446631 4161 }
4162 }
4163 }
906573e7 4164 if (busykeys) {
4165 addReply(c, shared.czero);
4166 return;
4167 }
6c446631 4168
4169 for (j = 1; j < c->argc; j += 2) {
4170 int retval;
4171
05df7621 4172 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
6c446631 4173 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4174 if (retval == DICT_ERR) {
4175 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4176 incrRefCount(c->argv[j+1]);
4177 } else {
4178 incrRefCount(c->argv[j]);
4179 incrRefCount(c->argv[j+1]);
4180 }
4181 removeExpire(c->db,c->argv[j]);
4182 }
4183 server.dirty += (c->argc-1)/2;
4184 addReply(c, nx ? shared.cone : shared.ok);
4185}
4186
4187static void msetCommand(redisClient *c) {
4188 msetGenericCommand(c,0);
4189}
4190
4191static void msetnxCommand(redisClient *c) {
4192 msetGenericCommand(c,1);
4193}
4194
d68ed120 4195static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 4196 long long value;
4197 int retval;
4198 robj *o;
e0a62c7f 4199
3305306f 4200 o = lookupKeyWrite(c->db,c->argv[1]);
ed9b544e 4201
bd79a6bd 4202 if (getLongLongFromObjectOrReply(c, o, &value, NULL) != REDIS_OK) return;
ed9b544e 4203
4204 value += incr;
4205 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
05df7621 4206 o = tryObjectEncoding(o);
3305306f 4207 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 4208 if (retval == DICT_ERR) {
3305306f 4209 dictReplace(c->db->dict,c->argv[1],o);
4210 removeExpire(c->db,c->argv[1]);
ed9b544e 4211 } else {
4212 incrRefCount(c->argv[1]);
4213 }
4214 server.dirty++;
c937aa89 4215 addReply(c,shared.colon);
ed9b544e 4216 addReply(c,o);
4217 addReply(c,shared.crlf);
4218}
4219
4220static void incrCommand(redisClient *c) {
a4d1ba9a 4221 incrDecrCommand(c,1);
ed9b544e 4222}
4223
4224static void decrCommand(redisClient *c) {
a4d1ba9a 4225 incrDecrCommand(c,-1);
ed9b544e 4226}
4227
4228static void incrbyCommand(redisClient *c) {
bbe025e0
AM
4229 long long incr;
4230
bd79a6bd 4231 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4232 incrDecrCommand(c,incr);
ed9b544e 4233}
4234
4235static void decrbyCommand(redisClient *c) {
bbe025e0
AM
4236 long long incr;
4237
bd79a6bd 4238 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4239 incrDecrCommand(c,-incr);
ed9b544e 4240}
4241
4b00bebd 4242static void appendCommand(redisClient *c) {
4243 int retval;
4244 size_t totlen;
4245 robj *o;
4246
4247 o = lookupKeyWrite(c->db,c->argv[1]);
4248 if (o == NULL) {
4249 /* Create the key */
4250 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4251 incrRefCount(c->argv[1]);
4252 incrRefCount(c->argv[2]);
4253 totlen = stringObjectLen(c->argv[2]);
4254 } else {
4255 dictEntry *de;
e0a62c7f 4256
4b00bebd 4257 de = dictFind(c->db->dict,c->argv[1]);
4258 assert(de != NULL);
4259
4260 o = dictGetEntryVal(de);
4261 if (o->type != REDIS_STRING) {
4262 addReply(c,shared.wrongtypeerr);
4263 return;
4264 }
4265 /* If the object is specially encoded or shared we have to make
4266 * a copy */
4267 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4268 robj *decoded = getDecodedObject(o);
4269
4270 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4271 decrRefCount(decoded);
4272 dictReplace(c->db->dict,c->argv[1],o);
4273 }
4274 /* APPEND! */
4275 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4276 o->ptr = sdscatlen(o->ptr,
4277 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4278 } else {
4279 o->ptr = sdscatprintf(o->ptr, "%ld",
4280 (unsigned long) c->argv[2]->ptr);
4281 }
4282 totlen = sdslen(o->ptr);
4283 }
4284 server.dirty++;
4285 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4286}
4287
39191553 4288static void substrCommand(redisClient *c) {
4289 robj *o;
4290 long start = atoi(c->argv[2]->ptr);
4291 long end = atoi(c->argv[3]->ptr);
dd88747b 4292 size_t rangelen, strlen;
4293 sds range;
39191553 4294
dd88747b 4295 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4296 checkType(c,o,REDIS_STRING)) return;
39191553 4297
dd88747b 4298 o = getDecodedObject(o);
4299 strlen = sdslen(o->ptr);
8fe7fad7 4300
dd88747b 4301 /* convert negative indexes */
4302 if (start < 0) start = strlen+start;
4303 if (end < 0) end = strlen+end;
4304 if (start < 0) start = 0;
4305 if (end < 0) end = 0;
39191553 4306
dd88747b 4307 /* indexes sanity checks */
4308 if (start > end || (size_t)start >= strlen) {
4309 /* Out of range start or start > end result in null reply */
4310 addReply(c,shared.nullbulk);
4311 decrRefCount(o);
4312 return;
39191553 4313 }
dd88747b 4314 if ((size_t)end >= strlen) end = strlen-1;
4315 rangelen = (end-start)+1;
4316
4317 /* Return the result */
4318 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4319 range = sdsnewlen((char*)o->ptr+start,rangelen);
4320 addReplySds(c,range);
4321 addReply(c,shared.crlf);
4322 decrRefCount(o);
39191553 4323}
4324
ed9b544e 4325/* ========================= Type agnostic commands ========================= */
4326
4327static void delCommand(redisClient *c) {
5109cdff 4328 int deleted = 0, j;
4329
4330 for (j = 1; j < c->argc; j++) {
4331 if (deleteKey(c->db,c->argv[j])) {
4332 server.dirty++;
4333 deleted++;
4334 }
4335 }
dd88747b 4336 addReplyLong(c,deleted);
ed9b544e 4337}
4338
4339static void existsCommand(redisClient *c) {
3305306f 4340 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
ed9b544e 4341}
4342
4343static void selectCommand(redisClient *c) {
4344 int id = atoi(c->argv[1]->ptr);
e0a62c7f 4345
ed9b544e 4346 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4347 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4348 } else {
4349 addReply(c,shared.ok);
4350 }
4351}
4352
4353static void randomkeyCommand(redisClient *c) {
4354 dictEntry *de;
dc4be23e 4355 robj *key;
e0a62c7f 4356
3305306f 4357 while(1) {
4358 de = dictGetRandomKey(c->db->dict);
ce7bef07 4359 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 4360 }
2b619329 4361
ed9b544e 4362 if (de == NULL) {
dc4be23e 4363 addReply(c,shared.nullbulk);
4364 return;
4365 }
4366
4367 key = dictGetEntryKey(de);
4368 if (server.vm_enabled) {
4369 key = dupStringObject(key);
4370 addReplyBulk(c,key);
4371 decrRefCount(key);
ed9b544e 4372 } else {
dc4be23e 4373 addReplyBulk(c,key);
ed9b544e 4374 }
4375}
4376
4377static void keysCommand(redisClient *c) {
4378 dictIterator *di;
4379 dictEntry *de;
4380 sds pattern = c->argv[1]->ptr;
4381 int plen = sdslen(pattern);
a3f9eec2 4382 unsigned long numkeys = 0;
ed9b544e 4383 robj *lenobj = createObject(REDIS_STRING,NULL);
4384
3305306f 4385 di = dictGetIterator(c->db->dict);
ed9b544e 4386 addReply(c,lenobj);
4387 decrRefCount(lenobj);
4388 while((de = dictNext(di)) != NULL) {
4389 robj *keyobj = dictGetEntryKey(de);
3305306f 4390
ed9b544e 4391 sds key = keyobj->ptr;
4392 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4393 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 4394 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4395 addReplyBulk(c,keyobj);
3305306f 4396 numkeys++;
3305306f 4397 }
ed9b544e 4398 }
4399 }
4400 dictReleaseIterator(di);
a3f9eec2 4401 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4402}
4403
4404static void dbsizeCommand(redisClient *c) {
4405 addReplySds(c,
3305306f 4406 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4407}
4408
4409static void lastsaveCommand(redisClient *c) {
4410 addReplySds(c,
c937aa89 4411 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4412}
4413
4414static void typeCommand(redisClient *c) {
3305306f 4415 robj *o;
ed9b544e 4416 char *type;
3305306f 4417
4418 o = lookupKeyRead(c->db,c->argv[1]);
4419 if (o == NULL) {
c937aa89 4420 type = "+none";
ed9b544e 4421 } else {
ed9b544e 4422 switch(o->type) {
c937aa89 4423 case REDIS_STRING: type = "+string"; break;
4424 case REDIS_LIST: type = "+list"; break;
4425 case REDIS_SET: type = "+set"; break;
412a8bce 4426 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4427 case REDIS_HASH: type = "+hash"; break;
4428 default: type = "+unknown"; break;
ed9b544e 4429 }
4430 }
4431 addReplySds(c,sdsnew(type));
4432 addReply(c,shared.crlf);
4433}
4434
4435static void saveCommand(redisClient *c) {
9d65a1bb 4436 if (server.bgsavechildpid != -1) {
05557f6d 4437 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4438 return;
4439 }
f78fd11b 4440 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4441 addReply(c,shared.ok);
4442 } else {
4443 addReply(c,shared.err);
4444 }
4445}
4446
4447static void bgsaveCommand(redisClient *c) {
9d65a1bb 4448 if (server.bgsavechildpid != -1) {
ed9b544e 4449 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4450 return;
4451 }
f78fd11b 4452 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4453 char *status = "+Background saving started\r\n";
4454 addReplySds(c,sdsnew(status));
ed9b544e 4455 } else {
4456 addReply(c,shared.err);
4457 }
4458}
4459
4460static void shutdownCommand(redisClient *c) {
4461 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
a3b21203 4462 /* Kill the saving child if there is a background saving in progress.
4463 We want to avoid race conditions, for instance our saving child may
4464 overwrite the synchronous saving did by SHUTDOWN. */
9d65a1bb 4465 if (server.bgsavechildpid != -1) {
9f3c422c 4466 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4467 kill(server.bgsavechildpid,SIGKILL);
a3b21203 4468 rdbRemoveTempFile(server.bgsavechildpid);
9f3c422c 4469 }
ac945e2d 4470 if (server.appendonly) {
4471 /* Append only file: fsync() the AOF and exit */
4472 fsync(server.appendfd);
054e426d 4473 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 4474 exit(0);
ed9b544e 4475 } else {
ac945e2d 4476 /* Snapshotting. Perform a SYNC SAVE and exit */
4477 if (rdbSave(server.dbfilename) == REDIS_OK) {
4478 if (server.daemonize)
4479 unlink(server.pidfile);
4480 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4481 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
054e426d 4482 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 4483 exit(0);
4484 } else {
dd88747b 4485 /* Ooops.. error saving! The best we can do is to continue
4486 * operating. Note that if there was a background saving process,
4487 * in the next cron() Redis will be notified that the background
4488 * saving aborted, handling special stuff like slaves pending for
4489 * synchronization... */
e0a62c7f 4490 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
dd88747b 4491 addReplySds(c,
4492 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
ac945e2d 4493 }
ed9b544e 4494 }
4495}
4496
4497static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4498 robj *o;
4499
4500 /* To use the same key as src and dst is probably an error */
4501 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4502 addReply(c,shared.sameobjecterr);
ed9b544e 4503 return;
4504 }
4505
dd88747b 4506 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4507 return;
dd88747b 4508
ed9b544e 4509 incrRefCount(o);
3305306f 4510 deleteIfVolatile(c->db,c->argv[2]);
4511 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 4512 if (nx) {
4513 decrRefCount(o);
c937aa89 4514 addReply(c,shared.czero);
ed9b544e 4515 return;
4516 }
3305306f 4517 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 4518 } else {
4519 incrRefCount(c->argv[2]);
4520 }
3305306f 4521 deleteKey(c->db,c->argv[1]);
ed9b544e 4522 server.dirty++;
c937aa89 4523 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4524}
4525
4526static void renameCommand(redisClient *c) {
4527 renameGenericCommand(c,0);
4528}
4529
4530static void renamenxCommand(redisClient *c) {
4531 renameGenericCommand(c,1);
4532}
4533
4534static void moveCommand(redisClient *c) {
3305306f 4535 robj *o;
4536 redisDb *src, *dst;
ed9b544e 4537 int srcid;
4538
4539 /* Obtain source and target DB pointers */
3305306f 4540 src = c->db;
4541 srcid = c->db->id;
ed9b544e 4542 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4543 addReply(c,shared.outofrangeerr);
ed9b544e 4544 return;
4545 }
3305306f 4546 dst = c->db;
4547 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4548
4549 /* If the user is moving using as target the same
4550 * DB as the source DB it is probably an error. */
4551 if (src == dst) {
c937aa89 4552 addReply(c,shared.sameobjecterr);
ed9b544e 4553 return;
4554 }
4555
4556 /* Check if the element exists and get a reference */
3305306f 4557 o = lookupKeyWrite(c->db,c->argv[1]);
4558 if (!o) {
c937aa89 4559 addReply(c,shared.czero);
ed9b544e 4560 return;
4561 }
4562
4563 /* Try to add the element to the target DB */
3305306f 4564 deleteIfVolatile(dst,c->argv[1]);
4565 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 4566 addReply(c,shared.czero);
ed9b544e 4567 return;
4568 }
3305306f 4569 incrRefCount(c->argv[1]);
ed9b544e 4570 incrRefCount(o);
4571
4572 /* OK! key moved, free the entry in the source DB */
3305306f 4573 deleteKey(src,c->argv[1]);
ed9b544e 4574 server.dirty++;
c937aa89 4575 addReply(c,shared.cone);
ed9b544e 4576}
4577
4578/* =================================== Lists ================================ */
4579static void pushGenericCommand(redisClient *c, int where) {
4580 robj *lobj;
ed9b544e 4581 list *list;
3305306f 4582
4583 lobj = lookupKeyWrite(c->db,c->argv[1]);
4584 if (lobj == NULL) {
95242ab5 4585 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4586 addReply(c,shared.cone);
95242ab5 4587 return;
4588 }
ed9b544e 4589 lobj = createListObject();
4590 list = lobj->ptr;
4591 if (where == REDIS_HEAD) {
6b47e12e 4592 listAddNodeHead(list,c->argv[2]);
ed9b544e 4593 } else {
6b47e12e 4594 listAddNodeTail(list,c->argv[2]);
ed9b544e 4595 }
3305306f 4596 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 4597 incrRefCount(c->argv[1]);
4598 incrRefCount(c->argv[2]);
4599 } else {
ed9b544e 4600 if (lobj->type != REDIS_LIST) {
4601 addReply(c,shared.wrongtypeerr);
4602 return;
4603 }
95242ab5 4604 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4605 addReply(c,shared.cone);
95242ab5 4606 return;
4607 }
ed9b544e 4608 list = lobj->ptr;
4609 if (where == REDIS_HEAD) {
6b47e12e 4610 listAddNodeHead(list,c->argv[2]);
ed9b544e 4611 } else {
6b47e12e 4612 listAddNodeTail(list,c->argv[2]);
ed9b544e 4613 }
4614 incrRefCount(c->argv[2]);
4615 }
4616 server.dirty++;
520b5a33 4617 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
ed9b544e 4618}
4619
4620static void lpushCommand(redisClient *c) {
4621 pushGenericCommand(c,REDIS_HEAD);
4622}
4623
4624static void rpushCommand(redisClient *c) {
4625 pushGenericCommand(c,REDIS_TAIL);
4626}
4627
4628static void llenCommand(redisClient *c) {
3305306f 4629 robj *o;
ed9b544e 4630 list *l;
dd88747b 4631
4632 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4633 checkType(c,o,REDIS_LIST)) return;
e0a62c7f 4634
dd88747b 4635 l = o->ptr;
4636 addReplyUlong(c,listLength(l));
ed9b544e 4637}
4638
4639static void lindexCommand(redisClient *c) {
3305306f 4640 robj *o;
ed9b544e 4641 int index = atoi(c->argv[2]->ptr);
dd88747b 4642 list *list;
4643 listNode *ln;
4644
4645 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4646 checkType(c,o,REDIS_LIST)) return;
4647 list = o->ptr;
4648
4649 ln = listIndex(list, index);
4650 if (ln == NULL) {
c937aa89 4651 addReply(c,shared.nullbulk);
ed9b544e 4652 } else {
dd88747b 4653 robj *ele = listNodeValue(ln);
4654 addReplyBulk(c,ele);
ed9b544e 4655 }
4656}
4657
4658static void lsetCommand(redisClient *c) {
3305306f 4659 robj *o;
ed9b544e 4660 int index = atoi(c->argv[2]->ptr);
dd88747b 4661 list *list;
4662 listNode *ln;
4663
4664 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4665 checkType(c,o,REDIS_LIST)) return;
4666 list = o->ptr;
4667
4668 ln = listIndex(list, index);
4669 if (ln == NULL) {
4670 addReply(c,shared.outofrangeerr);
ed9b544e 4671 } else {
dd88747b 4672 robj *ele = listNodeValue(ln);
ed9b544e 4673
dd88747b 4674 decrRefCount(ele);
4675 listNodeValue(ln) = c->argv[3];
4676 incrRefCount(c->argv[3]);
4677 addReply(c,shared.ok);
4678 server.dirty++;
ed9b544e 4679 }
4680}
4681
4682static void popGenericCommand(redisClient *c, int where) {
3305306f 4683 robj *o;
dd88747b 4684 list *list;
4685 listNode *ln;
3305306f 4686
dd88747b 4687 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4688 checkType(c,o,REDIS_LIST)) return;
4689 list = o->ptr;
ed9b544e 4690
dd88747b 4691 if (where == REDIS_HEAD)
4692 ln = listFirst(list);
4693 else
4694 ln = listLast(list);
ed9b544e 4695
dd88747b 4696 if (ln == NULL) {
4697 addReply(c,shared.nullbulk);
4698 } else {
4699 robj *ele = listNodeValue(ln);
4700 addReplyBulk(c,ele);
4701 listDelNode(list,ln);
3ea27d37 4702 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4703 server.dirty++;
ed9b544e 4704 }
4705}
4706
4707static void lpopCommand(redisClient *c) {
4708 popGenericCommand(c,REDIS_HEAD);
4709}
4710
4711static void rpopCommand(redisClient *c) {
4712 popGenericCommand(c,REDIS_TAIL);
4713}
4714
4715static void lrangeCommand(redisClient *c) {
3305306f 4716 robj *o;
ed9b544e 4717 int start = atoi(c->argv[2]->ptr);
4718 int end = atoi(c->argv[3]->ptr);
dd88747b 4719 int llen;
4720 int rangelen, j;
4721 list *list;
4722 listNode *ln;
4723 robj *ele;
4724
4e27f268 4725 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4726 || checkType(c,o,REDIS_LIST)) return;
dd88747b 4727 list = o->ptr;
4728 llen = listLength(list);
4729
4730 /* convert negative indexes */
4731 if (start < 0) start = llen+start;
4732 if (end < 0) end = llen+end;
4733 if (start < 0) start = 0;
4734 if (end < 0) end = 0;
4735
4736 /* indexes sanity checks */
4737 if (start > end || start >= llen) {
4738 /* Out of range start or start > end result in empty list */
4739 addReply(c,shared.emptymultibulk);
4740 return;
4741 }
4742 if (end >= llen) end = llen-1;
4743 rangelen = (end-start)+1;
3305306f 4744
dd88747b 4745 /* Return the result in form of a multi-bulk reply */
4746 ln = listIndex(list, start);
4747 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4748 for (j = 0; j < rangelen; j++) {
4749 ele = listNodeValue(ln);
4750 addReplyBulk(c,ele);
4751 ln = ln->next;
ed9b544e 4752 }
4753}
4754
4755static void ltrimCommand(redisClient *c) {
3305306f 4756 robj *o;
ed9b544e 4757 int start = atoi(c->argv[2]->ptr);
4758 int end = atoi(c->argv[3]->ptr);
dd88747b 4759 int llen;
4760 int j, ltrim, rtrim;
4761 list *list;
4762 listNode *ln;
4763
4764 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4765 checkType(c,o,REDIS_LIST)) return;
4766 list = o->ptr;
4767 llen = listLength(list);
4768
4769 /* convert negative indexes */
4770 if (start < 0) start = llen+start;
4771 if (end < 0) end = llen+end;
4772 if (start < 0) start = 0;
4773 if (end < 0) end = 0;
4774
4775 /* indexes sanity checks */
4776 if (start > end || start >= llen) {
4777 /* Out of range start or start > end result in empty list */
4778 ltrim = llen;
4779 rtrim = 0;
ed9b544e 4780 } else {
dd88747b 4781 if (end >= llen) end = llen-1;
4782 ltrim = start;
4783 rtrim = llen-end-1;
4784 }
ed9b544e 4785
dd88747b 4786 /* Remove list elements to perform the trim */
4787 for (j = 0; j < ltrim; j++) {
4788 ln = listFirst(list);
4789 listDelNode(list,ln);
4790 }
4791 for (j = 0; j < rtrim; j++) {
4792 ln = listLast(list);
4793 listDelNode(list,ln);
ed9b544e 4794 }
3ea27d37 4795 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4796 server.dirty++;
4797 addReply(c,shared.ok);
ed9b544e 4798}
4799
4800static void lremCommand(redisClient *c) {
3305306f 4801 robj *o;
dd88747b 4802 list *list;
4803 listNode *ln, *next;
4804 int toremove = atoi(c->argv[2]->ptr);
4805 int removed = 0;
4806 int fromtail = 0;
a4d1ba9a 4807
dd88747b 4808 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4809 checkType(c,o,REDIS_LIST)) return;
4810 list = o->ptr;
4811
4812 if (toremove < 0) {
4813 toremove = -toremove;
4814 fromtail = 1;
4815 }
4816 ln = fromtail ? list->tail : list->head;
4817 while (ln) {
4818 robj *ele = listNodeValue(ln);
4819
4820 next = fromtail ? ln->prev : ln->next;
4821 if (compareStringObjects(ele,c->argv[3]) == 0) {
4822 listDelNode(list,ln);
4823 server.dirty++;
4824 removed++;
4825 if (toremove && removed == toremove) break;
ed9b544e 4826 }
dd88747b 4827 ln = next;
ed9b544e 4828 }
3ea27d37 4829 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4830 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 4831}
4832
12f9d551 4833/* This is the semantic of this command:
0f5f7e9a 4834 * RPOPLPUSH srclist dstlist:
12f9d551 4835 * IF LLEN(srclist) > 0
4836 * element = RPOP srclist
4837 * LPUSH dstlist element
4838 * RETURN element
4839 * ELSE
4840 * RETURN nil
4841 * END
4842 * END
4843 *
4844 * The idea is to be able to get an element from a list in a reliable way
4845 * since the element is not just returned but pushed against another list
4846 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4847 */
0f5f7e9a 4848static void rpoplpushcommand(redisClient *c) {
12f9d551 4849 robj *sobj;
dd88747b 4850 list *srclist;
4851 listNode *ln;
4852
4853 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4854 checkType(c,sobj,REDIS_LIST)) return;
4855 srclist = sobj->ptr;
4856 ln = listLast(srclist);
12f9d551 4857
dd88747b 4858 if (ln == NULL) {
12f9d551 4859 addReply(c,shared.nullbulk);
4860 } else {
dd88747b 4861 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4862 robj *ele = listNodeValue(ln);
4863 list *dstlist;
e20fb74f 4864
dd88747b 4865 if (dobj && dobj->type != REDIS_LIST) {
4866 addReply(c,shared.wrongtypeerr);
4867 return;
4868 }
12f9d551 4869
dd88747b 4870 /* Add the element to the target list (unless it's directly
4871 * passed to some BLPOP-ing client */
4872 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4873 if (dobj == NULL) {
4874 /* Create the list if the key does not exist */
4875 dobj = createListObject();
4876 dictAdd(c->db->dict,c->argv[2],dobj);
4877 incrRefCount(c->argv[2]);
12f9d551 4878 }
dd88747b 4879 dstlist = dobj->ptr;
4880 listAddNodeHead(dstlist,ele);
4881 incrRefCount(ele);
12f9d551 4882 }
dd88747b 4883
4884 /* Send the element to the client as reply as well */
4885 addReplyBulk(c,ele);
4886
4887 /* Finally remove the element from the source list */
4888 listDelNode(srclist,ln);
3ea27d37 4889 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4890 server.dirty++;
12f9d551 4891 }
4892}
4893
ed9b544e 4894/* ==================================== Sets ================================ */
4895
4896static void saddCommand(redisClient *c) {
ed9b544e 4897 robj *set;
4898
3305306f 4899 set = lookupKeyWrite(c->db,c->argv[1]);
4900 if (set == NULL) {
ed9b544e 4901 set = createSetObject();
3305306f 4902 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 4903 incrRefCount(c->argv[1]);
4904 } else {
ed9b544e 4905 if (set->type != REDIS_SET) {
c937aa89 4906 addReply(c,shared.wrongtypeerr);
ed9b544e 4907 return;
4908 }
4909 }
4910 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4911 incrRefCount(c->argv[2]);
4912 server.dirty++;
c937aa89 4913 addReply(c,shared.cone);
ed9b544e 4914 } else {
c937aa89 4915 addReply(c,shared.czero);
ed9b544e 4916 }
4917}
4918
4919static void sremCommand(redisClient *c) {
3305306f 4920 robj *set;
ed9b544e 4921
dd88747b 4922 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4923 checkType(c,set,REDIS_SET)) return;
4924
4925 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4926 server.dirty++;
4927 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 4928 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4929 addReply(c,shared.cone);
ed9b544e 4930 } else {
dd88747b 4931 addReply(c,shared.czero);
ed9b544e 4932 }
4933}
4934
a4460ef4 4935static void smoveCommand(redisClient *c) {
4936 robj *srcset, *dstset;
4937
4938 srcset = lookupKeyWrite(c->db,c->argv[1]);
4939 dstset = lookupKeyWrite(c->db,c->argv[2]);
4940
4941 /* If the source key does not exist return 0, if it's of the wrong type
4942 * raise an error */
4943 if (srcset == NULL || srcset->type != REDIS_SET) {
4944 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4945 return;
4946 }
4947 /* Error if the destination key is not a set as well */
4948 if (dstset && dstset->type != REDIS_SET) {
4949 addReply(c,shared.wrongtypeerr);
4950 return;
4951 }
4952 /* Remove the element from the source set */
4953 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4954 /* Key not found in the src set! return zero */
4955 addReply(c,shared.czero);
4956 return;
4957 }
3ea27d37 4958 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4959 deleteKey(c->db,c->argv[1]);
a4460ef4 4960 server.dirty++;
4961 /* Add the element to the destination set */
4962 if (!dstset) {
4963 dstset = createSetObject();
4964 dictAdd(c->db->dict,c->argv[2],dstset);
4965 incrRefCount(c->argv[2]);
4966 }
4967 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4968 incrRefCount(c->argv[3]);
4969 addReply(c,shared.cone);
4970}
4971
ed9b544e 4972static void sismemberCommand(redisClient *c) {
3305306f 4973 robj *set;
ed9b544e 4974
dd88747b 4975 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4976 checkType(c,set,REDIS_SET)) return;
4977
4978 if (dictFind(set->ptr,c->argv[2]))
4979 addReply(c,shared.cone);
4980 else
c937aa89 4981 addReply(c,shared.czero);
ed9b544e 4982}
4983
4984static void scardCommand(redisClient *c) {
3305306f 4985 robj *o;
ed9b544e 4986 dict *s;
dd88747b 4987
4988 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4989 checkType(c,o,REDIS_SET)) return;
e0a62c7f 4990
dd88747b 4991 s = o->ptr;
4992 addReplyUlong(c,dictSize(s));
ed9b544e 4993}
4994
12fea928 4995static void spopCommand(redisClient *c) {
4996 robj *set;
4997 dictEntry *de;
4998
dd88747b 4999 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5000 checkType(c,set,REDIS_SET)) return;
5001
5002 de = dictGetRandomKey(set->ptr);
5003 if (de == NULL) {
12fea928 5004 addReply(c,shared.nullbulk);
5005 } else {
dd88747b 5006 robj *ele = dictGetEntryKey(de);
12fea928 5007
dd88747b 5008 addReplyBulk(c,ele);
5009 dictDelete(set->ptr,ele);
5010 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 5011 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5012 server.dirty++;
12fea928 5013 }
5014}
5015
2abb95a9 5016static void srandmemberCommand(redisClient *c) {
5017 robj *set;
5018 dictEntry *de;
5019
dd88747b 5020 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5021 checkType(c,set,REDIS_SET)) return;
5022
5023 de = dictGetRandomKey(set->ptr);
5024 if (de == NULL) {
2abb95a9 5025 addReply(c,shared.nullbulk);
5026 } else {
dd88747b 5027 robj *ele = dictGetEntryKey(de);
2abb95a9 5028
dd88747b 5029 addReplyBulk(c,ele);
2abb95a9 5030 }
5031}
5032
ed9b544e 5033static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
5034 dict **d1 = (void*) s1, **d2 = (void*) s2;
5035
3305306f 5036 return dictSize(*d1)-dictSize(*d2);
ed9b544e 5037}
5038
682ac724 5039static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 5040 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5041 dictIterator *di;
5042 dictEntry *de;
5043 robj *lenobj = NULL, *dstset = NULL;
682ac724 5044 unsigned long j, cardinality = 0;
ed9b544e 5045
ed9b544e 5046 for (j = 0; j < setsnum; j++) {
5047 robj *setobj;
3305306f 5048
5049 setobj = dstkey ?
5050 lookupKeyWrite(c->db,setskeys[j]) :
5051 lookupKeyRead(c->db,setskeys[j]);
5052 if (!setobj) {
ed9b544e 5053 zfree(dv);
5faa6025 5054 if (dstkey) {
fdcaae84 5055 if (deleteKey(c->db,dstkey))
5056 server.dirty++;
0d36ded0 5057 addReply(c,shared.czero);
5faa6025 5058 } else {
4e27f268 5059 addReply(c,shared.emptymultibulk);
5faa6025 5060 }
ed9b544e 5061 return;
5062 }
ed9b544e 5063 if (setobj->type != REDIS_SET) {
5064 zfree(dv);
c937aa89 5065 addReply(c,shared.wrongtypeerr);
ed9b544e 5066 return;
5067 }
5068 dv[j] = setobj->ptr;
5069 }
5070 /* Sort sets from the smallest to largest, this will improve our
5071 * algorithm's performace */
5072 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5073
5074 /* The first thing we should output is the total number of elements...
5075 * since this is a multi-bulk write, but at this stage we don't know
5076 * the intersection set size, so we use a trick, append an empty object
5077 * to the output list and save the pointer to later modify it with the
5078 * right length */
5079 if (!dstkey) {
5080 lenobj = createObject(REDIS_STRING,NULL);
5081 addReply(c,lenobj);
5082 decrRefCount(lenobj);
5083 } else {
5084 /* If we have a target key where to store the resulting set
5085 * create this key with an empty set inside */
5086 dstset = createSetObject();
ed9b544e 5087 }
5088
5089 /* Iterate all the elements of the first (smallest) set, and test
5090 * the element against all the other sets, if at least one set does
5091 * not include the element it is discarded */
5092 di = dictGetIterator(dv[0]);
ed9b544e 5093
5094 while((de = dictNext(di)) != NULL) {
5095 robj *ele;
5096
5097 for (j = 1; j < setsnum; j++)
5098 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5099 if (j != setsnum)
5100 continue; /* at least one set does not contain the member */
5101 ele = dictGetEntryKey(de);
5102 if (!dstkey) {
dd88747b 5103 addReplyBulk(c,ele);
ed9b544e 5104 cardinality++;
5105 } else {
5106 dictAdd(dstset->ptr,ele,NULL);
5107 incrRefCount(ele);
5108 }
5109 }
5110 dictReleaseIterator(di);
5111
83cdfe18 5112 if (dstkey) {
3ea27d37 5113 /* Store the resulting set into the target, if the intersection
5114 * is not an empty set. */
83cdfe18 5115 deleteKey(c->db,dstkey);
3ea27d37 5116 if (dictSize((dict*)dstset->ptr) > 0) {
5117 dictAdd(c->db->dict,dstkey,dstset);
5118 incrRefCount(dstkey);
d36c4e97 5119 addReplyLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5120 } else {
5121 decrRefCount(dstset);
d36c4e97 5122 addReply(c,shared.czero);
3ea27d37 5123 }
40d224a9 5124 server.dirty++;
d36c4e97 5125 } else {
5126 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 5127 }
ed9b544e 5128 zfree(dv);
5129}
5130
5131static void sinterCommand(redisClient *c) {
5132 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5133}
5134
5135static void sinterstoreCommand(redisClient *c) {
5136 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5137}
5138
f4f56e1d 5139#define REDIS_OP_UNION 0
5140#define REDIS_OP_DIFF 1
2830ca53 5141#define REDIS_OP_INTER 2
f4f56e1d 5142
5143static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 5144 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5145 dictIterator *di;
5146 dictEntry *de;
f4f56e1d 5147 robj *dstset = NULL;
40d224a9 5148 int j, cardinality = 0;
5149
40d224a9 5150 for (j = 0; j < setsnum; j++) {
5151 robj *setobj;
5152
5153 setobj = dstkey ?
5154 lookupKeyWrite(c->db,setskeys[j]) :
5155 lookupKeyRead(c->db,setskeys[j]);
5156 if (!setobj) {
5157 dv[j] = NULL;
5158 continue;
5159 }
5160 if (setobj->type != REDIS_SET) {
5161 zfree(dv);
5162 addReply(c,shared.wrongtypeerr);
5163 return;
5164 }
5165 dv[j] = setobj->ptr;
5166 }
5167
5168 /* We need a temp set object to store our union. If the dstkey
5169 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5170 * this set object will be the resulting object to set into the target key*/
5171 dstset = createSetObject();
5172
40d224a9 5173 /* Iterate all the elements of all the sets, add every element a single
5174 * time to the result set */
5175 for (j = 0; j < setsnum; j++) {
51829ed3 5176 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 5177 if (!dv[j]) continue; /* non existing keys are like empty sets */
5178
5179 di = dictGetIterator(dv[j]);
40d224a9 5180
5181 while((de = dictNext(di)) != NULL) {
5182 robj *ele;
5183
5184 /* dictAdd will not add the same element multiple times */
5185 ele = dictGetEntryKey(de);
f4f56e1d 5186 if (op == REDIS_OP_UNION || j == 0) {
5187 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5188 incrRefCount(ele);
40d224a9 5189 cardinality++;
5190 }
f4f56e1d 5191 } else if (op == REDIS_OP_DIFF) {
5192 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5193 cardinality--;
5194 }
40d224a9 5195 }
5196 }
5197 dictReleaseIterator(di);
51829ed3 5198
d36c4e97 5199 /* result set is empty? Exit asap. */
5200 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 5201 }
5202
f4f56e1d 5203 /* Output the content of the resulting set, if not in STORE mode */
5204 if (!dstkey) {
5205 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5206 di = dictGetIterator(dstset->ptr);
f4f56e1d 5207 while((de = dictNext(di)) != NULL) {
5208 robj *ele;
5209
5210 ele = dictGetEntryKey(de);
dd88747b 5211 addReplyBulk(c,ele);
f4f56e1d 5212 }
5213 dictReleaseIterator(di);
d36c4e97 5214 decrRefCount(dstset);
83cdfe18
AG
5215 } else {
5216 /* If we have a target key where to store the resulting set
5217 * create this key with the result set inside */
5218 deleteKey(c->db,dstkey);
3ea27d37 5219 if (dictSize((dict*)dstset->ptr) > 0) {
5220 dictAdd(c->db->dict,dstkey,dstset);
5221 incrRefCount(dstkey);
d36c4e97 5222 addReplyLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5223 } else {
5224 decrRefCount(dstset);
d36c4e97 5225 addReply(c,shared.czero);
3ea27d37 5226 }
40d224a9 5227 server.dirty++;
5228 }
5229 zfree(dv);
5230}
5231
5232static void sunionCommand(redisClient *c) {
f4f56e1d 5233 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 5234}
5235
5236static void sunionstoreCommand(redisClient *c) {
f4f56e1d 5237 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5238}
5239
5240static void sdiffCommand(redisClient *c) {
5241 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5242}
5243
5244static void sdiffstoreCommand(redisClient *c) {
5245 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 5246}
5247
6b47e12e 5248/* ==================================== ZSets =============================== */
5249
5250/* ZSETs are ordered sets using two data structures to hold the same elements
5251 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5252 * data structure.
5253 *
5254 * The elements are added to an hash table mapping Redis objects to scores.
5255 * At the same time the elements are added to a skip list mapping scores
5256 * to Redis objects (so objects are sorted by scores in this "view"). */
5257
5258/* This skiplist implementation is almost a C translation of the original
5259 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5260 * Alternative to Balanced Trees", modified in three ways:
5261 * a) this implementation allows for repeated values.
5262 * b) the comparison is not just by key (our 'score') but by satellite data.
5263 * c) there is a back pointer, so it's a doubly linked list with the back
5264 * pointers being only at "level 1". This allows to traverse the list
5265 * from tail to head, useful for ZREVRANGE. */
5266
5267static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5268 zskiplistNode *zn = zmalloc(sizeof(*zn));
5269
5270 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2b37892e
PN
5271 if (level > 0)
5272 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
6b47e12e 5273 zn->score = score;
5274 zn->obj = obj;
5275 return zn;
5276}
5277
5278static zskiplist *zslCreate(void) {
5279 int j;
5280 zskiplist *zsl;
e0a62c7f 5281
6b47e12e 5282 zsl = zmalloc(sizeof(*zsl));
5283 zsl->level = 1;
cc812361 5284 zsl->length = 0;
6b47e12e 5285 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 5286 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 5287 zsl->header->forward[j] = NULL;
94e543b5 5288
5289 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5290 if (j < ZSKIPLIST_MAXLEVEL-1)
5291 zsl->header->span[j] = 0;
69d95c3e 5292 }
e3870fab 5293 zsl->header->backward = NULL;
5294 zsl->tail = NULL;
6b47e12e 5295 return zsl;
5296}
5297
fd8ccf44 5298static void zslFreeNode(zskiplistNode *node) {
5299 decrRefCount(node->obj);
ad807e6f 5300 zfree(node->forward);
69d95c3e 5301 zfree(node->span);
fd8ccf44 5302 zfree(node);
5303}
5304
5305static void zslFree(zskiplist *zsl) {
ad807e6f 5306 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 5307
ad807e6f 5308 zfree(zsl->header->forward);
69d95c3e 5309 zfree(zsl->header->span);
ad807e6f 5310 zfree(zsl->header);
fd8ccf44 5311 while(node) {
599379dd 5312 next = node->forward[0];
fd8ccf44 5313 zslFreeNode(node);
5314 node = next;
5315 }
ad807e6f 5316 zfree(zsl);
fd8ccf44 5317}
5318
6b47e12e 5319static int zslRandomLevel(void) {
5320 int level = 1;
5321 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5322 level += 1;
10c2baa5 5323 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6b47e12e 5324}
5325
5326static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5327 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 5328 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 5329 int i, level;
5330
5331 x = zsl->header;
5332 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
5333 /* store rank that is crossed to reach the insert position */
5334 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 5335
9d60e6e4 5336 while (x->forward[i] &&
5337 (x->forward[i]->score < score ||
5338 (x->forward[i]->score == score &&
69d95c3e 5339 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 5340 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 5341 x = x->forward[i];
69d95c3e 5342 }
6b47e12e 5343 update[i] = x;
5344 }
6b47e12e 5345 /* we assume the key is not already inside, since we allow duplicated
5346 * scores, and the re-insertion of score and redis object should never
5347 * happpen since the caller of zslInsert() should test in the hash table
5348 * if the element is already inside or not. */
5349 level = zslRandomLevel();
5350 if (level > zsl->level) {
69d95c3e 5351 for (i = zsl->level; i < level; i++) {
2b37892e 5352 rank[i] = 0;
6b47e12e 5353 update[i] = zsl->header;
2b37892e 5354 update[i]->span[i-1] = zsl->length;
69d95c3e 5355 }
6b47e12e 5356 zsl->level = level;
5357 }
5358 x = zslCreateNode(level,score,obj);
5359 for (i = 0; i < level; i++) {
5360 x->forward[i] = update[i]->forward[i];
5361 update[i]->forward[i] = x;
69d95c3e
PN
5362
5363 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
5364 if (i > 0) {
5365 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5366 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5367 }
6b47e12e 5368 }
69d95c3e
PN
5369
5370 /* increment span for untouched levels */
5371 for (i = level; i < zsl->level; i++) {
2b37892e 5372 update[i]->span[i-1]++;
69d95c3e
PN
5373 }
5374
bb975144 5375 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 5376 if (x->forward[0])
5377 x->forward[0]->backward = x;
5378 else
5379 zsl->tail = x;
cc812361 5380 zsl->length++;
6b47e12e 5381}
5382
84105336
PN
5383/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5384void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5385 int i;
5386 for (i = 0; i < zsl->level; i++) {
5387 if (update[i]->forward[i] == x) {
5388 if (i > 0) {
5389 update[i]->span[i-1] += x->span[i-1] - 1;
5390 }
5391 update[i]->forward[i] = x->forward[i];
5392 } else {
5393 /* invariant: i > 0, because update[0]->forward[0]
5394 * is always equal to x */
5395 update[i]->span[i-1] -= 1;
5396 }
5397 }
5398 if (x->forward[0]) {
5399 x->forward[0]->backward = x->backward;
5400 } else {
5401 zsl->tail = x->backward;
5402 }
5403 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5404 zsl->level--;
5405 zsl->length--;
5406}
5407
50c55df5 5408/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 5409static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 5410 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5411 int i;
5412
5413 x = zsl->header;
5414 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 5415 while (x->forward[i] &&
5416 (x->forward[i]->score < score ||
5417 (x->forward[i]->score == score &&
5418 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 5419 x = x->forward[i];
5420 update[i] = x;
5421 }
5422 /* We may have multiple elements with the same score, what we need
5423 * is to find the element with both the right score and object. */
5424 x = x->forward[0];
50c55df5 5425 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
84105336 5426 zslDeleteNode(zsl, x, update);
9d60e6e4 5427 zslFreeNode(x);
9d60e6e4 5428 return 1;
5429 } else {
5430 return 0; /* not found */
e197b441 5431 }
5432 return 0; /* not found */
fd8ccf44 5433}
5434
1807985b 5435/* Delete all the elements with score between min and max from the skiplist.
5436 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5437 * Note that this function takes the reference to the hash table view of the
5438 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 5439static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 5440 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5441 unsigned long removed = 0;
5442 int i;
5443
5444 x = zsl->header;
5445 for (i = zsl->level-1; i >= 0; i--) {
5446 while (x->forward[i] && x->forward[i]->score < min)
5447 x = x->forward[i];
5448 update[i] = x;
5449 }
5450 /* We may have multiple elements with the same score, what we need
5451 * is to find the element with both the right score and object. */
5452 x = x->forward[0];
5453 while (x && x->score <= max) {
84105336
PN
5454 zskiplistNode *next = x->forward[0];
5455 zslDeleteNode(zsl, x, update);
1807985b 5456 dictDelete(dict,x->obj);
5457 zslFreeNode(x);
1807985b 5458 removed++;
5459 x = next;
5460 }
5461 return removed; /* not found */
5462}
1807985b 5463
9212eafd 5464/* Delete all the elements with rank between start and end from the skiplist.
2424490f 5465 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
5466static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5467 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5468 unsigned long traversed = 0, removed = 0;
5469 int i;
5470
9212eafd
PN
5471 x = zsl->header;
5472 for (i = zsl->level-1; i >= 0; i--) {
5473 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5474 traversed += i > 0 ? x->span[i-1] : 1;
5475 x = x->forward[i];
1807985b 5476 }
9212eafd
PN
5477 update[i] = x;
5478 }
5479
5480 traversed++;
5481 x = x->forward[0];
5482 while (x && traversed <= end) {
84105336
PN
5483 zskiplistNode *next = x->forward[0];
5484 zslDeleteNode(zsl, x, update);
1807985b 5485 dictDelete(dict,x->obj);
5486 zslFreeNode(x);
1807985b 5487 removed++;
9212eafd 5488 traversed++;
1807985b 5489 x = next;
5490 }
9212eafd 5491 return removed;
1807985b 5492}
5493
50c55df5 5494/* Find the first node having a score equal or greater than the specified one.
5495 * Returns NULL if there is no match. */
5496static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5497 zskiplistNode *x;
5498 int i;
5499
5500 x = zsl->header;
5501 for (i = zsl->level-1; i >= 0; i--) {
5502 while (x->forward[i] && x->forward[i]->score < score)
5503 x = x->forward[i];
5504 }
5505 /* We may have multiple elements with the same score, what we need
5506 * is to find the element with both the right score and object. */
5507 return x->forward[0];
5508}
5509
27b0ccca
PN
5510/* Find the rank for an element by both score and key.
5511 * Returns 0 when the element cannot be found, rank otherwise.
5512 * Note that the rank is 1-based due to the span of zsl->header to the
5513 * first element. */
5514static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5515 zskiplistNode *x;
5516 unsigned long rank = 0;
5517 int i;
5518
5519 x = zsl->header;
5520 for (i = zsl->level-1; i >= 0; i--) {
5521 while (x->forward[i] &&
5522 (x->forward[i]->score < score ||
5523 (x->forward[i]->score == score &&
5524 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 5525 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
5526 x = x->forward[i];
5527 }
5528
5529 /* x might be equal to zsl->header, so test if obj is non-NULL */
5530 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5531 return rank;
5532 }
5533 }
5534 return 0;
5535}
5536
e74825c2
PN
5537/* Finds an element by its rank. The rank argument needs to be 1-based. */
5538zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5539 zskiplistNode *x;
5540 unsigned long traversed = 0;
5541 int i;
5542
5543 x = zsl->header;
5544 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 5545 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5546 {
a50ea45c 5547 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
5548 x = x->forward[i];
5549 }
e74825c2
PN
5550 if (traversed == rank) {
5551 return x;
5552 }
5553 }
5554 return NULL;
5555}
5556
fd8ccf44 5557/* The actual Z-commands implementations */
5558
7db723ad 5559/* This generic command implements both ZADD and ZINCRBY.
e2665397 5560 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 5561 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 5562static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 5563 robj *zsetobj;
5564 zset *zs;
5565 double *score;
5566
e2665397 5567 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 5568 if (zsetobj == NULL) {
5569 zsetobj = createZsetObject();
e2665397 5570 dictAdd(c->db->dict,key,zsetobj);
5571 incrRefCount(key);
fd8ccf44 5572 } else {
5573 if (zsetobj->type != REDIS_ZSET) {
5574 addReply(c,shared.wrongtypeerr);
5575 return;
5576 }
5577 }
fd8ccf44 5578 zs = zsetobj->ptr;
e2665397 5579
7db723ad 5580 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 5581 * needs to handle the two different conditions. It's all about setting
5582 * '*score', that is, the new score to set, to the right value. */
5583 score = zmalloc(sizeof(double));
5584 if (doincrement) {
5585 dictEntry *de;
5586
5587 /* Read the old score. If the element was not present starts from 0 */
5588 de = dictFind(zs->dict,ele);
5589 if (de) {
5590 double *oldscore = dictGetEntryVal(de);
5591 *score = *oldscore + scoreval;
5592 } else {
5593 *score = scoreval;
5594 }
5595 } else {
5596 *score = scoreval;
5597 }
5598
5599 /* What follows is a simple remove and re-insert operation that is common
7db723ad 5600 * to both ZADD and ZINCRBY... */
e2665397 5601 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 5602 /* case 1: New element */
e2665397 5603 incrRefCount(ele); /* added to hash */
5604 zslInsert(zs->zsl,*score,ele);
5605 incrRefCount(ele); /* added to skiplist */
fd8ccf44 5606 server.dirty++;
e2665397 5607 if (doincrement)
e2665397 5608 addReplyDouble(c,*score);
91d71bfc 5609 else
5610 addReply(c,shared.cone);
fd8ccf44 5611 } else {
5612 dictEntry *de;
5613 double *oldscore;
e0a62c7f 5614
fd8ccf44 5615 /* case 2: Score update operation */
e2665397 5616 de = dictFind(zs->dict,ele);
dfc5e96c 5617 redisAssert(de != NULL);
fd8ccf44 5618 oldscore = dictGetEntryVal(de);
5619 if (*score != *oldscore) {
5620 int deleted;
5621
e2665397 5622 /* Remove and insert the element in the skip list with new score */
5623 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 5624 redisAssert(deleted != 0);
e2665397 5625 zslInsert(zs->zsl,*score,ele);
5626 incrRefCount(ele);
5627 /* Update the score in the hash table */
5628 dictReplace(zs->dict,ele,score);
fd8ccf44 5629 server.dirty++;
2161a965 5630 } else {
5631 zfree(score);
fd8ccf44 5632 }
e2665397 5633 if (doincrement)
5634 addReplyDouble(c,*score);
5635 else
5636 addReply(c,shared.czero);
fd8ccf44 5637 }
5638}
5639
e2665397 5640static void zaddCommand(redisClient *c) {
5641 double scoreval;
5642
bd79a6bd 5643 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5644 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5645}
5646
7db723ad 5647static void zincrbyCommand(redisClient *c) {
e2665397 5648 double scoreval;
5649
bd79a6bd 5650 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5651 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5652}
5653
1b7106e7 5654static void zremCommand(redisClient *c) {
5655 robj *zsetobj;
5656 zset *zs;
dd88747b 5657 dictEntry *de;
5658 double *oldscore;
5659 int deleted;
1b7106e7 5660
dd88747b 5661 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5662 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 5663
dd88747b 5664 zs = zsetobj->ptr;
5665 de = dictFind(zs->dict,c->argv[2]);
5666 if (de == NULL) {
5667 addReply(c,shared.czero);
5668 return;
1b7106e7 5669 }
dd88747b 5670 /* Delete from the skiplist */
5671 oldscore = dictGetEntryVal(de);
5672 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5673 redisAssert(deleted != 0);
5674
5675 /* Delete from the hash table */
5676 dictDelete(zs->dict,c->argv[2]);
5677 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5678 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5679 server.dirty++;
5680 addReply(c,shared.cone);
1b7106e7 5681}
5682
1807985b 5683static void zremrangebyscoreCommand(redisClient *c) {
bbe025e0
AM
5684 double min;
5685 double max;
dd88747b 5686 long deleted;
1807985b 5687 robj *zsetobj;
5688 zset *zs;
5689
bd79a6bd
PN
5690 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5691 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
bbe025e0 5692
dd88747b 5693 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5694 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 5695
dd88747b 5696 zs = zsetobj->ptr;
5697 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5698 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5699 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5700 server.dirty += deleted;
5701 addReplyLong(c,deleted);
1807985b 5702}
5703
9212eafd 5704static void zremrangebyrankCommand(redisClient *c) {
bbe025e0
AM
5705 long start;
5706 long end;
dd88747b 5707 int llen;
5708 long deleted;
9212eafd
PN
5709 robj *zsetobj;
5710 zset *zs;
5711
bd79a6bd
PN
5712 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5713 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 5714
dd88747b 5715 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5716 checkType(c,zsetobj,REDIS_ZSET)) return;
5717 zs = zsetobj->ptr;
5718 llen = zs->zsl->length;
9212eafd 5719
dd88747b 5720 /* convert negative indexes */
5721 if (start < 0) start = llen+start;
5722 if (end < 0) end = llen+end;
5723 if (start < 0) start = 0;
5724 if (end < 0) end = 0;
9212eafd 5725
dd88747b 5726 /* indexes sanity checks */
5727 if (start > end || start >= llen) {
5728 addReply(c,shared.czero);
5729 return;
9212eafd 5730 }
dd88747b 5731 if (end >= llen) end = llen-1;
5732
5733 /* increment start and end because zsl*Rank functions
5734 * use 1-based rank */
5735 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5736 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5737 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5738 server.dirty += deleted;
5739 addReplyLong(c, deleted);
9212eafd
PN
5740}
5741
8f92e768
PN
5742typedef struct {
5743 dict *dict;
5744 double weight;
5745} zsetopsrc;
5746
5747static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5748 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5749 unsigned long size1, size2;
5750 size1 = d1->dict ? dictSize(d1->dict) : 0;
5751 size2 = d2->dict ? dictSize(d2->dict) : 0;
5752 return size1 - size2;
5753}
5754
d2764cd6
PN
5755#define REDIS_AGGR_SUM 1
5756#define REDIS_AGGR_MIN 2
5757#define REDIS_AGGR_MAX 3
5758
5759inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5760 if (aggregate == REDIS_AGGR_SUM) {
5761 *target = *target + val;
5762 } else if (aggregate == REDIS_AGGR_MIN) {
5763 *target = val < *target ? val : *target;
5764 } else if (aggregate == REDIS_AGGR_MAX) {
5765 *target = val > *target ? val : *target;
5766 } else {
5767 /* safety net */
f83c6cb5 5768 redisPanic("Unknown ZUNION/INTER aggregate type");
d2764cd6
PN
5769 }
5770}
5771
2830ca53 5772static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
8f92e768 5773 int i, j, zsetnum;
d2764cd6 5774 int aggregate = REDIS_AGGR_SUM;
8f92e768 5775 zsetopsrc *src;
2830ca53
PN
5776 robj *dstobj;
5777 zset *dstzset;
b287c9bb
PN
5778 dictIterator *di;
5779 dictEntry *de;
5780
2830ca53
PN
5781 /* expect zsetnum input keys to be given */
5782 zsetnum = atoi(c->argv[2]->ptr);
5783 if (zsetnum < 1) {
5784 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5785 return;
b287c9bb 5786 }
2830ca53
PN
5787
5788 /* test if the expected number of keys would overflow */
5789 if (3+zsetnum > c->argc) {
b287c9bb
PN
5790 addReply(c,shared.syntaxerr);
5791 return;
5792 }
5793
2830ca53 5794 /* read keys to be used for input */
b9eed483 5795 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
2830ca53 5796 for (i = 0, j = 3; i < zsetnum; i++, j++) {
b287c9bb
PN
5797 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5798 if (!zsetobj) {
8f92e768 5799 src[i].dict = NULL;
b287c9bb
PN
5800 } else {
5801 if (zsetobj->type != REDIS_ZSET) {
8f92e768 5802 zfree(src);
b287c9bb
PN
5803 addReply(c,shared.wrongtypeerr);
5804 return;
5805 }
8f92e768 5806 src[i].dict = ((zset*)zsetobj->ptr)->dict;
b287c9bb 5807 }
2830ca53
PN
5808
5809 /* default all weights to 1 */
8f92e768 5810 src[i].weight = 1.0;
b287c9bb
PN
5811 }
5812
2830ca53
PN
5813 /* parse optional extra arguments */
5814 if (j < c->argc) {
d2764cd6 5815 int remaining = c->argc - j;
b287c9bb 5816
2830ca53 5817 while (remaining) {
d2764cd6 5818 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 5819 j++; remaining--;
2830ca53 5820 for (i = 0; i < zsetnum; i++, j++, remaining--) {
bd79a6bd 5821 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
bbe025e0 5822 return;
2830ca53 5823 }
d2764cd6
PN
5824 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5825 j++; remaining--;
5826 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5827 aggregate = REDIS_AGGR_SUM;
5828 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5829 aggregate = REDIS_AGGR_MIN;
5830 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5831 aggregate = REDIS_AGGR_MAX;
5832 } else {
5833 zfree(src);
5834 addReply(c,shared.syntaxerr);
5835 return;
5836 }
5837 j++; remaining--;
2830ca53 5838 } else {
8f92e768 5839 zfree(src);
2830ca53
PN
5840 addReply(c,shared.syntaxerr);
5841 return;
5842 }
5843 }
5844 }
b287c9bb 5845
d2764cd6
PN
5846 /* sort sets from the smallest to largest, this will improve our
5847 * algorithm's performance */
5848 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5849
2830ca53
PN
5850 dstobj = createZsetObject();
5851 dstzset = dstobj->ptr;
5852
5853 if (op == REDIS_OP_INTER) {
8f92e768
PN
5854 /* skip going over all entries if the smallest zset is NULL or empty */
5855 if (src[0].dict && dictSize(src[0].dict) > 0) {
5856 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5857 * from small to large, all src[i > 0].dict are non-empty too */
5858 di = dictGetIterator(src[0].dict);
2830ca53 5859 while((de = dictNext(di)) != NULL) {
d2764cd6
PN
5860 double *score = zmalloc(sizeof(double)), value;
5861 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
2830ca53 5862
d2764cd6
PN
5863 for (j = 1; j < zsetnum; j++) {
5864 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 5865 if (other) {
d2764cd6
PN
5866 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5867 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
5868 } else {
5869 break;
5870 }
5871 }
b287c9bb 5872
2830ca53 5873 /* skip entry when not present in every source dict */
8f92e768 5874 if (j != zsetnum) {
2830ca53
PN
5875 zfree(score);
5876 } else {
5877 robj *o = dictGetEntryKey(de);
5878 dictAdd(dstzset->dict,o,score);
5879 incrRefCount(o); /* added to dictionary */
5880 zslInsert(dstzset->zsl,*score,o);
5881 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
5882 }
5883 }
2830ca53
PN
5884 dictReleaseIterator(di);
5885 }
5886 } else if (op == REDIS_OP_UNION) {
5887 for (i = 0; i < zsetnum; i++) {
8f92e768 5888 if (!src[i].dict) continue;
2830ca53 5889
8f92e768 5890 di = dictGetIterator(src[i].dict);
2830ca53
PN
5891 while((de = dictNext(di)) != NULL) {
5892 /* skip key when already processed */
5893 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5894
d2764cd6
PN
5895 double *score = zmalloc(sizeof(double)), value;
5896 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
2830ca53 5897
d2764cd6
PN
5898 /* because the zsets are sorted by size, its only possible
5899 * for sets at larger indices to hold this entry */
5900 for (j = (i+1); j < zsetnum; j++) {
5901 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 5902 if (other) {
d2764cd6
PN
5903 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5904 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
5905 }
5906 }
b287c9bb 5907
2830ca53
PN
5908 robj *o = dictGetEntryKey(de);
5909 dictAdd(dstzset->dict,o,score);
5910 incrRefCount(o); /* added to dictionary */
5911 zslInsert(dstzset->zsl,*score,o);
5912 incrRefCount(o); /* added to skiplist */
5913 }
5914 dictReleaseIterator(di);
b287c9bb 5915 }
2830ca53
PN
5916 } else {
5917 /* unknown operator */
5918 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
5919 }
5920
5921 deleteKey(c->db,dstkey);
3ea27d37 5922 if (dstzset->zsl->length) {
5923 dictAdd(c->db->dict,dstkey,dstobj);
5924 incrRefCount(dstkey);
5925 addReplyLong(c, dstzset->zsl->length);
5926 server.dirty++;
5927 } else {
8bca8773 5928 decrRefCount(dstobj);
3ea27d37 5929 addReply(c, shared.czero);
5930 }
8f92e768 5931 zfree(src);
b287c9bb
PN
5932}
5933
2830ca53
PN
5934static void zunionCommand(redisClient *c) {
5935 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
5936}
5937
2830ca53
PN
5938static void zinterCommand(redisClient *c) {
5939 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
5940}
5941
e3870fab 5942static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 5943 robj *o;
bbe025e0
AM
5944 long start;
5945 long end;
752da584 5946 int withscores = 0;
dd88747b 5947 int llen;
5948 int rangelen, j;
5949 zset *zsetobj;
5950 zskiplist *zsl;
5951 zskiplistNode *ln;
5952 robj *ele;
752da584 5953
bd79a6bd
PN
5954 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5955 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 5956
752da584 5957 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5958 withscores = 1;
5959 } else if (c->argc >= 5) {
5960 addReply(c,shared.syntaxerr);
5961 return;
5962 }
cc812361 5963
4e27f268 5964 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5965 || checkType(c,o,REDIS_ZSET)) return;
dd88747b 5966 zsetobj = o->ptr;
5967 zsl = zsetobj->zsl;
5968 llen = zsl->length;
cc812361 5969
dd88747b 5970 /* convert negative indexes */
5971 if (start < 0) start = llen+start;
5972 if (end < 0) end = llen+end;
5973 if (start < 0) start = 0;
5974 if (end < 0) end = 0;
cc812361 5975
dd88747b 5976 /* indexes sanity checks */
5977 if (start > end || start >= llen) {
5978 /* Out of range start or start > end result in empty list */
5979 addReply(c,shared.emptymultibulk);
5980 return;
5981 }
5982 if (end >= llen) end = llen-1;
5983 rangelen = (end-start)+1;
cc812361 5984
dd88747b 5985 /* check if starting point is trivial, before searching
5986 * the element in log(N) time */
5987 if (reverse) {
5988 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5989 } else {
5990 ln = start == 0 ?
5991 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5992 }
cc812361 5993
dd88747b 5994 /* Return the result in form of a multi-bulk reply */
5995 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5996 withscores ? (rangelen*2) : rangelen));
5997 for (j = 0; j < rangelen; j++) {
5998 ele = ln->obj;
5999 addReplyBulk(c,ele);
6000 if (withscores)
6001 addReplyDouble(c,ln->score);
6002 ln = reverse ? ln->backward : ln->forward[0];
cc812361 6003 }
6004}
6005
e3870fab 6006static void zrangeCommand(redisClient *c) {
6007 zrangeGenericCommand(c,0);
6008}
6009
6010static void zrevrangeCommand(redisClient *c) {
6011 zrangeGenericCommand(c,1);
6012}
6013
f44dd428 6014/* This command implements both ZRANGEBYSCORE and ZCOUNT.
6015 * If justcount is non-zero, just the count is returned. */
6016static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 6017 robj *o;
f44dd428 6018 double min, max;
6019 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 6020 int offset = 0, limit = -1;
0500ef27
SH
6021 int withscores = 0;
6022 int badsyntax = 0;
6023
f44dd428 6024 /* Parse the min-max interval. If one of the values is prefixed
6025 * by the "(" character, it's considered "open". For instance
6026 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
6027 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
6028 if (((char*)c->argv[2]->ptr)[0] == '(') {
6029 min = strtod((char*)c->argv[2]->ptr+1,NULL);
6030 minex = 1;
6031 } else {
6032 min = strtod(c->argv[2]->ptr,NULL);
6033 }
6034 if (((char*)c->argv[3]->ptr)[0] == '(') {
6035 max = strtod((char*)c->argv[3]->ptr+1,NULL);
6036 maxex = 1;
6037 } else {
6038 max = strtod(c->argv[3]->ptr,NULL);
6039 }
6040
6041 /* Parse "WITHSCORES": note that if the command was called with
6042 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
6043 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 6044 if (c->argc == 5 || c->argc == 8) {
3a3978b1 6045 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
6046 withscores = 1;
6047 else
6048 badsyntax = 1;
0500ef27 6049 }
3a3978b1 6050 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 6051 badsyntax = 1;
0500ef27 6052 if (badsyntax) {
454d4e43 6053 addReplySds(c,
6054 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 6055 return;
0500ef27
SH
6056 }
6057
f44dd428 6058 /* Parse "LIMIT" */
0500ef27 6059 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 6060 addReply(c,shared.syntaxerr);
6061 return;
0500ef27 6062 } else if (c->argc == (7 + withscores)) {
80181f78 6063 offset = atoi(c->argv[5]->ptr);
6064 limit = atoi(c->argv[6]->ptr);
0b13687c 6065 if (offset < 0) offset = 0;
80181f78 6066 }
50c55df5 6067
f44dd428 6068 /* Ok, lookup the key and get the range */
50c55df5 6069 o = lookupKeyRead(c->db,c->argv[1]);
6070 if (o == NULL) {
4e27f268 6071 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6072 } else {
6073 if (o->type != REDIS_ZSET) {
6074 addReply(c,shared.wrongtypeerr);
6075 } else {
6076 zset *zsetobj = o->ptr;
6077 zskiplist *zsl = zsetobj->zsl;
6078 zskiplistNode *ln;
f44dd428 6079 robj *ele, *lenobj = NULL;
6080 unsigned long rangelen = 0;
50c55df5 6081
f44dd428 6082 /* Get the first node with the score >= min, or with
6083 * score > min if 'minex' is true. */
50c55df5 6084 ln = zslFirstWithScore(zsl,min);
f44dd428 6085 while (minex && ln && ln->score == min) ln = ln->forward[0];
6086
50c55df5 6087 if (ln == NULL) {
6088 /* No element matching the speciifed interval */
f44dd428 6089 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6090 return;
6091 }
6092
6093 /* We don't know in advance how many matching elements there
6094 * are in the list, so we push this object that will represent
6095 * the multi-bulk length in the output buffer, and will "fix"
6096 * it later */
f44dd428 6097 if (!justcount) {
6098 lenobj = createObject(REDIS_STRING,NULL);
6099 addReply(c,lenobj);
6100 decrRefCount(lenobj);
6101 }
50c55df5 6102
f44dd428 6103 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 6104 if (offset) {
6105 offset--;
6106 ln = ln->forward[0];
6107 continue;
6108 }
6109 if (limit == 0) break;
f44dd428 6110 if (!justcount) {
6111 ele = ln->obj;
dd88747b 6112 addReplyBulk(c,ele);
f44dd428 6113 if (withscores)
6114 addReplyDouble(c,ln->score);
6115 }
50c55df5 6116 ln = ln->forward[0];
6117 rangelen++;
80181f78 6118 if (limit > 0) limit--;
50c55df5 6119 }
f44dd428 6120 if (justcount) {
6121 addReplyLong(c,(long)rangelen);
6122 } else {
6123 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6124 withscores ? (rangelen*2) : rangelen);
6125 }
50c55df5 6126 }
6127 }
6128}
6129
f44dd428 6130static void zrangebyscoreCommand(redisClient *c) {
6131 genericZrangebyscoreCommand(c,0);
6132}
6133
6134static void zcountCommand(redisClient *c) {
6135 genericZrangebyscoreCommand(c,1);
6136}
6137
3c41331e 6138static void zcardCommand(redisClient *c) {
e197b441 6139 robj *o;
6140 zset *zs;
dd88747b 6141
6142 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6143 checkType(c,o,REDIS_ZSET)) return;
6144
6145 zs = o->ptr;
6146 addReplyUlong(c,zs->zsl->length);
e197b441 6147}
6148
6e333bbe 6149static void zscoreCommand(redisClient *c) {
6150 robj *o;
6151 zset *zs;
dd88747b 6152 dictEntry *de;
6153
6154 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6155 checkType(c,o,REDIS_ZSET)) return;
6156
6157 zs = o->ptr;
6158 de = dictFind(zs->dict,c->argv[2]);
6159 if (!de) {
96d8b4ee 6160 addReply(c,shared.nullbulk);
6e333bbe 6161 } else {
dd88747b 6162 double *score = dictGetEntryVal(de);
6e333bbe 6163
dd88747b 6164 addReplyDouble(c,*score);
6e333bbe 6165 }
6166}
6167
798d9e55 6168static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 6169 robj *o;
dd88747b 6170 zset *zs;
6171 zskiplist *zsl;
6172 dictEntry *de;
6173 unsigned long rank;
6174 double *score;
6175
6176 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6177 checkType(c,o,REDIS_ZSET)) return;
6178
6179 zs = o->ptr;
6180 zsl = zs->zsl;
6181 de = dictFind(zs->dict,c->argv[2]);
6182 if (!de) {
69d95c3e
PN
6183 addReply(c,shared.nullbulk);
6184 return;
6185 }
69d95c3e 6186
dd88747b 6187 score = dictGetEntryVal(de);
6188 rank = zslGetRank(zsl, *score, c->argv[2]);
6189 if (rank) {
6190 if (reverse) {
6191 addReplyLong(c, zsl->length - rank);
27b0ccca 6192 } else {
dd88747b 6193 addReplyLong(c, rank-1);
69d95c3e 6194 }
dd88747b 6195 } else {
6196 addReply(c,shared.nullbulk);
978c2c94 6197 }
6198}
6199
798d9e55
PN
6200static void zrankCommand(redisClient *c) {
6201 zrankGenericCommand(c, 0);
6202}
6203
6204static void zrevrankCommand(redisClient *c) {
6205 zrankGenericCommand(c, 1);
6206}
6207
7fb16bac
PN
6208/* ========================= Hashes utility functions ======================= */
6209#define REDIS_HASH_KEY 1
6210#define REDIS_HASH_VALUE 2
978c2c94 6211
7fb16bac
PN
6212/* Check the length of a number of objects to see if we need to convert a
6213 * zipmap to a real hash. Note that we only check string encoded objects
6214 * as their string length can be queried in constant time. */
6215static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6216 int i;
6217 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
978c2c94 6218
7fb16bac
PN
6219 for (i = start; i <= end; i++) {
6220 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6221 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6222 {
6223 convertToRealHash(subject);
978c2c94 6224 return;
6225 }
6226 }
7fb16bac 6227}
bae2c7ec 6228
97224de7
PN
6229/* Encode given objects in-place when the hash uses a dict. */
6230static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6231 if (subject->encoding == REDIS_ENCODING_HT) {
3f973463
PN
6232 if (o1) *o1 = tryObjectEncoding(*o1);
6233 if (o2) *o2 = tryObjectEncoding(*o2);
97224de7
PN
6234 }
6235}
6236
7fb16bac 6237/* Get the value from a hash identified by key. Returns either a string
a3f3af86
PN
6238 * object or NULL if the value cannot be found. The refcount of the object
6239 * is always increased by 1 when the value was found. */
7fb16bac
PN
6240static robj *hashGet(robj *o, robj *key) {
6241 robj *value = NULL;
978c2c94 6242 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7fb16bac
PN
6243 unsigned char *v;
6244 unsigned int vlen;
6245 key = getDecodedObject(key);
6246 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6247 value = createStringObject((char*)v,vlen);
6248 }
6249 decrRefCount(key);
6250 } else {
6251 dictEntry *de = dictFind(o->ptr,key);
6252 if (de != NULL) {
6253 value = dictGetEntryVal(de);
a3f3af86 6254 incrRefCount(value);
7fb16bac
PN
6255 }
6256 }
6257 return value;
6258}
978c2c94 6259
7fb16bac
PN
6260/* Test if the key exists in the given hash. Returns 1 if the key
6261 * exists and 0 when it doesn't. */
6262static int hashExists(robj *o, robj *key) {
6263 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6264 key = getDecodedObject(key);
6265 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6266 decrRefCount(key);
6267 return 1;
6268 }
6269 decrRefCount(key);
6270 } else {
6271 if (dictFind(o->ptr,key) != NULL) {
6272 return 1;
6273 }
6274 }
6275 return 0;
6276}
bae2c7ec 6277
7fb16bac
PN
6278/* Add an element, discard the old if the key already exists.
6279 * Return 0 on insert and 1 on update. */
feb8d7e6 6280static int hashSet(robj *o, robj *key, robj *value) {
7fb16bac
PN
6281 int update = 0;
6282 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6283 key = getDecodedObject(key);
6284 value = getDecodedObject(value);
6285 o->ptr = zipmapSet(o->ptr,
6286 key->ptr,sdslen(key->ptr),
6287 value->ptr,sdslen(value->ptr), &update);
6288 decrRefCount(key);
6289 decrRefCount(value);
6290
6291 /* Check if the zipmap needs to be upgraded to a real hash table */
6292 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
bae2c7ec 6293 convertToRealHash(o);
978c2c94 6294 } else {
7fb16bac
PN
6295 if (dictReplace(o->ptr,key,value)) {
6296 /* Insert */
6297 incrRefCount(key);
978c2c94 6298 } else {
7fb16bac 6299 /* Update */
978c2c94 6300 update = 1;
6301 }
7fb16bac 6302 incrRefCount(value);
978c2c94 6303 }
7fb16bac 6304 return update;
978c2c94 6305}
6306
7fb16bac
PN
6307/* Delete an element from a hash.
6308 * Return 1 on deleted and 0 on not found. */
6309static int hashDelete(robj *o, robj *key) {
6310 int deleted = 0;
6311 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6312 key = getDecodedObject(key);
6313 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6314 decrRefCount(key);
6315 } else {
6316 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6317 /* Always check if the dictionary needs a resize after a delete. */
6318 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
d33278d1 6319 }
7fb16bac
PN
6320 return deleted;
6321}
d33278d1 6322
7fb16bac 6323/* Return the number of elements in a hash. */
c811bb38 6324static unsigned long hashLength(robj *o) {
7fb16bac
PN
6325 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6326 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6327}
6328
6329/* Structure to hold hash iteration abstration. Note that iteration over
6330 * hashes involves both fields and values. Because it is possible that
6331 * not both are required, store pointers in the iterator to avoid
6332 * unnecessary memory allocation for fields/values. */
6333typedef struct {
6334 int encoding;
6335 unsigned char *zi;
6336 unsigned char *zk, *zv;
6337 unsigned int zklen, zvlen;
6338
6339 dictIterator *di;
6340 dictEntry *de;
6341} hashIterator;
6342
c44d3b56
PN
6343static hashIterator *hashInitIterator(robj *subject) {
6344 hashIterator *hi = zmalloc(sizeof(hashIterator));
7fb16bac
PN
6345 hi->encoding = subject->encoding;
6346 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6347 hi->zi = zipmapRewind(subject->ptr);
6348 } else if (hi->encoding == REDIS_ENCODING_HT) {
6349 hi->di = dictGetIterator(subject->ptr);
d33278d1 6350 } else {
7fb16bac 6351 redisAssert(NULL);
d33278d1 6352 }
c44d3b56 6353 return hi;
7fb16bac 6354}
d33278d1 6355
7fb16bac
PN
6356static void hashReleaseIterator(hashIterator *hi) {
6357 if (hi->encoding == REDIS_ENCODING_HT) {
6358 dictReleaseIterator(hi->di);
d33278d1 6359 }
c44d3b56 6360 zfree(hi);
7fb16bac 6361}
d33278d1 6362
7fb16bac
PN
6363/* Move to the next entry in the hash. Return REDIS_OK when the next entry
6364 * could be found and REDIS_ERR when the iterator reaches the end. */
c811bb38 6365static int hashNext(hashIterator *hi) {
7fb16bac
PN
6366 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6367 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6368 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6369 } else {
6370 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6371 }
6372 return REDIS_OK;
6373}
d33278d1 6374
0c390abc 6375/* Get key or value object at current iteration position.
a3f3af86 6376 * This increases the refcount of the field object by 1. */
c811bb38 6377static robj *hashCurrent(hashIterator *hi, int what) {
7fb16bac
PN
6378 robj *o;
6379 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6380 if (what & REDIS_HASH_KEY) {
6381 o = createStringObject((char*)hi->zk,hi->zklen);
6382 } else {
6383 o = createStringObject((char*)hi->zv,hi->zvlen);
d33278d1 6384 }
d33278d1 6385 } else {
7fb16bac
PN
6386 if (what & REDIS_HASH_KEY) {
6387 o = dictGetEntryKey(hi->de);
6388 } else {
6389 o = dictGetEntryVal(hi->de);
d33278d1 6390 }
a3f3af86 6391 incrRefCount(o);
d33278d1 6392 }
7fb16bac 6393 return o;
d33278d1
PN
6394}
6395
7fb16bac
PN
6396static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6397 robj *o = lookupKeyWrite(c->db,key);
01426b05
PN
6398 if (o == NULL) {
6399 o = createHashObject();
7fb16bac
PN
6400 dictAdd(c->db->dict,key,o);
6401 incrRefCount(key);
01426b05
PN
6402 } else {
6403 if (o->type != REDIS_HASH) {
6404 addReply(c,shared.wrongtypeerr);
7fb16bac 6405 return NULL;
01426b05
PN
6406 }
6407 }
7fb16bac
PN
6408 return o;
6409}
01426b05 6410
7fb16bac
PN
6411/* ============================= Hash commands ============================== */
6412static void hsetCommand(redisClient *c) {
6e9e463f 6413 int update;
7fb16bac 6414 robj *o;
bbe025e0 6415
7fb16bac
PN
6416 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6417 hashTryConversion(o,c->argv,2,3);
97224de7 6418 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6419 update = hashSet(o,c->argv[2],c->argv[3]);
6e9e463f 6420 addReply(c, update ? shared.czero : shared.cone);
7fb16bac
PN
6421 server.dirty++;
6422}
01426b05 6423
1f1c7695
PN
6424static void hsetnxCommand(redisClient *c) {
6425 robj *o;
6426 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6427 hashTryConversion(o,c->argv,2,3);
6428
6429 if (hashExists(o, c->argv[2])) {
6430 addReply(c, shared.czero);
01426b05 6431 } else {
97224de7 6432 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6433 hashSet(o,c->argv[2],c->argv[3]);
1f1c7695
PN
6434 addReply(c, shared.cone);
6435 server.dirty++;
6436 }
6437}
01426b05 6438
7fb16bac
PN
6439static void hmsetCommand(redisClient *c) {
6440 int i;
6441 robj *o;
01426b05 6442
7fb16bac
PN
6443 if ((c->argc % 2) == 1) {
6444 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6445 return;
6446 }
01426b05 6447
7fb16bac
PN
6448 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6449 hashTryConversion(o,c->argv,2,c->argc-1);
6450 for (i = 2; i < c->argc; i += 2) {
97224de7 6451 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
feb8d7e6 6452 hashSet(o,c->argv[i],c->argv[i+1]);
7fb16bac
PN
6453 }
6454 addReply(c, shared.ok);
edc2f63a 6455 server.dirty++;
7fb16bac
PN
6456}
6457
6458static void hincrbyCommand(redisClient *c) {
6459 long long value, incr;
6460 robj *o, *current, *new;
6461
bd79a6bd 6462 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
7fb16bac
PN
6463 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6464 if ((current = hashGet(o,c->argv[2])) != NULL) {
946342c1
PN
6465 if (getLongLongFromObjectOrReply(c,current,&value,
6466 "hash value is not an integer") != REDIS_OK) {
6467 decrRefCount(current);
6468 return;
6469 }
a3f3af86 6470 decrRefCount(current);
7fb16bac
PN
6471 } else {
6472 value = 0;
01426b05
PN
6473 }
6474
7fb16bac 6475 value += incr;
3f973463
PN
6476 new = createStringObjectFromLongLong(value);
6477 hashTryObjectEncoding(o,&c->argv[2],NULL);
feb8d7e6 6478 hashSet(o,c->argv[2],new);
7fb16bac
PN
6479 decrRefCount(new);
6480 addReplyLongLong(c,value);
01426b05 6481 server.dirty++;
01426b05
PN
6482}
6483
978c2c94 6484static void hgetCommand(redisClient *c) {
7fb16bac 6485 robj *o, *value;
dd88747b 6486 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6487 checkType(c,o,REDIS_HASH)) return;
6488
7fb16bac
PN
6489 if ((value = hashGet(o,c->argv[2])) != NULL) {
6490 addReplyBulk(c,value);
a3f3af86 6491 decrRefCount(value);
dd88747b 6492 } else {
7fb16bac 6493 addReply(c,shared.nullbulk);
69d95c3e 6494 }
69d95c3e
PN
6495}
6496
09aeb579
PN
6497static void hmgetCommand(redisClient *c) {
6498 int i;
7fb16bac
PN
6499 robj *o, *value;
6500 o = lookupKeyRead(c->db,c->argv[1]);
6501 if (o != NULL && o->type != REDIS_HASH) {
6502 addReply(c,shared.wrongtypeerr);
09aeb579
PN
6503 }
6504
7fb16bac
PN
6505 /* Note the check for o != NULL happens inside the loop. This is
6506 * done because objects that cannot be found are considered to be
6507 * an empty hash. The reply should then be a series of NULLs. */
09aeb579 6508 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7fb16bac
PN
6509 for (i = 2; i < c->argc; i++) {
6510 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6511 addReplyBulk(c,value);
a3f3af86 6512 decrRefCount(value);
7fb16bac
PN
6513 } else {
6514 addReply(c,shared.nullbulk);
09aeb579
PN
6515 }
6516 }
6517}
6518
07efaf74 6519static void hdelCommand(redisClient *c) {
dd88747b 6520 robj *o;
dd88747b 6521 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6522 checkType(c,o,REDIS_HASH)) return;
07efaf74 6523
7fb16bac
PN
6524 if (hashDelete(o,c->argv[2])) {
6525 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6526 addReply(c,shared.cone);
6527 server.dirty++;
dd88747b 6528 } else {
7fb16bac 6529 addReply(c,shared.czero);
07efaf74 6530 }
6531}
6532
92b27fe9 6533static void hlenCommand(redisClient *c) {
6534 robj *o;
dd88747b 6535 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 6536 checkType(c,o,REDIS_HASH)) return;
6537
7fb16bac 6538 addReplyUlong(c,hashLength(o));
92b27fe9 6539}
6540
78409a0f 6541static void genericHgetallCommand(redisClient *c, int flags) {
7fb16bac 6542 robj *o, *lenobj, *obj;
78409a0f 6543 unsigned long count = 0;
c44d3b56 6544 hashIterator *hi;
78409a0f 6545
4e27f268 6546 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
78409a0f 6547 || checkType(c,o,REDIS_HASH)) return;
6548
6549 lenobj = createObject(REDIS_STRING,NULL);
6550 addReply(c,lenobj);
6551 decrRefCount(lenobj);
6552
c44d3b56
PN
6553 hi = hashInitIterator(o);
6554 while (hashNext(hi) != REDIS_ERR) {
7fb16bac 6555 if (flags & REDIS_HASH_KEY) {
c44d3b56 6556 obj = hashCurrent(hi,REDIS_HASH_KEY);
7fb16bac 6557 addReplyBulk(c,obj);
a3f3af86 6558 decrRefCount(obj);
7fb16bac 6559 count++;
78409a0f 6560 }
7fb16bac 6561 if (flags & REDIS_HASH_VALUE) {
c44d3b56 6562 obj = hashCurrent(hi,REDIS_HASH_VALUE);
7fb16bac 6563 addReplyBulk(c,obj);
a3f3af86 6564 decrRefCount(obj);
7fb16bac 6565 count++;
78409a0f 6566 }
78409a0f 6567 }
c44d3b56 6568 hashReleaseIterator(hi);
7fb16bac 6569
78409a0f 6570 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6571}
6572
6573static void hkeysCommand(redisClient *c) {
7fb16bac 6574 genericHgetallCommand(c,REDIS_HASH_KEY);
78409a0f 6575}
6576
6577static void hvalsCommand(redisClient *c) {
7fb16bac 6578 genericHgetallCommand(c,REDIS_HASH_VALUE);
78409a0f 6579}
6580
6581static void hgetallCommand(redisClient *c) {
7fb16bac 6582 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
78409a0f 6583}
6584
a86f14b1 6585static void hexistsCommand(redisClient *c) {
6586 robj *o;
a86f14b1 6587 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6588 checkType(c,o,REDIS_HASH)) return;
6589
7fb16bac 6590 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
a86f14b1 6591}
6592
ada386b2 6593static void convertToRealHash(robj *o) {
6594 unsigned char *key, *val, *p, *zm = o->ptr;
6595 unsigned int klen, vlen;
6596 dict *dict = dictCreate(&hashDictType,NULL);
6597
6598 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6599 p = zipmapRewind(zm);
6600 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6601 robj *keyobj, *valobj;
6602
6603 keyobj = createStringObject((char*)key,klen);
6604 valobj = createStringObject((char*)val,vlen);
05df7621 6605 keyobj = tryObjectEncoding(keyobj);
6606 valobj = tryObjectEncoding(valobj);
ada386b2 6607 dictAdd(dict,keyobj,valobj);
6608 }
6609 o->encoding = REDIS_ENCODING_HT;
6610 o->ptr = dict;
6611 zfree(zm);
6612}
6613
6b47e12e 6614/* ========================= Non type-specific commands ==================== */
6615
ed9b544e 6616static void flushdbCommand(redisClient *c) {
ca37e9cd 6617 server.dirty += dictSize(c->db->dict);
3305306f 6618 dictEmpty(c->db->dict);
6619 dictEmpty(c->db->expires);
ed9b544e 6620 addReply(c,shared.ok);
ed9b544e 6621}
6622
6623static void flushallCommand(redisClient *c) {
ca37e9cd 6624 server.dirty += emptyDb();
ed9b544e 6625 addReply(c,shared.ok);
500ece7c 6626 if (server.bgsavechildpid != -1) {
6627 kill(server.bgsavechildpid,SIGKILL);
6628 rdbRemoveTempFile(server.bgsavechildpid);
6629 }
f78fd11b 6630 rdbSave(server.dbfilename);
ca37e9cd 6631 server.dirty++;
ed9b544e 6632}
6633
56906eef 6634static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 6635 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 6636 so->type = type;
6637 so->pattern = pattern;
6638 return so;
6639}
6640
6641/* Return the value associated to the key with a name obtained
55017f9d
PN
6642 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6643 * The returned object will always have its refcount increased by 1
6644 * when it is non-NULL. */
56906eef 6645static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6d7d1370 6646 char *p, *f;
ed9b544e 6647 sds spat, ssub;
6d7d1370
PN
6648 robj keyobj, fieldobj, *o;
6649 int prefixlen, sublen, postfixlen, fieldlen;
ed9b544e 6650 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6651 struct {
f1017b3f 6652 long len;
6653 long free;
ed9b544e 6654 char buf[REDIS_SORTKEY_MAX+1];
6d7d1370 6655 } keyname, fieldname;
ed9b544e 6656
28173a49 6657 /* If the pattern is "#" return the substitution object itself in order
6658 * to implement the "SORT ... GET #" feature. */
6659 spat = pattern->ptr;
6660 if (spat[0] == '#' && spat[1] == '\0') {
55017f9d 6661 incrRefCount(subst);
28173a49 6662 return subst;
6663 }
6664
6665 /* The substitution object may be specially encoded. If so we create
9d65a1bb 6666 * a decoded object on the fly. Otherwise getDecodedObject will just
6667 * increment the ref count, that we'll decrement later. */
6668 subst = getDecodedObject(subst);
942a3961 6669
ed9b544e 6670 ssub = subst->ptr;
6671 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6672 p = strchr(spat,'*');
ed5a857a 6673 if (!p) {
6674 decrRefCount(subst);
6675 return NULL;
6676 }
ed9b544e 6677
6d7d1370
PN
6678 /* Find out if we're dealing with a hash dereference. */
6679 if ((f = strstr(p+1, "->")) != NULL) {
6680 fieldlen = sdslen(spat)-(f-spat);
6681 /* this also copies \0 character */
6682 memcpy(fieldname.buf,f+2,fieldlen-1);
6683 fieldname.len = fieldlen-2;
6684 } else {
6685 fieldlen = 0;
6686 }
6687
ed9b544e 6688 prefixlen = p-spat;
6689 sublen = sdslen(ssub);
6d7d1370 6690 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
ed9b544e 6691 memcpy(keyname.buf,spat,prefixlen);
6692 memcpy(keyname.buf+prefixlen,ssub,sublen);
6693 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6694 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6695 keyname.len = prefixlen+sublen+postfixlen;
942a3961 6696 decrRefCount(subst);
6697
6d7d1370
PN
6698 /* Lookup substituted key */
6699 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6700 o = lookupKeyRead(db,&keyobj);
55017f9d
PN
6701 if (o == NULL) return NULL;
6702
6703 if (fieldlen > 0) {
6704 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6d7d1370 6705
705dad38
PN
6706 /* Retrieve value from hash by the field name. This operation
6707 * already increases the refcount of the returned object. */
6d7d1370
PN
6708 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6709 o = hashGet(o, &fieldobj);
705dad38 6710 } else {
55017f9d 6711 if (o->type != REDIS_STRING) return NULL;
b6f07345 6712
705dad38
PN
6713 /* Every object that this function returns needs to have its refcount
6714 * increased. sortCommand decreases it again. */
6715 incrRefCount(o);
6d7d1370
PN
6716 }
6717
6718 return o;
ed9b544e 6719}
6720
6721/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6722 * the additional parameter is not standard but a BSD-specific we have to
6723 * pass sorting parameters via the global 'server' structure */
6724static int sortCompare(const void *s1, const void *s2) {
6725 const redisSortObject *so1 = s1, *so2 = s2;
6726 int cmp;
6727
6728 if (!server.sort_alpha) {
6729 /* Numeric sorting. Here it's trivial as we precomputed scores */
6730 if (so1->u.score > so2->u.score) {
6731 cmp = 1;
6732 } else if (so1->u.score < so2->u.score) {
6733 cmp = -1;
6734 } else {
6735 cmp = 0;
6736 }
6737 } else {
6738 /* Alphanumeric sorting */
6739 if (server.sort_bypattern) {
6740 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6741 /* At least one compare object is NULL */
6742 if (so1->u.cmpobj == so2->u.cmpobj)
6743 cmp = 0;
6744 else if (so1->u.cmpobj == NULL)
6745 cmp = -1;
6746 else
6747 cmp = 1;
6748 } else {
6749 /* We have both the objects, use strcoll */
6750 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6751 }
6752 } else {
08ee9b57 6753 /* Compare elements directly. */
6754 cmp = compareStringObjects(so1->obj,so2->obj);
ed9b544e 6755 }
6756 }
6757 return server.sort_desc ? -cmp : cmp;
6758}
6759
6760/* The SORT command is the most complex command in Redis. Warning: this code
6761 * is optimized for speed and a bit less for readability */
6762static void sortCommand(redisClient *c) {
ed9b544e 6763 list *operations;
6764 int outputlen = 0;
6765 int desc = 0, alpha = 0;
6766 int limit_start = 0, limit_count = -1, start, end;
6767 int j, dontsort = 0, vectorlen;
6768 int getop = 0; /* GET operation counter */
443c6409 6769 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 6770 redisSortObject *vector; /* Resulting vector to sort */
6771
6772 /* Lookup the key to sort. It must be of the right types */
3305306f 6773 sortval = lookupKeyRead(c->db,c->argv[1]);
6774 if (sortval == NULL) {
4e27f268 6775 addReply(c,shared.emptymultibulk);
ed9b544e 6776 return;
6777 }
a5eb649b 6778 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6779 sortval->type != REDIS_ZSET)
6780 {
c937aa89 6781 addReply(c,shared.wrongtypeerr);
ed9b544e 6782 return;
6783 }
6784
6785 /* Create a list of operations to perform for every sorted element.
6786 * Operations can be GET/DEL/INCR/DECR */
6787 operations = listCreate();
092dac2a 6788 listSetFreeMethod(operations,zfree);
ed9b544e 6789 j = 2;
6790
6791 /* Now we need to protect sortval incrementing its count, in the future
6792 * SORT may have options able to overwrite/delete keys during the sorting
6793 * and the sorted key itself may get destroied */
6794 incrRefCount(sortval);
6795
6796 /* The SORT command has an SQL-alike syntax, parse it */
6797 while(j < c->argc) {
6798 int leftargs = c->argc-j-1;
6799 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6800 desc = 0;
6801 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6802 desc = 1;
6803 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6804 alpha = 1;
6805 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6806 limit_start = atoi(c->argv[j+1]->ptr);
6807 limit_count = atoi(c->argv[j+2]->ptr);
6808 j+=2;
443c6409 6809 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6810 storekey = c->argv[j+1];
6811 j++;
ed9b544e 6812 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6813 sortby = c->argv[j+1];
6814 /* If the BY pattern does not contain '*', i.e. it is constant,
6815 * we don't need to sort nor to lookup the weight keys. */
6816 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6817 j++;
6818 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6819 listAddNodeTail(operations,createSortOperation(
6820 REDIS_SORT_GET,c->argv[j+1]));
6821 getop++;
6822 j++;
ed9b544e 6823 } else {
6824 decrRefCount(sortval);
6825 listRelease(operations);
c937aa89 6826 addReply(c,shared.syntaxerr);
ed9b544e 6827 return;
6828 }
6829 j++;
6830 }
6831
6832 /* Load the sorting vector with all the objects to sort */
a5eb649b 6833 switch(sortval->type) {
6834 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6835 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6836 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
f83c6cb5 6837 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
a5eb649b 6838 }
ed9b544e 6839 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 6840 j = 0;
a5eb649b 6841
ed9b544e 6842 if (sortval->type == REDIS_LIST) {
6843 list *list = sortval->ptr;
6208b3a7 6844 listNode *ln;
c7df85a4 6845 listIter li;
6208b3a7 6846
c7df85a4 6847 listRewind(list,&li);
6848 while((ln = listNext(&li))) {
ed9b544e 6849 robj *ele = ln->value;
6850 vector[j].obj = ele;
6851 vector[j].u.score = 0;
6852 vector[j].u.cmpobj = NULL;
ed9b544e 6853 j++;
6854 }
6855 } else {
a5eb649b 6856 dict *set;
ed9b544e 6857 dictIterator *di;
6858 dictEntry *setele;
6859
a5eb649b 6860 if (sortval->type == REDIS_SET) {
6861 set = sortval->ptr;
6862 } else {
6863 zset *zs = sortval->ptr;
6864 set = zs->dict;
6865 }
6866
ed9b544e 6867 di = dictGetIterator(set);
ed9b544e 6868 while((setele = dictNext(di)) != NULL) {
6869 vector[j].obj = dictGetEntryKey(setele);
6870 vector[j].u.score = 0;
6871 vector[j].u.cmpobj = NULL;
6872 j++;
6873 }
6874 dictReleaseIterator(di);
6875 }
dfc5e96c 6876 redisAssert(j == vectorlen);
ed9b544e 6877
6878 /* Now it's time to load the right scores in the sorting vector */
6879 if (dontsort == 0) {
6880 for (j = 0; j < vectorlen; j++) {
6d7d1370 6881 robj *byval;
ed9b544e 6882 if (sortby) {
6d7d1370 6883 /* lookup value to sort by */
3305306f 6884 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
705dad38 6885 if (!byval) continue;
ed9b544e 6886 } else {
6d7d1370
PN
6887 /* use object itself to sort by */
6888 byval = vector[j].obj;
6889 }
6890
6891 if (alpha) {
08ee9b57 6892 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6d7d1370
PN
6893 } else {
6894 if (byval->encoding == REDIS_ENCODING_RAW) {
6895 vector[j].u.score = strtod(byval->ptr,NULL);
16fa22f1 6896 } else if (byval->encoding == REDIS_ENCODING_INT) {
6d7d1370
PN
6897 /* Don't need to decode the object if it's
6898 * integer-encoded (the only encoding supported) so
6899 * far. We can just cast it */
16fa22f1
PN
6900 vector[j].u.score = (long)byval->ptr;
6901 } else {
6902 redisAssert(1 != 1);
942a3961 6903 }
ed9b544e 6904 }
6d7d1370 6905
705dad38
PN
6906 /* when the object was retrieved using lookupKeyByPattern,
6907 * its refcount needs to be decreased. */
6908 if (sortby) {
6909 decrRefCount(byval);
ed9b544e 6910 }
6911 }
6912 }
6913
6914 /* We are ready to sort the vector... perform a bit of sanity check
6915 * on the LIMIT option too. We'll use a partial version of quicksort. */
6916 start = (limit_start < 0) ? 0 : limit_start;
6917 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6918 if (start >= vectorlen) {
6919 start = vectorlen-1;
6920 end = vectorlen-2;
6921 }
6922 if (end >= vectorlen) end = vectorlen-1;
6923
6924 if (dontsort == 0) {
6925 server.sort_desc = desc;
6926 server.sort_alpha = alpha;
6927 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 6928 if (sortby && (start != 0 || end != vectorlen-1))
6929 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6930 else
6931 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 6932 }
6933
6934 /* Send command output to the output buffer, performing the specified
6935 * GET/DEL/INCR/DECR operations if any. */
6936 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 6937 if (storekey == NULL) {
6938 /* STORE option not specified, sent the sorting result to client */
6939 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6940 for (j = start; j <= end; j++) {
6941 listNode *ln;
c7df85a4 6942 listIter li;
6943
dd88747b 6944 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 6945 listRewind(operations,&li);
6946 while((ln = listNext(&li))) {
443c6409 6947 redisSortOperation *sop = ln->value;
6948 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6949 vector[j].obj);
6950
6951 if (sop->type == REDIS_SORT_GET) {
55017f9d 6952 if (!val) {
443c6409 6953 addReply(c,shared.nullbulk);
6954 } else {
dd88747b 6955 addReplyBulk(c,val);
55017f9d 6956 decrRefCount(val);
443c6409 6957 }
6958 } else {
dfc5e96c 6959 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 6960 }
6961 }
ed9b544e 6962 }
443c6409 6963 } else {
6964 robj *listObject = createListObject();
6965 list *listPtr = (list*) listObject->ptr;
6966
6967 /* STORE option specified, set the sorting result as a List object */
6968 for (j = start; j <= end; j++) {
6969 listNode *ln;
c7df85a4 6970 listIter li;
6971
443c6409 6972 if (!getop) {
6973 listAddNodeTail(listPtr,vector[j].obj);
6974 incrRefCount(vector[j].obj);
6975 }
c7df85a4 6976 listRewind(operations,&li);
6977 while((ln = listNext(&li))) {
443c6409 6978 redisSortOperation *sop = ln->value;
6979 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6980 vector[j].obj);
6981
6982 if (sop->type == REDIS_SORT_GET) {
55017f9d 6983 if (!val) {
443c6409 6984 listAddNodeTail(listPtr,createStringObject("",0));
6985 } else {
55017f9d
PN
6986 /* We should do a incrRefCount on val because it is
6987 * added to the list, but also a decrRefCount because
6988 * it is returned by lookupKeyByPattern. This results
6989 * in doing nothing at all. */
443c6409 6990 listAddNodeTail(listPtr,val);
443c6409 6991 }
ed9b544e 6992 } else {
dfc5e96c 6993 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 6994 }
ed9b544e 6995 }
ed9b544e 6996 }
121796f7 6997 if (dictReplace(c->db->dict,storekey,listObject)) {
6998 incrRefCount(storekey);
6999 }
443c6409 7000 /* Note: we add 1 because the DB is dirty anyway since even if the
7001 * SORT result is empty a new key is set and maybe the old content
7002 * replaced. */
7003 server.dirty += 1+outputlen;
7004 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 7005 }
7006
7007 /* Cleanup */
7008 decrRefCount(sortval);
7009 listRelease(operations);
7010 for (j = 0; j < vectorlen; j++) {
16fa22f1 7011 if (alpha && vector[j].u.cmpobj)
ed9b544e 7012 decrRefCount(vector[j].u.cmpobj);
7013 }
7014 zfree(vector);
7015}
7016
ec6c7a1d 7017/* Convert an amount of bytes into a human readable string in the form
7018 * of 100B, 2G, 100M, 4K, and so forth. */
7019static void bytesToHuman(char *s, unsigned long long n) {
7020 double d;
7021
7022 if (n < 1024) {
7023 /* Bytes */
7024 sprintf(s,"%lluB",n);
7025 return;
7026 } else if (n < (1024*1024)) {
7027 d = (double)n/(1024);
7028 sprintf(s,"%.2fK",d);
7029 } else if (n < (1024LL*1024*1024)) {
7030 d = (double)n/(1024*1024);
7031 sprintf(s,"%.2fM",d);
7032 } else if (n < (1024LL*1024*1024*1024)) {
7033 d = (double)n/(1024LL*1024*1024);
b72f6a4b 7034 sprintf(s,"%.2fG",d);
ec6c7a1d 7035 }
7036}
7037
1c85b79f 7038/* Create the string returned by the INFO command. This is decoupled
7039 * by the INFO command itself as we need to report the same information
7040 * on memory corruption problems. */
7041static sds genRedisInfoString(void) {
ed9b544e 7042 sds info;
7043 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 7044 int j;
ec6c7a1d 7045 char hmem[64];
55a8298f 7046
b72f6a4b 7047 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 7048 info = sdscatprintf(sdsempty(),
7049 "redis_version:%s\r\n"
f1017b3f 7050 "arch_bits:%s\r\n"
7a932b74 7051 "multiplexing_api:%s\r\n"
0d7170a4 7052 "process_id:%ld\r\n"
682ac724 7053 "uptime_in_seconds:%ld\r\n"
7054 "uptime_in_days:%ld\r\n"
ed9b544e 7055 "connected_clients:%d\r\n"
7056 "connected_slaves:%d\r\n"
f86a74e9 7057 "blocked_clients:%d\r\n"
5fba9f71 7058 "used_memory:%zu\r\n"
ec6c7a1d 7059 "used_memory_human:%s\r\n"
ed9b544e 7060 "changes_since_last_save:%lld\r\n"
be2bb6b0 7061 "bgsave_in_progress:%d\r\n"
682ac724 7062 "last_save_time:%ld\r\n"
b3fad521 7063 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 7064 "total_connections_received:%lld\r\n"
7065 "total_commands_processed:%lld\r\n"
2a6a2ed1 7066 "expired_keys:%lld\r\n"
55a8298f 7067 "hash_max_zipmap_entries:%ld\r\n"
7068 "hash_max_zipmap_value:%ld\r\n"
ffc6b7f8 7069 "pubsub_channels:%ld\r\n"
7070 "pubsub_patterns:%u\r\n"
7d98e08c 7071 "vm_enabled:%d\r\n"
a0f643ea 7072 "role:%s\r\n"
ed9b544e 7073 ,REDIS_VERSION,
f1017b3f 7074 (sizeof(long) == 8) ? "64" : "32",
7a932b74 7075 aeGetApiName(),
0d7170a4 7076 (long) getpid(),
a0f643ea 7077 uptime,
7078 uptime/(3600*24),
ed9b544e 7079 listLength(server.clients)-listLength(server.slaves),
7080 listLength(server.slaves),
d5d55fc3 7081 server.blpop_blocked_clients,
b72f6a4b 7082 zmalloc_used_memory(),
ec6c7a1d 7083 hmem,
ed9b544e 7084 server.dirty,
9d65a1bb 7085 server.bgsavechildpid != -1,
ed9b544e 7086 server.lastsave,
b3fad521 7087 server.bgrewritechildpid != -1,
ed9b544e 7088 server.stat_numconnections,
7089 server.stat_numcommands,
2a6a2ed1 7090 server.stat_expiredkeys,
55a8298f 7091 server.hash_max_zipmap_entries,
7092 server.hash_max_zipmap_value,
ffc6b7f8 7093 dictSize(server.pubsub_channels),
7094 listLength(server.pubsub_patterns),
7d98e08c 7095 server.vm_enabled != 0,
a0f643ea 7096 server.masterhost == NULL ? "master" : "slave"
ed9b544e 7097 );
a0f643ea 7098 if (server.masterhost) {
7099 info = sdscatprintf(info,
7100 "master_host:%s\r\n"
7101 "master_port:%d\r\n"
7102 "master_link_status:%s\r\n"
7103 "master_last_io_seconds_ago:%d\r\n"
7104 ,server.masterhost,
7105 server.masterport,
7106 (server.replstate == REDIS_REPL_CONNECTED) ?
7107 "up" : "down",
f72b934d 7108 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 7109 );
7110 }
7d98e08c 7111 if (server.vm_enabled) {
1064ef87 7112 lockThreadedIO();
7d98e08c 7113 info = sdscatprintf(info,
7114 "vm_conf_max_memory:%llu\r\n"
7115 "vm_conf_page_size:%llu\r\n"
7116 "vm_conf_pages:%llu\r\n"
7117 "vm_stats_used_pages:%llu\r\n"
7118 "vm_stats_swapped_objects:%llu\r\n"
7119 "vm_stats_swappin_count:%llu\r\n"
7120 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 7121 "vm_stats_io_newjobs_len:%lu\r\n"
7122 "vm_stats_io_processing_len:%lu\r\n"
7123 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 7124 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 7125 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 7126 ,(unsigned long long) server.vm_max_memory,
7127 (unsigned long long) server.vm_page_size,
7128 (unsigned long long) server.vm_pages,
7129 (unsigned long long) server.vm_stats_used_pages,
7130 (unsigned long long) server.vm_stats_swapped_objects,
7131 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 7132 (unsigned long long) server.vm_stats_swapouts,
7133 (unsigned long) listLength(server.io_newjobs),
7134 (unsigned long) listLength(server.io_processing),
7135 (unsigned long) listLength(server.io_processed),
d5d55fc3 7136 (unsigned long) server.io_active_threads,
7137 (unsigned long) server.vm_blocked_clients
7d98e08c 7138 );
1064ef87 7139 unlockThreadedIO();
7d98e08c 7140 }
c3cb078d 7141 for (j = 0; j < server.dbnum; j++) {
7142 long long keys, vkeys;
7143
7144 keys = dictSize(server.db[j].dict);
7145 vkeys = dictSize(server.db[j].expires);
7146 if (keys || vkeys) {
9d65a1bb 7147 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 7148 j, keys, vkeys);
7149 }
7150 }
1c85b79f 7151 return info;
7152}
7153
7154static void infoCommand(redisClient *c) {
7155 sds info = genRedisInfoString();
83c6a618 7156 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7157 (unsigned long)sdslen(info)));
ed9b544e 7158 addReplySds(c,info);
70003d28 7159 addReply(c,shared.crlf);
ed9b544e 7160}
7161
3305306f 7162static void monitorCommand(redisClient *c) {
7163 /* ignore MONITOR if aleady slave or in monitor mode */
7164 if (c->flags & REDIS_SLAVE) return;
7165
7166 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7167 c->slaveseldb = 0;
6b47e12e 7168 listAddNodeTail(server.monitors,c);
3305306f 7169 addReply(c,shared.ok);
7170}
7171
7172/* ================================= Expire ================================= */
7173static int removeExpire(redisDb *db, robj *key) {
7174 if (dictDelete(db->expires,key) == DICT_OK) {
7175 return 1;
7176 } else {
7177 return 0;
7178 }
7179}
7180
7181static int setExpire(redisDb *db, robj *key, time_t when) {
7182 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7183 return 0;
7184 } else {
7185 incrRefCount(key);
7186 return 1;
7187 }
7188}
7189
bb32ede5 7190/* Return the expire time of the specified key, or -1 if no expire
7191 * is associated with this key (i.e. the key is non volatile) */
7192static time_t getExpire(redisDb *db, robj *key) {
7193 dictEntry *de;
7194
7195 /* No expire? return ASAP */
7196 if (dictSize(db->expires) == 0 ||
7197 (de = dictFind(db->expires,key)) == NULL) return -1;
7198
7199 return (time_t) dictGetEntryVal(de);
7200}
7201
3305306f 7202static int expireIfNeeded(redisDb *db, robj *key) {
7203 time_t when;
7204 dictEntry *de;
7205
7206 /* No expire? return ASAP */
7207 if (dictSize(db->expires) == 0 ||
7208 (de = dictFind(db->expires,key)) == NULL) return 0;
7209
7210 /* Lookup the expire */
7211 when = (time_t) dictGetEntryVal(de);
7212 if (time(NULL) <= when) return 0;
7213
7214 /* Delete the key */
7215 dictDelete(db->expires,key);
2a6a2ed1 7216 server.stat_expiredkeys++;
3305306f 7217 return dictDelete(db->dict,key) == DICT_OK;
7218}
7219
7220static int deleteIfVolatile(redisDb *db, robj *key) {
7221 dictEntry *de;
7222
7223 /* No expire? return ASAP */
7224 if (dictSize(db->expires) == 0 ||
7225 (de = dictFind(db->expires,key)) == NULL) return 0;
7226
7227 /* Delete the key */
0c66a471 7228 server.dirty++;
2a6a2ed1 7229 server.stat_expiredkeys++;
3305306f 7230 dictDelete(db->expires,key);
7231 return dictDelete(db->dict,key) == DICT_OK;
7232}
7233
bbe025e0 7234static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
3305306f 7235 dictEntry *de;
bbe025e0
AM
7236 time_t seconds;
7237
bd79a6bd 7238 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
bbe025e0
AM
7239
7240 seconds -= offset;
3305306f 7241
802e8373 7242 de = dictFind(c->db->dict,key);
3305306f 7243 if (de == NULL) {
7244 addReply(c,shared.czero);
7245 return;
7246 }
d4dd6556 7247 if (seconds <= 0) {
43e5ccdf 7248 if (deleteKey(c->db,key)) server.dirty++;
7249 addReply(c, shared.cone);
3305306f 7250 return;
7251 } else {
7252 time_t when = time(NULL)+seconds;
802e8373 7253 if (setExpire(c->db,key,when)) {
3305306f 7254 addReply(c,shared.cone);
77423026 7255 server.dirty++;
7256 } else {
3305306f 7257 addReply(c,shared.czero);
77423026 7258 }
3305306f 7259 return;
7260 }
7261}
7262
802e8373 7263static void expireCommand(redisClient *c) {
bbe025e0 7264 expireGenericCommand(c,c->argv[1],c->argv[2],0);
802e8373 7265}
7266
7267static void expireatCommand(redisClient *c) {
bbe025e0 7268 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
802e8373 7269}
7270
fd88489a 7271static void ttlCommand(redisClient *c) {
7272 time_t expire;
7273 int ttl = -1;
7274
7275 expire = getExpire(c->db,c->argv[1]);
7276 if (expire != -1) {
7277 ttl = (int) (expire-time(NULL));
7278 if (ttl < 0) ttl = -1;
7279 }
7280 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7281}
7282
6e469882 7283/* ================================ MULTI/EXEC ============================== */
7284
7285/* Client state initialization for MULTI/EXEC */
7286static void initClientMultiState(redisClient *c) {
7287 c->mstate.commands = NULL;
7288 c->mstate.count = 0;
7289}
7290
7291/* Release all the resources associated with MULTI/EXEC state */
7292static void freeClientMultiState(redisClient *c) {
7293 int j;
7294
7295 for (j = 0; j < c->mstate.count; j++) {
7296 int i;
7297 multiCmd *mc = c->mstate.commands+j;
7298
7299 for (i = 0; i < mc->argc; i++)
7300 decrRefCount(mc->argv[i]);
7301 zfree(mc->argv);
7302 }
7303 zfree(c->mstate.commands);
7304}
7305
7306/* Add a new command into the MULTI commands queue */
7307static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7308 multiCmd *mc;
7309 int j;
7310
7311 c->mstate.commands = zrealloc(c->mstate.commands,
7312 sizeof(multiCmd)*(c->mstate.count+1));
7313 mc = c->mstate.commands+c->mstate.count;
7314 mc->cmd = cmd;
7315 mc->argc = c->argc;
7316 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7317 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7318 for (j = 0; j < c->argc; j++)
7319 incrRefCount(mc->argv[j]);
7320 c->mstate.count++;
7321}
7322
7323static void multiCommand(redisClient *c) {
7324 c->flags |= REDIS_MULTI;
36c548f0 7325 addReply(c,shared.ok);
6e469882 7326}
7327
18b6cb76
DJ
7328static void discardCommand(redisClient *c) {
7329 if (!(c->flags & REDIS_MULTI)) {
7330 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7331 return;
7332 }
7333
7334 freeClientMultiState(c);
7335 initClientMultiState(c);
7336 c->flags &= (~REDIS_MULTI);
7337 addReply(c,shared.ok);
7338}
7339
66c8853f 7340/* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7341 * implememntation for more information. */
7342static void execCommandReplicateMulti(redisClient *c) {
7343 struct redisCommand *cmd;
7344 robj *multistring = createStringObject("MULTI",5);
7345
7346 cmd = lookupCommand("multi");
7347 if (server.appendonly)
7348 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7349 if (listLength(server.slaves))
7350 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7351 decrRefCount(multistring);
7352}
7353
6e469882 7354static void execCommand(redisClient *c) {
7355 int j;
7356 robj **orig_argv;
7357 int orig_argc;
7358
7359 if (!(c->flags & REDIS_MULTI)) {
7360 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7361 return;
7362 }
7363
66c8853f 7364 /* Replicate a MULTI request now that we are sure the block is executed.
7365 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7366 * both the AOF and the replication link will have the same consistency
7367 * and atomicity guarantees. */
7368 execCommandReplicateMulti(c);
7369
7370 /* Exec all the queued commands */
6e469882 7371 orig_argv = c->argv;
7372 orig_argc = c->argc;
7373 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7374 for (j = 0; j < c->mstate.count; j++) {
7375 c->argc = c->mstate.commands[j].argc;
7376 c->argv = c->mstate.commands[j].argv;
7377 call(c,c->mstate.commands[j].cmd);
7378 }
7379 c->argv = orig_argv;
7380 c->argc = orig_argc;
7381 freeClientMultiState(c);
7382 initClientMultiState(c);
7383 c->flags &= (~REDIS_MULTI);
66c8853f 7384 /* Make sure the EXEC command is always replicated / AOF, since we
7385 * always send the MULTI command (we can't know beforehand if the
7386 * next operations will contain at least a modification to the DB). */
7387 server.dirty++;
6e469882 7388}
7389
4409877e 7390/* =========================== Blocking Operations ========================= */
7391
7392/* Currently Redis blocking operations support is limited to list POP ops,
7393 * so the current implementation is not fully generic, but it is also not
7394 * completely specific so it will not require a rewrite to support new
7395 * kind of blocking operations in the future.
7396 *
7397 * Still it's important to note that list blocking operations can be already
7398 * used as a notification mechanism in order to implement other blocking
7399 * operations at application level, so there must be a very strong evidence
7400 * of usefulness and generality before new blocking operations are implemented.
7401 *
7402 * This is how the current blocking POP works, we use BLPOP as example:
7403 * - If the user calls BLPOP and the key exists and contains a non empty list
7404 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7405 * if there is not to block.
7406 * - If instead BLPOP is called and the key does not exists or the list is
7407 * empty we need to block. In order to do so we remove the notification for
7408 * new data to read in the client socket (so that we'll not serve new
7409 * requests if the blocking request is not served). Also we put the client
95242ab5 7410 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
4409877e 7411 * blocking for this keys.
7412 * - If a PUSH operation against a key with blocked clients waiting is
7413 * performed, we serve the first in the list: basically instead to push
7414 * the new element inside the list we return it to the (first / oldest)
7415 * blocking client, unblock the client, and remove it form the list.
7416 *
7417 * The above comment and the source code should be enough in order to understand
7418 * the implementation and modify / fix it later.
7419 */
7420
7421/* Set a client in blocking mode for the specified key, with the specified
7422 * timeout */
b177fd30 7423static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 7424 dictEntry *de;
7425 list *l;
b177fd30 7426 int j;
4409877e 7427
b177fd30 7428 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7429 c->blockingkeysnum = numkeys;
4409877e 7430 c->blockingto = timeout;
b177fd30 7431 for (j = 0; j < numkeys; j++) {
7432 /* Add the key in the client structure, to map clients -> keys */
7433 c->blockingkeys[j] = keys[j];
7434 incrRefCount(keys[j]);
4409877e 7435
b177fd30 7436 /* And in the other "side", to map keys -> clients */
7437 de = dictFind(c->db->blockingkeys,keys[j]);
7438 if (de == NULL) {
7439 int retval;
7440
7441 /* For every key we take a list of clients blocked for it */
7442 l = listCreate();
7443 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7444 incrRefCount(keys[j]);
7445 assert(retval == DICT_OK);
7446 } else {
7447 l = dictGetEntryVal(de);
7448 }
7449 listAddNodeTail(l,c);
4409877e 7450 }
b177fd30 7451 /* Mark the client as a blocked client */
4409877e 7452 c->flags |= REDIS_BLOCKED;
d5d55fc3 7453 server.blpop_blocked_clients++;
4409877e 7454}
7455
7456/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 7457static void unblockClientWaitingData(redisClient *c) {
4409877e 7458 dictEntry *de;
7459 list *l;
b177fd30 7460 int j;
4409877e 7461
b177fd30 7462 assert(c->blockingkeys != NULL);
7463 /* The client may wait for multiple keys, so unblock it for every key. */
7464 for (j = 0; j < c->blockingkeysnum; j++) {
7465 /* Remove this client from the list of clients waiting for this key. */
7466 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7467 assert(de != NULL);
7468 l = dictGetEntryVal(de);
7469 listDelNode(l,listSearchKey(l,c));
7470 /* If the list is empty we need to remove it to avoid wasting memory */
7471 if (listLength(l) == 0)
7472 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7473 decrRefCount(c->blockingkeys[j]);
7474 }
7475 /* Cleanup the client structure */
7476 zfree(c->blockingkeys);
7477 c->blockingkeys = NULL;
4409877e 7478 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 7479 server.blpop_blocked_clients--;
5921aa36 7480 /* We want to process data if there is some command waiting
b0d8747d 7481 * in the input buffer. Note that this is safe even if
7482 * unblockClientWaitingData() gets called from freeClient() because
7483 * freeClient() will be smart enough to call this function
7484 * *after* c->querybuf was set to NULL. */
4409877e 7485 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7486}
7487
7488/* This should be called from any function PUSHing into lists.
7489 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7490 * 'ele' is the element pushed.
7491 *
7492 * If the function returns 0 there was no client waiting for a list push
7493 * against this key.
7494 *
7495 * If the function returns 1 there was a client waiting for a list push
7496 * against this key, the element was passed to this client thus it's not
7497 * needed to actually add it to the list and the caller should return asap. */
7498static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7499 struct dictEntry *de;
7500 redisClient *receiver;
7501 list *l;
7502 listNode *ln;
7503
7504 de = dictFind(c->db->blockingkeys,key);
7505 if (de == NULL) return 0;
7506 l = dictGetEntryVal(de);
7507 ln = listFirst(l);
7508 assert(ln != NULL);
7509 receiver = ln->value;
4409877e 7510
b177fd30 7511 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 7512 addReplyBulk(receiver,key);
7513 addReplyBulk(receiver,ele);
b0d8747d 7514 unblockClientWaitingData(receiver);
4409877e 7515 return 1;
7516}
7517
7518/* Blocking RPOP/LPOP */
7519static void blockingPopGenericCommand(redisClient *c, int where) {
7520 robj *o;
7521 time_t timeout;
b177fd30 7522 int j;
4409877e 7523
b177fd30 7524 for (j = 1; j < c->argc-1; j++) {
7525 o = lookupKeyWrite(c->db,c->argv[j]);
7526 if (o != NULL) {
7527 if (o->type != REDIS_LIST) {
7528 addReply(c,shared.wrongtypeerr);
4409877e 7529 return;
b177fd30 7530 } else {
7531 list *list = o->ptr;
7532 if (listLength(list) != 0) {
7533 /* If the list contains elements fall back to the usual
7534 * non-blocking POP operation */
7535 robj *argv[2], **orig_argv;
7536 int orig_argc;
e0a62c7f 7537
b177fd30 7538 /* We need to alter the command arguments before to call
7539 * popGenericCommand() as the command takes a single key. */
7540 orig_argv = c->argv;
7541 orig_argc = c->argc;
7542 argv[1] = c->argv[j];
7543 c->argv = argv;
7544 c->argc = 2;
7545
7546 /* Also the return value is different, we need to output
7547 * the multi bulk reply header and the key name. The
7548 * "real" command will add the last element (the value)
7549 * for us. If this souds like an hack to you it's just
7550 * because it is... */
7551 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 7552 addReplyBulk(c,argv[1]);
b177fd30 7553 popGenericCommand(c,where);
7554
7555 /* Fix the client structure with the original stuff */
7556 c->argv = orig_argv;
7557 c->argc = orig_argc;
7558 return;
7559 }
4409877e 7560 }
7561 }
7562 }
7563 /* If the list is empty or the key does not exists we must block */
b177fd30 7564 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 7565 if (timeout > 0) timeout += time(NULL);
b177fd30 7566 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 7567}
7568
7569static void blpopCommand(redisClient *c) {
7570 blockingPopGenericCommand(c,REDIS_HEAD);
7571}
7572
7573static void brpopCommand(redisClient *c) {
7574 blockingPopGenericCommand(c,REDIS_TAIL);
7575}
7576
ed9b544e 7577/* =============================== Replication ============================= */
7578
a4d1ba9a 7579static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7580 ssize_t nwritten, ret = size;
7581 time_t start = time(NULL);
7582
7583 timeout++;
7584 while(size) {
7585 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7586 nwritten = write(fd,ptr,size);
7587 if (nwritten == -1) return -1;
7588 ptr += nwritten;
7589 size -= nwritten;
7590 }
7591 if ((time(NULL)-start) > timeout) {
7592 errno = ETIMEDOUT;
7593 return -1;
7594 }
7595 }
7596 return ret;
7597}
7598
a4d1ba9a 7599static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7600 ssize_t nread, totread = 0;
7601 time_t start = time(NULL);
7602
7603 timeout++;
7604 while(size) {
7605 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7606 nread = read(fd,ptr,size);
7607 if (nread == -1) return -1;
7608 ptr += nread;
7609 size -= nread;
7610 totread += nread;
7611 }
7612 if ((time(NULL)-start) > timeout) {
7613 errno = ETIMEDOUT;
7614 return -1;
7615 }
7616 }
7617 return totread;
7618}
7619
7620static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7621 ssize_t nread = 0;
7622
7623 size--;
7624 while(size) {
7625 char c;
7626
7627 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7628 if (c == '\n') {
7629 *ptr = '\0';
7630 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7631 return nread;
7632 } else {
7633 *ptr++ = c;
7634 *ptr = '\0';
7635 nread++;
7636 }
7637 }
7638 return nread;
7639}
7640
7641static void syncCommand(redisClient *c) {
40d224a9 7642 /* ignore SYNC if aleady slave or in monitor mode */
7643 if (c->flags & REDIS_SLAVE) return;
7644
7645 /* SYNC can't be issued when the server has pending data to send to
7646 * the client about already issued commands. We need a fresh reply
7647 * buffer registering the differences between the BGSAVE and the current
7648 * dataset, so that we can copy to other slaves if needed. */
7649 if (listLength(c->reply) != 0) {
7650 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7651 return;
7652 }
7653
7654 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7655 /* Here we need to check if there is a background saving operation
7656 * in progress, or if it is required to start one */
9d65a1bb 7657 if (server.bgsavechildpid != -1) {
40d224a9 7658 /* Ok a background save is in progress. Let's check if it is a good
7659 * one for replication, i.e. if there is another slave that is
7660 * registering differences since the server forked to save */
7661 redisClient *slave;
7662 listNode *ln;
c7df85a4 7663 listIter li;
40d224a9 7664
c7df85a4 7665 listRewind(server.slaves,&li);
7666 while((ln = listNext(&li))) {
40d224a9 7667 slave = ln->value;
7668 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 7669 }
7670 if (ln) {
7671 /* Perfect, the server is already registering differences for
7672 * another slave. Set the right state, and copy the buffer. */
7673 listRelease(c->reply);
7674 c->reply = listDup(slave->reply);
40d224a9 7675 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7676 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7677 } else {
7678 /* No way, we need to wait for the next BGSAVE in order to
7679 * register differences */
7680 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7681 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7682 }
7683 } else {
7684 /* Ok we don't have a BGSAVE in progress, let's start one */
7685 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7686 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7687 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7688 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7689 return;
7690 }
7691 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7692 }
6208b3a7 7693 c->repldbfd = -1;
40d224a9 7694 c->flags |= REDIS_SLAVE;
7695 c->slaveseldb = 0;
6b47e12e 7696 listAddNodeTail(server.slaves,c);
40d224a9 7697 return;
7698}
7699
6208b3a7 7700static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7701 redisClient *slave = privdata;
7702 REDIS_NOTUSED(el);
7703 REDIS_NOTUSED(mask);
7704 char buf[REDIS_IOBUF_LEN];
7705 ssize_t nwritten, buflen;
7706
7707 if (slave->repldboff == 0) {
7708 /* Write the bulk write count before to transfer the DB. In theory here
7709 * we don't know how much room there is in the output buffer of the
7710 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7711 * operations) will never be smaller than the few bytes we need. */
7712 sds bulkcount;
7713
7714 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7715 slave->repldbsize);
7716 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7717 {
7718 sdsfree(bulkcount);
7719 freeClient(slave);
7720 return;
7721 }
7722 sdsfree(bulkcount);
7723 }
7724 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7725 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7726 if (buflen <= 0) {
7727 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7728 (buflen == 0) ? "premature EOF" : strerror(errno));
7729 freeClient(slave);
7730 return;
7731 }
7732 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 7733 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 7734 strerror(errno));
7735 freeClient(slave);
7736 return;
7737 }
7738 slave->repldboff += nwritten;
7739 if (slave->repldboff == slave->repldbsize) {
7740 close(slave->repldbfd);
7741 slave->repldbfd = -1;
7742 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7743 slave->replstate = REDIS_REPL_ONLINE;
7744 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 7745 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 7746 freeClient(slave);
7747 return;
7748 }
7749 addReplySds(slave,sdsempty());
7750 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7751 }
7752}
ed9b544e 7753
a3b21203 7754/* This function is called at the end of every backgrond saving.
7755 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7756 * otherwise REDIS_ERR is passed to the function.
7757 *
7758 * The goal of this function is to handle slaves waiting for a successful
7759 * background saving in order to perform non-blocking synchronization. */
7760static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 7761 listNode *ln;
7762 int startbgsave = 0;
c7df85a4 7763 listIter li;
ed9b544e 7764
c7df85a4 7765 listRewind(server.slaves,&li);
7766 while((ln = listNext(&li))) {
6208b3a7 7767 redisClient *slave = ln->value;
ed9b544e 7768
6208b3a7 7769 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7770 startbgsave = 1;
7771 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7772 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 7773 struct redis_stat buf;
e0a62c7f 7774
6208b3a7 7775 if (bgsaveerr != REDIS_OK) {
7776 freeClient(slave);
7777 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7778 continue;
7779 }
7780 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 7781 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 7782 freeClient(slave);
7783 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7784 continue;
7785 }
7786 slave->repldboff = 0;
7787 slave->repldbsize = buf.st_size;
7788 slave->replstate = REDIS_REPL_SEND_BULK;
7789 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 7790 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 7791 freeClient(slave);
7792 continue;
7793 }
7794 }
ed9b544e 7795 }
6208b3a7 7796 if (startbgsave) {
7797 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 7798 listIter li;
7799
7800 listRewind(server.slaves,&li);
6208b3a7 7801 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 7802 while((ln = listNext(&li))) {
6208b3a7 7803 redisClient *slave = ln->value;
ed9b544e 7804
6208b3a7 7805 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7806 freeClient(slave);
7807 }
7808 }
7809 }
ed9b544e 7810}
7811
7812static int syncWithMaster(void) {
d0ccebcf 7813 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 7814 long dumpsize;
ed9b544e 7815 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 7816 int dfd, maxtries = 5;
ed9b544e 7817
7818 if (fd == -1) {
7819 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7820 strerror(errno));
7821 return REDIS_ERR;
7822 }
d0ccebcf 7823
7824 /* AUTH with the master if required. */
7825 if(server.masterauth) {
7826 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7827 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7828 close(fd);
7829 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7830 strerror(errno));
7831 return REDIS_ERR;
7832 }
7833 /* Read the AUTH result. */
7834 if (syncReadLine(fd,buf,1024,3600) == -1) {
7835 close(fd);
7836 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7837 strerror(errno));
7838 return REDIS_ERR;
7839 }
7840 if (buf[0] != '+') {
7841 close(fd);
7842 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7843 return REDIS_ERR;
7844 }
7845 }
7846
ed9b544e 7847 /* Issue the SYNC command */
7848 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7849 close(fd);
7850 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7851 strerror(errno));
7852 return REDIS_ERR;
7853 }
7854 /* Read the bulk write count */
8c4d91fc 7855 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 7856 close(fd);
7857 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7858 strerror(errno));
7859 return REDIS_ERR;
7860 }
4aa701c1 7861 if (buf[0] != '$') {
7862 close(fd);
7863 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7864 return REDIS_ERR;
7865 }
18e61fa2 7866 dumpsize = strtol(buf+1,NULL,10);
7867 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 7868 /* Read the bulk write data on a temp file */
8c5abee8 7869 while(maxtries--) {
7870 snprintf(tmpfile,256,
7871 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7872 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7873 if (dfd != -1) break;
5de9ad7c 7874 sleep(1);
8c5abee8 7875 }
ed9b544e 7876 if (dfd == -1) {
7877 close(fd);
7878 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7879 return REDIS_ERR;
7880 }
7881 while(dumpsize) {
7882 int nread, nwritten;
7883
7884 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7885 if (nread == -1) {
7886 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7887 strerror(errno));
7888 close(fd);
7889 close(dfd);
7890 return REDIS_ERR;
7891 }
7892 nwritten = write(dfd,buf,nread);
7893 if (nwritten == -1) {
7894 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7895 close(fd);
7896 close(dfd);
7897 return REDIS_ERR;
7898 }
7899 dumpsize -= nread;
7900 }
7901 close(dfd);
7902 if (rename(tmpfile,server.dbfilename) == -1) {
7903 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7904 unlink(tmpfile);
7905 close(fd);
7906 return REDIS_ERR;
7907 }
7908 emptyDb();
f78fd11b 7909 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 7910 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7911 close(fd);
7912 return REDIS_ERR;
7913 }
7914 server.master = createClient(fd);
7915 server.master->flags |= REDIS_MASTER;
179b3952 7916 server.master->authenticated = 1;
ed9b544e 7917 server.replstate = REDIS_REPL_CONNECTED;
7918 return REDIS_OK;
7919}
7920
321b0e13 7921static void slaveofCommand(redisClient *c) {
7922 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7923 !strcasecmp(c->argv[2]->ptr,"one")) {
7924 if (server.masterhost) {
7925 sdsfree(server.masterhost);
7926 server.masterhost = NULL;
7927 if (server.master) freeClient(server.master);
7928 server.replstate = REDIS_REPL_NONE;
7929 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7930 }
7931 } else {
7932 sdsfree(server.masterhost);
7933 server.masterhost = sdsdup(c->argv[1]->ptr);
7934 server.masterport = atoi(c->argv[2]->ptr);
7935 if (server.master) freeClient(server.master);
7936 server.replstate = REDIS_REPL_CONNECT;
7937 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7938 server.masterhost, server.masterport);
7939 }
7940 addReply(c,shared.ok);
7941}
7942
3fd78bcd 7943/* ============================ Maxmemory directive ======================== */
7944
a5819310 7945/* Try to free one object form the pre-allocated objects free list.
7946 * This is useful under low mem conditions as by default we take 1 million
7947 * free objects allocated. On success REDIS_OK is returned, otherwise
7948 * REDIS_ERR. */
7949static int tryFreeOneObjectFromFreelist(void) {
f870935d 7950 robj *o;
7951
a5819310 7952 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7953 if (listLength(server.objfreelist)) {
7954 listNode *head = listFirst(server.objfreelist);
7955 o = listNodeValue(head);
7956 listDelNode(server.objfreelist,head);
7957 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7958 zfree(o);
7959 return REDIS_OK;
7960 } else {
7961 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7962 return REDIS_ERR;
7963 }
f870935d 7964}
7965
3fd78bcd 7966/* This function gets called when 'maxmemory' is set on the config file to limit
7967 * the max memory used by the server, and we are out of memory.
7968 * This function will try to, in order:
7969 *
7970 * - Free objects from the free list
7971 * - Try to remove keys with an EXPIRE set
7972 *
7973 * It is not possible to free enough memory to reach used-memory < maxmemory
7974 * the server will start refusing commands that will enlarge even more the
7975 * memory usage.
7976 */
7977static void freeMemoryIfNeeded(void) {
7978 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 7979 int j, k, freed = 0;
7980
7981 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7982 for (j = 0; j < server.dbnum; j++) {
7983 int minttl = -1;
7984 robj *minkey = NULL;
7985 struct dictEntry *de;
7986
7987 if (dictSize(server.db[j].expires)) {
7988 freed = 1;
7989 /* From a sample of three keys drop the one nearest to
7990 * the natural expire */
7991 for (k = 0; k < 3; k++) {
7992 time_t t;
7993
7994 de = dictGetRandomKey(server.db[j].expires);
7995 t = (time_t) dictGetEntryVal(de);
7996 if (minttl == -1 || t < minttl) {
7997 minkey = dictGetEntryKey(de);
7998 minttl = t;
3fd78bcd 7999 }
3fd78bcd 8000 }
a5819310 8001 deleteKey(server.db+j,minkey);
3fd78bcd 8002 }
3fd78bcd 8003 }
a5819310 8004 if (!freed) return; /* nothing to free... */
3fd78bcd 8005 }
8006}
8007
f80dff62 8008/* ============================== Append Only file ========================== */
8009
8010static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
8011 sds buf = sdsempty();
8012 int j;
8013 ssize_t nwritten;
8014 time_t now;
8015 robj *tmpargv[3];
8016
8017 /* The DB this command was targetting is not the same as the last command
8018 * we appendend. To issue a SELECT command is needed. */
8019 if (dictid != server.appendseldb) {
8020 char seldb[64];
8021
8022 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 8023 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 8024 (unsigned long)strlen(seldb),seldb);
f80dff62 8025 server.appendseldb = dictid;
8026 }
8027
8028 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
8029 * EXPIREs into EXPIREATs calls */
8030 if (cmd->proc == expireCommand) {
8031 long when;
8032
8033 tmpargv[0] = createStringObject("EXPIREAT",8);
8034 tmpargv[1] = argv[1];
8035 incrRefCount(argv[1]);
8036 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
8037 tmpargv[2] = createObject(REDIS_STRING,
8038 sdscatprintf(sdsempty(),"%ld",when));
8039 argv = tmpargv;
8040 }
8041
8042 /* Append the actual command */
8043 buf = sdscatprintf(buf,"*%d\r\n",argc);
8044 for (j = 0; j < argc; j++) {
8045 robj *o = argv[j];
8046
9d65a1bb 8047 o = getDecodedObject(o);
83c6a618 8048 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
f80dff62 8049 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
8050 buf = sdscatlen(buf,"\r\n",2);
9d65a1bb 8051 decrRefCount(o);
f80dff62 8052 }
8053
8054 /* Free the objects from the modified argv for EXPIREAT */
8055 if (cmd->proc == expireCommand) {
8056 for (j = 0; j < 3; j++)
8057 decrRefCount(argv[j]);
8058 }
8059
8060 /* We want to perform a single write. This should be guaranteed atomic
8061 * at least if the filesystem we are writing is a real physical one.
8062 * While this will save us against the server being killed I don't think
8063 * there is much to do about the whole server stopping for power problems
8064 * or alike */
8065 nwritten = write(server.appendfd,buf,sdslen(buf));
8066 if (nwritten != (signed)sdslen(buf)) {
8067 /* Ooops, we are in troubles. The best thing to do for now is
8068 * to simply exit instead to give the illusion that everything is
8069 * working as expected. */
8070 if (nwritten == -1) {
8071 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8072 } else {
8073 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8074 }
8075 exit(1);
8076 }
85a83172 8077 /* If a background append only file rewriting is in progress we want to
8078 * accumulate the differences between the child DB and the current one
8079 * in a buffer, so that when the child process will do its work we
8080 * can append the differences to the new append only file. */
8081 if (server.bgrewritechildpid != -1)
8082 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8083
8084 sdsfree(buf);
f80dff62 8085 now = time(NULL);
8086 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8087 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8088 now-server.lastfsync > 1))
8089 {
10ce1276 8090 /* aof_fsync is defined as fdatasync() for Linux in order to avoid
8091 * flushing metadata. */
8092 aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
f80dff62 8093 server.lastfsync = now;
8094 }
8095}
8096
8097/* In Redis commands are always executed in the context of a client, so in
8098 * order to load the append only file we need to create a fake client. */
8099static struct redisClient *createFakeClient(void) {
8100 struct redisClient *c = zmalloc(sizeof(*c));
8101
8102 selectDb(c,0);
8103 c->fd = -1;
8104 c->querybuf = sdsempty();
8105 c->argc = 0;
8106 c->argv = NULL;
8107 c->flags = 0;
9387d17d 8108 /* We set the fake client as a slave waiting for the synchronization
8109 * so that Redis will not try to send replies to this client. */
8110 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 8111 c->reply = listCreate();
8112 listSetFreeMethod(c->reply,decrRefCount);
8113 listSetDupMethod(c->reply,dupClientReplyValue);
8063b99d 8114 initClientMultiState(c);
f80dff62 8115 return c;
8116}
8117
8118static void freeFakeClient(struct redisClient *c) {
8119 sdsfree(c->querybuf);
8120 listRelease(c->reply);
8063b99d 8121 freeClientMultiState(c);
f80dff62 8122 zfree(c);
8123}
8124
8125/* Replay the append log file. On error REDIS_OK is returned. On non fatal
8126 * error (the append only file is zero-length) REDIS_ERR is returned. On
8127 * fatal error an error message is logged and the program exists. */
8128int loadAppendOnlyFile(char *filename) {
8129 struct redisClient *fakeClient;
8130 FILE *fp = fopen(filename,"r");
8131 struct redis_stat sb;
b492cf00 8132 unsigned long long loadedkeys = 0;
8063b99d 8133 int appendonly = server.appendonly;
f80dff62 8134
8135 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8136 return REDIS_ERR;
8137
8138 if (fp == NULL) {
8139 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8140 exit(1);
8141 }
8142
8063b99d
PN
8143 /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
8144 * to the same file we're about to read. */
8145 server.appendonly = 0;
8146
f80dff62 8147 fakeClient = createFakeClient();
8148 while(1) {
8149 int argc, j;
8150 unsigned long len;
8151 robj **argv;
8152 char buf[128];
8153 sds argsds;
8154 struct redisCommand *cmd;
8155
8156 if (fgets(buf,sizeof(buf),fp) == NULL) {
8157 if (feof(fp))
8158 break;
8159 else
8160 goto readerr;
8161 }
8162 if (buf[0] != '*') goto fmterr;
8163 argc = atoi(buf+1);
8164 argv = zmalloc(sizeof(robj*)*argc);
8165 for (j = 0; j < argc; j++) {
8166 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8167 if (buf[0] != '$') goto fmterr;
8168 len = strtol(buf+1,NULL,10);
8169 argsds = sdsnewlen(NULL,len);
0f151ef1 8170 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 8171 argv[j] = createObject(REDIS_STRING,argsds);
8172 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8173 }
8174
8175 /* Command lookup */
8176 cmd = lookupCommand(argv[0]->ptr);
8177 if (!cmd) {
8178 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8179 exit(1);
8180 }
bdcb92f2 8181 /* Try object encoding */
f80dff62 8182 if (cmd->flags & REDIS_CMD_BULK)
05df7621 8183 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
f80dff62 8184 /* Run the command in the context of a fake client */
8185 fakeClient->argc = argc;
8186 fakeClient->argv = argv;
8187 cmd->proc(fakeClient);
8188 /* Discard the reply objects list from the fake client */
8189 while(listLength(fakeClient->reply))
8190 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8191 /* Clean up, ready for the next command */
8192 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8193 zfree(argv);
b492cf00 8194 /* Handle swapping while loading big datasets when VM is on */
8195 loadedkeys++;
8196 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8197 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 8198 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 8199 }
8200 }
f80dff62 8201 }
8063b99d
PN
8202
8203 /* This point can only be reached when EOF is reached without errors.
8204 * If the client is in the middle of a MULTI/EXEC, log error and quit. */
8205 if (fakeClient->flags & REDIS_MULTI) goto readerr;
8206
f80dff62 8207 fclose(fp);
8208 freeFakeClient(fakeClient);
8063b99d 8209 server.appendonly = appendonly;
f80dff62 8210 return REDIS_OK;
8211
8212readerr:
8213 if (feof(fp)) {
8214 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8215 } else {
8216 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8217 }
8218 exit(1);
8219fmterr:
8220 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8221 exit(1);
8222}
8223
9d65a1bb 8224/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
9c8e3cee 8225static int fwriteBulkObject(FILE *fp, robj *obj) {
9d65a1bb 8226 char buf[128];
b9bc0eef 8227 int decrrc = 0;
8228
f2d9f50f 8229 /* Avoid the incr/decr ref count business if possible to help
8230 * copy-on-write (we are often in a child process when this function
8231 * is called).
8232 * Also makes sure that key objects don't get incrRefCount-ed when VM
8233 * is enabled */
8234 if (obj->encoding != REDIS_ENCODING_RAW) {
b9bc0eef 8235 obj = getDecodedObject(obj);
8236 decrrc = 1;
8237 }
9d65a1bb 8238 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8239 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 8240 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8241 goto err;
9d65a1bb 8242 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 8243 if (decrrc) decrRefCount(obj);
9d65a1bb 8244 return 1;
8245err:
b9bc0eef 8246 if (decrrc) decrRefCount(obj);
9d65a1bb 8247 return 0;
8248}
8249
9c8e3cee 8250/* Write binary-safe string into a file in the bulkformat
8251 * $<count>\r\n<payload>\r\n */
8252static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8253 char buf[128];
8254
8255 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8256 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8257 if (len && fwrite(s,len,1,fp) == 0) return 0;
8258 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8259 return 1;
8260}
8261
9d65a1bb 8262/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8263static int fwriteBulkDouble(FILE *fp, double d) {
8264 char buf[128], dbuf[128];
8265
8266 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8267 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8268 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8269 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8270 return 1;
8271}
8272
8273/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8274static int fwriteBulkLong(FILE *fp, long l) {
8275 char buf[128], lbuf[128];
8276
8277 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8278 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8279 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8280 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8281 return 1;
8282}
8283
8284/* Write a sequence of commands able to fully rebuild the dataset into
8285 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8286static int rewriteAppendOnlyFile(char *filename) {
8287 dictIterator *di = NULL;
8288 dictEntry *de;
8289 FILE *fp;
8290 char tmpfile[256];
8291 int j;
8292 time_t now = time(NULL);
8293
8294 /* Note that we have to use a different temp name here compared to the
8295 * one used by rewriteAppendOnlyFileBackground() function. */
8296 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8297 fp = fopen(tmpfile,"w");
8298 if (!fp) {
8299 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8300 return REDIS_ERR;
8301 }
8302 for (j = 0; j < server.dbnum; j++) {
8303 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8304 redisDb *db = server.db+j;
8305 dict *d = db->dict;
8306 if (dictSize(d) == 0) continue;
8307 di = dictGetIterator(d);
8308 if (!di) {
8309 fclose(fp);
8310 return REDIS_ERR;
8311 }
8312
8313 /* SELECT the new DB */
8314 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 8315 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 8316
8317 /* Iterate this DB writing every entry */
8318 while((de = dictNext(di)) != NULL) {
e7546c63 8319 robj *key, *o;
8320 time_t expiretime;
8321 int swapped;
8322
8323 key = dictGetEntryKey(de);
b9bc0eef 8324 /* If the value for this key is swapped, load a preview in memory.
8325 * We use a "swapped" flag to remember if we need to free the
8326 * value object instead to just increment the ref count anyway
8327 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 8328 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8329 key->storage == REDIS_VM_SWAPPING) {
e7546c63 8330 o = dictGetEntryVal(de);
8331 swapped = 0;
8332 } else {
8333 o = vmPreviewObject(key);
e7546c63 8334 swapped = 1;
8335 }
8336 expiretime = getExpire(db,key);
9d65a1bb 8337
8338 /* Save the key and associated value */
9d65a1bb 8339 if (o->type == REDIS_STRING) {
8340 /* Emit a SET command */
8341 char cmd[]="*3\r\n$3\r\nSET\r\n";
8342 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8343 /* Key and value */
9c8e3cee 8344 if (fwriteBulkObject(fp,key) == 0) goto werr;
8345 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 8346 } else if (o->type == REDIS_LIST) {
8347 /* Emit the RPUSHes needed to rebuild the list */
8348 list *list = o->ptr;
8349 listNode *ln;
c7df85a4 8350 listIter li;
9d65a1bb 8351
c7df85a4 8352 listRewind(list,&li);
8353 while((ln = listNext(&li))) {
9d65a1bb 8354 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8355 robj *eleobj = listNodeValue(ln);
8356
8357 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8358 if (fwriteBulkObject(fp,key) == 0) goto werr;
8359 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8360 }
8361 } else if (o->type == REDIS_SET) {
8362 /* Emit the SADDs needed to rebuild the set */
8363 dict *set = o->ptr;
8364 dictIterator *di = dictGetIterator(set);
8365 dictEntry *de;
8366
8367 while((de = dictNext(di)) != NULL) {
8368 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8369 robj *eleobj = dictGetEntryKey(de);
8370
8371 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8372 if (fwriteBulkObject(fp,key) == 0) goto werr;
8373 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8374 }
8375 dictReleaseIterator(di);
8376 } else if (o->type == REDIS_ZSET) {
8377 /* Emit the ZADDs needed to rebuild the sorted set */
8378 zset *zs = o->ptr;
8379 dictIterator *di = dictGetIterator(zs->dict);
8380 dictEntry *de;
8381
8382 while((de = dictNext(di)) != NULL) {
8383 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8384 robj *eleobj = dictGetEntryKey(de);
8385 double *score = dictGetEntryVal(de);
8386
8387 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8388 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8389 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 8390 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8391 }
8392 dictReleaseIterator(di);
9c8e3cee 8393 } else if (o->type == REDIS_HASH) {
8394 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8395
8396 /* Emit the HSETs needed to rebuild the hash */
8397 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8398 unsigned char *p = zipmapRewind(o->ptr);
8399 unsigned char *field, *val;
8400 unsigned int flen, vlen;
8401
8402 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8403 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8404 if (fwriteBulkObject(fp,key) == 0) goto werr;
8405 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8406 return -1;
8407 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8408 return -1;
8409 }
8410 } else {
8411 dictIterator *di = dictGetIterator(o->ptr);
8412 dictEntry *de;
8413
8414 while((de = dictNext(di)) != NULL) {
8415 robj *field = dictGetEntryKey(de);
8416 robj *val = dictGetEntryVal(de);
8417
8418 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8419 if (fwriteBulkObject(fp,key) == 0) goto werr;
8420 if (fwriteBulkObject(fp,field) == -1) return -1;
8421 if (fwriteBulkObject(fp,val) == -1) return -1;
8422 }
8423 dictReleaseIterator(di);
8424 }
9d65a1bb 8425 } else {
f83c6cb5 8426 redisPanic("Unknown object type");
9d65a1bb 8427 }
8428 /* Save the expire time */
8429 if (expiretime != -1) {
e96e4fbf 8430 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 8431 /* If this key is already expired skip it */
8432 if (expiretime < now) continue;
8433 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8434 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8435 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8436 }
b9bc0eef 8437 if (swapped) decrRefCount(o);
9d65a1bb 8438 }
8439 dictReleaseIterator(di);
8440 }
8441
8442 /* Make sure data will not remain on the OS's output buffers */
8443 fflush(fp);
8444 fsync(fileno(fp));
8445 fclose(fp);
e0a62c7f 8446
9d65a1bb 8447 /* Use RENAME to make sure the DB file is changed atomically only
8448 * if the generate DB file is ok. */
8449 if (rename(tmpfile,filename) == -1) {
8450 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8451 unlink(tmpfile);
8452 return REDIS_ERR;
8453 }
8454 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8455 return REDIS_OK;
8456
8457werr:
8458 fclose(fp);
8459 unlink(tmpfile);
e96e4fbf 8460 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 8461 if (di) dictReleaseIterator(di);
8462 return REDIS_ERR;
8463}
8464
8465/* This is how rewriting of the append only file in background works:
8466 *
8467 * 1) The user calls BGREWRITEAOF
8468 * 2) Redis calls this function, that forks():
8469 * 2a) the child rewrite the append only file in a temp file.
8470 * 2b) the parent accumulates differences in server.bgrewritebuf.
8471 * 3) When the child finished '2a' exists.
8472 * 4) The parent will trap the exit code, if it's OK, will append the
8473 * data accumulated into server.bgrewritebuf into the temp file, and
8474 * finally will rename(2) the temp file in the actual file name.
8475 * The the new file is reopened as the new append only file. Profit!
8476 */
8477static int rewriteAppendOnlyFileBackground(void) {
8478 pid_t childpid;
8479
8480 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 8481 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 8482 if ((childpid = fork()) == 0) {
8483 /* Child */
8484 char tmpfile[256];
9d65a1bb 8485
054e426d 8486 if (server.vm_enabled) vmReopenSwapFile();
8487 close(server.fd);
9d65a1bb 8488 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8489 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 8490 _exit(0);
9d65a1bb 8491 } else {
478c2c6f 8492 _exit(1);
9d65a1bb 8493 }
8494 } else {
8495 /* Parent */
8496 if (childpid == -1) {
8497 redisLog(REDIS_WARNING,
8498 "Can't rewrite append only file in background: fork: %s",
8499 strerror(errno));
8500 return REDIS_ERR;
8501 }
8502 redisLog(REDIS_NOTICE,
8503 "Background append only file rewriting started by pid %d",childpid);
8504 server.bgrewritechildpid = childpid;
884d4b39 8505 updateDictResizePolicy();
85a83172 8506 /* We set appendseldb to -1 in order to force the next call to the
8507 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8508 * accumulated by the parent into server.bgrewritebuf will start
8509 * with a SELECT statement and it will be safe to merge. */
8510 server.appendseldb = -1;
9d65a1bb 8511 return REDIS_OK;
8512 }
8513 return REDIS_OK; /* unreached */
8514}
8515
8516static void bgrewriteaofCommand(redisClient *c) {
8517 if (server.bgrewritechildpid != -1) {
8518 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8519 return;
8520 }
8521 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 8522 char *status = "+Background append only file rewriting started\r\n";
8523 addReplySds(c,sdsnew(status));
9d65a1bb 8524 } else {
8525 addReply(c,shared.err);
8526 }
8527}
8528
8529static void aofRemoveTempFile(pid_t childpid) {
8530 char tmpfile[256];
8531
8532 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8533 unlink(tmpfile);
8534}
8535
996cb5f7 8536/* Virtual Memory is composed mainly of two subsystems:
8537 * - Blocking Virutal Memory
8538 * - Threaded Virtual Memory I/O
8539 * The two parts are not fully decoupled, but functions are split among two
8540 * different sections of the source code (delimited by comments) in order to
8541 * make more clear what functionality is about the blocking VM and what about
8542 * the threaded (not blocking) VM.
8543 *
8544 * Redis VM design:
8545 *
8546 * Redis VM is a blocking VM (one that blocks reading swapped values from
8547 * disk into memory when a value swapped out is needed in memory) that is made
8548 * unblocking by trying to examine the command argument vector in order to
8549 * load in background values that will likely be needed in order to exec
8550 * the command. The command is executed only once all the relevant keys
8551 * are loaded into memory.
8552 *
8553 * This basically is almost as simple of a blocking VM, but almost as parallel
8554 * as a fully non-blocking VM.
8555 */
8556
8557/* =================== Virtual Memory - Blocking Side ====================== */
054e426d 8558
8559/* substitute the first occurrence of '%p' with the process pid in the
8560 * swap file name. */
8561static void expandVmSwapFilename(void) {
8562 char *p = strstr(server.vm_swap_file,"%p");
8563 sds new;
e0a62c7f 8564
054e426d 8565 if (!p) return;
8566 new = sdsempty();
8567 *p = '\0';
8568 new = sdscat(new,server.vm_swap_file);
8569 new = sdscatprintf(new,"%ld",(long) getpid());
8570 new = sdscat(new,p+2);
8571 zfree(server.vm_swap_file);
8572 server.vm_swap_file = new;
8573}
8574
75680a3c 8575static void vmInit(void) {
8576 off_t totsize;
996cb5f7 8577 int pipefds[2];
bcaa7a4f 8578 size_t stacksize;
75680a3c 8579
4ad37480 8580 if (server.vm_max_threads != 0)
8581 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8582
054e426d 8583 expandVmSwapFilename();
8584 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
6fa987e3 8585 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8586 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8587 }
75680a3c 8588 if (server.vm_fp == NULL) {
6fa987e3 8589 redisLog(REDIS_WARNING,
8590 "Impossible to open the swap file: %s. Exiting.",
8591 strerror(errno));
75680a3c 8592 exit(1);
8593 }
8594 server.vm_fd = fileno(server.vm_fp);
8595 server.vm_next_page = 0;
8596 server.vm_near_pages = 0;
7d98e08c 8597 server.vm_stats_used_pages = 0;
8598 server.vm_stats_swapped_objects = 0;
8599 server.vm_stats_swapouts = 0;
8600 server.vm_stats_swapins = 0;
75680a3c 8601 totsize = server.vm_pages*server.vm_page_size;
8602 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8603 if (ftruncate(server.vm_fd,totsize) == -1) {
8604 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8605 strerror(errno));
8606 exit(1);
8607 } else {
8608 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8609 }
7d30035d 8610 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 8611 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 8612 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 8613 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 8614
996cb5f7 8615 /* Initialize threaded I/O (used by Virtual Memory) */
8616 server.io_newjobs = listCreate();
8617 server.io_processing = listCreate();
8618 server.io_processed = listCreate();
d5d55fc3 8619 server.io_ready_clients = listCreate();
92f8e882 8620 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 8621 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8622 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 8623 server.io_active_threads = 0;
996cb5f7 8624 if (pipe(pipefds) == -1) {
8625 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8626 ,strerror(errno));
8627 exit(1);
8628 }
8629 server.io_ready_pipe_read = pipefds[0];
8630 server.io_ready_pipe_write = pipefds[1];
8631 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 8632 /* LZF requires a lot of stack */
8633 pthread_attr_init(&server.io_threads_attr);
8634 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8635 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8636 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 8637 /* Listen for events in the threaded I/O pipe */
8638 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8639 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8640 oom("creating file event");
75680a3c 8641}
8642
06224fec 8643/* Mark the page as used */
8644static void vmMarkPageUsed(off_t page) {
8645 off_t byte = page/8;
8646 int bit = page&7;
970e10bb 8647 redisAssert(vmFreePage(page) == 1);
06224fec 8648 server.vm_bitmap[byte] |= 1<<bit;
8649}
8650
8651/* Mark N contiguous pages as used, with 'page' being the first. */
8652static void vmMarkPagesUsed(off_t page, off_t count) {
8653 off_t j;
8654
8655 for (j = 0; j < count; j++)
7d30035d 8656 vmMarkPageUsed(page+j);
7d98e08c 8657 server.vm_stats_used_pages += count;
7c775e09 8658 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8659 (long long)count, (long long)page);
06224fec 8660}
8661
8662/* Mark the page as free */
8663static void vmMarkPageFree(off_t page) {
8664 off_t byte = page/8;
8665 int bit = page&7;
970e10bb 8666 redisAssert(vmFreePage(page) == 0);
06224fec 8667 server.vm_bitmap[byte] &= ~(1<<bit);
8668}
8669
8670/* Mark N contiguous pages as free, with 'page' being the first. */
8671static void vmMarkPagesFree(off_t page, off_t count) {
8672 off_t j;
8673
8674 for (j = 0; j < count; j++)
7d30035d 8675 vmMarkPageFree(page+j);
7d98e08c 8676 server.vm_stats_used_pages -= count;
7c775e09 8677 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8678 (long long)count, (long long)page);
06224fec 8679}
8680
8681/* Test if the page is free */
8682static int vmFreePage(off_t page) {
8683 off_t byte = page/8;
8684 int bit = page&7;
7d30035d 8685 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 8686}
8687
8688/* Find N contiguous free pages storing the first page of the cluster in *first.
e0a62c7f 8689 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
3a66edc7 8690 * REDIS_ERR is returned.
06224fec 8691 *
8692 * This function uses a simple algorithm: we try to allocate
8693 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8694 * again from the start of the swap file searching for free spaces.
8695 *
8696 * If it looks pretty clear that there are no free pages near our offset
8697 * we try to find less populated places doing a forward jump of
8698 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8699 * without hurry, and then we jump again and so forth...
e0a62c7f 8700 *
06224fec 8701 * This function can be improved using a free list to avoid to guess
8702 * too much, since we could collect data about freed pages.
8703 *
8704 * note: I implemented this function just after watching an episode of
8705 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8706 */
c7df85a4 8707static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 8708 off_t base, offset = 0, since_jump = 0, numfree = 0;
8709
8710 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8711 server.vm_near_pages = 0;
8712 server.vm_next_page = 0;
8713 }
8714 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8715 base = server.vm_next_page;
8716
8717 while(offset < server.vm_pages) {
8718 off_t this = base+offset;
8719
8720 /* If we overflow, restart from page zero */
8721 if (this >= server.vm_pages) {
8722 this -= server.vm_pages;
8723 if (this == 0) {
8724 /* Just overflowed, what we found on tail is no longer
8725 * interesting, as it's no longer contiguous. */
8726 numfree = 0;
8727 }
8728 }
8729 if (vmFreePage(this)) {
8730 /* This is a free page */
8731 numfree++;
8732 /* Already got N free pages? Return to the caller, with success */
8733 if (numfree == n) {
7d30035d 8734 *first = this-(n-1);
8735 server.vm_next_page = this+1;
7c775e09 8736 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 8737 return REDIS_OK;
06224fec 8738 }
8739 } else {
8740 /* The current one is not a free page */
8741 numfree = 0;
8742 }
8743
8744 /* Fast-forward if the current page is not free and we already
8745 * searched enough near this place. */
8746 since_jump++;
8747 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8748 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8749 since_jump = 0;
8750 /* Note that even if we rewind after the jump, we are don't need
8751 * to make sure numfree is set to zero as we only jump *if* it
8752 * is set to zero. */
8753 } else {
8754 /* Otherwise just check the next page */
8755 offset++;
8756 }
8757 }
3a66edc7 8758 return REDIS_ERR;
8759}
8760
a5819310 8761/* Write the specified object at the specified page of the swap file */
8762static int vmWriteObjectOnSwap(robj *o, off_t page) {
8763 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8764 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8765 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8766 redisLog(REDIS_WARNING,
9ebed7cf 8767 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 8768 strerror(errno));
8769 return REDIS_ERR;
8770 }
8771 rdbSaveObject(server.vm_fp,o);
ba76a8f9 8772 fflush(server.vm_fp);
a5819310 8773 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8774 return REDIS_OK;
8775}
8776
3a66edc7 8777/* Swap the 'val' object relative to 'key' into disk. Store all the information
8778 * needed to later retrieve the object into the key object.
8779 * If we can't find enough contiguous empty pages to swap the object on disk
8780 * REDIS_ERR is returned. */
a69a0c9c 8781static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 8782 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 8783 off_t page;
8784
8785 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 8786 assert(key->refcount == 1);
3a66edc7 8787 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 8788 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 8789 key->vm.page = page;
8790 key->vm.usedpages = pages;
8791 key->storage = REDIS_VM_SWAPPED;
d894161b 8792 key->vtype = val->type;
3a66edc7 8793 decrRefCount(val); /* Deallocate the object from memory. */
8794 vmMarkPagesUsed(page,pages);
7d30035d 8795 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8796 (unsigned char*) key->ptr,
8797 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 8798 server.vm_stats_swapped_objects++;
8799 server.vm_stats_swapouts++;
3a66edc7 8800 return REDIS_OK;
8801}
8802
a5819310 8803static robj *vmReadObjectFromSwap(off_t page, int type) {
8804 robj *o;
3a66edc7 8805
a5819310 8806 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8807 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 8808 redisLog(REDIS_WARNING,
d5d55fc3 8809 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 8810 strerror(errno));
478c2c6f 8811 _exit(1);
3a66edc7 8812 }
a5819310 8813 o = rdbLoadObject(type,server.vm_fp);
8814 if (o == NULL) {
d5d55fc3 8815 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 8816 _exit(1);
3a66edc7 8817 }
a5819310 8818 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8819 return o;
8820}
8821
8822/* Load the value object relative to the 'key' object from swap to memory.
8823 * The newly allocated object is returned.
8824 *
8825 * If preview is true the unserialized object is returned to the caller but
8826 * no changes are made to the key object, nor the pages are marked as freed */
8827static robj *vmGenericLoadObject(robj *key, int preview) {
8828 robj *val;
8829
d5d55fc3 8830 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
a5819310 8831 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 8832 if (!preview) {
8833 key->storage = REDIS_VM_MEMORY;
8834 key->vm.atime = server.unixtime;
8835 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8836 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8837 (unsigned char*) key->ptr);
7d98e08c 8838 server.vm_stats_swapped_objects--;
38aba9a1 8839 } else {
8840 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8841 (unsigned char*) key->ptr);
7e69548d 8842 }
7d98e08c 8843 server.vm_stats_swapins++;
3a66edc7 8844 return val;
06224fec 8845}
8846
7e69548d 8847/* Plain object loading, from swap to memory */
8848static robj *vmLoadObject(robj *key) {
996cb5f7 8849 /* If we are loading the object in background, stop it, we
8850 * need to load this object synchronously ASAP. */
8851 if (key->storage == REDIS_VM_LOADING)
8852 vmCancelThreadedIOJob(key);
7e69548d 8853 return vmGenericLoadObject(key,0);
8854}
8855
8856/* Just load the value on disk, without to modify the key.
8857 * This is useful when we want to perform some operation on the value
8858 * without to really bring it from swap to memory, like while saving the
8859 * dataset or rewriting the append only log. */
8860static robj *vmPreviewObject(robj *key) {
8861 return vmGenericLoadObject(key,1);
8862}
8863
4ef8de8a 8864/* How a good candidate is this object for swapping?
8865 * The better candidate it is, the greater the returned value.
8866 *
8867 * Currently we try to perform a fast estimation of the object size in
8868 * memory, and combine it with aging informations.
8869 *
8870 * Basically swappability = idle-time * log(estimated size)
8871 *
8872 * Bigger objects are preferred over smaller objects, but not
8873 * proportionally, this is why we use the logarithm. This algorithm is
8874 * just a first try and will probably be tuned later. */
8875static double computeObjectSwappability(robj *o) {
8876 time_t age = server.unixtime - o->vm.atime;
8877 long asize = 0;
8878 list *l;
8879 dict *d;
8880 struct dictEntry *de;
8881 int z;
8882
8883 if (age <= 0) return 0;
8884 switch(o->type) {
8885 case REDIS_STRING:
8886 if (o->encoding != REDIS_ENCODING_RAW) {
8887 asize = sizeof(*o);
8888 } else {
8889 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8890 }
8891 break;
8892 case REDIS_LIST:
8893 l = o->ptr;
8894 listNode *ln = listFirst(l);
8895
8896 asize = sizeof(list);
8897 if (ln) {
8898 robj *ele = ln->value;
8899 long elesize;
8900
8901 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8902 (sizeof(*o)+sdslen(ele->ptr)) :
8903 sizeof(*o);
8904 asize += (sizeof(listNode)+elesize)*listLength(l);
8905 }
8906 break;
8907 case REDIS_SET:
8908 case REDIS_ZSET:
8909 z = (o->type == REDIS_ZSET);
8910 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8911
8912 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8913 if (z) asize += sizeof(zset)-sizeof(dict);
8914 if (dictSize(d)) {
8915 long elesize;
8916 robj *ele;
8917
8918 de = dictGetRandomKey(d);
8919 ele = dictGetEntryKey(de);
8920 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8921 (sizeof(*o)+sdslen(ele->ptr)) :
8922 sizeof(*o);
8923 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8924 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8925 }
8926 break;
a97b9060 8927 case REDIS_HASH:
8928 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8929 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8930 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8931 unsigned int klen, vlen;
8932 unsigned char *key, *val;
8933
8934 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8935 klen = 0;
8936 vlen = 0;
8937 }
8938 asize = len*(klen+vlen+3);
8939 } else if (o->encoding == REDIS_ENCODING_HT) {
8940 d = o->ptr;
8941 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8942 if (dictSize(d)) {
8943 long elesize;
8944 robj *ele;
8945
8946 de = dictGetRandomKey(d);
8947 ele = dictGetEntryKey(de);
8948 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8949 (sizeof(*o)+sdslen(ele->ptr)) :
8950 sizeof(*o);
8951 ele = dictGetEntryVal(de);
8952 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8953 (sizeof(*o)+sdslen(ele->ptr)) :
8954 sizeof(*o);
8955 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8956 }
8957 }
8958 break;
4ef8de8a 8959 }
c8c72447 8960 return (double)age*log(1+asize);
4ef8de8a 8961}
8962
8963/* Try to swap an object that's a good candidate for swapping.
8964 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 8965 * to swap any object at all.
8966 *
8967 * If 'usethreaded' is true, Redis will try to swap the object in background
8968 * using I/O threads. */
8969static int vmSwapOneObject(int usethreads) {
4ef8de8a 8970 int j, i;
8971 struct dictEntry *best = NULL;
8972 double best_swappability = 0;
b9bc0eef 8973 redisDb *best_db = NULL;
4ef8de8a 8974 robj *key, *val;
8975
8976 for (j = 0; j < server.dbnum; j++) {
8977 redisDb *db = server.db+j;
b72f6a4b 8978 /* Why maxtries is set to 100?
8979 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8980 * are swappable objects */
b0d8747d 8981 int maxtries = 100;
4ef8de8a 8982
8983 if (dictSize(db->dict) == 0) continue;
8984 for (i = 0; i < 5; i++) {
8985 dictEntry *de;
8986 double swappability;
8987
e3cadb8a 8988 if (maxtries) maxtries--;
4ef8de8a 8989 de = dictGetRandomKey(db->dict);
8990 key = dictGetEntryKey(de);
8991 val = dictGetEntryVal(de);
1064ef87 8992 /* Only swap objects that are currently in memory.
8993 *
8994 * Also don't swap shared objects if threaded VM is on, as we
8995 * try to ensure that the main thread does not touch the
8996 * object while the I/O thread is using it, but we can't
8997 * control other keys without adding additional mutex. */
8998 if (key->storage != REDIS_VM_MEMORY ||
8999 (server.vm_max_threads != 0 && val->refcount != 1)) {
e3cadb8a 9000 if (maxtries) i--; /* don't count this try */
9001 continue;
9002 }
4ef8de8a 9003 swappability = computeObjectSwappability(val);
9004 if (!best || swappability > best_swappability) {
9005 best = de;
9006 best_swappability = swappability;
b9bc0eef 9007 best_db = db;
4ef8de8a 9008 }
9009 }
9010 }
7c775e09 9011 if (best == NULL) return REDIS_ERR;
4ef8de8a 9012 key = dictGetEntryKey(best);
9013 val = dictGetEntryVal(best);
9014
e3cadb8a 9015 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 9016 key->ptr, best_swappability);
9017
9018 /* Unshare the key if needed */
9019 if (key->refcount > 1) {
9020 robj *newkey = dupStringObject(key);
9021 decrRefCount(key);
9022 key = dictGetEntryKey(best) = newkey;
9023 }
9024 /* Swap it */
a69a0c9c 9025 if (usethreads) {
b9bc0eef 9026 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 9027 return REDIS_OK;
9028 } else {
a69a0c9c 9029 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
9030 dictGetEntryVal(best) = NULL;
9031 return REDIS_OK;
9032 } else {
9033 return REDIS_ERR;
9034 }
4ef8de8a 9035 }
9036}
9037
a69a0c9c 9038static int vmSwapOneObjectBlocking() {
9039 return vmSwapOneObject(0);
9040}
9041
9042static int vmSwapOneObjectThreaded() {
9043 return vmSwapOneObject(1);
9044}
9045
7e69548d 9046/* Return true if it's safe to swap out objects in a given moment.
9047 * Basically we don't want to swap objects out while there is a BGSAVE
9048 * or a BGAEOREWRITE running in backgroud. */
9049static int vmCanSwapOut(void) {
9050 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
9051}
9052
1b03836c 9053/* Delete a key if swapped. Returns 1 if the key was found, was swapped
9054 * and was deleted. Otherwise 0 is returned. */
9055static int deleteIfSwapped(redisDb *db, robj *key) {
9056 dictEntry *de;
9057 robj *foundkey;
9058
9059 if ((de = dictFind(db->dict,key)) == NULL) return 0;
9060 foundkey = dictGetEntryKey(de);
9061 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
9062 deleteKey(db,key);
9063 return 1;
9064}
9065
996cb5f7 9066/* =================== Virtual Memory - Threaded I/O ======================= */
9067
b9bc0eef 9068static void freeIOJob(iojob *j) {
d5d55fc3 9069 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
9070 j->type == REDIS_IOJOB_DO_SWAP ||
9071 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
b9bc0eef 9072 decrRefCount(j->val);
78ebe4c8 9073 /* We don't decrRefCount the j->key field as we did't incremented
9074 * the count creating IO Jobs. This is because the key field here is
9075 * just used as an indentifier and if a key is removed the Job should
9076 * never be touched again. */
b9bc0eef 9077 zfree(j);
9078}
9079
996cb5f7 9080/* Every time a thread finished a Job, it writes a byte into the write side
9081 * of an unix pipe in order to "awake" the main thread, and this function
9082 * is called. */
9083static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9084 int mask)
9085{
9086 char buf[1];
b0d8747d 9087 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 9088 REDIS_NOTUSED(el);
9089 REDIS_NOTUSED(mask);
9090 REDIS_NOTUSED(privdata);
9091
9092 /* For every byte we read in the read side of the pipe, there is one
9093 * I/O job completed to process. */
9094 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 9095 iojob *j;
9096 listNode *ln;
9097 robj *key;
9098 struct dictEntry *de;
9099
996cb5f7 9100 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 9101
9102 /* Get the processed element (the oldest one) */
9103 lockThreadedIO();
1064ef87 9104 assert(listLength(server.io_processed) != 0);
f6c0bba8 9105 if (toprocess == -1) {
9106 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9107 if (toprocess <= 0) toprocess = 1;
9108 }
b9bc0eef 9109 ln = listFirst(server.io_processed);
9110 j = ln->value;
9111 listDelNode(server.io_processed,ln);
9112 unlockThreadedIO();
9113 /* If this job is marked as canceled, just ignore it */
9114 if (j->canceled) {
9115 freeIOJob(j);
9116 continue;
9117 }
9118 /* Post process it in the main thread, as there are things we
9119 * can do just here to avoid race conditions and/or invasive locks */
6c96ba7d 9120 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 9121 de = dictFind(j->db->dict,j->key);
9122 assert(de != NULL);
9123 key = dictGetEntryKey(de);
9124 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9125 redisDb *db;
9126
b9bc0eef 9127 /* Key loaded, bring it at home */
9128 key->storage = REDIS_VM_MEMORY;
9129 key->vm.atime = server.unixtime;
9130 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9131 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9132 (unsigned char*) key->ptr);
9133 server.vm_stats_swapped_objects--;
9134 server.vm_stats_swapins++;
d5d55fc3 9135 dictGetEntryVal(de) = j->val;
9136 incrRefCount(j->val);
9137 db = j->db;
b9bc0eef 9138 freeIOJob(j);
d5d55fc3 9139 /* Handle clients waiting for this key to be loaded. */
9140 handleClientsBlockedOnSwappedKey(db,key);
b9bc0eef 9141 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9142 /* Now we know the amount of pages required to swap this object.
9143 * Let's find some space for it, and queue this task again
9144 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 9145 if (!vmCanSwapOut() ||
9146 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9147 {
9148 /* Ooops... no space or we can't swap as there is
9149 * a fork()ed Redis trying to save stuff on disk. */
b9bc0eef 9150 freeIOJob(j);
054e426d 9151 key->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 9152 } else {
c7df85a4 9153 /* Note that we need to mark this pages as used now,
9154 * if the job will be canceled, we'll mark them as freed
9155 * again. */
9156 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 9157 j->type = REDIS_IOJOB_DO_SWAP;
9158 lockThreadedIO();
9159 queueIOJob(j);
9160 unlockThreadedIO();
9161 }
9162 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9163 robj *val;
9164
9165 /* Key swapped. We can finally free some memory. */
6c96ba7d 9166 if (key->storage != REDIS_VM_SWAPPING) {
9167 printf("key->storage: %d\n",key->storage);
9168 printf("key->name: %s\n",(char*)key->ptr);
9169 printf("key->refcount: %d\n",key->refcount);
9170 printf("val: %p\n",(void*)j->val);
9171 printf("val->type: %d\n",j->val->type);
9172 printf("val->ptr: %s\n",(char*)j->val->ptr);
9173 }
9174 redisAssert(key->storage == REDIS_VM_SWAPPING);
b9bc0eef 9175 val = dictGetEntryVal(de);
9176 key->vm.page = j->page;
9177 key->vm.usedpages = j->pages;
9178 key->storage = REDIS_VM_SWAPPED;
9179 key->vtype = j->val->type;
9180 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 9181 dictGetEntryVal(de) = NULL;
b9bc0eef 9182 redisLog(REDIS_DEBUG,
9183 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9184 (unsigned char*) key->ptr,
9185 (unsigned long long) j->page, (unsigned long long) j->pages);
9186 server.vm_stats_swapped_objects++;
9187 server.vm_stats_swapouts++;
9188 freeIOJob(j);
f11b8647 9189 /* Put a few more swap requests in queue if we are still
9190 * out of memory */
b0d8747d 9191 if (trytoswap && vmCanSwapOut() &&
9192 zmalloc_used_memory() > server.vm_max_memory)
9193 {
f11b8647 9194 int more = 1;
9195 while(more) {
9196 lockThreadedIO();
9197 more = listLength(server.io_newjobs) <
9198 (unsigned) server.vm_max_threads;
9199 unlockThreadedIO();
9200 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 9201 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9202 trytoswap = 0;
9203 break;
9204 }
f11b8647 9205 }
9206 }
b9bc0eef 9207 }
c953f24b 9208 processed++;
f6c0bba8 9209 if (processed == toprocess) return;
996cb5f7 9210 }
9211 if (retval < 0 && errno != EAGAIN) {
9212 redisLog(REDIS_WARNING,
9213 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9214 strerror(errno));
9215 }
9216}
9217
9218static void lockThreadedIO(void) {
9219 pthread_mutex_lock(&server.io_mutex);
9220}
9221
9222static void unlockThreadedIO(void) {
9223 pthread_mutex_unlock(&server.io_mutex);
9224}
9225
9226/* Remove the specified object from the threaded I/O queue if still not
9227 * processed, otherwise make sure to flag it as canceled. */
9228static void vmCancelThreadedIOJob(robj *o) {
9229 list *lists[3] = {
6c96ba7d 9230 server.io_newjobs, /* 0 */
9231 server.io_processing, /* 1 */
9232 server.io_processed /* 2 */
996cb5f7 9233 };
9234 int i;
9235
9236 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 9237again:
996cb5f7 9238 lockThreadedIO();
9239 /* Search for a matching key in one of the queues */
9240 for (i = 0; i < 3; i++) {
9241 listNode *ln;
c7df85a4 9242 listIter li;
996cb5f7 9243
c7df85a4 9244 listRewind(lists[i],&li);
9245 while ((ln = listNext(&li)) != NULL) {
996cb5f7 9246 iojob *job = ln->value;
9247
6c96ba7d 9248 if (job->canceled) continue; /* Skip this, already canceled. */
78ebe4c8 9249 if (job->key == o) {
970e10bb 9250 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9251 (void*)job, (char*)o->ptr, job->type, i);
427a2153 9252 /* Mark the pages as free since the swap didn't happened
9253 * or happened but is now discarded. */
970e10bb 9254 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 9255 vmMarkPagesFree(job->page,job->pages);
9256 /* Cancel the job. It depends on the list the job is
9257 * living in. */
996cb5f7 9258 switch(i) {
9259 case 0: /* io_newjobs */
6c96ba7d 9260 /* If the job was yet not processed the best thing to do
996cb5f7 9261 * is to remove it from the queue at all */
6c96ba7d 9262 freeIOJob(job);
996cb5f7 9263 listDelNode(lists[i],ln);
9264 break;
9265 case 1: /* io_processing */
d5d55fc3 9266 /* Oh Shi- the thread is messing with the Job:
9267 *
9268 * Probably it's accessing the object if this is a
9269 * PREPARE_SWAP or DO_SWAP job.
9270 * If it's a LOAD job it may be reading from disk and
9271 * if we don't wait for the job to terminate before to
9272 * cancel it, maybe in a few microseconds data can be
9273 * corrupted in this pages. So the short story is:
9274 *
9275 * Better to wait for the job to move into the
9276 * next queue (processed)... */
9277
9278 /* We try again and again until the job is completed. */
9279 unlockThreadedIO();
9280 /* But let's wait some time for the I/O thread
9281 * to finish with this job. After all this condition
9282 * should be very rare. */
9283 usleep(1);
9284 goto again;
996cb5f7 9285 case 2: /* io_processed */
2e111efe 9286 /* The job was already processed, that's easy...
9287 * just mark it as canceled so that we'll ignore it
9288 * when processing completed jobs. */
996cb5f7 9289 job->canceled = 1;
9290 break;
9291 }
c7df85a4 9292 /* Finally we have to adjust the storage type of the object
9293 * in order to "UNDO" the operaiton. */
996cb5f7 9294 if (o->storage == REDIS_VM_LOADING)
9295 o->storage = REDIS_VM_SWAPPED;
9296 else if (o->storage == REDIS_VM_SWAPPING)
9297 o->storage = REDIS_VM_MEMORY;
9298 unlockThreadedIO();
9299 return;
9300 }
9301 }
9302 }
9303 unlockThreadedIO();
9304 assert(1 != 1); /* We should never reach this */
9305}
9306
b9bc0eef 9307static void *IOThreadEntryPoint(void *arg) {
9308 iojob *j;
9309 listNode *ln;
9310 REDIS_NOTUSED(arg);
9311
9312 pthread_detach(pthread_self());
9313 while(1) {
9314 /* Get a new job to process */
9315 lockThreadedIO();
9316 if (listLength(server.io_newjobs) == 0) {
9317 /* No new jobs in queue, exit. */
9ebed7cf 9318 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9319 (long) pthread_self());
b9bc0eef 9320 server.io_active_threads--;
9321 unlockThreadedIO();
9322 return NULL;
9323 }
9324 ln = listFirst(server.io_newjobs);
9325 j = ln->value;
9326 listDelNode(server.io_newjobs,ln);
9327 /* Add the job in the processing queue */
9328 j->thread = pthread_self();
9329 listAddNodeTail(server.io_processing,j);
9330 ln = listLast(server.io_processing); /* We use ln later to remove it */
9331 unlockThreadedIO();
9ebed7cf 9332 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9333 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 9334
9335 /* Process the Job */
9336 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9337 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
b9bc0eef 9338 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9339 FILE *fp = fopen("/dev/null","w+");
9340 j->pages = rdbSavedObjectPages(j->val,fp);
9341 fclose(fp);
9342 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 9343 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9344 j->canceled = 1;
b9bc0eef 9345 }
9346
9347 /* Done: insert the job into the processed queue */
9ebed7cf 9348 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9349 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 9350 lockThreadedIO();
9351 listDelNode(server.io_processing,ln);
9352 listAddNodeTail(server.io_processed,j);
9353 unlockThreadedIO();
e0a62c7f 9354
b9bc0eef 9355 /* Signal the main thread there is new stuff to process */
9356 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9357 }
9358 return NULL; /* never reached */
9359}
9360
9361static void spawnIOThread(void) {
9362 pthread_t thread;
478c2c6f 9363 sigset_t mask, omask;
a97b9060 9364 int err;
b9bc0eef 9365
478c2c6f 9366 sigemptyset(&mask);
9367 sigaddset(&mask,SIGCHLD);
9368 sigaddset(&mask,SIGHUP);
9369 sigaddset(&mask,SIGPIPE);
9370 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 9371 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9372 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9373 strerror(err));
9374 usleep(1000000);
9375 }
478c2c6f 9376 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 9377 server.io_active_threads++;
9378}
9379
4ee9488d 9380/* We need to wait for the last thread to exit before we are able to
9381 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 9382static void waitEmptyIOJobsQueue(void) {
4ee9488d 9383 while(1) {
76b7233a 9384 int io_processed_len;
9385
4ee9488d 9386 lockThreadedIO();
054e426d 9387 if (listLength(server.io_newjobs) == 0 &&
9388 listLength(server.io_processing) == 0 &&
9389 server.io_active_threads == 0)
9390 {
4ee9488d 9391 unlockThreadedIO();
9392 return;
9393 }
76b7233a 9394 /* While waiting for empty jobs queue condition we post-process some
9395 * finshed job, as I/O threads may be hanging trying to write against
9396 * the io_ready_pipe_write FD but there are so much pending jobs that
9397 * it's blocking. */
9398 io_processed_len = listLength(server.io_processed);
4ee9488d 9399 unlockThreadedIO();
76b7233a 9400 if (io_processed_len) {
9401 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9402 usleep(1000); /* 1 millisecond */
9403 } else {
9404 usleep(10000); /* 10 milliseconds */
9405 }
4ee9488d 9406 }
9407}
9408
054e426d 9409static void vmReopenSwapFile(void) {
478c2c6f 9410 /* Note: we don't close the old one as we are in the child process
9411 * and don't want to mess at all with the original file object. */
054e426d 9412 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9413 if (server.vm_fp == NULL) {
9414 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9415 server.vm_swap_file);
478c2c6f 9416 _exit(1);
054e426d 9417 }
9418 server.vm_fd = fileno(server.vm_fp);
9419}
9420
b9bc0eef 9421/* This function must be called while with threaded IO locked */
9422static void queueIOJob(iojob *j) {
6c96ba7d 9423 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9424 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 9425 listAddNodeTail(server.io_newjobs,j);
9426 if (server.io_active_threads < server.vm_max_threads)
9427 spawnIOThread();
9428}
9429
9430static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9431 iojob *j;
e0a62c7f 9432
b9bc0eef 9433 assert(key->storage == REDIS_VM_MEMORY);
9434 assert(key->refcount == 1);
9435
9436 j = zmalloc(sizeof(*j));
9437 j->type = REDIS_IOJOB_PREPARE_SWAP;
9438 j->db = db;
78ebe4c8 9439 j->key = key;
b9bc0eef 9440 j->val = val;
9441 incrRefCount(val);
9442 j->canceled = 0;
9443 j->thread = (pthread_t) -1;
f11b8647 9444 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 9445
9446 lockThreadedIO();
9447 queueIOJob(j);
9448 unlockThreadedIO();
9449 return REDIS_OK;
9450}
9451
b0d8747d 9452/* ============ Virtual Memory - Blocking clients on missing keys =========== */
9453
d5d55fc3 9454/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9455 * If there is not already a job loading the key, it is craeted.
9456 * The key is added to the io_keys list in the client structure, and also
9457 * in the hash table mapping swapped keys to waiting clients, that is,
9458 * server.io_waited_keys. */
9459static int waitForSwappedKey(redisClient *c, robj *key) {
9460 struct dictEntry *de;
9461 robj *o;
9462 list *l;
9463
9464 /* If the key does not exist or is already in RAM we don't need to
9465 * block the client at all. */
9466 de = dictFind(c->db->dict,key);
9467 if (de == NULL) return 0;
9468 o = dictGetEntryKey(de);
9469 if (o->storage == REDIS_VM_MEMORY) {
9470 return 0;
9471 } else if (o->storage == REDIS_VM_SWAPPING) {
9472 /* We were swapping the key, undo it! */
9473 vmCancelThreadedIOJob(o);
9474 return 0;
9475 }
e0a62c7f 9476
d5d55fc3 9477 /* OK: the key is either swapped, or being loaded just now. */
9478
9479 /* Add the key to the list of keys this client is waiting for.
9480 * This maps clients to keys they are waiting for. */
9481 listAddNodeTail(c->io_keys,key);
9482 incrRefCount(key);
9483
9484 /* Add the client to the swapped keys => clients waiting map. */
9485 de = dictFind(c->db->io_keys,key);
9486 if (de == NULL) {
9487 int retval;
9488
9489 /* For every key we take a list of clients blocked for it */
9490 l = listCreate();
9491 retval = dictAdd(c->db->io_keys,key,l);
9492 incrRefCount(key);
9493 assert(retval == DICT_OK);
9494 } else {
9495 l = dictGetEntryVal(de);
9496 }
9497 listAddNodeTail(l,c);
9498
9499 /* Are we already loading the key from disk? If not create a job */
9500 if (o->storage == REDIS_VM_SWAPPED) {
9501 iojob *j;
9502
9503 o->storage = REDIS_VM_LOADING;
9504 j = zmalloc(sizeof(*j));
9505 j->type = REDIS_IOJOB_LOAD;
9506 j->db = c->db;
78ebe4c8 9507 j->key = o;
d5d55fc3 9508 j->key->vtype = o->vtype;
9509 j->page = o->vm.page;
9510 j->val = NULL;
9511 j->canceled = 0;
9512 j->thread = (pthread_t) -1;
9513 lockThreadedIO();
9514 queueIOJob(j);
9515 unlockThreadedIO();
9516 }
9517 return 1;
9518}
9519
76583ea4
PN
9520/* Preload keys needed for the ZUNION and ZINTER commands. */
9521static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9522 int i, num;
9523 num = atoi(c->argv[2]->ptr);
9524 for (i = 0; i < num; i++) {
9525 waitForSwappedKey(c,c->argv[3+i]);
9526 }
9527}
9528
b0d8747d 9529/* Is this client attempting to run a command against swapped keys?
d5d55fc3 9530 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 9531 *
d5d55fc3 9532 * The important idea about this function is that it can fail! If keys will
9533 * still be swapped when the client is resumed, this key lookups will
9534 * just block loading keys from disk. In practical terms this should only
9535 * happen with SORT BY command or if there is a bug in this function.
9536 *
9537 * Return 1 if the client is marked as blocked, 0 if the client can
9538 * continue as the keys it is going to access appear to be in memory. */
9539static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
7c775e09 9540 int j, last;
9541
76583ea4
PN
9542 if (cmd->vm_preload_proc != NULL) {
9543 cmd->vm_preload_proc(c);
9544 } else {
9545 if (cmd->vm_firstkey == 0) return 0;
9546 last = cmd->vm_lastkey;
9547 if (last < 0) last = c->argc+last;
9548 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9549 waitForSwappedKey(c,c->argv[j]);
9550 }
9551
d5d55fc3 9552 /* If the client was blocked for at least one key, mark it as blocked. */
9553 if (listLength(c->io_keys)) {
9554 c->flags |= REDIS_IO_WAIT;
9555 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9556 server.vm_blocked_clients++;
9557 return 1;
9558 } else {
9559 return 0;
9560 }
9561}
9562
9563/* Remove the 'key' from the list of blocked keys for a given client.
9564 *
9565 * The function returns 1 when there are no longer blocking keys after
9566 * the current one was removed (and the client can be unblocked). */
9567static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9568 list *l;
9569 listNode *ln;
9570 listIter li;
9571 struct dictEntry *de;
9572
9573 /* Remove the key from the list of keys this client is waiting for. */
9574 listRewind(c->io_keys,&li);
9575 while ((ln = listNext(&li)) != NULL) {
9576 if (compareStringObjects(ln->value,key) == 0) {
9577 listDelNode(c->io_keys,ln);
9578 break;
9579 }
9580 }
9581 assert(ln != NULL);
9582
9583 /* Remove the client form the key => waiting clients map. */
9584 de = dictFind(c->db->io_keys,key);
9585 assert(de != NULL);
9586 l = dictGetEntryVal(de);
9587 ln = listSearchKey(l,c);
9588 assert(ln != NULL);
9589 listDelNode(l,ln);
9590 if (listLength(l) == 0)
9591 dictDelete(c->db->io_keys,key);
9592
9593 return listLength(c->io_keys) == 0;
9594}
9595
9596static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9597 struct dictEntry *de;
9598 list *l;
9599 listNode *ln;
9600 int len;
9601
9602 de = dictFind(db->io_keys,key);
9603 if (!de) return;
9604
9605 l = dictGetEntryVal(de);
9606 len = listLength(l);
9607 /* Note: we can't use something like while(listLength(l)) as the list
9608 * can be freed by the calling function when we remove the last element. */
9609 while (len--) {
9610 ln = listFirst(l);
9611 redisClient *c = ln->value;
9612
9613 if (dontWaitForSwappedKey(c,key)) {
9614 /* Put the client in the list of clients ready to go as we
9615 * loaded all the keys about it. */
9616 listAddNodeTail(server.io_ready_clients,c);
9617 }
9618 }
b0d8747d 9619}
b0d8747d 9620
500ece7c 9621/* =========================== Remote Configuration ========================= */
9622
9623static void configSetCommand(redisClient *c) {
9624 robj *o = getDecodedObject(c->argv[3]);
9625 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9626 zfree(server.dbfilename);
9627 server.dbfilename = zstrdup(o->ptr);
9628 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9629 zfree(server.requirepass);
9630 server.requirepass = zstrdup(o->ptr);
9631 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9632 zfree(server.masterauth);
9633 server.masterauth = zstrdup(o->ptr);
9634 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9635 server.maxmemory = strtoll(o->ptr, NULL, 10);
9636 } else {
9637 addReplySds(c,sdscatprintf(sdsempty(),
9638 "-ERR not supported CONFIG parameter %s\r\n",
9639 (char*)c->argv[2]->ptr));
9640 decrRefCount(o);
9641 return;
9642 }
9643 decrRefCount(o);
9644 addReply(c,shared.ok);
9645}
9646
9647static void configGetCommand(redisClient *c) {
9648 robj *o = getDecodedObject(c->argv[2]);
9649 robj *lenobj = createObject(REDIS_STRING,NULL);
9650 char *pattern = o->ptr;
9651 int matches = 0;
9652
9653 addReply(c,lenobj);
9654 decrRefCount(lenobj);
9655
9656 if (stringmatch(pattern,"dbfilename",0)) {
9657 addReplyBulkCString(c,"dbfilename");
9658 addReplyBulkCString(c,server.dbfilename);
9659 matches++;
9660 }
9661 if (stringmatch(pattern,"requirepass",0)) {
9662 addReplyBulkCString(c,"requirepass");
9663 addReplyBulkCString(c,server.requirepass);
9664 matches++;
9665 }
9666 if (stringmatch(pattern,"masterauth",0)) {
9667 addReplyBulkCString(c,"masterauth");
9668 addReplyBulkCString(c,server.masterauth);
9669 matches++;
9670 }
9671 if (stringmatch(pattern,"maxmemory",0)) {
9672 char buf[128];
9673
9674 snprintf(buf,128,"%llu\n",server.maxmemory);
9675 addReplyBulkCString(c,"maxmemory");
9676 addReplyBulkCString(c,buf);
9677 matches++;
9678 }
9679 decrRefCount(o);
9680 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9681}
9682
9683static void configCommand(redisClient *c) {
9684 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9685 if (c->argc != 4) goto badarity;
9686 configSetCommand(c);
9687 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9688 if (c->argc != 3) goto badarity;
9689 configGetCommand(c);
9690 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9691 if (c->argc != 2) goto badarity;
9692 server.stat_numcommands = 0;
9693 server.stat_numconnections = 0;
9694 server.stat_expiredkeys = 0;
9695 server.stat_starttime = time(NULL);
9696 addReply(c,shared.ok);
9697 } else {
9698 addReplySds(c,sdscatprintf(sdsempty(),
9699 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9700 }
9701 return;
9702
9703badarity:
9704 addReplySds(c,sdscatprintf(sdsempty(),
9705 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9706 (char*) c->argv[1]->ptr));
9707}
9708
befec3cd 9709/* =========================== Pubsub implementation ======================== */
9710
ffc6b7f8 9711static void freePubsubPattern(void *p) {
9712 pubsubPattern *pat = p;
9713
9714 decrRefCount(pat->pattern);
9715 zfree(pat);
9716}
9717
9718static int listMatchPubsubPattern(void *a, void *b) {
9719 pubsubPattern *pa = a, *pb = b;
9720
9721 return (pa->client == pb->client) &&
9722 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9723}
9724
9725/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9726 * 0 if the client was already subscribed to that channel. */
9727static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
befec3cd 9728 struct dictEntry *de;
9729 list *clients = NULL;
9730 int retval = 0;
9731
ffc6b7f8 9732 /* Add the channel to the client -> channels hash table */
9733 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
befec3cd 9734 retval = 1;
ffc6b7f8 9735 incrRefCount(channel);
9736 /* Add the client to the channel -> list of clients hash table */
9737 de = dictFind(server.pubsub_channels,channel);
befec3cd 9738 if (de == NULL) {
9739 clients = listCreate();
ffc6b7f8 9740 dictAdd(server.pubsub_channels,channel,clients);
9741 incrRefCount(channel);
befec3cd 9742 } else {
9743 clients = dictGetEntryVal(de);
9744 }
9745 listAddNodeTail(clients,c);
9746 }
9747 /* Notify the client */
9748 addReply(c,shared.mbulk3);
9749 addReply(c,shared.subscribebulk);
ffc6b7f8 9750 addReplyBulk(c,channel);
9751 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
befec3cd 9752 return retval;
9753}
9754
ffc6b7f8 9755/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9756 * 0 if the client was not subscribed to the specified channel. */
9757static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
befec3cd 9758 struct dictEntry *de;
9759 list *clients;
9760 listNode *ln;
9761 int retval = 0;
9762
ffc6b7f8 9763 /* Remove the channel from the client -> channels hash table */
9764 incrRefCount(channel); /* channel may be just a pointer to the same object
201037f5 9765 we have in the hash tables. Protect it... */
ffc6b7f8 9766 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
befec3cd 9767 retval = 1;
ffc6b7f8 9768 /* Remove the client from the channel -> clients list hash table */
9769 de = dictFind(server.pubsub_channels,channel);
befec3cd 9770 assert(de != NULL);
9771 clients = dictGetEntryVal(de);
9772 ln = listSearchKey(clients,c);
9773 assert(ln != NULL);
9774 listDelNode(clients,ln);
ff767a75 9775 if (listLength(clients) == 0) {
9776 /* Free the list and associated hash entry at all if this was
9777 * the latest client, so that it will be possible to abuse
ffc6b7f8 9778 * Redis PUBSUB creating millions of channels. */
9779 dictDelete(server.pubsub_channels,channel);
ff767a75 9780 }
befec3cd 9781 }
9782 /* Notify the client */
9783 if (notify) {
9784 addReply(c,shared.mbulk3);
9785 addReply(c,shared.unsubscribebulk);
ffc6b7f8 9786 addReplyBulk(c,channel);
9787 addReplyLong(c,dictSize(c->pubsub_channels)+
9788 listLength(c->pubsub_patterns));
9789
9790 }
9791 decrRefCount(channel); /* it is finally safe to release it */
9792 return retval;
9793}
9794
9795/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9796static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9797 int retval = 0;
9798
9799 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9800 retval = 1;
9801 pubsubPattern *pat;
9802 listAddNodeTail(c->pubsub_patterns,pattern);
9803 incrRefCount(pattern);
9804 pat = zmalloc(sizeof(*pat));
9805 pat->pattern = getDecodedObject(pattern);
9806 pat->client = c;
9807 listAddNodeTail(server.pubsub_patterns,pat);
9808 }
9809 /* Notify the client */
9810 addReply(c,shared.mbulk3);
9811 addReply(c,shared.psubscribebulk);
9812 addReplyBulk(c,pattern);
9813 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9814 return retval;
9815}
9816
9817/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9818 * 0 if the client was not subscribed to the specified channel. */
9819static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9820 listNode *ln;
9821 pubsubPattern pat;
9822 int retval = 0;
9823
9824 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9825 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9826 retval = 1;
9827 listDelNode(c->pubsub_patterns,ln);
9828 pat.client = c;
9829 pat.pattern = pattern;
9830 ln = listSearchKey(server.pubsub_patterns,&pat);
9831 listDelNode(server.pubsub_patterns,ln);
9832 }
9833 /* Notify the client */
9834 if (notify) {
9835 addReply(c,shared.mbulk3);
9836 addReply(c,shared.punsubscribebulk);
9837 addReplyBulk(c,pattern);
9838 addReplyLong(c,dictSize(c->pubsub_channels)+
9839 listLength(c->pubsub_patterns));
befec3cd 9840 }
ffc6b7f8 9841 decrRefCount(pattern);
befec3cd 9842 return retval;
9843}
9844
ffc6b7f8 9845/* Unsubscribe from all the channels. Return the number of channels the
9846 * client was subscribed from. */
9847static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9848 dictIterator *di = dictGetIterator(c->pubsub_channels);
befec3cd 9849 dictEntry *de;
9850 int count = 0;
9851
9852 while((de = dictNext(di)) != NULL) {
ffc6b7f8 9853 robj *channel = dictGetEntryKey(de);
befec3cd 9854
ffc6b7f8 9855 count += pubsubUnsubscribeChannel(c,channel,notify);
befec3cd 9856 }
9857 dictReleaseIterator(di);
9858 return count;
9859}
9860
ffc6b7f8 9861/* Unsubscribe from all the patterns. Return the number of patterns the
9862 * client was subscribed from. */
9863static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9864 listNode *ln;
9865 listIter li;
9866 int count = 0;
9867
9868 listRewind(c->pubsub_patterns,&li);
9869 while ((ln = listNext(&li)) != NULL) {
9870 robj *pattern = ln->value;
9871
9872 count += pubsubUnsubscribePattern(c,pattern,notify);
9873 }
9874 return count;
9875}
9876
befec3cd 9877/* Publish a message */
ffc6b7f8 9878static int pubsubPublishMessage(robj *channel, robj *message) {
befec3cd 9879 int receivers = 0;
9880 struct dictEntry *de;
ffc6b7f8 9881 listNode *ln;
9882 listIter li;
befec3cd 9883
ffc6b7f8 9884 /* Send to clients listening for that channel */
9885 de = dictFind(server.pubsub_channels,channel);
befec3cd 9886 if (de) {
9887 list *list = dictGetEntryVal(de);
9888 listNode *ln;
9889 listIter li;
9890
9891 listRewind(list,&li);
9892 while ((ln = listNext(&li)) != NULL) {
9893 redisClient *c = ln->value;
9894
9895 addReply(c,shared.mbulk3);
9896 addReply(c,shared.messagebulk);
ffc6b7f8 9897 addReplyBulk(c,channel);
befec3cd 9898 addReplyBulk(c,message);
9899 receivers++;
9900 }
9901 }
ffc6b7f8 9902 /* Send to clients listening to matching channels */
9903 if (listLength(server.pubsub_patterns)) {
9904 listRewind(server.pubsub_patterns,&li);
9905 channel = getDecodedObject(channel);
9906 while ((ln = listNext(&li)) != NULL) {
9907 pubsubPattern *pat = ln->value;
9908
9909 if (stringmatchlen((char*)pat->pattern->ptr,
9910 sdslen(pat->pattern->ptr),
9911 (char*)channel->ptr,
9912 sdslen(channel->ptr),0)) {
c8d0ea0e 9913 addReply(pat->client,shared.mbulk4);
9914 addReply(pat->client,shared.pmessagebulk);
9915 addReplyBulk(pat->client,pat->pattern);
ffc6b7f8 9916 addReplyBulk(pat->client,channel);
9917 addReplyBulk(pat->client,message);
9918 receivers++;
9919 }
9920 }
9921 decrRefCount(channel);
9922 }
befec3cd 9923 return receivers;
9924}
9925
9926static void subscribeCommand(redisClient *c) {
9927 int j;
9928
9929 for (j = 1; j < c->argc; j++)
ffc6b7f8 9930 pubsubSubscribeChannel(c,c->argv[j]);
befec3cd 9931}
9932
9933static void unsubscribeCommand(redisClient *c) {
9934 if (c->argc == 1) {
ffc6b7f8 9935 pubsubUnsubscribeAllChannels(c,1);
9936 return;
9937 } else {
9938 int j;
9939
9940 for (j = 1; j < c->argc; j++)
9941 pubsubUnsubscribeChannel(c,c->argv[j],1);
9942 }
9943}
9944
9945static void psubscribeCommand(redisClient *c) {
9946 int j;
9947
9948 for (j = 1; j < c->argc; j++)
9949 pubsubSubscribePattern(c,c->argv[j]);
9950}
9951
9952static void punsubscribeCommand(redisClient *c) {
9953 if (c->argc == 1) {
9954 pubsubUnsubscribeAllPatterns(c,1);
befec3cd 9955 return;
9956 } else {
9957 int j;
9958
9959 for (j = 1; j < c->argc; j++)
ffc6b7f8 9960 pubsubUnsubscribePattern(c,c->argv[j],1);
befec3cd 9961 }
9962}
9963
9964static void publishCommand(redisClient *c) {
9965 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9966 addReplyLong(c,receivers);
9967}
9968
7f957c92 9969/* ================================= Debugging ============================== */
9970
9971static void debugCommand(redisClient *c) {
9972 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9973 *((char*)-1) = 'x';
210e29f7 9974 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9975 if (rdbSave(server.dbfilename) != REDIS_OK) {
9976 addReply(c,shared.err);
9977 return;
9978 }
9979 emptyDb();
9980 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9981 addReply(c,shared.err);
9982 return;
9983 }
9984 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9985 addReply(c,shared.ok);
71c2b467 9986 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9987 emptyDb();
9988 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9989 addReply(c,shared.err);
9990 return;
9991 }
9992 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9993 addReply(c,shared.ok);
333298da 9994 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9995 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9996 robj *key, *val;
9997
9998 if (!de) {
9999 addReply(c,shared.nokeyerr);
10000 return;
10001 }
10002 key = dictGetEntryKey(de);
10003 val = dictGetEntryVal(de);
59146ef3 10004 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
10005 key->storage == REDIS_VM_SWAPPING)) {
07efaf74 10006 char *strenc;
10007 char buf[128];
10008
10009 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
10010 strenc = strencoding[val->encoding];
10011 } else {
10012 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
10013 strenc = buf;
10014 }
ace06542 10015 addReplySds(c,sdscatprintf(sdsempty(),
10016 "+Key at:%p refcount:%d, value at:%p refcount:%d "
07efaf74 10017 "encoding:%s serializedlength:%lld\r\n",
682ac724 10018 (void*)key, key->refcount, (void*)val, val->refcount,
07efaf74 10019 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 10020 } else {
10021 addReplySds(c,sdscatprintf(sdsempty(),
10022 "+Key at:%p refcount:%d, value swapped at: page %llu "
10023 "using %llu pages\r\n",
10024 (void*)key, key->refcount, (unsigned long long) key->vm.page,
10025 (unsigned long long) key->vm.usedpages));
10026 }
78ebe4c8 10027 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
10028 lookupKeyRead(c->db,c->argv[2]);
10029 addReply(c,shared.ok);
7d30035d 10030 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
10031 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
10032 robj *key, *val;
10033
10034 if (!server.vm_enabled) {
10035 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
10036 return;
10037 }
10038 if (!de) {
10039 addReply(c,shared.nokeyerr);
10040 return;
10041 }
10042 key = dictGetEntryKey(de);
10043 val = dictGetEntryVal(de);
4ef8de8a 10044 /* If the key is shared we want to create a copy */
10045 if (key->refcount > 1) {
10046 robj *newkey = dupStringObject(key);
10047 decrRefCount(key);
10048 key = dictGetEntryKey(de) = newkey;
10049 }
10050 /* Swap it */
7d30035d 10051 if (key->storage != REDIS_VM_MEMORY) {
10052 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 10053 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 10054 dictGetEntryVal(de) = NULL;
10055 addReply(c,shared.ok);
10056 } else {
10057 addReply(c,shared.err);
10058 }
7f957c92 10059 } else {
333298da 10060 addReplySds(c,sdsnew(
bdcb92f2 10061 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 10062 }
10063}
56906eef 10064
6c96ba7d 10065static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 10066 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
6c96ba7d 10067 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
dfc5e96c 10068#ifdef HAVE_BACKTRACE
10069 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10070 *((char*)-1) = 'x';
10071#endif
10072}
10073
c651fd9e 10074static void _redisPanic(char *msg, char *file, int line) {
10075 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
17772754 10076 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
c651fd9e 10077#ifdef HAVE_BACKTRACE
10078 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10079 *((char*)-1) = 'x';
10080#endif
10081}
10082
bcfc686d 10083/* =================================== Main! ================================ */
56906eef 10084
bcfc686d 10085#ifdef __linux__
10086int linuxOvercommitMemoryValue(void) {
10087 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10088 char buf[64];
56906eef 10089
bcfc686d 10090 if (!fp) return -1;
10091 if (fgets(buf,64,fp) == NULL) {
10092 fclose(fp);
10093 return -1;
10094 }
10095 fclose(fp);
56906eef 10096
bcfc686d 10097 return atoi(buf);
10098}
10099
10100void linuxOvercommitMemoryWarning(void) {
10101 if (linuxOvercommitMemoryValue() == 0) {
7ccd2d0a 10102 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
bcfc686d 10103 }
10104}
10105#endif /* __linux__ */
10106
10107static void daemonize(void) {
10108 int fd;
10109 FILE *fp;
10110
10111 if (fork() != 0) exit(0); /* parent exits */
10112 setsid(); /* create a new session */
10113
10114 /* Every output goes to /dev/null. If Redis is daemonized but
10115 * the 'logfile' is set to 'stdout' in the configuration file
10116 * it will not log at all. */
10117 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10118 dup2(fd, STDIN_FILENO);
10119 dup2(fd, STDOUT_FILENO);
10120 dup2(fd, STDERR_FILENO);
10121 if (fd > STDERR_FILENO) close(fd);
10122 }
10123 /* Try to write the pid file */
10124 fp = fopen(server.pidfile,"w");
10125 if (fp) {
10126 fprintf(fp,"%d\n",getpid());
10127 fclose(fp);
56906eef 10128 }
56906eef 10129}
10130
42ab0172
AO
10131static void version() {
10132 printf("Redis server version %s\n", REDIS_VERSION);
10133 exit(0);
10134}
10135
723fb69b
AO
10136static void usage() {
10137 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 10138 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
10139 exit(1);
10140}
10141
bcfc686d 10142int main(int argc, char **argv) {
9651a787 10143 time_t start;
10144
bcfc686d 10145 initServerConfig();
10146 if (argc == 2) {
44efe66e 10147 if (strcmp(argv[1], "-v") == 0 ||
10148 strcmp(argv[1], "--version") == 0) version();
10149 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 10150 resetServerSaveParams();
10151 loadServerConfig(argv[1]);
723fb69b
AO
10152 } else if ((argc > 2)) {
10153 usage();
bcfc686d 10154 } else {
10155 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10156 }
bcfc686d 10157 if (server.daemonize) daemonize();
71c54b21 10158 initServer();
bcfc686d 10159 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10160#ifdef __linux__
10161 linuxOvercommitMemoryWarning();
10162#endif
9651a787 10163 start = time(NULL);
bcfc686d 10164 if (server.appendonly) {
10165 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 10166 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 10167 } else {
10168 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 10169 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 10170 }
bcfc686d 10171 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 10172 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 10173 aeMain(server.el);
10174 aeDeleteEventLoop(server.el);
10175 return 0;
10176}
10177
10178/* ============================= Backtrace support ========================= */
10179
10180#ifdef HAVE_BACKTRACE
10181static char *findFuncName(void *pointer, unsigned long *offset);
10182
56906eef 10183static void *getMcontextEip(ucontext_t *uc) {
10184#if defined(__FreeBSD__)
10185 return (void*) uc->uc_mcontext.mc_eip;
10186#elif defined(__dietlibc__)
10187 return (void*) uc->uc_mcontext.eip;
06db1f50 10188#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 10189 #if __x86_64__
10190 return (void*) uc->uc_mcontext->__ss.__rip;
10191 #else
56906eef 10192 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 10193 #endif
06db1f50 10194#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 10195 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 10196 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 10197 #else
10198 return (void*) uc->uc_mcontext->__ss.__eip;
e0a62c7f 10199 #endif
54bac49d 10200#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 10201 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 10202#elif defined(__ia64__) /* Linux IA64 */
10203 return (void*) uc->uc_mcontext.sc_ip;
10204#else
10205 return NULL;
56906eef 10206#endif
10207}
10208
10209static void segvHandler(int sig, siginfo_t *info, void *secret) {
10210 void *trace[100];
10211 char **messages = NULL;
10212 int i, trace_size = 0;
10213 unsigned long offset=0;
56906eef 10214 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 10215 sds infostring;
56906eef 10216 REDIS_NOTUSED(info);
10217
10218 redisLog(REDIS_WARNING,
10219 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 10220 infostring = genRedisInfoString();
10221 redisLog(REDIS_WARNING, "%s",infostring);
10222 /* It's not safe to sdsfree() the returned string under memory
10223 * corruption conditions. Let it leak as we are going to abort */
e0a62c7f 10224
56906eef 10225 trace_size = backtrace(trace, 100);
de96dbfe 10226 /* overwrite sigaction with caller's address */
b91cf5ef 10227 if (getMcontextEip(uc) != NULL) {
10228 trace[1] = getMcontextEip(uc);
10229 }
56906eef 10230 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 10231
d76412d1 10232 for (i=1; i<trace_size; ++i) {
56906eef 10233 char *fn = findFuncName(trace[i], &offset), *p;
10234
10235 p = strchr(messages[i],'+');
10236 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
10237 redisLog(REDIS_WARNING,"%s", messages[i]);
10238 } else {
10239 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
10240 }
10241 }
b177fd30 10242 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 10243 _exit(0);
fe3bbfbe 10244}
56906eef 10245
10246static void setupSigSegvAction(void) {
10247 struct sigaction act;
10248
10249 sigemptyset (&act.sa_mask);
10250 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10251 * is used. Otherwise, sa_handler is used */
10252 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10253 act.sa_sigaction = segvHandler;
10254 sigaction (SIGSEGV, &act, NULL);
10255 sigaction (SIGBUS, &act, NULL);
12fea928 10256 sigaction (SIGFPE, &act, NULL);
10257 sigaction (SIGILL, &act, NULL);
10258 sigaction (SIGBUS, &act, NULL);
e65fdc78 10259 return;
56906eef 10260}
e65fdc78 10261
bcfc686d 10262#include "staticsymbols.h"
10263/* This function try to convert a pointer into a function name. It's used in
10264 * oreder to provide a backtrace under segmentation fault that's able to
10265 * display functions declared as static (otherwise the backtrace is useless). */
10266static char *findFuncName(void *pointer, unsigned long *offset){
10267 int i, ret = -1;
10268 unsigned long off, minoff = 0;
ed9b544e 10269
bcfc686d 10270 /* Try to match against the Symbol with the smallest offset */
10271 for (i=0; symsTable[i].pointer; i++) {
10272 unsigned long lp = (unsigned long) pointer;
0bc03378 10273
bcfc686d 10274 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
10275 off=lp-symsTable[i].pointer;
10276 if (ret < 0 || off < minoff) {
10277 minoff=off;
10278 ret=i;
10279 }
10280 }
0bc03378 10281 }
bcfc686d 10282 if (ret == -1) return NULL;
10283 *offset = minoff;
10284 return symsTable[ret].name;
0bc03378 10285}
bcfc686d 10286#else /* HAVE_BACKTRACE */
10287static void setupSigSegvAction(void) {
0bc03378 10288}
bcfc686d 10289#endif /* HAVE_BACKTRACE */
0bc03378 10290
ed9b544e 10291
ed9b544e 10292
bcfc686d 10293/* The End */
10294
10295
ed9b544e 10296