]> git.saurik.com Git - redis.git/blame - redis.c
removed a no longer true assert in the VM code
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
cac154c5 30#define REDIS_VERSION "1.3.8"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
c9468bcf 40#define __USE_POSIX199309
54bac49d 41#define __USE_UNIX98
ed9b544e 42#include <signal.h>
fbf9bcdb 43
44#ifdef HAVE_BACKTRACE
c9468bcf 45#include <execinfo.h>
46#include <ucontext.h>
fbf9bcdb 47#endif /* HAVE_BACKTRACE */
48
ed9b544e 49#include <sys/wait.h>
50#include <errno.h>
51#include <assert.h>
52#include <ctype.h>
53#include <stdarg.h>
54#include <inttypes.h>
55#include <arpa/inet.h>
56#include <sys/stat.h>
57#include <fcntl.h>
58#include <sys/time.h>
59#include <sys/resource.h>
2895e862 60#include <sys/uio.h>
f78fd11b 61#include <limits.h>
a7866db6 62#include <math.h>
92f8e882 63#include <pthread.h>
0bc1b2f6 64
65#if defined(__sun)
5043dff3 66#include "solarisfixes.h"
67#endif
ed9b544e 68
c9468bcf 69#include "redis.h"
ed9b544e 70#include "ae.h" /* Event driven programming library */
71#include "sds.h" /* Dynamic safe strings */
72#include "anet.h" /* Networking the easy way */
73#include "dict.h" /* Hash tables */
74#include "adlist.h" /* Linked lists */
75#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 76#include "lzf.h" /* LZF compression library */
77#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
5234952b 78#include "zipmap.h"
ed9b544e 79
80/* Error codes */
81#define REDIS_OK 0
82#define REDIS_ERR -1
83
84/* Static server configuration */
85#define REDIS_SERVERPORT 6379 /* TCP port */
86#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 87#define REDIS_IOBUF_LEN 1024
ed9b544e 88#define REDIS_LOADBUF_LEN 1024
248ea310 89#define REDIS_STATIC_ARGS 8
ed9b544e 90#define REDIS_DEFAULT_DBNUM 16
91#define REDIS_CONFIGLINE_MAX 1024
92#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
1763929f 94#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* try to expire 10 keys/loop */
6f376729 95#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 96#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99#define REDIS_WRITEV_THRESHOLD 3
100/* Max number of iovecs used for each writev call */
101#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 102
103/* Hash table parameters */
104#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 105
106/* Command flags */
3fd78bcd 107#define REDIS_CMD_BULK 1 /* Bulk write command */
108#define REDIS_CMD_INLINE 2 /* Inline command */
109/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113#define REDIS_CMD_DENYOOM 4
4005fef1 114#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
ed9b544e 115
116/* Object types */
117#define REDIS_STRING 0
118#define REDIS_LIST 1
119#define REDIS_SET 2
1812e024 120#define REDIS_ZSET 3
121#define REDIS_HASH 4
f78fd11b 122
5234952b 123/* Objects encoding. Some kind of objects like Strings and Hashes can be
124 * internally represented in multiple ways. The 'encoding' field of the object
125 * is set to one of this fields for this object. */
942a3961 126#define REDIS_ENCODING_RAW 0 /* Raw representation */
127#define REDIS_ENCODING_INT 1 /* Encoded as integer */
5234952b 128#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
129#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
942a3961 130
07efaf74 131static char* strencoding[] = {
132 "raw", "int", "zipmap", "hashtable"
133};
134
f78fd11b 135/* Object types only used for dumping to disk */
bb32ede5 136#define REDIS_EXPIRETIME 253
ed9b544e 137#define REDIS_SELECTDB 254
138#define REDIS_EOF 255
139
f78fd11b 140/* Defines related to the dump file format. To store 32 bits lengths for short
141 * keys requires a lot of space, so we check the most significant 2 bits of
142 * the first byte to interpreter the length:
143 *
144 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
145 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
146 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 147 * 11|000000 this means: specially encoded object will follow. The six bits
148 * number specify the kind of object that follows.
149 * See the REDIS_RDB_ENC_* defines.
f78fd11b 150 *
10c43610 151 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
152 * values, will fit inside. */
f78fd11b 153#define REDIS_RDB_6BITLEN 0
154#define REDIS_RDB_14BITLEN 1
155#define REDIS_RDB_32BITLEN 2
17be1a4a 156#define REDIS_RDB_ENCVAL 3
f78fd11b 157#define REDIS_RDB_LENERR UINT_MAX
158
a4d1ba9a 159/* When a length of a string object stored on disk has the first two bits
160 * set, the remaining two bits specify a special encoding for the object
161 * accordingly to the following defines: */
162#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
163#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
164#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 165#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 166
75680a3c 167/* Virtual memory object->where field. */
168#define REDIS_VM_MEMORY 0 /* The object is on memory */
169#define REDIS_VM_SWAPPED 1 /* The object is on disk */
170#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
171#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
172
06224fec 173/* Virtual memory static configuration stuff.
174 * Check vmFindContiguousPages() to know more about this magic numbers. */
175#define REDIS_VM_MAX_NEAR_PAGES 65536
176#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 177#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 178#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 179/* The following is the *percentage* of completed I/O jobs to process when the
180 * handelr is called. While Virtual Memory I/O operations are performed by
181 * threads, this operations must be processed by the main thread when completed
182 * in order to take effect. */
c953f24b 183#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 184
ed9b544e 185/* Client flags */
d5d55fc3 186#define REDIS_SLAVE 1 /* This client is a slave server */
187#define REDIS_MASTER 2 /* This client is a master server */
188#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
189#define REDIS_MULTI 8 /* This client is in a MULTI context */
190#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
191#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
ed9b544e 192
40d224a9 193/* Slave replication state - slave side */
ed9b544e 194#define REDIS_REPL_NONE 0 /* No active replication */
195#define REDIS_REPL_CONNECT 1 /* Must connect to master */
196#define REDIS_REPL_CONNECTED 2 /* Connected to master */
197
40d224a9 198/* Slave replication state - from the point of view of master
199 * Note that in SEND_BULK and ONLINE state the slave receives new updates
200 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
201 * to start the next background saving in order to send updates to it. */
202#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
203#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
204#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
205#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
206
ed9b544e 207/* List related stuff */
208#define REDIS_HEAD 0
209#define REDIS_TAIL 1
210
211/* Sort operations */
212#define REDIS_SORT_GET 0
443c6409 213#define REDIS_SORT_ASC 1
214#define REDIS_SORT_DESC 2
ed9b544e 215#define REDIS_SORTKEY_MAX 1024
216
217/* Log levels */
218#define REDIS_DEBUG 0
f870935d 219#define REDIS_VERBOSE 1
220#define REDIS_NOTICE 2
221#define REDIS_WARNING 3
ed9b544e 222
223/* Anti-warning macro... */
224#define REDIS_NOTUSED(V) ((void) V)
225
6b47e12e 226#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
227#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 228
48f0308a 229/* Append only defines */
230#define APPENDFSYNC_NO 0
231#define APPENDFSYNC_ALWAYS 1
232#define APPENDFSYNC_EVERYSEC 2
233
cbba7dd7 234/* Hashes related defaults */
235#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
236#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
237
dfc5e96c 238/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 239#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
6c96ba7d 240static void _redisAssert(char *estr, char *file, int line);
dfc5e96c 241
ed9b544e 242/*================================= Data types ============================== */
243
244/* A redis object, that is a type able to hold a string / list / set */
75680a3c 245
246/* The VM object structure */
247struct redisObjectVM {
3a66edc7 248 off_t page; /* the page at witch the object is stored on disk */
249 off_t usedpages; /* number of pages used on disk */
250 time_t atime; /* Last access time */
75680a3c 251} vm;
252
253/* The actual Redis Object */
ed9b544e 254typedef struct redisObject {
ed9b544e 255 void *ptr;
942a3961 256 unsigned char type;
257 unsigned char encoding;
d894161b 258 unsigned char storage; /* If this object is a key, where is the value?
259 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
260 unsigned char vtype; /* If this object is a key, and value is swapped out,
261 * this is the type of the swapped out object. */
ed9b544e 262 int refcount;
75680a3c 263 /* VM fields, this are only allocated if VM is active, otherwise the
264 * object allocation function will just allocate
265 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
266 * Redis without VM active will not have any overhead. */
267 struct redisObjectVM vm;
ed9b544e 268} robj;
269
dfc5e96c 270/* Macro used to initalize a Redis object allocated on the stack.
271 * Note that this macro is taken near the structure definition to make sure
272 * we'll update it when the structure is changed, to avoid bugs like
273 * bug #85 introduced exactly in this way. */
274#define initStaticStringObject(_var,_ptr) do { \
275 _var.refcount = 1; \
276 _var.type = REDIS_STRING; \
277 _var.encoding = REDIS_ENCODING_RAW; \
278 _var.ptr = _ptr; \
3a66edc7 279 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 280} while(0);
281
3305306f 282typedef struct redisDb {
4409877e 283 dict *dict; /* The keyspace for this DB */
284 dict *expires; /* Timeout of keys with a timeout set */
285 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 286 dict *io_keys; /* Keys with clients waiting for VM I/O */
3305306f 287 int id;
288} redisDb;
289
6e469882 290/* Client MULTI/EXEC state */
291typedef struct multiCmd {
292 robj **argv;
293 int argc;
294 struct redisCommand *cmd;
295} multiCmd;
296
297typedef struct multiState {
298 multiCmd *commands; /* Array of MULTI commands */
299 int count; /* Total number of MULTI commands */
300} multiState;
301
ed9b544e 302/* With multiplexing we need to take per-clinet state.
303 * Clients are taken in a liked list. */
304typedef struct redisClient {
305 int fd;
3305306f 306 redisDb *db;
ed9b544e 307 int dictid;
308 sds querybuf;
e8a74421 309 robj **argv, **mbargv;
310 int argc, mbargc;
40d224a9 311 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 312 int multibulk; /* multi bulk command format active */
ed9b544e 313 list *reply;
314 int sentlen;
315 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 316 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 317 int slaveseldb; /* slave selected db, if this client is a slave */
318 int authenticated; /* when requirepass is non-NULL */
319 int replstate; /* replication state if this is a slave */
320 int repldbfd; /* replication DB file descriptor */
6e469882 321 long repldboff; /* replication DB file offset */
40d224a9 322 off_t repldbsize; /* replication DB file size */
6e469882 323 multiState mstate; /* MULTI/EXEC state */
d5d55fc3 324 robj **blockingkeys; /* The key we are waiting to terminate a blocking
4409877e 325 * operation such as BLPOP. Otherwise NULL. */
b177fd30 326 int blockingkeysnum; /* Number of blocking keys */
4409877e 327 time_t blockingto; /* Blocking operation timeout. If UNIX current time
328 * is >= blockingto then the operation timed out. */
92f8e882 329 list *io_keys; /* Keys this client is waiting to be loaded from the
330 * swap file in order to continue. */
ffc6b7f8 331 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
332 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
ed9b544e 333} redisClient;
334
335struct saveparam {
336 time_t seconds;
337 int changes;
338};
339
340/* Global server state structure */
341struct redisServer {
342 int port;
343 int fd;
3305306f 344 redisDb *db;
ed9b544e 345 long long dirty; /* changes to DB from the last save */
346 list *clients;
87eca727 347 list *slaves, *monitors;
ed9b544e 348 char neterr[ANET_ERR_LEN];
349 aeEventLoop *el;
350 int cronloops; /* number of times the cron function run */
351 list *objfreelist; /* A list of freed objects to avoid malloc() */
352 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 353 /* Fields used only for stats */
354 time_t stat_starttime; /* server start time */
355 long long stat_numcommands; /* number of processed commands */
356 long long stat_numconnections; /* number of connections received */
2a6a2ed1 357 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 358 /* Configuration */
359 int verbosity;
360 int glueoutputbuf;
361 int maxidletime;
362 int dbnum;
363 int daemonize;
44b38ef4 364 int appendonly;
48f0308a 365 int appendfsync;
366 time_t lastfsync;
44b38ef4 367 int appendfd;
368 int appendseldb;
ed329fcf 369 char *pidfile;
9f3c422c 370 pid_t bgsavechildpid;
9d65a1bb 371 pid_t bgrewritechildpid;
372 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
ed9b544e 373 struct saveparam *saveparams;
374 int saveparamslen;
375 char *logfile;
376 char *bindaddr;
377 char *dbfilename;
44b38ef4 378 char *appendfilename;
abcb223e 379 char *requirepass;
10c43610 380 int shareobjects;
121f70cf 381 int rdbcompression;
ed9b544e 382 /* Replication related */
383 int isslave;
d0ccebcf 384 char *masterauth;
ed9b544e 385 char *masterhost;
386 int masterport;
40d224a9 387 redisClient *master; /* client that is master for this slave */
ed9b544e 388 int replstate;
285add55 389 unsigned int maxclients;
4ef8de8a 390 unsigned long long maxmemory;
d5d55fc3 391 unsigned int blpop_blocked_clients;
392 unsigned int vm_blocked_clients;
ed9b544e 393 /* Sort parameters - qsort_r() is only available under BSD so we
394 * have to take this state global, in order to pass it to sortCompare() */
395 int sort_desc;
396 int sort_alpha;
397 int sort_bypattern;
75680a3c 398 /* Virtual memory configuration */
399 int vm_enabled;
054e426d 400 char *vm_swap_file;
75680a3c 401 off_t vm_page_size;
402 off_t vm_pages;
4ef8de8a 403 unsigned long long vm_max_memory;
cbba7dd7 404 /* Hashes config */
405 size_t hash_max_zipmap_entries;
406 size_t hash_max_zipmap_value;
75680a3c 407 /* Virtual memory state */
408 FILE *vm_fp;
409 int vm_fd;
410 off_t vm_next_page; /* Next probably empty page */
411 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 412 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 413 time_t unixtime; /* Unix time sampled every second. */
92f8e882 414 /* Virtual memory I/O threads stuff */
92f8e882 415 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 416 * put the result of the operation in the io_done list. While the
417 * job is being processed, it's put on io_processing queue. */
418 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
419 list *io_processing; /* List of VM I/O jobs being processed */
420 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 421 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 422 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 423 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
424 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 425 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 426 int io_active_threads; /* Number of running I/O threads */
427 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 428 /* Our main thread is blocked on the event loop, locking for sockets ready
429 * to be read or written, so when a threaded I/O operation is ready to be
430 * processed by the main thread, the I/O thread will use a unix pipe to
431 * awake the main thread. The followings are the two pipe FDs. */
432 int io_ready_pipe_read;
433 int io_ready_pipe_write;
7d98e08c 434 /* Virtual memory stats */
435 unsigned long long vm_stats_used_pages;
436 unsigned long long vm_stats_swapped_objects;
437 unsigned long long vm_stats_swapouts;
438 unsigned long long vm_stats_swapins;
befec3cd 439 /* Pubsub */
ffc6b7f8 440 dict *pubsub_channels; /* Map channels to list of subscribed clients */
441 list *pubsub_patterns; /* A list of pubsub_patterns */
befec3cd 442 /* Misc */
b9bc0eef 443 FILE *devnull;
ed9b544e 444};
445
ffc6b7f8 446typedef struct pubsubPattern {
447 redisClient *client;
448 robj *pattern;
449} pubsubPattern;
450
ed9b544e 451typedef void redisCommandProc(redisClient *c);
452struct redisCommand {
453 char *name;
454 redisCommandProc *proc;
455 int arity;
456 int flags;
76583ea4
PN
457 /* Use a function to determine which keys need to be loaded
458 * in the background prior to executing this command. Takes precedence
459 * over vm_firstkey and others, ignored when NULL */
460 redisCommandProc *vm_preload_proc;
7c775e09 461 /* What keys should be loaded in background when calling this command? */
462 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
463 int vm_lastkey; /* THe last argument that's a key */
464 int vm_keystep; /* The step between first and last key */
ed9b544e 465};
466
de96dbfe 467struct redisFunctionSym {
468 char *name;
56906eef 469 unsigned long pointer;
de96dbfe 470};
471
ed9b544e 472typedef struct _redisSortObject {
473 robj *obj;
474 union {
475 double score;
476 robj *cmpobj;
477 } u;
478} redisSortObject;
479
480typedef struct _redisSortOperation {
481 int type;
482 robj *pattern;
483} redisSortOperation;
484
6b47e12e 485/* ZSETs use a specialized version of Skiplists */
486
487typedef struct zskiplistNode {
488 struct zskiplistNode **forward;
e3870fab 489 struct zskiplistNode *backward;
912b9165 490 unsigned int *span;
6b47e12e 491 double score;
492 robj *obj;
493} zskiplistNode;
494
495typedef struct zskiplist {
e3870fab 496 struct zskiplistNode *header, *tail;
d13f767c 497 unsigned long length;
6b47e12e 498 int level;
499} zskiplist;
500
1812e024 501typedef struct zset {
502 dict *dict;
6b47e12e 503 zskiplist *zsl;
1812e024 504} zset;
505
6b47e12e 506/* Our shared "common" objects */
507
ed9b544e 508struct sharedObjectsStruct {
c937aa89 509 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 510 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 511 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
512 *outofrangeerr, *plus,
ed9b544e 513 *select0, *select1, *select2, *select3, *select4,
befec3cd 514 *select5, *select6, *select7, *select8, *select9,
ffc6b7f8 515 *messagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
516 *psubscribebulk, *punsubscribebulk;
ed9b544e 517} shared;
518
a7866db6 519/* Global vars that are actally used as constants. The following double
520 * values are used for double on-disk serialization, and are initialized
521 * at runtime to avoid strange compiler optimizations. */
522
523static double R_Zero, R_PosInf, R_NegInf, R_Nan;
524
92f8e882 525/* VM threaded I/O request message */
b9bc0eef 526#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
527#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
528#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 529typedef struct iojob {
996cb5f7 530 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 531 redisDb *db;/* Redis database */
92f8e882 532 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 533 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 534 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
535 off_t page; /* Swap page where to read/write the object */
248ea310 536 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 537 int canceled; /* True if this command was canceled by blocking side of VM */
538 pthread_t thread; /* ID of the thread processing this entry */
539} iojob;
92f8e882 540
ed9b544e 541/*================================ Prototypes =============================== */
542
543static void freeStringObject(robj *o);
544static void freeListObject(robj *o);
545static void freeSetObject(robj *o);
546static void decrRefCount(void *o);
547static robj *createObject(int type, void *ptr);
548static void freeClient(redisClient *c);
f78fd11b 549static int rdbLoad(char *filename);
ed9b544e 550static void addReply(redisClient *c, robj *obj);
551static void addReplySds(redisClient *c, sds s);
552static void incrRefCount(robj *o);
f78fd11b 553static int rdbSaveBackground(char *filename);
ed9b544e 554static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 555static robj *dupStringObject(robj *o);
248ea310 556static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
44b38ef4 557static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 558static int syncWithMaster(void);
942a3961 559static int tryObjectEncoding(robj *o);
9d65a1bb 560static robj *getDecodedObject(robj *o);
3305306f 561static int removeExpire(redisDb *db, robj *key);
562static int expireIfNeeded(redisDb *db, robj *key);
563static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 564static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 565static int deleteKey(redisDb *db, robj *key);
bb32ede5 566static time_t getExpire(redisDb *db, robj *key);
567static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 568static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 569static void freeMemoryIfNeeded(void);
de96dbfe 570static int processCommand(redisClient *c);
56906eef 571static void setupSigSegvAction(void);
a3b21203 572static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 573static void aofRemoveTempFile(pid_t childpid);
0ea663ea 574static size_t stringObjectLen(robj *o);
638e42ac 575static void processInputBuffer(redisClient *c);
6b47e12e 576static zskiplist *zslCreate(void);
fd8ccf44 577static void zslFree(zskiplist *zsl);
2b59cfdf 578static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 579static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 580static void initClientMultiState(redisClient *c);
581static void freeClientMultiState(redisClient *c);
582static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 583static void unblockClientWaitingData(redisClient *c);
4409877e 584static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 585static void vmInit(void);
a35ddf12 586static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 587static robj *vmLoadObject(robj *key);
7e69548d 588static robj *vmPreviewObject(robj *key);
a69a0c9c 589static int vmSwapOneObjectBlocking(void);
590static int vmSwapOneObjectThreaded(void);
7e69548d 591static int vmCanSwapOut(void);
a5819310 592static int tryFreeOneObjectFromFreelist(void);
996cb5f7 593static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
594static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
595static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 596static void lockThreadedIO(void);
597static void unlockThreadedIO(void);
598static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
599static void freeIOJob(iojob *j);
600static void queueIOJob(iojob *j);
a5819310 601static int vmWriteObjectOnSwap(robj *o, off_t page);
602static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 603static void waitEmptyIOJobsQueue(void);
604static void vmReopenSwapFile(void);
970e10bb 605static int vmFreePage(off_t page);
76583ea4 606static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
d5d55fc3 607static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
608static int dontWaitForSwappedKey(redisClient *c, robj *key);
609static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
610static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
611static struct redisCommand *lookupCommand(char *name);
612static void call(redisClient *c, struct redisCommand *cmd);
613static void resetClient(redisClient *c);
ada386b2 614static void convertToRealHash(robj *o);
ffc6b7f8 615static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
616static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
617static void freePubsubPattern(void *p);
618static int listMatchPubsubPattern(void *a, void *b);
619static int compareStringObjects(robj *a, robj *b);
befec3cd 620static void usage();
ed9b544e 621
abcb223e 622static void authCommand(redisClient *c);
ed9b544e 623static void pingCommand(redisClient *c);
624static void echoCommand(redisClient *c);
625static void setCommand(redisClient *c);
626static void setnxCommand(redisClient *c);
627static void getCommand(redisClient *c);
628static void delCommand(redisClient *c);
629static void existsCommand(redisClient *c);
630static void incrCommand(redisClient *c);
631static void decrCommand(redisClient *c);
632static void incrbyCommand(redisClient *c);
633static void decrbyCommand(redisClient *c);
634static void selectCommand(redisClient *c);
635static void randomkeyCommand(redisClient *c);
636static void keysCommand(redisClient *c);
637static void dbsizeCommand(redisClient *c);
638static void lastsaveCommand(redisClient *c);
639static void saveCommand(redisClient *c);
640static void bgsaveCommand(redisClient *c);
9d65a1bb 641static void bgrewriteaofCommand(redisClient *c);
ed9b544e 642static void shutdownCommand(redisClient *c);
643static void moveCommand(redisClient *c);
644static void renameCommand(redisClient *c);
645static void renamenxCommand(redisClient *c);
646static void lpushCommand(redisClient *c);
647static void rpushCommand(redisClient *c);
648static void lpopCommand(redisClient *c);
649static void rpopCommand(redisClient *c);
650static void llenCommand(redisClient *c);
651static void lindexCommand(redisClient *c);
652static void lrangeCommand(redisClient *c);
653static void ltrimCommand(redisClient *c);
654static void typeCommand(redisClient *c);
655static void lsetCommand(redisClient *c);
656static void saddCommand(redisClient *c);
657static void sremCommand(redisClient *c);
a4460ef4 658static void smoveCommand(redisClient *c);
ed9b544e 659static void sismemberCommand(redisClient *c);
660static void scardCommand(redisClient *c);
12fea928 661static void spopCommand(redisClient *c);
2abb95a9 662static void srandmemberCommand(redisClient *c);
ed9b544e 663static void sinterCommand(redisClient *c);
664static void sinterstoreCommand(redisClient *c);
40d224a9 665static void sunionCommand(redisClient *c);
666static void sunionstoreCommand(redisClient *c);
f4f56e1d 667static void sdiffCommand(redisClient *c);
668static void sdiffstoreCommand(redisClient *c);
ed9b544e 669static void syncCommand(redisClient *c);
670static void flushdbCommand(redisClient *c);
671static void flushallCommand(redisClient *c);
672static void sortCommand(redisClient *c);
673static void lremCommand(redisClient *c);
0f5f7e9a 674static void rpoplpushcommand(redisClient *c);
ed9b544e 675static void infoCommand(redisClient *c);
70003d28 676static void mgetCommand(redisClient *c);
87eca727 677static void monitorCommand(redisClient *c);
3305306f 678static void expireCommand(redisClient *c);
802e8373 679static void expireatCommand(redisClient *c);
f6b141c5 680static void getsetCommand(redisClient *c);
fd88489a 681static void ttlCommand(redisClient *c);
321b0e13 682static void slaveofCommand(redisClient *c);
7f957c92 683static void debugCommand(redisClient *c);
f6b141c5 684static void msetCommand(redisClient *c);
685static void msetnxCommand(redisClient *c);
fd8ccf44 686static void zaddCommand(redisClient *c);
7db723ad 687static void zincrbyCommand(redisClient *c);
cc812361 688static void zrangeCommand(redisClient *c);
50c55df5 689static void zrangebyscoreCommand(redisClient *c);
f44dd428 690static void zcountCommand(redisClient *c);
e3870fab 691static void zrevrangeCommand(redisClient *c);
3c41331e 692static void zcardCommand(redisClient *c);
1b7106e7 693static void zremCommand(redisClient *c);
6e333bbe 694static void zscoreCommand(redisClient *c);
1807985b 695static void zremrangebyscoreCommand(redisClient *c);
6e469882 696static void multiCommand(redisClient *c);
697static void execCommand(redisClient *c);
18b6cb76 698static void discardCommand(redisClient *c);
4409877e 699static void blpopCommand(redisClient *c);
700static void brpopCommand(redisClient *c);
4b00bebd 701static void appendCommand(redisClient *c);
39191553 702static void substrCommand(redisClient *c);
69d95c3e 703static void zrankCommand(redisClient *c);
798d9e55 704static void zrevrankCommand(redisClient *c);
978c2c94 705static void hsetCommand(redisClient *c);
706static void hgetCommand(redisClient *c);
07efaf74 707static void hdelCommand(redisClient *c);
92b27fe9 708static void hlenCommand(redisClient *c);
9212eafd 709static void zremrangebyrankCommand(redisClient *c);
2830ca53
PN
710static void zunionCommand(redisClient *c);
711static void zinterCommand(redisClient *c);
78409a0f 712static void hkeysCommand(redisClient *c);
713static void hvalsCommand(redisClient *c);
714static void hgetallCommand(redisClient *c);
a86f14b1 715static void hexistsCommand(redisClient *c);
500ece7c 716static void configCommand(redisClient *c);
01426b05 717static void hincrbyCommand(redisClient *c);
befec3cd 718static void subscribeCommand(redisClient *c);
719static void unsubscribeCommand(redisClient *c);
ffc6b7f8 720static void psubscribeCommand(redisClient *c);
721static void punsubscribeCommand(redisClient *c);
befec3cd 722static void publishCommand(redisClient *c);
f6b141c5 723
ed9b544e 724/*================================= Globals ================================= */
725
726/* Global vars */
727static struct redisServer server; /* server global state */
728static struct redisCommand cmdTable[] = {
76583ea4
PN
729 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
730 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
731 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
732 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
733 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
734 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
735 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
736 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
737 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
738 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
739 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
740 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
741 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
742 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
743 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
744 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
745 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
746 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
748 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
749 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
750 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
751 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
752 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
753 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
754 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
755 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
756 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
757 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
758 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
760 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
761 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
762 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
763 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
764 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
765 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
766 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
767 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
768 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
769 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
770 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
771 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
772 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
773 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
779 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
780 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
781 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
01426b05 782 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
783 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
784 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
785 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 789 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
790 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
791 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
792 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
793 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
794 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
795 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
796 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
797 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
798 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
800 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
801 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
802 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
803 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
804 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
805 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
806 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
807 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
808 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
809 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
810 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
811 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
812 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
813 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
958cd5f3 814 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
815 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
816 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
818 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
819 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
820 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
823 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
824 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 825 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
befec3cd 826 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
ffc6b7f8 828 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
4005fef1 830 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
76583ea4 831 {NULL,NULL,0,0,NULL,0,0,0}
ed9b544e 832};
bcfc686d 833
ed9b544e 834/*============================ Utility functions ============================ */
835
836/* Glob-style pattern matching. */
500ece7c 837static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 838 const char *string, int stringLen, int nocase)
839{
840 while(patternLen) {
841 switch(pattern[0]) {
842 case '*':
843 while (pattern[1] == '*') {
844 pattern++;
845 patternLen--;
846 }
847 if (patternLen == 1)
848 return 1; /* match */
849 while(stringLen) {
850 if (stringmatchlen(pattern+1, patternLen-1,
851 string, stringLen, nocase))
852 return 1; /* match */
853 string++;
854 stringLen--;
855 }
856 return 0; /* no match */
857 break;
858 case '?':
859 if (stringLen == 0)
860 return 0; /* no match */
861 string++;
862 stringLen--;
863 break;
864 case '[':
865 {
866 int not, match;
867
868 pattern++;
869 patternLen--;
870 not = pattern[0] == '^';
871 if (not) {
872 pattern++;
873 patternLen--;
874 }
875 match = 0;
876 while(1) {
877 if (pattern[0] == '\\') {
878 pattern++;
879 patternLen--;
880 if (pattern[0] == string[0])
881 match = 1;
882 } else if (pattern[0] == ']') {
883 break;
884 } else if (patternLen == 0) {
885 pattern--;
886 patternLen++;
887 break;
888 } else if (pattern[1] == '-' && patternLen >= 3) {
889 int start = pattern[0];
890 int end = pattern[2];
891 int c = string[0];
892 if (start > end) {
893 int t = start;
894 start = end;
895 end = t;
896 }
897 if (nocase) {
898 start = tolower(start);
899 end = tolower(end);
900 c = tolower(c);
901 }
902 pattern += 2;
903 patternLen -= 2;
904 if (c >= start && c <= end)
905 match = 1;
906 } else {
907 if (!nocase) {
908 if (pattern[0] == string[0])
909 match = 1;
910 } else {
911 if (tolower((int)pattern[0]) == tolower((int)string[0]))
912 match = 1;
913 }
914 }
915 pattern++;
916 patternLen--;
917 }
918 if (not)
919 match = !match;
920 if (!match)
921 return 0; /* no match */
922 string++;
923 stringLen--;
924 break;
925 }
926 case '\\':
927 if (patternLen >= 2) {
928 pattern++;
929 patternLen--;
930 }
931 /* fall through */
932 default:
933 if (!nocase) {
934 if (pattern[0] != string[0])
935 return 0; /* no match */
936 } else {
937 if (tolower((int)pattern[0]) != tolower((int)string[0]))
938 return 0; /* no match */
939 }
940 string++;
941 stringLen--;
942 break;
943 }
944 pattern++;
945 patternLen--;
946 if (stringLen == 0) {
947 while(*pattern == '*') {
948 pattern++;
949 patternLen--;
950 }
951 break;
952 }
953 }
954 if (patternLen == 0 && stringLen == 0)
955 return 1;
956 return 0;
957}
958
500ece7c 959static int stringmatch(const char *pattern, const char *string, int nocase) {
960 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
961}
962
56906eef 963static void redisLog(int level, const char *fmt, ...) {
ed9b544e 964 va_list ap;
965 FILE *fp;
966
967 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
968 if (!fp) return;
969
970 va_start(ap, fmt);
971 if (level >= server.verbosity) {
6766f45e 972 char *c = ".-*#";
1904ecc1 973 char buf[64];
974 time_t now;
975
976 now = time(NULL);
6c9385e0 977 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 978 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 979 vfprintf(fp, fmt, ap);
980 fprintf(fp,"\n");
981 fflush(fp);
982 }
983 va_end(ap);
984
985 if (server.logfile) fclose(fp);
986}
987
988/*====================== Hash table type implementation ==================== */
989
990/* This is an hash table type that uses the SDS dynamic strings libary as
991 * keys and radis objects as values (objects can hold SDS strings,
992 * lists, sets). */
993
1812e024 994static void dictVanillaFree(void *privdata, void *val)
995{
996 DICT_NOTUSED(privdata);
997 zfree(val);
998}
999
4409877e 1000static void dictListDestructor(void *privdata, void *val)
1001{
1002 DICT_NOTUSED(privdata);
1003 listRelease((list*)val);
1004}
1005
ed9b544e 1006static int sdsDictKeyCompare(void *privdata, const void *key1,
1007 const void *key2)
1008{
1009 int l1,l2;
1010 DICT_NOTUSED(privdata);
1011
1012 l1 = sdslen((sds)key1);
1013 l2 = sdslen((sds)key2);
1014 if (l1 != l2) return 0;
1015 return memcmp(key1, key2, l1) == 0;
1016}
1017
1018static void dictRedisObjectDestructor(void *privdata, void *val)
1019{
1020 DICT_NOTUSED(privdata);
1021
a35ddf12 1022 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 1023 decrRefCount(val);
1024}
1025
942a3961 1026static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1027 const void *key2)
1028{
1029 const robj *o1 = key1, *o2 = key2;
1030 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1031}
1032
942a3961 1033static unsigned int dictObjHash(const void *key) {
ed9b544e 1034 const robj *o = key;
1035 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1036}
1037
942a3961 1038static int dictEncObjKeyCompare(void *privdata, const void *key1,
1039 const void *key2)
1040{
9d65a1bb 1041 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1042 int cmp;
942a3961 1043
2a1198b4 1044 if (o1->encoding == REDIS_ENCODING_INT &&
1045 o2->encoding == REDIS_ENCODING_INT &&
db5946fc 1046 o1->ptr == o2->ptr) return 1;
2a1198b4 1047
9d65a1bb 1048 o1 = getDecodedObject(o1);
1049 o2 = getDecodedObject(o2);
1050 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1051 decrRefCount(o1);
1052 decrRefCount(o2);
1053 return cmp;
942a3961 1054}
1055
1056static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1057 robj *o = (robj*) key;
942a3961 1058
ed9e4966 1059 if (o->encoding == REDIS_ENCODING_RAW) {
1060 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1061 } else {
1062 if (o->encoding == REDIS_ENCODING_INT) {
1063 char buf[32];
1064 int len;
1065
1066 len = snprintf(buf,32,"%ld",(long)o->ptr);
1067 return dictGenHashFunction((unsigned char*)buf, len);
1068 } else {
1069 unsigned int hash;
1070
1071 o = getDecodedObject(o);
1072 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1073 decrRefCount(o);
1074 return hash;
1075 }
1076 }
942a3961 1077}
1078
f2d9f50f 1079/* Sets type and expires */
ed9b544e 1080static dictType setDictType = {
942a3961 1081 dictEncObjHash, /* hash function */
ed9b544e 1082 NULL, /* key dup */
1083 NULL, /* val dup */
942a3961 1084 dictEncObjKeyCompare, /* key compare */
ed9b544e 1085 dictRedisObjectDestructor, /* key destructor */
1086 NULL /* val destructor */
1087};
1088
f2d9f50f 1089/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1090static dictType zsetDictType = {
1091 dictEncObjHash, /* hash function */
1092 NULL, /* key dup */
1093 NULL, /* val dup */
1094 dictEncObjKeyCompare, /* key compare */
1095 dictRedisObjectDestructor, /* key destructor */
da0a1620 1096 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1097};
1098
f2d9f50f 1099/* Db->dict */
5234952b 1100static dictType dbDictType = {
942a3961 1101 dictObjHash, /* hash function */
ed9b544e 1102 NULL, /* key dup */
1103 NULL, /* val dup */
942a3961 1104 dictObjKeyCompare, /* key compare */
ed9b544e 1105 dictRedisObjectDestructor, /* key destructor */
1106 dictRedisObjectDestructor /* val destructor */
1107};
1108
f2d9f50f 1109/* Db->expires */
1110static dictType keyptrDictType = {
1111 dictObjHash, /* hash function */
1112 NULL, /* key dup */
1113 NULL, /* val dup */
1114 dictObjKeyCompare, /* key compare */
1115 dictRedisObjectDestructor, /* key destructor */
1116 NULL /* val destructor */
1117};
1118
5234952b 1119/* Hash type hash table (note that small hashes are represented with zimpaps) */
1120static dictType hashDictType = {
1121 dictEncObjHash, /* hash function */
1122 NULL, /* key dup */
1123 NULL, /* val dup */
1124 dictEncObjKeyCompare, /* key compare */
1125 dictRedisObjectDestructor, /* key destructor */
1126 dictRedisObjectDestructor /* val destructor */
1127};
1128
4409877e 1129/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1130 * lists as values. It's used for blocking operations (BLPOP) and to
1131 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1132static dictType keylistDictType = {
1133 dictObjHash, /* hash function */
1134 NULL, /* key dup */
1135 NULL, /* val dup */
1136 dictObjKeyCompare, /* key compare */
1137 dictRedisObjectDestructor, /* key destructor */
1138 dictListDestructor /* val destructor */
1139};
1140
42ab0172
AO
1141static void version();
1142
ed9b544e 1143/* ========================= Random utility functions ======================= */
1144
1145/* Redis generally does not try to recover from out of memory conditions
1146 * when allocating objects or strings, it is not clear if it will be possible
1147 * to report this condition to the client since the networking layer itself
1148 * is based on heap allocation for send buffers, so we simply abort.
1149 * At least the code will be simpler to read... */
1150static void oom(const char *msg) {
71c54b21 1151 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1152 sleep(1);
1153 abort();
1154}
1155
1156/* ====================== Redis server networking stuff ===================== */
56906eef 1157static void closeTimedoutClients(void) {
ed9b544e 1158 redisClient *c;
ed9b544e 1159 listNode *ln;
1160 time_t now = time(NULL);
c7df85a4 1161 listIter li;
ed9b544e 1162
c7df85a4 1163 listRewind(server.clients,&li);
1164 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1165 c = listNodeValue(ln);
f86a74e9 1166 if (server.maxidletime &&
1167 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1168 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
ffc6b7f8 1169 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1170 listLength(c->pubsub_patterns) == 0 &&
d6cc8867 1171 (now - c->lastinteraction > server.maxidletime))
f86a74e9 1172 {
f870935d 1173 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1174 freeClient(c);
f86a74e9 1175 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1176 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1177 addReply(c,shared.nullmultibulk);
b0d8747d 1178 unblockClientWaitingData(c);
f86a74e9 1179 }
ed9b544e 1180 }
1181 }
ed9b544e 1182}
1183
12fea928 1184static int htNeedsResize(dict *dict) {
1185 long long size, used;
1186
1187 size = dictSlots(dict);
1188 used = dictSize(dict);
1189 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1190 (used*100/size < REDIS_HT_MINFILL));
1191}
1192
0bc03378 1193/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1194 * we resize the hash table to save memory */
56906eef 1195static void tryResizeHashTables(void) {
0bc03378 1196 int j;
1197
1198 for (j = 0; j < server.dbnum; j++) {
12fea928 1199 if (htNeedsResize(server.db[j].dict)) {
f870935d 1200 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
0bc03378 1201 dictResize(server.db[j].dict);
f870935d 1202 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
0bc03378 1203 }
12fea928 1204 if (htNeedsResize(server.db[j].expires))
1205 dictResize(server.db[j].expires);
0bc03378 1206 }
1207}
1208
9d65a1bb 1209/* A background saving child (BGSAVE) terminated its work. Handle this. */
1210void backgroundSaveDoneHandler(int statloc) {
1211 int exitcode = WEXITSTATUS(statloc);
1212 int bysignal = WIFSIGNALED(statloc);
1213
1214 if (!bysignal && exitcode == 0) {
1215 redisLog(REDIS_NOTICE,
1216 "Background saving terminated with success");
1217 server.dirty = 0;
1218 server.lastsave = time(NULL);
1219 } else if (!bysignal && exitcode != 0) {
1220 redisLog(REDIS_WARNING, "Background saving error");
1221 } else {
1222 redisLog(REDIS_WARNING,
454eea7c 1223 "Background saving terminated by signal %d", WTERMSIG(statloc));
9d65a1bb 1224 rdbRemoveTempFile(server.bgsavechildpid);
1225 }
1226 server.bgsavechildpid = -1;
1227 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1228 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1229 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1230}
1231
1232/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1233 * Handle this. */
1234void backgroundRewriteDoneHandler(int statloc) {
1235 int exitcode = WEXITSTATUS(statloc);
1236 int bysignal = WIFSIGNALED(statloc);
1237
1238 if (!bysignal && exitcode == 0) {
1239 int fd;
1240 char tmpfile[256];
1241
1242 redisLog(REDIS_NOTICE,
1243 "Background append only file rewriting terminated with success");
1244 /* Now it's time to flush the differences accumulated by the parent */
1245 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1246 fd = open(tmpfile,O_WRONLY|O_APPEND);
1247 if (fd == -1) {
1248 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1249 goto cleanup;
1250 }
1251 /* Flush our data... */
1252 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1253 (signed) sdslen(server.bgrewritebuf)) {
1254 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1255 close(fd);
1256 goto cleanup;
1257 }
b32627cd 1258 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1259 /* Now our work is to rename the temp file into the stable file. And
1260 * switch the file descriptor used by the server for append only. */
1261 if (rename(tmpfile,server.appendfilename) == -1) {
1262 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1263 close(fd);
1264 goto cleanup;
1265 }
1266 /* Mission completed... almost */
1267 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1268 if (server.appendfd != -1) {
1269 /* If append only is actually enabled... */
1270 close(server.appendfd);
1271 server.appendfd = fd;
1272 fsync(fd);
85a83172 1273 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1274 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1275 } else {
1276 /* If append only is disabled we just generate a dump in this
1277 * format. Why not? */
1278 close(fd);
1279 }
1280 } else if (!bysignal && exitcode != 0) {
1281 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1282 } else {
1283 redisLog(REDIS_WARNING,
454eea7c 1284 "Background append only file rewriting terminated by signal %d",
1285 WTERMSIG(statloc));
9d65a1bb 1286 }
1287cleanup:
1288 sdsfree(server.bgrewritebuf);
1289 server.bgrewritebuf = sdsempty();
1290 aofRemoveTempFile(server.bgrewritechildpid);
1291 server.bgrewritechildpid = -1;
1292}
1293
884d4b39 1294/* This function is called once a background process of some kind terminates,
1295 * as we want to avoid resizing the hash tables when there is a child in order
1296 * to play well with copy-on-write (otherwise when a resize happens lots of
1297 * memory pages are copied). The goal of this function is to update the ability
1298 * for dict.c to resize the hash tables accordingly to the fact we have o not
1299 * running childs. */
1300static void updateDictResizePolicy(void) {
1301 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1302 dictEnableResize();
1303 else
1304 dictDisableResize();
1305}
1306
56906eef 1307static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1308 int j, loops = server.cronloops++;
ed9b544e 1309 REDIS_NOTUSED(eventLoop);
1310 REDIS_NOTUSED(id);
1311 REDIS_NOTUSED(clientData);
1312
3a66edc7 1313 /* We take a cached value of the unix time in the global state because
1314 * with virtual memory and aging there is to store the current time
1315 * in objects at every object access, and accuracy is not needed.
1316 * To access a global var is faster than calling time(NULL) */
1317 server.unixtime = time(NULL);
1318
0bc03378 1319 /* Show some info about non-empty databases */
ed9b544e 1320 for (j = 0; j < server.dbnum; j++) {
dec423d9 1321 long long size, used, vkeys;
94754ccc 1322
3305306f 1323 size = dictSlots(server.db[j].dict);
1324 used = dictSize(server.db[j].dict);
94754ccc 1325 vkeys = dictSize(server.db[j].expires);
1763929f 1326 if (!(loops % 50) && (used || vkeys)) {
f870935d 1327 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1328 /* dictPrintStats(server.dict); */
ed9b544e 1329 }
ed9b544e 1330 }
1331
0bc03378 1332 /* We don't want to resize the hash tables while a bacground saving
1333 * is in progress: the saving child is created using fork() that is
1334 * implemented with a copy-on-write semantic in most modern systems, so
1335 * if we resize the HT while there is the saving child at work actually
1336 * a lot of memory movements in the parent will cause a lot of pages
1337 * copied. */
884d4b39 1338 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1 &&
1339 !(loops % 10))
1340 {
1341 tryResizeHashTables();
1342 }
0bc03378 1343
ed9b544e 1344 /* Show information about connected clients */
1763929f 1345 if (!(loops % 50)) {
bdcb92f2 1346 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
ed9b544e 1347 listLength(server.clients)-listLength(server.slaves),
1348 listLength(server.slaves),
bdcb92f2 1349 zmalloc_used_memory());
ed9b544e 1350 }
1351
1352 /* Close connections of timedout clients */
1763929f 1353 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1354 closeTimedoutClients();
1355
9d65a1bb 1356 /* Check if a background saving or AOF rewrite in progress terminated */
1357 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1358 int statloc;
9d65a1bb 1359 pid_t pid;
1360
1361 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1362 if (pid == server.bgsavechildpid) {
1363 backgroundSaveDoneHandler(statloc);
ed9b544e 1364 } else {
9d65a1bb 1365 backgroundRewriteDoneHandler(statloc);
ed9b544e 1366 }
884d4b39 1367 updateDictResizePolicy();
ed9b544e 1368 }
1369 } else {
1370 /* If there is not a background saving in progress check if
1371 * we have to save now */
1372 time_t now = time(NULL);
1373 for (j = 0; j < server.saveparamslen; j++) {
1374 struct saveparam *sp = server.saveparams+j;
1375
1376 if (server.dirty >= sp->changes &&
1377 now-server.lastsave > sp->seconds) {
1378 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1379 sp->changes, sp->seconds);
f78fd11b 1380 rdbSaveBackground(server.dbfilename);
ed9b544e 1381 break;
1382 }
1383 }
1384 }
94754ccc 1385
f2324293 1386 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1387 * will use few CPU cycles if there are few expiring keys, otherwise
1388 * it will get more aggressive to avoid that too much memory is used by
1389 * keys that can be removed from the keyspace. */
94754ccc 1390 for (j = 0; j < server.dbnum; j++) {
f2324293 1391 int expired;
94754ccc 1392 redisDb *db = server.db+j;
94754ccc 1393
f2324293 1394 /* Continue to expire if at the end of the cycle more than 25%
1395 * of the keys were expired. */
1396 do {
4ef8de8a 1397 long num = dictSize(db->expires);
94754ccc 1398 time_t now = time(NULL);
1399
f2324293 1400 expired = 0;
94754ccc 1401 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1402 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1403 while (num--) {
1404 dictEntry *de;
1405 time_t t;
1406
1407 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1408 t = (time_t) dictGetEntryVal(de);
1409 if (now > t) {
1410 deleteKey(db,dictGetEntryKey(de));
f2324293 1411 expired++;
2a6a2ed1 1412 server.stat_expiredkeys++;
94754ccc 1413 }
1414 }
f2324293 1415 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1416 }
1417
4ef8de8a 1418 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1419 * is enbled. Try to free objects from the free list first. */
7e69548d 1420 if (vmCanSwapOut()) {
1421 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1422 server.vm_max_memory)
1423 {
72e9fd40 1424 int retval;
1425
a5819310 1426 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1427 retval = (server.vm_max_threads == 0) ?
1428 vmSwapOneObjectBlocking() :
1429 vmSwapOneObjectThreaded();
1763929f 1430 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1431 zmalloc_used_memory() >
1432 (server.vm_max_memory+server.vm_max_memory/10))
1433 {
1434 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1435 }
72e9fd40 1436 /* Note that when using threade I/O we free just one object,
1437 * because anyway when the I/O thread in charge to swap this
1438 * object out will finish, the handler of completed jobs
1439 * will try to swap more objects if we are still out of memory. */
1440 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1441 }
1442 }
1443
ed9b544e 1444 /* Check if we should connect to a MASTER */
1763929f 1445 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1446 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1447 if (syncWithMaster() == REDIS_OK) {
1448 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1449 }
1450 }
1763929f 1451 return 100;
ed9b544e 1452}
1453
d5d55fc3 1454/* This function gets called every time Redis is entering the
1455 * main loop of the event driven library, that is, before to sleep
1456 * for ready file descriptors. */
1457static void beforeSleep(struct aeEventLoop *eventLoop) {
1458 REDIS_NOTUSED(eventLoop);
1459
1460 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1461 listIter li;
1462 listNode *ln;
1463
1464 listRewind(server.io_ready_clients,&li);
1465 while((ln = listNext(&li))) {
1466 redisClient *c = ln->value;
1467 struct redisCommand *cmd;
1468
1469 /* Resume the client. */
1470 listDelNode(server.io_ready_clients,ln);
1471 c->flags &= (~REDIS_IO_WAIT);
1472 server.vm_blocked_clients--;
1473 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1474 readQueryFromClient, c);
1475 cmd = lookupCommand(c->argv[0]->ptr);
1476 assert(cmd != NULL);
1477 call(c,cmd);
1478 resetClient(c);
1479 /* There may be more data to process in the input buffer. */
1480 if (c->querybuf && sdslen(c->querybuf) > 0)
1481 processInputBuffer(c);
1482 }
1483 }
1484}
1485
ed9b544e 1486static void createSharedObjects(void) {
1487 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1488 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1489 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1490 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1491 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1492 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1493 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1494 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1495 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1496 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1497 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1498 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1499 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1500 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1501 "-ERR no such key\r\n"));
ed9b544e 1502 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1503 "-ERR syntax error\r\n"));
c937aa89 1504 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1505 "-ERR source and destination objects are the same\r\n"));
1506 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1507 "-ERR index out of range\r\n"));
ed9b544e 1508 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1509 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1510 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1511 shared.select0 = createStringObject("select 0\r\n",10);
1512 shared.select1 = createStringObject("select 1\r\n",10);
1513 shared.select2 = createStringObject("select 2\r\n",10);
1514 shared.select3 = createStringObject("select 3\r\n",10);
1515 shared.select4 = createStringObject("select 4\r\n",10);
1516 shared.select5 = createStringObject("select 5\r\n",10);
1517 shared.select6 = createStringObject("select 6\r\n",10);
1518 shared.select7 = createStringObject("select 7\r\n",10);
1519 shared.select8 = createStringObject("select 8\r\n",10);
1520 shared.select9 = createStringObject("select 9\r\n",10);
befec3cd 1521 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1522 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
fc46bb71 1523 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
ffc6b7f8 1524 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1525 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
befec3cd 1526 shared.mbulk3 = createStringObject("*3\r\n",4);
ed9b544e 1527}
1528
1529static void appendServerSaveParams(time_t seconds, int changes) {
1530 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1531 server.saveparams[server.saveparamslen].seconds = seconds;
1532 server.saveparams[server.saveparamslen].changes = changes;
1533 server.saveparamslen++;
1534}
1535
bcfc686d 1536static void resetServerSaveParams() {
ed9b544e 1537 zfree(server.saveparams);
1538 server.saveparams = NULL;
1539 server.saveparamslen = 0;
1540}
1541
1542static void initServerConfig() {
1543 server.dbnum = REDIS_DEFAULT_DBNUM;
1544 server.port = REDIS_SERVERPORT;
f870935d 1545 server.verbosity = REDIS_VERBOSE;
ed9b544e 1546 server.maxidletime = REDIS_MAXIDLETIME;
1547 server.saveparams = NULL;
1548 server.logfile = NULL; /* NULL = log on standard output */
1549 server.bindaddr = NULL;
1550 server.glueoutputbuf = 1;
1551 server.daemonize = 0;
44b38ef4 1552 server.appendonly = 0;
4e141d5a 1553 server.appendfsync = APPENDFSYNC_ALWAYS;
48f0308a 1554 server.lastfsync = time(NULL);
44b38ef4 1555 server.appendfd = -1;
1556 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1557 server.pidfile = zstrdup("/var/run/redis.pid");
1558 server.dbfilename = zstrdup("dump.rdb");
1559 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1560 server.requirepass = NULL;
10c43610 1561 server.shareobjects = 0;
b0553789 1562 server.rdbcompression = 1;
285add55 1563 server.maxclients = 0;
d5d55fc3 1564 server.blpop_blocked_clients = 0;
3fd78bcd 1565 server.maxmemory = 0;
75680a3c 1566 server.vm_enabled = 0;
054e426d 1567 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1568 server.vm_page_size = 256; /* 256 bytes per page */
1569 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1570 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1571 server.vm_max_threads = 4;
d5d55fc3 1572 server.vm_blocked_clients = 0;
cbba7dd7 1573 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1574 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
75680a3c 1575
bcfc686d 1576 resetServerSaveParams();
ed9b544e 1577
1578 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1579 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1580 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1581 /* Replication related */
1582 server.isslave = 0;
d0ccebcf 1583 server.masterauth = NULL;
ed9b544e 1584 server.masterhost = NULL;
1585 server.masterport = 6379;
1586 server.master = NULL;
1587 server.replstate = REDIS_REPL_NONE;
a7866db6 1588
1589 /* Double constants initialization */
1590 R_Zero = 0.0;
1591 R_PosInf = 1.0/R_Zero;
1592 R_NegInf = -1.0/R_Zero;
1593 R_Nan = R_Zero/R_Zero;
ed9b544e 1594}
1595
1596static void initServer() {
1597 int j;
1598
1599 signal(SIGHUP, SIG_IGN);
1600 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1601 setupSigSegvAction();
ed9b544e 1602
b9bc0eef 1603 server.devnull = fopen("/dev/null","w");
1604 if (server.devnull == NULL) {
1605 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1606 exit(1);
1607 }
ed9b544e 1608 server.clients = listCreate();
1609 server.slaves = listCreate();
87eca727 1610 server.monitors = listCreate();
ed9b544e 1611 server.objfreelist = listCreate();
1612 createSharedObjects();
1613 server.el = aeCreateEventLoop();
3305306f 1614 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
ed9b544e 1615 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1616 if (server.fd == -1) {
1617 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1618 exit(1);
1619 }
3305306f 1620 for (j = 0; j < server.dbnum; j++) {
5234952b 1621 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1622 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
4409877e 1623 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1624 if (server.vm_enabled)
1625 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1626 server.db[j].id = j;
1627 }
ffc6b7f8 1628 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1629 server.pubsub_patterns = listCreate();
1630 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1631 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
ed9b544e 1632 server.cronloops = 0;
9f3c422c 1633 server.bgsavechildpid = -1;
9d65a1bb 1634 server.bgrewritechildpid = -1;
1635 server.bgrewritebuf = sdsempty();
ed9b544e 1636 server.lastsave = time(NULL);
1637 server.dirty = 0;
ed9b544e 1638 server.stat_numcommands = 0;
1639 server.stat_numconnections = 0;
2a6a2ed1 1640 server.stat_expiredkeys = 0;
ed9b544e 1641 server.stat_starttime = time(NULL);
3a66edc7 1642 server.unixtime = time(NULL);
d8f8b666 1643 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1644 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1645 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1646
1647 if (server.appendonly) {
71eba477 1648 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1649 if (server.appendfd == -1) {
1650 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1651 strerror(errno));
1652 exit(1);
1653 }
1654 }
75680a3c 1655
1656 if (server.vm_enabled) vmInit();
ed9b544e 1657}
1658
1659/* Empty the whole database */
ca37e9cd 1660static long long emptyDb() {
ed9b544e 1661 int j;
ca37e9cd 1662 long long removed = 0;
ed9b544e 1663
3305306f 1664 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1665 removed += dictSize(server.db[j].dict);
3305306f 1666 dictEmpty(server.db[j].dict);
1667 dictEmpty(server.db[j].expires);
1668 }
ca37e9cd 1669 return removed;
ed9b544e 1670}
1671
85dd2f3a 1672static int yesnotoi(char *s) {
1673 if (!strcasecmp(s,"yes")) return 1;
1674 else if (!strcasecmp(s,"no")) return 0;
1675 else return -1;
1676}
1677
ed9b544e 1678/* I agree, this is a very rudimental way to load a configuration...
1679 will improve later if the config gets more complex */
1680static void loadServerConfig(char *filename) {
c9a111ac 1681 FILE *fp;
ed9b544e 1682 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1683 int linenum = 0;
1684 sds line = NULL;
6bccf64a
AO
1685 char *errormsg = "Fatal error, can't open config file '%s'";
1686 char *errorbuf = zmalloc(sizeof(char)*(strlen(errormsg)+strlen(filename)));
1687 sprintf(errorbuf, errormsg, filename);
c9a111ac 1688
1689 if (filename[0] == '-' && filename[1] == '\0')
1690 fp = stdin;
1691 else {
1692 if ((fp = fopen(filename,"r")) == NULL) {
6bccf64a 1693 redisLog(REDIS_WARNING, errorbuf);
c9a111ac 1694 exit(1);
1695 }
ed9b544e 1696 }
c9a111ac 1697
ed9b544e 1698 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1699 sds *argv;
1700 int argc, j;
1701
1702 linenum++;
1703 line = sdsnew(buf);
1704 line = sdstrim(line," \t\r\n");
1705
1706 /* Skip comments and blank lines*/
1707 if (line[0] == '#' || line[0] == '\0') {
1708 sdsfree(line);
1709 continue;
1710 }
1711
1712 /* Split into arguments */
1713 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1714 sdstolower(argv[0]);
1715
1716 /* Execute config directives */
bb0b03a3 1717 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1718 server.maxidletime = atoi(argv[1]);
0150db36 1719 if (server.maxidletime < 0) {
ed9b544e 1720 err = "Invalid timeout value"; goto loaderr;
1721 }
bb0b03a3 1722 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1723 server.port = atoi(argv[1]);
1724 if (server.port < 1 || server.port > 65535) {
1725 err = "Invalid port"; goto loaderr;
1726 }
bb0b03a3 1727 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1728 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1729 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1730 int seconds = atoi(argv[1]);
1731 int changes = atoi(argv[2]);
1732 if (seconds < 1 || changes < 0) {
1733 err = "Invalid save parameters"; goto loaderr;
1734 }
1735 appendServerSaveParams(seconds,changes);
bb0b03a3 1736 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1737 if (chdir(argv[1]) == -1) {
1738 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1739 argv[1], strerror(errno));
1740 exit(1);
1741 }
bb0b03a3 1742 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1743 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1744 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1745 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1746 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1747 else {
1748 err = "Invalid log level. Must be one of debug, notice, warning";
1749 goto loaderr;
1750 }
bb0b03a3 1751 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1752 FILE *logfp;
ed9b544e 1753
1754 server.logfile = zstrdup(argv[1]);
bb0b03a3 1755 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1756 zfree(server.logfile);
1757 server.logfile = NULL;
1758 }
1759 if (server.logfile) {
1760 /* Test if we are able to open the file. The server will not
1761 * be able to abort just for this problem later... */
c9a111ac 1762 logfp = fopen(server.logfile,"a");
1763 if (logfp == NULL) {
ed9b544e 1764 err = sdscatprintf(sdsempty(),
1765 "Can't open the log file: %s", strerror(errno));
1766 goto loaderr;
1767 }
c9a111ac 1768 fclose(logfp);
ed9b544e 1769 }
bb0b03a3 1770 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1771 server.dbnum = atoi(argv[1]);
1772 if (server.dbnum < 1) {
1773 err = "Invalid number of databases"; goto loaderr;
1774 }
b3f83f12
JZ
1775 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1776 loadServerConfig(argv[1]);
285add55 1777 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1778 server.maxclients = atoi(argv[1]);
3fd78bcd 1779 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
d4465900 1780 server.maxmemory = strtoll(argv[1], NULL, 10);
bb0b03a3 1781 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1782 server.masterhost = sdsnew(argv[1]);
1783 server.masterport = atoi(argv[2]);
1784 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1785 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1786 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1787 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1788 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1789 err = "argument must be 'yes' or 'no'"; goto loaderr;
1790 }
bb0b03a3 1791 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
85dd2f3a 1792 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
10c43610 1793 err = "argument must be 'yes' or 'no'"; goto loaderr;
1794 }
121f70cf 1795 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1796 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1797 err = "argument must be 'yes' or 'no'"; goto loaderr;
1798 }
bb0b03a3 1799 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1800 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1801 err = "argument must be 'yes' or 'no'"; goto loaderr;
1802 }
44b38ef4 1803 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1804 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1805 err = "argument must be 'yes' or 'no'"; goto loaderr;
1806 }
48f0308a 1807 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1808 if (!strcasecmp(argv[1],"no")) {
48f0308a 1809 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1810 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1811 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1812 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1813 server.appendfsync = APPENDFSYNC_EVERYSEC;
1814 } else {
1815 err = "argument must be 'no', 'always' or 'everysec'";
1816 goto loaderr;
1817 }
bb0b03a3 1818 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 1819 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1820 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 1821 zfree(server.pidfile);
054e426d 1822 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1823 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 1824 zfree(server.dbfilename);
054e426d 1825 server.dbfilename = zstrdup(argv[1]);
75680a3c 1826 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1827 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1828 err = "argument must be 'yes' or 'no'"; goto loaderr;
1829 }
054e426d 1830 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 1831 zfree(server.vm_swap_file);
054e426d 1832 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 1833 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1834 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1835 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1836 server.vm_page_size = strtoll(argv[1], NULL, 10);
1837 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1838 server.vm_pages = strtoll(argv[1], NULL, 10);
92f8e882 1839 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1840 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 1841 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1842 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1843 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1844 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1845 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1846 server.vm_max_threads = strtoll(argv[1], NULL, 10);
ed9b544e 1847 } else {
1848 err = "Bad directive or wrong number of arguments"; goto loaderr;
1849 }
1850 for (j = 0; j < argc; j++)
1851 sdsfree(argv[j]);
1852 zfree(argv);
1853 sdsfree(line);
1854 }
c9a111ac 1855 if (fp != stdin) fclose(fp);
ed9b544e 1856 return;
1857
1858loaderr:
1859 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1860 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1861 fprintf(stderr, ">>> '%s'\n", line);
1862 fprintf(stderr, "%s\n", err);
1863 exit(1);
1864}
1865
1866static void freeClientArgv(redisClient *c) {
1867 int j;
1868
1869 for (j = 0; j < c->argc; j++)
1870 decrRefCount(c->argv[j]);
e8a74421 1871 for (j = 0; j < c->mbargc; j++)
1872 decrRefCount(c->mbargv[j]);
ed9b544e 1873 c->argc = 0;
e8a74421 1874 c->mbargc = 0;
ed9b544e 1875}
1876
1877static void freeClient(redisClient *c) {
1878 listNode *ln;
1879
4409877e 1880 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 1881 * call, we have to set querybuf to NULL *before* to call
1882 * unblockClientWaitingData() to avoid processInputBuffer() will get
1883 * called. Also it is important to remove the file events after
1884 * this, because this call adds the READABLE event. */
4409877e 1885 sdsfree(c->querybuf);
1886 c->querybuf = NULL;
1887 if (c->flags & REDIS_BLOCKED)
b0d8747d 1888 unblockClientWaitingData(c);
4409877e 1889
ffc6b7f8 1890 /* Unsubscribe from all the pubsub channels */
1891 pubsubUnsubscribeAllChannels(c,0);
1892 pubsubUnsubscribeAllPatterns(c,0);
1893 dictRelease(c->pubsub_channels);
1894 listRelease(c->pubsub_patterns);
befec3cd 1895 /* Obvious cleanup */
ed9b544e 1896 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1897 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 1898 listRelease(c->reply);
1899 freeClientArgv(c);
1900 close(c->fd);
92f8e882 1901 /* Remove from the list of clients */
ed9b544e 1902 ln = listSearchKey(server.clients,c);
dfc5e96c 1903 redisAssert(ln != NULL);
ed9b544e 1904 listDelNode(server.clients,ln);
d5d55fc3 1905 /* Remove from the list of clients waiting for swapped keys */
1906 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1907 ln = listSearchKey(server.io_ready_clients,c);
1908 if (ln) {
1909 listDelNode(server.io_ready_clients,ln);
1910 server.vm_blocked_clients--;
1911 }
1912 }
1913 while (server.vm_enabled && listLength(c->io_keys)) {
1914 ln = listFirst(c->io_keys);
1915 dontWaitForSwappedKey(c,ln->value);
92f8e882 1916 }
b3e3d0d7 1917 listRelease(c->io_keys);
befec3cd 1918 /* Master/slave cleanup */
ed9b544e 1919 if (c->flags & REDIS_SLAVE) {
6208b3a7 1920 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1921 close(c->repldbfd);
87eca727 1922 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1923 ln = listSearchKey(l,c);
dfc5e96c 1924 redisAssert(ln != NULL);
87eca727 1925 listDelNode(l,ln);
ed9b544e 1926 }
1927 if (c->flags & REDIS_MASTER) {
1928 server.master = NULL;
1929 server.replstate = REDIS_REPL_CONNECT;
1930 }
befec3cd 1931 /* Release memory */
93ea3759 1932 zfree(c->argv);
e8a74421 1933 zfree(c->mbargv);
6e469882 1934 freeClientMultiState(c);
ed9b544e 1935 zfree(c);
1936}
1937
cc30e368 1938#define GLUEREPLY_UP_TO (1024)
ed9b544e 1939static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 1940 int copylen = 0;
1941 char buf[GLUEREPLY_UP_TO];
6208b3a7 1942 listNode *ln;
c7df85a4 1943 listIter li;
ed9b544e 1944 robj *o;
1945
c7df85a4 1946 listRewind(c->reply,&li);
1947 while((ln = listNext(&li))) {
c28b42ac 1948 int objlen;
1949
ed9b544e 1950 o = ln->value;
c28b42ac 1951 objlen = sdslen(o->ptr);
1952 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1953 memcpy(buf+copylen,o->ptr,objlen);
1954 copylen += objlen;
ed9b544e 1955 listDelNode(c->reply,ln);
c28b42ac 1956 } else {
1957 if (copylen == 0) return;
1958 break;
ed9b544e 1959 }
ed9b544e 1960 }
c28b42ac 1961 /* Now the output buffer is empty, add the new single element */
1962 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1963 listAddNodeHead(c->reply,o);
ed9b544e 1964}
1965
1966static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1967 redisClient *c = privdata;
1968 int nwritten = 0, totwritten = 0, objlen;
1969 robj *o;
1970 REDIS_NOTUSED(el);
1971 REDIS_NOTUSED(mask);
1972
2895e862 1973 /* Use writev() if we have enough buffers to send */
7ea870c0 1974 if (!server.glueoutputbuf &&
1975 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1976 !(c->flags & REDIS_MASTER))
2895e862 1977 {
1978 sendReplyToClientWritev(el, fd, privdata, mask);
1979 return;
1980 }
2895e862 1981
ed9b544e 1982 while(listLength(c->reply)) {
c28b42ac 1983 if (server.glueoutputbuf && listLength(c->reply) > 1)
1984 glueReplyBuffersIfNeeded(c);
1985
ed9b544e 1986 o = listNodeValue(listFirst(c->reply));
1987 objlen = sdslen(o->ptr);
1988
1989 if (objlen == 0) {
1990 listDelNode(c->reply,listFirst(c->reply));
1991 continue;
1992 }
1993
1994 if (c->flags & REDIS_MASTER) {
6f376729 1995 /* Don't reply to a master */
ed9b544e 1996 nwritten = objlen - c->sentlen;
1997 } else {
a4d1ba9a 1998 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 1999 if (nwritten <= 0) break;
2000 }
2001 c->sentlen += nwritten;
2002 totwritten += nwritten;
2003 /* If we fully sent the object on head go to the next one */
2004 if (c->sentlen == objlen) {
2005 listDelNode(c->reply,listFirst(c->reply));
2006 c->sentlen = 0;
2007 }
6f376729 2008 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 2009 * bytes, in a single threaded server it's a good idea to serve
6f376729 2010 * other clients as well, even if a very large request comes from
2011 * super fast link that is always able to accept data (in real world
12f9d551 2012 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 2013 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 2014 }
2015 if (nwritten == -1) {
2016 if (errno == EAGAIN) {
2017 nwritten = 0;
2018 } else {
f870935d 2019 redisLog(REDIS_VERBOSE,
ed9b544e 2020 "Error writing to client: %s", strerror(errno));
2021 freeClient(c);
2022 return;
2023 }
2024 }
2025 if (totwritten > 0) c->lastinteraction = time(NULL);
2026 if (listLength(c->reply) == 0) {
2027 c->sentlen = 0;
2028 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2029 }
2030}
2031
2895e862 2032static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2033{
2034 redisClient *c = privdata;
2035 int nwritten = 0, totwritten = 0, objlen, willwrite;
2036 robj *o;
2037 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2038 int offset, ion = 0;
2039 REDIS_NOTUSED(el);
2040 REDIS_NOTUSED(mask);
2041
2042 listNode *node;
2043 while (listLength(c->reply)) {
2044 offset = c->sentlen;
2045 ion = 0;
2046 willwrite = 0;
2047
2048 /* fill-in the iov[] array */
2049 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2050 o = listNodeValue(node);
2051 objlen = sdslen(o->ptr);
2052
2053 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2054 break;
2055
2056 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2057 break; /* no more iovecs */
2058
2059 iov[ion].iov_base = ((char*)o->ptr) + offset;
2060 iov[ion].iov_len = objlen - offset;
2061 willwrite += objlen - offset;
2062 offset = 0; /* just for the first item */
2063 ion++;
2064 }
2065
2066 if(willwrite == 0)
2067 break;
2068
2069 /* write all collected blocks at once */
2070 if((nwritten = writev(fd, iov, ion)) < 0) {
2071 if (errno != EAGAIN) {
f870935d 2072 redisLog(REDIS_VERBOSE,
2895e862 2073 "Error writing to client: %s", strerror(errno));
2074 freeClient(c);
2075 return;
2076 }
2077 break;
2078 }
2079
2080 totwritten += nwritten;
2081 offset = c->sentlen;
2082
2083 /* remove written robjs from c->reply */
2084 while (nwritten && listLength(c->reply)) {
2085 o = listNodeValue(listFirst(c->reply));
2086 objlen = sdslen(o->ptr);
2087
2088 if(nwritten >= objlen - offset) {
2089 listDelNode(c->reply, listFirst(c->reply));
2090 nwritten -= objlen - offset;
2091 c->sentlen = 0;
2092 } else {
2093 /* partial write */
2094 c->sentlen += nwritten;
2095 break;
2096 }
2097 offset = 0;
2098 }
2099 }
2100
2101 if (totwritten > 0)
2102 c->lastinteraction = time(NULL);
2103
2104 if (listLength(c->reply) == 0) {
2105 c->sentlen = 0;
2106 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2107 }
2108}
2109
ed9b544e 2110static struct redisCommand *lookupCommand(char *name) {
2111 int j = 0;
2112 while(cmdTable[j].name != NULL) {
bb0b03a3 2113 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
ed9b544e 2114 j++;
2115 }
2116 return NULL;
2117}
2118
2119/* resetClient prepare the client to process the next command */
2120static void resetClient(redisClient *c) {
2121 freeClientArgv(c);
2122 c->bulklen = -1;
e8a74421 2123 c->multibulk = 0;
ed9b544e 2124}
2125
6e469882 2126/* Call() is the core of Redis execution of a command */
2127static void call(redisClient *c, struct redisCommand *cmd) {
2128 long long dirty;
2129
2130 dirty = server.dirty;
2131 cmd->proc(c);
4005fef1 2132 dirty = server.dirty-dirty;
2133
2134 if (server.appendonly && dirty)
6e469882 2135 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
4005fef1 2136 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2137 listLength(server.slaves))
248ea310 2138 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2139 if (listLength(server.monitors))
248ea310 2140 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2141 server.stat_numcommands++;
2142}
2143
ed9b544e 2144/* If this function gets called we already read a whole
2145 * command, argments are in the client argv/argc fields.
2146 * processCommand() execute the command or prepare the
2147 * server for a bulk read from the client.
2148 *
2149 * If 1 is returned the client is still alive and valid and
2150 * and other operations can be performed by the caller. Otherwise
2151 * if 0 is returned the client was destroied (i.e. after QUIT). */
2152static int processCommand(redisClient *c) {
2153 struct redisCommand *cmd;
ed9b544e 2154
3fd78bcd 2155 /* Free some memory if needed (maxmemory setting) */
2156 if (server.maxmemory) freeMemoryIfNeeded();
2157
e8a74421 2158 /* Handle the multi bulk command type. This is an alternative protocol
2159 * supported by Redis in order to receive commands that are composed of
2160 * multiple binary-safe "bulk" arguments. The latency of processing is
2161 * a bit higher but this allows things like multi-sets, so if this
2162 * protocol is used only for MSET and similar commands this is a big win. */
2163 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2164 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2165 if (c->multibulk <= 0) {
2166 resetClient(c);
2167 return 1;
2168 } else {
2169 decrRefCount(c->argv[c->argc-1]);
2170 c->argc--;
2171 return 1;
2172 }
2173 } else if (c->multibulk) {
2174 if (c->bulklen == -1) {
2175 if (((char*)c->argv[0]->ptr)[0] != '$') {
2176 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2177 resetClient(c);
2178 return 1;
2179 } else {
2180 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2181 decrRefCount(c->argv[0]);
2182 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2183 c->argc--;
2184 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2185 resetClient(c);
2186 return 1;
2187 }
2188 c->argc--;
2189 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2190 return 1;
2191 }
2192 } else {
2193 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2194 c->mbargv[c->mbargc] = c->argv[0];
2195 c->mbargc++;
2196 c->argc--;
2197 c->multibulk--;
2198 if (c->multibulk == 0) {
2199 robj **auxargv;
2200 int auxargc;
2201
2202 /* Here we need to swap the multi-bulk argc/argv with the
2203 * normal argc/argv of the client structure. */
2204 auxargv = c->argv;
2205 c->argv = c->mbargv;
2206 c->mbargv = auxargv;
2207
2208 auxargc = c->argc;
2209 c->argc = c->mbargc;
2210 c->mbargc = auxargc;
2211
2212 /* We need to set bulklen to something different than -1
2213 * in order for the code below to process the command without
2214 * to try to read the last argument of a bulk command as
2215 * a special argument. */
2216 c->bulklen = 0;
2217 /* continue below and process the command */
2218 } else {
2219 c->bulklen = -1;
2220 return 1;
2221 }
2222 }
2223 }
2224 /* -- end of multi bulk commands processing -- */
2225
ed9b544e 2226 /* The QUIT command is handled as a special case. Normal command
2227 * procs are unable to close the client connection safely */
bb0b03a3 2228 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2229 freeClient(c);
2230 return 0;
2231 }
d5d55fc3 2232
2233 /* Now lookup the command and check ASAP about trivial error conditions
2234 * such wrong arity, bad command name and so forth. */
ed9b544e 2235 cmd = lookupCommand(c->argv[0]->ptr);
2236 if (!cmd) {
2c14807b 2237 addReplySds(c,
2238 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2239 (char*)c->argv[0]->ptr));
ed9b544e 2240 resetClient(c);
2241 return 1;
2242 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2243 (c->argc < -cmd->arity)) {
454d4e43 2244 addReplySds(c,
2245 sdscatprintf(sdsempty(),
2246 "-ERR wrong number of arguments for '%s' command\r\n",
2247 cmd->name));
ed9b544e 2248 resetClient(c);
2249 return 1;
2250 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2251 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2252 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2253
2254 decrRefCount(c->argv[c->argc-1]);
2255 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2256 c->argc--;
2257 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2258 resetClient(c);
2259 return 1;
2260 }
2261 c->argc--;
2262 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2263 /* It is possible that the bulk read is already in the
8d0490e7 2264 * buffer. Check this condition and handle it accordingly.
2265 * This is just a fast path, alternative to call processInputBuffer().
2266 * It's a good idea since the code is small and this condition
2267 * happens most of the times. */
ed9b544e 2268 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2269 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2270 c->argc++;
2271 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2272 } else {
d5d55fc3 2273 /* Otherwise return... there is to read the last argument
2274 * from the socket. */
ed9b544e 2275 return 1;
2276 }
2277 }
942a3961 2278 /* Let's try to encode the bulk object to save space. */
2279 if (cmd->flags & REDIS_CMD_BULK)
2280 tryObjectEncoding(c->argv[c->argc-1]);
2281
e63943a4 2282 /* Check if the user is authenticated */
2283 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2284 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2285 resetClient(c);
2286 return 1;
2287 }
2288
b61a28fe 2289 /* Handle the maxmemory directive */
2290 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2291 zmalloc_used_memory() > server.maxmemory)
2292 {
2293 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2294 resetClient(c);
2295 return 1;
2296 }
2297
d6cc8867 2298 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
ffc6b7f8 2299 if (dictSize(c->pubsub_channels) > 0 &&
2300 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2301 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2302 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
d6cc8867 2303 resetClient(c);
2304 return 1;
2305 }
2306
ed9b544e 2307 /* Exec the command */
18b6cb76 2308 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
6e469882 2309 queueMultiCommand(c,cmd);
2310 addReply(c,shared.queued);
2311 } else {
d5d55fc3 2312 if (server.vm_enabled && server.vm_max_threads > 0 &&
2313 blockClientOnSwappedKeys(cmd,c)) return 1;
6e469882 2314 call(c,cmd);
2315 }
ed9b544e 2316
2317 /* Prepare the client for the next command */
ed9b544e 2318 resetClient(c);
2319 return 1;
2320}
2321
248ea310 2322static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2323 listNode *ln;
c7df85a4 2324 listIter li;
ed9b544e 2325 int outc = 0, j;
93ea3759 2326 robj **outv;
248ea310 2327 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2328 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2329 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2330 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2331 robj *lenobj;
93ea3759 2332
2333 if (argc <= REDIS_STATIC_ARGS) {
2334 outv = static_outv;
2335 } else {
248ea310 2336 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2337 }
248ea310 2338
2339 lenobj = createObject(REDIS_STRING,
2340 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2341 lenobj->refcount = 0;
2342 outv[outc++] = lenobj;
ed9b544e 2343 for (j = 0; j < argc; j++) {
248ea310 2344 lenobj = createObject(REDIS_STRING,
2345 sdscatprintf(sdsempty(),"$%lu\r\n",
2346 (unsigned long) stringObjectLen(argv[j])));
2347 lenobj->refcount = 0;
2348 outv[outc++] = lenobj;
ed9b544e 2349 outv[outc++] = argv[j];
248ea310 2350 outv[outc++] = shared.crlf;
ed9b544e 2351 }
ed9b544e 2352
40d224a9 2353 /* Increment all the refcounts at start and decrement at end in order to
2354 * be sure to free objects if there is no slave in a replication state
2355 * able to be feed with commands */
2356 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2357 listRewind(slaves,&li);
2358 while((ln = listNext(&li))) {
ed9b544e 2359 redisClient *slave = ln->value;
40d224a9 2360
2361 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2362 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2363
2364 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2365 if (slave->slaveseldb != dictid) {
2366 robj *selectcmd;
2367
2368 switch(dictid) {
2369 case 0: selectcmd = shared.select0; break;
2370 case 1: selectcmd = shared.select1; break;
2371 case 2: selectcmd = shared.select2; break;
2372 case 3: selectcmd = shared.select3; break;
2373 case 4: selectcmd = shared.select4; break;
2374 case 5: selectcmd = shared.select5; break;
2375 case 6: selectcmd = shared.select6; break;
2376 case 7: selectcmd = shared.select7; break;
2377 case 8: selectcmd = shared.select8; break;
2378 case 9: selectcmd = shared.select9; break;
2379 default:
2380 selectcmd = createObject(REDIS_STRING,
2381 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2382 selectcmd->refcount = 0;
2383 break;
2384 }
2385 addReply(slave,selectcmd);
2386 slave->slaveseldb = dictid;
2387 }
2388 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2389 }
40d224a9 2390 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2391 if (outv != static_outv) zfree(outv);
ed9b544e 2392}
2393
638e42ac 2394static void processInputBuffer(redisClient *c) {
ed9b544e 2395again:
4409877e 2396 /* Before to process the input buffer, make sure the client is not
2397 * waitig for a blocking operation such as BLPOP. Note that the first
2398 * iteration the client is never blocked, otherwise the processInputBuffer
2399 * would not be called at all, but after the execution of the first commands
2400 * in the input buffer the client may be blocked, and the "goto again"
2401 * will try to reiterate. The following line will make it return asap. */
92f8e882 2402 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2403 if (c->bulklen == -1) {
2404 /* Read the first line of the query */
2405 char *p = strchr(c->querybuf,'\n');
2406 size_t querylen;
644fafa3 2407
ed9b544e 2408 if (p) {
2409 sds query, *argv;
2410 int argc, j;
2411
2412 query = c->querybuf;
2413 c->querybuf = sdsempty();
2414 querylen = 1+(p-(query));
2415 if (sdslen(query) > querylen) {
2416 /* leave data after the first line of the query in the buffer */
2417 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2418 }
2419 *p = '\0'; /* remove "\n" */
2420 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2421 sdsupdatelen(query);
2422
2423 /* Now we can split the query in arguments */
ed9b544e 2424 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2425 sdsfree(query);
2426
2427 if (c->argv) zfree(c->argv);
2428 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2429
2430 for (j = 0; j < argc; j++) {
ed9b544e 2431 if (sdslen(argv[j])) {
2432 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2433 c->argc++;
2434 } else {
2435 sdsfree(argv[j]);
2436 }
2437 }
2438 zfree(argv);
7c49733c 2439 if (c->argc) {
2440 /* Execute the command. If the client is still valid
2441 * after processCommand() return and there is something
2442 * on the query buffer try to process the next command. */
2443 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2444 } else {
2445 /* Nothing to process, argc == 0. Just process the query
2446 * buffer if it's not empty or return to the caller */
2447 if (sdslen(c->querybuf)) goto again;
2448 }
ed9b544e 2449 return;
644fafa3 2450 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2451 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2452 freeClient(c);
2453 return;
2454 }
2455 } else {
2456 /* Bulk read handling. Note that if we are at this point
2457 the client already sent a command terminated with a newline,
2458 we are reading the bulk data that is actually the last
2459 argument of the command. */
2460 int qbl = sdslen(c->querybuf);
2461
2462 if (c->bulklen <= qbl) {
2463 /* Copy everything but the final CRLF as final argument */
2464 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2465 c->argc++;
2466 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2467 /* Process the command. If the client is still valid after
2468 * the processing and there is more data in the buffer
2469 * try to parse it. */
2470 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2471 return;
2472 }
2473 }
2474}
2475
638e42ac 2476static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2477 redisClient *c = (redisClient*) privdata;
2478 char buf[REDIS_IOBUF_LEN];
2479 int nread;
2480 REDIS_NOTUSED(el);
2481 REDIS_NOTUSED(mask);
2482
2483 nread = read(fd, buf, REDIS_IOBUF_LEN);
2484 if (nread == -1) {
2485 if (errno == EAGAIN) {
2486 nread = 0;
2487 } else {
f870935d 2488 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2489 freeClient(c);
2490 return;
2491 }
2492 } else if (nread == 0) {
f870935d 2493 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2494 freeClient(c);
2495 return;
2496 }
2497 if (nread) {
2498 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2499 c->lastinteraction = time(NULL);
2500 } else {
2501 return;
2502 }
168ac5c6 2503 processInputBuffer(c);
638e42ac 2504}
2505
ed9b544e 2506static int selectDb(redisClient *c, int id) {
2507 if (id < 0 || id >= server.dbnum)
2508 return REDIS_ERR;
3305306f 2509 c->db = &server.db[id];
ed9b544e 2510 return REDIS_OK;
2511}
2512
40d224a9 2513static void *dupClientReplyValue(void *o) {
2514 incrRefCount((robj*)o);
12d090d2 2515 return o;
40d224a9 2516}
2517
ffc6b7f8 2518static int listMatchObjects(void *a, void *b) {
2519 return compareStringObjects(a,b) == 0;
2520}
2521
ed9b544e 2522static redisClient *createClient(int fd) {
2523 redisClient *c = zmalloc(sizeof(*c));
2524
2525 anetNonBlock(NULL,fd);
2526 anetTcpNoDelay(NULL,fd);
2527 if (!c) return NULL;
2528 selectDb(c,0);
2529 c->fd = fd;
2530 c->querybuf = sdsempty();
2531 c->argc = 0;
93ea3759 2532 c->argv = NULL;
ed9b544e 2533 c->bulklen = -1;
e8a74421 2534 c->multibulk = 0;
2535 c->mbargc = 0;
2536 c->mbargv = NULL;
ed9b544e 2537 c->sentlen = 0;
2538 c->flags = 0;
2539 c->lastinteraction = time(NULL);
abcb223e 2540 c->authenticated = 0;
40d224a9 2541 c->replstate = REDIS_REPL_NONE;
6b47e12e 2542 c->reply = listCreate();
ed9b544e 2543 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2544 listSetDupMethod(c->reply,dupClientReplyValue);
92f8e882 2545 c->blockingkeys = NULL;
2546 c->blockingkeysnum = 0;
2547 c->io_keys = listCreate();
2548 listSetFreeMethod(c->io_keys,decrRefCount);
ffc6b7f8 2549 c->pubsub_channels = dictCreate(&setDictType,NULL);
2550 c->pubsub_patterns = listCreate();
2551 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2552 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
ed9b544e 2553 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2554 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2555 freeClient(c);
2556 return NULL;
2557 }
6b47e12e 2558 listAddNodeTail(server.clients,c);
6e469882 2559 initClientMultiState(c);
ed9b544e 2560 return c;
2561}
2562
2563static void addReply(redisClient *c, robj *obj) {
2564 if (listLength(c->reply) == 0 &&
6208b3a7 2565 (c->replstate == REDIS_REPL_NONE ||
2566 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2567 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2568 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2569
2570 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2571 obj = dupStringObject(obj);
2572 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2573 }
9d65a1bb 2574 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2575}
2576
2577static void addReplySds(redisClient *c, sds s) {
2578 robj *o = createObject(REDIS_STRING,s);
2579 addReply(c,o);
2580 decrRefCount(o);
2581}
2582
e2665397 2583static void addReplyDouble(redisClient *c, double d) {
2584 char buf[128];
2585
2586 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2587 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2588 (unsigned long) strlen(buf),buf));
e2665397 2589}
2590
f44dd428 2591static void addReplyLong(redisClient *c, long l) {
2592 char buf[128];
2593 size_t len;
2594
dd88747b 2595 if (l == 0) {
2596 addReply(c,shared.czero);
2597 return;
2598 } else if (l == 1) {
2599 addReply(c,shared.cone);
2600 return;
2601 }
f44dd428 2602 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2603 addReplySds(c,sdsnewlen(buf,len));
2604}
2605
aa7c2934
PN
2606static void addReplyLongLong(redisClient *c, long long ll) {
2607 char buf[128];
2608 size_t len;
2609
2610 if (ll == 0) {
2611 addReply(c,shared.czero);
2612 return;
2613 } else if (ll == 1) {
2614 addReply(c,shared.cone);
2615 return;
2616 }
2617 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2618 addReplySds(c,sdsnewlen(buf,len));
2619}
2620
92b27fe9 2621static void addReplyUlong(redisClient *c, unsigned long ul) {
2622 char buf[128];
2623 size_t len;
2624
dd88747b 2625 if (ul == 0) {
2626 addReply(c,shared.czero);
2627 return;
2628 } else if (ul == 1) {
2629 addReply(c,shared.cone);
2630 return;
2631 }
92b27fe9 2632 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2633 addReplySds(c,sdsnewlen(buf,len));
2634}
2635
942a3961 2636static void addReplyBulkLen(redisClient *c, robj *obj) {
2637 size_t len;
2638
2639 if (obj->encoding == REDIS_ENCODING_RAW) {
2640 len = sdslen(obj->ptr);
2641 } else {
2642 long n = (long)obj->ptr;
2643
e054afda 2644 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2645 len = 1;
2646 if (n < 0) {
2647 len++;
2648 n = -n;
2649 }
2650 while((n = n/10) != 0) {
2651 len++;
2652 }
2653 }
83c6a618 2654 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
942a3961 2655}
2656
dd88747b 2657static void addReplyBulk(redisClient *c, robj *obj) {
2658 addReplyBulkLen(c,obj);
2659 addReply(c,obj);
2660 addReply(c,shared.crlf);
2661}
2662
500ece7c 2663/* In the CONFIG command we need to add vanilla C string as bulk replies */
2664static void addReplyBulkCString(redisClient *c, char *s) {
2665 if (s == NULL) {
2666 addReply(c,shared.nullbulk);
2667 } else {
2668 robj *o = createStringObject(s,strlen(s));
2669 addReplyBulk(c,o);
2670 decrRefCount(o);
2671 }
2672}
2673
ed9b544e 2674static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2675 int cport, cfd;
2676 char cip[128];
285add55 2677 redisClient *c;
ed9b544e 2678 REDIS_NOTUSED(el);
2679 REDIS_NOTUSED(mask);
2680 REDIS_NOTUSED(privdata);
2681
2682 cfd = anetAccept(server.neterr, fd, cip, &cport);
2683 if (cfd == AE_ERR) {
f870935d 2684 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2685 return;
2686 }
f870935d 2687 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2688 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2689 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2690 close(cfd); /* May be already closed, just ingore errors */
2691 return;
2692 }
285add55 2693 /* If maxclient directive is set and this is one client more... close the
2694 * connection. Note that we create the client instead to check before
2695 * for this condition, since now the socket is already set in nonblocking
2696 * mode and we can send an error for free using the Kernel I/O */
2697 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2698 char *err = "-ERR max number of clients reached\r\n";
2699
2700 /* That's a best effort error message, don't check write errors */
fee803ba 2701 if (write(c->fd,err,strlen(err)) == -1) {
2702 /* Nothing to do, Just to avoid the warning... */
2703 }
285add55 2704 freeClient(c);
2705 return;
2706 }
ed9b544e 2707 server.stat_numconnections++;
2708}
2709
2710/* ======================= Redis objects implementation ===================== */
2711
2712static robj *createObject(int type, void *ptr) {
2713 robj *o;
2714
a5819310 2715 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2716 if (listLength(server.objfreelist)) {
2717 listNode *head = listFirst(server.objfreelist);
2718 o = listNodeValue(head);
2719 listDelNode(server.objfreelist,head);
a5819310 2720 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2721 } else {
75680a3c 2722 if (server.vm_enabled) {
a5819310 2723 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2724 o = zmalloc(sizeof(*o));
2725 } else {
2726 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2727 }
ed9b544e 2728 }
ed9b544e 2729 o->type = type;
942a3961 2730 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2731 o->ptr = ptr;
2732 o->refcount = 1;
3a66edc7 2733 if (server.vm_enabled) {
1064ef87 2734 /* Note that this code may run in the context of an I/O thread
2735 * and accessing to server.unixtime in theory is an error
2736 * (no locks). But in practice this is safe, and even if we read
2737 * garbage Redis will not fail, as it's just a statistical info */
3a66edc7 2738 o->vm.atime = server.unixtime;
2739 o->storage = REDIS_VM_MEMORY;
2740 }
ed9b544e 2741 return o;
2742}
2743
2744static robj *createStringObject(char *ptr, size_t len) {
2745 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2746}
2747
4ef8de8a 2748static robj *dupStringObject(robj *o) {
b9bc0eef 2749 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2750 return createStringObject(o->ptr,sdslen(o->ptr));
2751}
2752
ed9b544e 2753static robj *createListObject(void) {
2754 list *l = listCreate();
2755
ed9b544e 2756 listSetFreeMethod(l,decrRefCount);
2757 return createObject(REDIS_LIST,l);
2758}
2759
2760static robj *createSetObject(void) {
2761 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2762 return createObject(REDIS_SET,d);
2763}
2764
5234952b 2765static robj *createHashObject(void) {
2766 /* All the Hashes start as zipmaps. Will be automatically converted
2767 * into hash tables if there are enough elements or big elements
2768 * inside. */
2769 unsigned char *zm = zipmapNew();
2770 robj *o = createObject(REDIS_HASH,zm);
2771 o->encoding = REDIS_ENCODING_ZIPMAP;
2772 return o;
2773}
2774
1812e024 2775static robj *createZsetObject(void) {
6b47e12e 2776 zset *zs = zmalloc(sizeof(*zs));
2777
2778 zs->dict = dictCreate(&zsetDictType,NULL);
2779 zs->zsl = zslCreate();
2780 return createObject(REDIS_ZSET,zs);
1812e024 2781}
2782
ed9b544e 2783static void freeStringObject(robj *o) {
942a3961 2784 if (o->encoding == REDIS_ENCODING_RAW) {
2785 sdsfree(o->ptr);
2786 }
ed9b544e 2787}
2788
2789static void freeListObject(robj *o) {
2790 listRelease((list*) o->ptr);
2791}
2792
2793static void freeSetObject(robj *o) {
2794 dictRelease((dict*) o->ptr);
2795}
2796
fd8ccf44 2797static void freeZsetObject(robj *o) {
2798 zset *zs = o->ptr;
2799
2800 dictRelease(zs->dict);
2801 zslFree(zs->zsl);
2802 zfree(zs);
2803}
2804
ed9b544e 2805static void freeHashObject(robj *o) {
cbba7dd7 2806 switch (o->encoding) {
2807 case REDIS_ENCODING_HT:
2808 dictRelease((dict*) o->ptr);
2809 break;
2810 case REDIS_ENCODING_ZIPMAP:
2811 zfree(o->ptr);
2812 break;
2813 default:
2814 redisAssert(0);
2815 break;
2816 }
ed9b544e 2817}
2818
2819static void incrRefCount(robj *o) {
2820 o->refcount++;
2821}
2822
2823static void decrRefCount(void *obj) {
2824 robj *o = obj;
94754ccc 2825
970e10bb 2826 /* Object is a key of a swapped out value, or in the process of being
2827 * loaded. */
996cb5f7 2828 if (server.vm_enabled &&
2829 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2830 {
996cb5f7 2831 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 2832 redisAssert(o->type == REDIS_STRING);
a35ddf12 2833 freeStringObject(o);
2834 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 2835 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 2836 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2837 !listAddNodeHead(server.objfreelist,o))
2838 zfree(o);
a5819310 2839 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 2840 server.vm_stats_swapped_objects--;
a35ddf12 2841 return;
2842 }
996cb5f7 2843 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 2844 if (--(o->refcount) == 0) {
996cb5f7 2845 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2846 vmCancelThreadedIOJob(obj);
ed9b544e 2847 switch(o->type) {
2848 case REDIS_STRING: freeStringObject(o); break;
2849 case REDIS_LIST: freeListObject(o); break;
2850 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 2851 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 2852 case REDIS_HASH: freeHashObject(o); break;
78409a0f 2853 default: redisAssert(0); break;
ed9b544e 2854 }
a5819310 2855 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2856 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2857 !listAddNodeHead(server.objfreelist,o))
2858 zfree(o);
a5819310 2859 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2860 }
2861}
2862
942a3961 2863static robj *lookupKey(redisDb *db, robj *key) {
2864 dictEntry *de = dictFind(db->dict,key);
3a66edc7 2865 if (de) {
55cf8433 2866 robj *key = dictGetEntryKey(de);
2867 robj *val = dictGetEntryVal(de);
3a66edc7 2868
55cf8433 2869 if (server.vm_enabled) {
996cb5f7 2870 if (key->storage == REDIS_VM_MEMORY ||
2871 key->storage == REDIS_VM_SWAPPING)
2872 {
2873 /* If we were swapping the object out, stop it, this key
2874 * was requested. */
2875 if (key->storage == REDIS_VM_SWAPPING)
2876 vmCancelThreadedIOJob(key);
55cf8433 2877 /* Update the access time of the key for the aging algorithm. */
2878 key->vm.atime = server.unixtime;
2879 } else {
d5d55fc3 2880 int notify = (key->storage == REDIS_VM_LOADING);
2881
55cf8433 2882 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 2883 redisAssert(val == NULL);
55cf8433 2884 val = vmLoadObject(key);
2885 dictGetEntryVal(de) = val;
d5d55fc3 2886
2887 /* Clients blocked by the VM subsystem may be waiting for
2888 * this key... */
2889 if (notify) handleClientsBlockedOnSwappedKey(db,key);
55cf8433 2890 }
2891 }
2892 return val;
3a66edc7 2893 } else {
2894 return NULL;
2895 }
942a3961 2896}
2897
2898static robj *lookupKeyRead(redisDb *db, robj *key) {
2899 expireIfNeeded(db,key);
2900 return lookupKey(db,key);
2901}
2902
2903static robj *lookupKeyWrite(redisDb *db, robj *key) {
2904 deleteIfVolatile(db,key);
2905 return lookupKey(db,key);
2906}
2907
92b27fe9 2908static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2909 robj *o = lookupKeyRead(c->db, key);
2910 if (!o) addReply(c,reply);
2911 return o;
2912}
2913
2914static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2915 robj *o = lookupKeyWrite(c->db, key);
2916 if (!o) addReply(c,reply);
2917 return o;
2918}
2919
2920static int checkType(redisClient *c, robj *o, int type) {
2921 if (o->type != type) {
2922 addReply(c,shared.wrongtypeerr);
2923 return 1;
2924 }
2925 return 0;
2926}
2927
942a3961 2928static int deleteKey(redisDb *db, robj *key) {
2929 int retval;
2930
2931 /* We need to protect key from destruction: after the first dictDelete()
2932 * it may happen that 'key' is no longer valid if we don't increment
2933 * it's count. This may happen when we get the object reference directly
2934 * from the hash table with dictRandomKey() or dict iterators */
2935 incrRefCount(key);
2936 if (dictSize(db->expires)) dictDelete(db->expires,key);
2937 retval = dictDelete(db->dict,key);
2938 decrRefCount(key);
2939
2940 return retval == DICT_OK;
2941}
2942
724a51b1 2943/* Check if the nul-terminated string 's' can be represented by a long
2944 * (that is, is a number that fits into long without any other space or
2945 * character before or after the digits).
2946 *
2947 * If so, the function returns REDIS_OK and *longval is set to the value
2948 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 2949static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 2950 char buf[32], *endptr;
2951 long value;
2952 int slen;
2953
2954 value = strtol(s, &endptr, 10);
2955 if (endptr[0] != '\0') return REDIS_ERR;
2956 slen = snprintf(buf,32,"%ld",value);
2957
2958 /* If the number converted back into a string is not identical
2959 * then it's not possible to encode the string as integer */
f69f2cba 2960 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 2961 if (longval) *longval = value;
2962 return REDIS_OK;
2963}
2964
942a3961 2965/* Try to encode a string object in order to save space */
2966static int tryObjectEncoding(robj *o) {
2967 long value;
942a3961 2968 sds s = o->ptr;
3305306f 2969
942a3961 2970 if (o->encoding != REDIS_ENCODING_RAW)
2971 return REDIS_ERR; /* Already encoded */
3305306f 2972
942a3961 2973 /* It's not save to encode shared objects: shared objects can be shared
2974 * everywhere in the "object space" of Redis. Encoded objects can only
2975 * appear as "values" (and not, for instance, as keys) */
2976 if (o->refcount > 1) return REDIS_ERR;
3305306f 2977
942a3961 2978 /* Currently we try to encode only strings */
dfc5e96c 2979 redisAssert(o->type == REDIS_STRING);
94754ccc 2980
724a51b1 2981 /* Check if we can represent this string as a long integer */
2982 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return REDIS_ERR;
942a3961 2983
2984 /* Ok, this object can be encoded */
2985 o->encoding = REDIS_ENCODING_INT;
2986 sdsfree(o->ptr);
2987 o->ptr = (void*) value;
2988 return REDIS_OK;
2989}
2990
9d65a1bb 2991/* Get a decoded version of an encoded object (returned as a new object).
2992 * If the object is already raw-encoded just increment the ref count. */
2993static robj *getDecodedObject(robj *o) {
942a3961 2994 robj *dec;
2995
9d65a1bb 2996 if (o->encoding == REDIS_ENCODING_RAW) {
2997 incrRefCount(o);
2998 return o;
2999 }
942a3961 3000 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3001 char buf[32];
3002
3003 snprintf(buf,32,"%ld",(long)o->ptr);
3004 dec = createStringObject(buf,strlen(buf));
3005 return dec;
3006 } else {
dfc5e96c 3007 redisAssert(1 != 1);
942a3961 3008 }
3305306f 3009}
3010
d7f43c08 3011/* Compare two string objects via strcmp() or alike.
3012 * Note that the objects may be integer-encoded. In such a case we
3013 * use snprintf() to get a string representation of the numbers on the stack
1fd9bc8a 3014 * and compare the strings, it's much faster than calling getDecodedObject().
3015 *
3016 * Important note: if objects are not integer encoded, but binary-safe strings,
3017 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3018 * binary safe. */
724a51b1 3019static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 3020 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 3021 char bufa[128], bufb[128], *astr, *bstr;
3022 int bothsds = 1;
724a51b1 3023
e197b441 3024 if (a == b) return 0;
d7f43c08 3025 if (a->encoding != REDIS_ENCODING_RAW) {
3026 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3027 astr = bufa;
3028 bothsds = 0;
724a51b1 3029 } else {
d7f43c08 3030 astr = a->ptr;
724a51b1 3031 }
d7f43c08 3032 if (b->encoding != REDIS_ENCODING_RAW) {
3033 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3034 bstr = bufb;
3035 bothsds = 0;
3036 } else {
3037 bstr = b->ptr;
3038 }
3039 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3040}
3041
0ea663ea 3042static size_t stringObjectLen(robj *o) {
dfc5e96c 3043 redisAssert(o->type == REDIS_STRING);
0ea663ea 3044 if (o->encoding == REDIS_ENCODING_RAW) {
3045 return sdslen(o->ptr);
3046 } else {
3047 char buf[32];
3048
3049 return snprintf(buf,32,"%ld",(long)o->ptr);
3050 }
3051}
3052
06233c45 3053/*============================ RDB saving/loading =========================== */
ed9b544e 3054
f78fd11b 3055static int rdbSaveType(FILE *fp, unsigned char type) {
3056 if (fwrite(&type,1,1,fp) == 0) return -1;
3057 return 0;
3058}
3059
bb32ede5 3060static int rdbSaveTime(FILE *fp, time_t t) {
3061 int32_t t32 = (int32_t) t;
3062 if (fwrite(&t32,4,1,fp) == 0) return -1;
3063 return 0;
3064}
3065
e3566d4b 3066/* check rdbLoadLen() comments for more info */
f78fd11b 3067static int rdbSaveLen(FILE *fp, uint32_t len) {
3068 unsigned char buf[2];
3069
3070 if (len < (1<<6)) {
3071 /* Save a 6 bit len */
10c43610 3072 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3073 if (fwrite(buf,1,1,fp) == 0) return -1;
3074 } else if (len < (1<<14)) {
3075 /* Save a 14 bit len */
10c43610 3076 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3077 buf[1] = len&0xFF;
17be1a4a 3078 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3079 } else {
3080 /* Save a 32 bit len */
10c43610 3081 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3082 if (fwrite(buf,1,1,fp) == 0) return -1;
3083 len = htonl(len);
3084 if (fwrite(&len,4,1,fp) == 0) return -1;
3085 }
3086 return 0;
3087}
3088
e3566d4b 3089/* String objects in the form "2391" "-100" without any space and with a
3090 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3091 * encoded as integers to save space */
b1befe6a 3092static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
e3566d4b 3093 long long value;
3094 char *endptr, buf[32];
3095
3096 /* Check if it's possible to encode this value as a number */
3097 value = strtoll(s, &endptr, 10);
3098 if (endptr[0] != '\0') return 0;
3099 snprintf(buf,32,"%lld",value);
3100
3101 /* If the number converted back into a string is not identical
3102 * then it's not possible to encode the string as integer */
b1befe6a 3103 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
e3566d4b 3104
3105 /* Finally check if it fits in our ranges */
3106 if (value >= -(1<<7) && value <= (1<<7)-1) {
3107 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3108 enc[1] = value&0xFF;
3109 return 2;
3110 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3111 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3112 enc[1] = value&0xFF;
3113 enc[2] = (value>>8)&0xFF;
3114 return 3;
3115 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3116 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3117 enc[1] = value&0xFF;
3118 enc[2] = (value>>8)&0xFF;
3119 enc[3] = (value>>16)&0xFF;
3120 enc[4] = (value>>24)&0xFF;
3121 return 5;
3122 } else {
3123 return 0;
3124 }
3125}
3126
b1befe6a 3127static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3128 size_t comprlen, outlen;
774e3047 3129 unsigned char byte;
3130 void *out;
3131
3132 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3133 if (len <= 4) return 0;
3134 outlen = len-4;
3a2694c4 3135 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3136 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3137 if (comprlen == 0) {
88e85998 3138 zfree(out);
774e3047 3139 return 0;
3140 }
3141 /* Data compressed! Let's save it on disk */
3142 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3143 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3144 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3145 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3146 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3147 zfree(out);
774e3047 3148 return comprlen;
3149
3150writeerr:
88e85998 3151 zfree(out);
774e3047 3152 return -1;
3153}
3154
e3566d4b 3155/* Save a string objet as [len][data] on disk. If the object is a string
3156 * representation of an integer value we try to safe it in a special form */
b1befe6a 3157static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3158 int enclen;
10c43610 3159
774e3047 3160 /* Try integer encoding */
e3566d4b 3161 if (len <= 11) {
3162 unsigned char buf[5];
b1befe6a 3163 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3164 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3165 return 0;
3166 }
3167 }
774e3047 3168
3169 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3170 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3171 if (server.rdbcompression && len > 20) {
774e3047 3172 int retval;
3173
b1befe6a 3174 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3175 if (retval == -1) return -1;
3176 if (retval > 0) return 0;
3177 /* retval == 0 means data can't be compressed, save the old way */
3178 }
3179
3180 /* Store verbatim */
10c43610 3181 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3182 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3183 return 0;
3184}
3185
942a3961 3186/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3187static int rdbSaveStringObject(FILE *fp, robj *obj) {
3188 int retval;
942a3961 3189
f2d9f50f 3190 /* Avoid incr/decr ref count business when possible.
3191 * This plays well with copy-on-write given that we are probably
3192 * in a child process (BGSAVE). Also this makes sure key objects
3193 * of swapped objects are not incRefCount-ed (an assert does not allow
3194 * this in order to avoid bugs) */
3195 if (obj->encoding != REDIS_ENCODING_RAW) {
996cb5f7 3196 obj = getDecodedObject(obj);
b1befe6a 3197 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3198 decrRefCount(obj);
3199 } else {
b1befe6a 3200 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3201 }
9d65a1bb 3202 return retval;
942a3961 3203}
3204
a7866db6 3205/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3206 * 8 bit integer specifing the length of the representation.
3207 * This 8 bit integer has special values in order to specify the following
3208 * conditions:
3209 * 253: not a number
3210 * 254: + inf
3211 * 255: - inf
3212 */
3213static int rdbSaveDoubleValue(FILE *fp, double val) {
3214 unsigned char buf[128];
3215 int len;
3216
3217 if (isnan(val)) {
3218 buf[0] = 253;
3219 len = 1;
3220 } else if (!isfinite(val)) {
3221 len = 1;
3222 buf[0] = (val < 0) ? 255 : 254;
3223 } else {
eaa256ad 3224 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3225 buf[0] = strlen((char*)buf+1);
a7866db6 3226 len = buf[0]+1;
3227 }
3228 if (fwrite(buf,len,1,fp) == 0) return -1;
3229 return 0;
3230}
3231
06233c45 3232/* Save a Redis object. */
3233static int rdbSaveObject(FILE *fp, robj *o) {
3234 if (o->type == REDIS_STRING) {
3235 /* Save a string value */
3236 if (rdbSaveStringObject(fp,o) == -1) return -1;
3237 } else if (o->type == REDIS_LIST) {
3238 /* Save a list value */
3239 list *list = o->ptr;
c7df85a4 3240 listIter li;
06233c45 3241 listNode *ln;
3242
06233c45 3243 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
c7df85a4 3244 listRewind(list,&li);
3245 while((ln = listNext(&li))) {
06233c45 3246 robj *eleobj = listNodeValue(ln);
3247
3248 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3249 }
3250 } else if (o->type == REDIS_SET) {
3251 /* Save a set value */
3252 dict *set = o->ptr;
3253 dictIterator *di = dictGetIterator(set);
3254 dictEntry *de;
3255
3256 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3257 while((de = dictNext(di)) != NULL) {
3258 robj *eleobj = dictGetEntryKey(de);
3259
3260 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3261 }
3262 dictReleaseIterator(di);
3263 } else if (o->type == REDIS_ZSET) {
3264 /* Save a set value */
3265 zset *zs = o->ptr;
3266 dictIterator *di = dictGetIterator(zs->dict);
3267 dictEntry *de;
3268
3269 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3270 while((de = dictNext(di)) != NULL) {
3271 robj *eleobj = dictGetEntryKey(de);
3272 double *score = dictGetEntryVal(de);
3273
3274 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3275 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3276 }
3277 dictReleaseIterator(di);
b1befe6a 3278 } else if (o->type == REDIS_HASH) {
3279 /* Save a hash value */
3280 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3281 unsigned char *p = zipmapRewind(o->ptr);
3282 unsigned int count = zipmapLen(o->ptr);
3283 unsigned char *key, *val;
3284 unsigned int klen, vlen;
3285
3286 if (rdbSaveLen(fp,count) == -1) return -1;
3287 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3288 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3289 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3290 }
3291 } else {
3292 dictIterator *di = dictGetIterator(o->ptr);
3293 dictEntry *de;
3294
3295 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3296 while((de = dictNext(di)) != NULL) {
3297 robj *key = dictGetEntryKey(de);
3298 robj *val = dictGetEntryVal(de);
3299
3300 if (rdbSaveStringObject(fp,key) == -1) return -1;
3301 if (rdbSaveStringObject(fp,val) == -1) return -1;
3302 }
3303 dictReleaseIterator(di);
3304 }
06233c45 3305 } else {
78409a0f 3306 redisAssert(0);
06233c45 3307 }
3308 return 0;
3309}
3310
3311/* Return the length the object will have on disk if saved with
3312 * the rdbSaveObject() function. Currently we use a trick to get
3313 * this length with very little changes to the code. In the future
3314 * we could switch to a faster solution. */
b9bc0eef 3315static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3316 if (fp == NULL) fp = server.devnull;
06233c45 3317 rewind(fp);
3318 assert(rdbSaveObject(fp,o) != 1);
3319 return ftello(fp);
3320}
3321
06224fec 3322/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3323static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3324 off_t bytes = rdbSavedObjectLen(o,fp);
06224fec 3325
3326 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3327}
3328
ed9b544e 3329/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3330static int rdbSave(char *filename) {
ed9b544e 3331 dictIterator *di = NULL;
3332 dictEntry *de;
ed9b544e 3333 FILE *fp;
3334 char tmpfile[256];
3335 int j;
bb32ede5 3336 time_t now = time(NULL);
ed9b544e 3337
2316bb3b 3338 /* Wait for I/O therads to terminate, just in case this is a
3339 * foreground-saving, to avoid seeking the swap file descriptor at the
3340 * same time. */
3341 if (server.vm_enabled)
3342 waitEmptyIOJobsQueue();
3343
a3b21203 3344 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3345 fp = fopen(tmpfile,"w");
3346 if (!fp) {
3347 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3348 return REDIS_ERR;
3349 }
f78fd11b 3350 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3351 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3352 redisDb *db = server.db+j;
3353 dict *d = db->dict;
3305306f 3354 if (dictSize(d) == 0) continue;
ed9b544e 3355 di = dictGetIterator(d);
3356 if (!di) {
3357 fclose(fp);
3358 return REDIS_ERR;
3359 }
3360
3361 /* Write the SELECT DB opcode */
f78fd11b 3362 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3363 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3364
3365 /* Iterate this DB writing every entry */
3366 while((de = dictNext(di)) != NULL) {
3367 robj *key = dictGetEntryKey(de);
3368 robj *o = dictGetEntryVal(de);
bb32ede5 3369 time_t expiretime = getExpire(db,key);
3370
3371 /* Save the expire time */
3372 if (expiretime != -1) {
3373 /* If this key is already expired skip it */
3374 if (expiretime < now) continue;
3375 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3376 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3377 }
7e69548d 3378 /* Save the key and associated value. This requires special
3379 * handling if the value is swapped out. */
996cb5f7 3380 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3381 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3382 /* Save type, key, value */
3383 if (rdbSaveType(fp,o->type) == -1) goto werr;
3384 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3385 if (rdbSaveObject(fp,o) == -1) goto werr;
3386 } else {
996cb5f7 3387 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3388 robj *po;
7e69548d 3389 /* Get a preview of the object in memory */
3390 po = vmPreviewObject(key);
7e69548d 3391 /* Save type, key, value */
3392 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3393 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3394 if (rdbSaveObject(fp,po) == -1) goto werr;
3395 /* Remove the loaded object from memory */
3396 decrRefCount(po);
7e69548d 3397 }
ed9b544e 3398 }
3399 dictReleaseIterator(di);
3400 }
3401 /* EOF opcode */
f78fd11b 3402 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3403
3404 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3405 fflush(fp);
3406 fsync(fileno(fp));
3407 fclose(fp);
3408
3409 /* Use RENAME to make sure the DB file is changed atomically only
3410 * if the generate DB file is ok. */
3411 if (rename(tmpfile,filename) == -1) {
325d1eb4 3412 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3413 unlink(tmpfile);
3414 return REDIS_ERR;
3415 }
3416 redisLog(REDIS_NOTICE,"DB saved on disk");
3417 server.dirty = 0;
3418 server.lastsave = time(NULL);
3419 return REDIS_OK;
3420
3421werr:
3422 fclose(fp);
3423 unlink(tmpfile);
3424 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3425 if (di) dictReleaseIterator(di);
3426 return REDIS_ERR;
3427}
3428
f78fd11b 3429static int rdbSaveBackground(char *filename) {
ed9b544e 3430 pid_t childpid;
3431
9d65a1bb 3432 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3433 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3434 if ((childpid = fork()) == 0) {
3435 /* Child */
054e426d 3436 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3437 close(server.fd);
f78fd11b 3438 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3439 _exit(0);
ed9b544e 3440 } else {
478c2c6f 3441 _exit(1);
ed9b544e 3442 }
3443 } else {
3444 /* Parent */
5a7c647e 3445 if (childpid == -1) {
3446 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3447 strerror(errno));
3448 return REDIS_ERR;
3449 }
ed9b544e 3450 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3451 server.bgsavechildpid = childpid;
884d4b39 3452 updateDictResizePolicy();
ed9b544e 3453 return REDIS_OK;
3454 }
3455 return REDIS_OK; /* unreached */
3456}
3457
a3b21203 3458static void rdbRemoveTempFile(pid_t childpid) {
3459 char tmpfile[256];
3460
3461 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3462 unlink(tmpfile);
3463}
3464
f78fd11b 3465static int rdbLoadType(FILE *fp) {
3466 unsigned char type;
7b45bfb2 3467 if (fread(&type,1,1,fp) == 0) return -1;
3468 return type;
3469}
3470
bb32ede5 3471static time_t rdbLoadTime(FILE *fp) {
3472 int32_t t32;
3473 if (fread(&t32,4,1,fp) == 0) return -1;
3474 return (time_t) t32;
3475}
3476
e3566d4b 3477/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3478 * of this file for a description of how this are stored on disk.
3479 *
3480 * isencoded is set to 1 if the readed length is not actually a length but
3481 * an "encoding type", check the above comments for more info */
c78a8ccc 3482static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3483 unsigned char buf[2];
3484 uint32_t len;
c78a8ccc 3485 int type;
f78fd11b 3486
e3566d4b 3487 if (isencoded) *isencoded = 0;
c78a8ccc 3488 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3489 type = (buf[0]&0xC0)>>6;
3490 if (type == REDIS_RDB_6BITLEN) {
3491 /* Read a 6 bit len */
3492 return buf[0]&0x3F;
3493 } else if (type == REDIS_RDB_ENCVAL) {
3494 /* Read a 6 bit len encoding type */
3495 if (isencoded) *isencoded = 1;
3496 return buf[0]&0x3F;
3497 } else if (type == REDIS_RDB_14BITLEN) {
3498 /* Read a 14 bit len */
3499 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3500 return ((buf[0]&0x3F)<<8)|buf[1];
3501 } else {
3502 /* Read a 32 bit len */
f78fd11b 3503 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3504 return ntohl(len);
f78fd11b 3505 }
f78fd11b 3506}
3507
e3566d4b 3508static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3509 unsigned char enc[4];
3510 long long val;
3511
3512 if (enctype == REDIS_RDB_ENC_INT8) {
3513 if (fread(enc,1,1,fp) == 0) return NULL;
3514 val = (signed char)enc[0];
3515 } else if (enctype == REDIS_RDB_ENC_INT16) {
3516 uint16_t v;
3517 if (fread(enc,2,1,fp) == 0) return NULL;
3518 v = enc[0]|(enc[1]<<8);
3519 val = (int16_t)v;
3520 } else if (enctype == REDIS_RDB_ENC_INT32) {
3521 uint32_t v;
3522 if (fread(enc,4,1,fp) == 0) return NULL;
3523 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3524 val = (int32_t)v;
3525 } else {
3526 val = 0; /* anti-warning */
78409a0f 3527 redisAssert(0);
e3566d4b 3528 }
3529 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3530}
3531
c78a8ccc 3532static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3533 unsigned int len, clen;
3534 unsigned char *c = NULL;
3535 sds val = NULL;
3536
c78a8ccc 3537 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3538 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3539 if ((c = zmalloc(clen)) == NULL) goto err;
3540 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3541 if (fread(c,clen,1,fp) == 0) goto err;
3542 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3543 zfree(c);
88e85998 3544 return createObject(REDIS_STRING,val);
3545err:
3546 zfree(c);
3547 sdsfree(val);
3548 return NULL;
3549}
3550
c78a8ccc 3551static robj *rdbLoadStringObject(FILE*fp) {
e3566d4b 3552 int isencoded;
3553 uint32_t len;
f78fd11b 3554 sds val;
3555
c78a8ccc 3556 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3557 if (isencoded) {
3558 switch(len) {
3559 case REDIS_RDB_ENC_INT8:
3560 case REDIS_RDB_ENC_INT16:
3561 case REDIS_RDB_ENC_INT32:
bdcb92f2 3562 return rdbLoadIntegerObject(fp,len);
88e85998 3563 case REDIS_RDB_ENC_LZF:
bdcb92f2 3564 return rdbLoadLzfStringObject(fp);
e3566d4b 3565 default:
78409a0f 3566 redisAssert(0);
e3566d4b 3567 }
3568 }
3569
f78fd11b 3570 if (len == REDIS_RDB_LENERR) return NULL;
3571 val = sdsnewlen(NULL,len);
3572 if (len && fread(val,len,1,fp) == 0) {
3573 sdsfree(val);
3574 return NULL;
3575 }
bdcb92f2 3576 return createObject(REDIS_STRING,val);
f78fd11b 3577}
3578
a7866db6 3579/* For information about double serialization check rdbSaveDoubleValue() */
3580static int rdbLoadDoubleValue(FILE *fp, double *val) {
3581 char buf[128];
3582 unsigned char len;
3583
3584 if (fread(&len,1,1,fp) == 0) return -1;
3585 switch(len) {
3586 case 255: *val = R_NegInf; return 0;
3587 case 254: *val = R_PosInf; return 0;
3588 case 253: *val = R_Nan; return 0;
3589 default:
3590 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3591 buf[len] = '\0';
a7866db6 3592 sscanf(buf, "%lg", val);
3593 return 0;
3594 }
3595}
3596
c78a8ccc 3597/* Load a Redis object of the specified type from the specified file.
3598 * On success a newly allocated object is returned, otherwise NULL. */
3599static robj *rdbLoadObject(int type, FILE *fp) {
3600 robj *o;
3601
bcd11906 3602 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 3603 if (type == REDIS_STRING) {
3604 /* Read string value */
3605 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
3606 tryObjectEncoding(o);
3607 } else if (type == REDIS_LIST || type == REDIS_SET) {
3608 /* Read list/set value */
3609 uint32_t listlen;
3610
3611 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3612 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3c68de9b 3613 /* It's faster to expand the dict to the right size asap in order
3614 * to avoid rehashing */
3615 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3616 dictExpand(o->ptr,listlen);
c78a8ccc 3617 /* Load every single element of the list/set */
3618 while(listlen--) {
3619 robj *ele;
3620
3621 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3622 tryObjectEncoding(ele);
3623 if (type == REDIS_LIST) {
3624 listAddNodeTail((list*)o->ptr,ele);
3625 } else {
3626 dictAdd((dict*)o->ptr,ele,NULL);
3627 }
3628 }
3629 } else if (type == REDIS_ZSET) {
3630 /* Read list/set value */
ada386b2 3631 size_t zsetlen;
c78a8ccc 3632 zset *zs;
3633
3634 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3635 o = createZsetObject();
3636 zs = o->ptr;
3637 /* Load every single element of the list/set */
3638 while(zsetlen--) {
3639 robj *ele;
3640 double *score = zmalloc(sizeof(double));
3641
3642 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
3643 tryObjectEncoding(ele);
3644 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3645 dictAdd(zs->dict,ele,score);
3646 zslInsert(zs->zsl,*score,ele);
3647 incrRefCount(ele); /* added to skiplist */
3648 }
ada386b2 3649 } else if (type == REDIS_HASH) {
3650 size_t hashlen;
3651
3652 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3653 o = createHashObject();
3654 /* Too many entries? Use an hash table. */
3655 if (hashlen > server.hash_max_zipmap_entries)
3656 convertToRealHash(o);
3657 /* Load every key/value, then set it into the zipmap or hash
3658 * table, as needed. */
3659 while(hashlen--) {
3660 robj *key, *val;
3661
3662 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3663 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3664 /* If we are using a zipmap and there are too big values
3665 * the object is converted to real hash table encoding. */
3666 if (o->encoding != REDIS_ENCODING_HT &&
3667 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3668 sdslen(val->ptr) > server.hash_max_zipmap_value))
3669 {
3670 convertToRealHash(o);
3671 }
3672
3673 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3674 unsigned char *zm = o->ptr;
3675
3676 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3677 val->ptr,sdslen(val->ptr),NULL);
3678 o->ptr = zm;
3679 decrRefCount(key);
3680 decrRefCount(val);
3681 } else {
3682 tryObjectEncoding(key);
3683 tryObjectEncoding(val);
3684 dictAdd((dict*)o->ptr,key,val);
ada386b2 3685 }
3686 }
c78a8ccc 3687 } else {
78409a0f 3688 redisAssert(0);
c78a8ccc 3689 }
3690 return o;
3691}
3692
f78fd11b 3693static int rdbLoad(char *filename) {
ed9b544e 3694 FILE *fp;
f78fd11b 3695 robj *keyobj = NULL;
3696 uint32_t dbid;
bb32ede5 3697 int type, retval, rdbver;
3305306f 3698 dict *d = server.db[0].dict;
bb32ede5 3699 redisDb *db = server.db+0;
f78fd11b 3700 char buf[1024];
bb32ede5 3701 time_t expiretime = -1, now = time(NULL);
b492cf00 3702 long long loadedkeys = 0;
bb32ede5 3703
ed9b544e 3704 fp = fopen(filename,"r");
3705 if (!fp) return REDIS_ERR;
3706 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 3707 buf[9] = '\0';
3708 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 3709 fclose(fp);
3710 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3711 return REDIS_ERR;
3712 }
f78fd11b 3713 rdbver = atoi(buf+5);
c78a8ccc 3714 if (rdbver != 1) {
f78fd11b 3715 fclose(fp);
3716 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3717 return REDIS_ERR;
3718 }
ed9b544e 3719 while(1) {
3720 robj *o;
3721
3722 /* Read type. */
f78fd11b 3723 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 3724 if (type == REDIS_EXPIRETIME) {
3725 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3726 /* We read the time so we need to read the object type again */
3727 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3728 }
ed9b544e 3729 if (type == REDIS_EOF) break;
3730 /* Handle SELECT DB opcode as a special case */
3731 if (type == REDIS_SELECTDB) {
c78a8ccc 3732 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 3733 goto eoferr;
ed9b544e 3734 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 3735 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 3736 exit(1);
3737 }
bb32ede5 3738 db = server.db+dbid;
3739 d = db->dict;
ed9b544e 3740 continue;
3741 }
3742 /* Read key */
c78a8ccc 3743 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3744 /* Read value */
3745 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
ed9b544e 3746 /* Add the new object in the hash table */
f78fd11b 3747 retval = dictAdd(d,keyobj,o);
ed9b544e 3748 if (retval == DICT_ERR) {
f78fd11b 3749 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
ed9b544e 3750 exit(1);
3751 }
bb32ede5 3752 /* Set the expire time if needed */
3753 if (expiretime != -1) {
3754 setExpire(db,keyobj,expiretime);
3755 /* Delete this key if already expired */
3756 if (expiretime < now) deleteKey(db,keyobj);
3757 expiretime = -1;
3758 }
f78fd11b 3759 keyobj = o = NULL;
b492cf00 3760 /* Handle swapping while loading big datasets when VM is on */
3761 loadedkeys++;
3762 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3763 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 3764 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 3765 }
3766 }
ed9b544e 3767 }
3768 fclose(fp);
3769 return REDIS_OK;
3770
3771eoferr: /* unexpected end of file is handled here with a fatal exit */
e3566d4b 3772 if (keyobj) decrRefCount(keyobj);
f80dff62 3773 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 3774 exit(1);
3775 return REDIS_ERR; /* Just to avoid warning */
3776}
3777
3778/*================================== Commands =============================== */
3779
abcb223e 3780static void authCommand(redisClient *c) {
2e77c2ee 3781 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
3782 c->authenticated = 1;
3783 addReply(c,shared.ok);
3784 } else {
3785 c->authenticated = 0;
fa4c0aba 3786 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
3787 }
3788}
3789
ed9b544e 3790static void pingCommand(redisClient *c) {
3791 addReply(c,shared.pong);
3792}
3793
3794static void echoCommand(redisClient *c) {
dd88747b 3795 addReplyBulk(c,c->argv[1]);
ed9b544e 3796}
3797
3798/*=================================== Strings =============================== */
3799
3800static void setGenericCommand(redisClient *c, int nx) {
3801 int retval;
3802
333fd216 3803 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3305306f 3804 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3805 if (retval == DICT_ERR) {
3806 if (!nx) {
1b03836c 3807 /* If the key is about a swapped value, we want a new key object
3808 * to overwrite the old. So we delete the old key in the database.
3809 * This will also make sure that swap pages about the old object
3810 * will be marked as free. */
ddfaca9d 3811 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
1b03836c 3812 incrRefCount(c->argv[1]);
3305306f 3813 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3814 incrRefCount(c->argv[2]);
3815 } else {
c937aa89 3816 addReply(c,shared.czero);
ed9b544e 3817 return;
3818 }
3819 } else {
3820 incrRefCount(c->argv[1]);
3821 incrRefCount(c->argv[2]);
3822 }
3823 server.dirty++;
3305306f 3824 removeExpire(c->db,c->argv[1]);
c937aa89 3825 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 3826}
3827
3828static void setCommand(redisClient *c) {
a4d1ba9a 3829 setGenericCommand(c,0);
ed9b544e 3830}
3831
3832static void setnxCommand(redisClient *c) {
a4d1ba9a 3833 setGenericCommand(c,1);
ed9b544e 3834}
3835
322fc7d8 3836static int getGenericCommand(redisClient *c) {
dd88747b 3837 robj *o;
3838
3839 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 3840 return REDIS_OK;
dd88747b 3841
3842 if (o->type != REDIS_STRING) {
3843 addReply(c,shared.wrongtypeerr);
3844 return REDIS_ERR;
ed9b544e 3845 } else {
dd88747b 3846 addReplyBulk(c,o);
3847 return REDIS_OK;
ed9b544e 3848 }
3849}
3850
322fc7d8 3851static void getCommand(redisClient *c) {
3852 getGenericCommand(c);
3853}
3854
f6b141c5 3855static void getsetCommand(redisClient *c) {
322fc7d8 3856 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 3857 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3858 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3859 } else {
3860 incrRefCount(c->argv[1]);
3861 }
3862 incrRefCount(c->argv[2]);
3863 server.dirty++;
3864 removeExpire(c->db,c->argv[1]);
3865}
3866
70003d28 3867static void mgetCommand(redisClient *c) {
70003d28 3868 int j;
3869
c937aa89 3870 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 3871 for (j = 1; j < c->argc; j++) {
3305306f 3872 robj *o = lookupKeyRead(c->db,c->argv[j]);
3873 if (o == NULL) {
c937aa89 3874 addReply(c,shared.nullbulk);
70003d28 3875 } else {
70003d28 3876 if (o->type != REDIS_STRING) {
c937aa89 3877 addReply(c,shared.nullbulk);
70003d28 3878 } else {
dd88747b 3879 addReplyBulk(c,o);
70003d28 3880 }
3881 }
3882 }
3883}
3884
6c446631 3885static void msetGenericCommand(redisClient *c, int nx) {
906573e7 3886 int j, busykeys = 0;
6c446631 3887
3888 if ((c->argc % 2) == 0) {
454d4e43 3889 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 3890 return;
3891 }
3892 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3893 * set nothing at all if at least one already key exists. */
3894 if (nx) {
3895 for (j = 1; j < c->argc; j += 2) {
906573e7 3896 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3897 busykeys++;
6c446631 3898 }
3899 }
3900 }
906573e7 3901 if (busykeys) {
3902 addReply(c, shared.czero);
3903 return;
3904 }
6c446631 3905
3906 for (j = 1; j < c->argc; j += 2) {
3907 int retval;
3908
17511391 3909 tryObjectEncoding(c->argv[j+1]);
6c446631 3910 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3911 if (retval == DICT_ERR) {
3912 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3913 incrRefCount(c->argv[j+1]);
3914 } else {
3915 incrRefCount(c->argv[j]);
3916 incrRefCount(c->argv[j+1]);
3917 }
3918 removeExpire(c->db,c->argv[j]);
3919 }
3920 server.dirty += (c->argc-1)/2;
3921 addReply(c, nx ? shared.cone : shared.ok);
3922}
3923
3924static void msetCommand(redisClient *c) {
3925 msetGenericCommand(c,0);
3926}
3927
3928static void msetnxCommand(redisClient *c) {
3929 msetGenericCommand(c,1);
3930}
3931
d68ed120 3932static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 3933 long long value;
3934 int retval;
3935 robj *o;
3936
3305306f 3937 o = lookupKeyWrite(c->db,c->argv[1]);
3938 if (o == NULL) {
ed9b544e 3939 value = 0;
3940 } else {
ed9b544e 3941 if (o->type != REDIS_STRING) {
3942 value = 0;
3943 } else {
3944 char *eptr;
3945
942a3961 3946 if (o->encoding == REDIS_ENCODING_RAW)
3947 value = strtoll(o->ptr, &eptr, 10);
3948 else if (o->encoding == REDIS_ENCODING_INT)
3949 value = (long)o->ptr;
3950 else
dfc5e96c 3951 redisAssert(1 != 1);
ed9b544e 3952 }
3953 }
3954
3955 value += incr;
3956 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
942a3961 3957 tryObjectEncoding(o);
3305306f 3958 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 3959 if (retval == DICT_ERR) {
3305306f 3960 dictReplace(c->db->dict,c->argv[1],o);
3961 removeExpire(c->db,c->argv[1]);
ed9b544e 3962 } else {
3963 incrRefCount(c->argv[1]);
3964 }
3965 server.dirty++;
c937aa89 3966 addReply(c,shared.colon);
ed9b544e 3967 addReply(c,o);
3968 addReply(c,shared.crlf);
3969}
3970
3971static void incrCommand(redisClient *c) {
a4d1ba9a 3972 incrDecrCommand(c,1);
ed9b544e 3973}
3974
3975static void decrCommand(redisClient *c) {
a4d1ba9a 3976 incrDecrCommand(c,-1);
ed9b544e 3977}
3978
3979static void incrbyCommand(redisClient *c) {
d68ed120 3980 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
a4d1ba9a 3981 incrDecrCommand(c,incr);
ed9b544e 3982}
3983
3984static void decrbyCommand(redisClient *c) {
d68ed120 3985 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
a4d1ba9a 3986 incrDecrCommand(c,-incr);
ed9b544e 3987}
3988
4b00bebd 3989static void appendCommand(redisClient *c) {
3990 int retval;
3991 size_t totlen;
3992 robj *o;
3993
3994 o = lookupKeyWrite(c->db,c->argv[1]);
3995 if (o == NULL) {
3996 /* Create the key */
3997 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
3998 incrRefCount(c->argv[1]);
3999 incrRefCount(c->argv[2]);
4000 totlen = stringObjectLen(c->argv[2]);
4001 } else {
4002 dictEntry *de;
4003
4004 de = dictFind(c->db->dict,c->argv[1]);
4005 assert(de != NULL);
4006
4007 o = dictGetEntryVal(de);
4008 if (o->type != REDIS_STRING) {
4009 addReply(c,shared.wrongtypeerr);
4010 return;
4011 }
4012 /* If the object is specially encoded or shared we have to make
4013 * a copy */
4014 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4015 robj *decoded = getDecodedObject(o);
4016
4017 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4018 decrRefCount(decoded);
4019 dictReplace(c->db->dict,c->argv[1],o);
4020 }
4021 /* APPEND! */
4022 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4023 o->ptr = sdscatlen(o->ptr,
4024 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4025 } else {
4026 o->ptr = sdscatprintf(o->ptr, "%ld",
4027 (unsigned long) c->argv[2]->ptr);
4028 }
4029 totlen = sdslen(o->ptr);
4030 }
4031 server.dirty++;
4032 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4033}
4034
39191553 4035static void substrCommand(redisClient *c) {
4036 robj *o;
4037 long start = atoi(c->argv[2]->ptr);
4038 long end = atoi(c->argv[3]->ptr);
dd88747b 4039 size_t rangelen, strlen;
4040 sds range;
39191553 4041
dd88747b 4042 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4043 checkType(c,o,REDIS_STRING)) return;
39191553 4044
dd88747b 4045 o = getDecodedObject(o);
4046 strlen = sdslen(o->ptr);
8fe7fad7 4047
dd88747b 4048 /* convert negative indexes */
4049 if (start < 0) start = strlen+start;
4050 if (end < 0) end = strlen+end;
4051 if (start < 0) start = 0;
4052 if (end < 0) end = 0;
39191553 4053
dd88747b 4054 /* indexes sanity checks */
4055 if (start > end || (size_t)start >= strlen) {
4056 /* Out of range start or start > end result in null reply */
4057 addReply(c,shared.nullbulk);
4058 decrRefCount(o);
4059 return;
39191553 4060 }
dd88747b 4061 if ((size_t)end >= strlen) end = strlen-1;
4062 rangelen = (end-start)+1;
4063
4064 /* Return the result */
4065 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4066 range = sdsnewlen((char*)o->ptr+start,rangelen);
4067 addReplySds(c,range);
4068 addReply(c,shared.crlf);
4069 decrRefCount(o);
39191553 4070}
4071
ed9b544e 4072/* ========================= Type agnostic commands ========================= */
4073
4074static void delCommand(redisClient *c) {
5109cdff 4075 int deleted = 0, j;
4076
4077 for (j = 1; j < c->argc; j++) {
4078 if (deleteKey(c->db,c->argv[j])) {
4079 server.dirty++;
4080 deleted++;
4081 }
4082 }
dd88747b 4083 addReplyLong(c,deleted);
ed9b544e 4084}
4085
4086static void existsCommand(redisClient *c) {
3305306f 4087 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
ed9b544e 4088}
4089
4090static void selectCommand(redisClient *c) {
4091 int id = atoi(c->argv[1]->ptr);
4092
4093 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4094 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4095 } else {
4096 addReply(c,shared.ok);
4097 }
4098}
4099
4100static void randomkeyCommand(redisClient *c) {
4101 dictEntry *de;
3305306f 4102
4103 while(1) {
4104 de = dictGetRandomKey(c->db->dict);
ce7bef07 4105 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 4106 }
ed9b544e 4107 if (de == NULL) {
ce7bef07 4108 addReply(c,shared.plus);
ed9b544e 4109 addReply(c,shared.crlf);
4110 } else {
c937aa89 4111 addReply(c,shared.plus);
ed9b544e 4112 addReply(c,dictGetEntryKey(de));
4113 addReply(c,shared.crlf);
4114 }
4115}
4116
4117static void keysCommand(redisClient *c) {
4118 dictIterator *di;
4119 dictEntry *de;
4120 sds pattern = c->argv[1]->ptr;
4121 int plen = sdslen(pattern);
a3f9eec2 4122 unsigned long numkeys = 0;
ed9b544e 4123 robj *lenobj = createObject(REDIS_STRING,NULL);
4124
3305306f 4125 di = dictGetIterator(c->db->dict);
ed9b544e 4126 addReply(c,lenobj);
4127 decrRefCount(lenobj);
4128 while((de = dictNext(di)) != NULL) {
4129 robj *keyobj = dictGetEntryKey(de);
3305306f 4130
ed9b544e 4131 sds key = keyobj->ptr;
4132 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4133 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 4134 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4135 addReplyBulk(c,keyobj);
3305306f 4136 numkeys++;
3305306f 4137 }
ed9b544e 4138 }
4139 }
4140 dictReleaseIterator(di);
a3f9eec2 4141 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4142}
4143
4144static void dbsizeCommand(redisClient *c) {
4145 addReplySds(c,
3305306f 4146 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4147}
4148
4149static void lastsaveCommand(redisClient *c) {
4150 addReplySds(c,
c937aa89 4151 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4152}
4153
4154static void typeCommand(redisClient *c) {
3305306f 4155 robj *o;
ed9b544e 4156 char *type;
3305306f 4157
4158 o = lookupKeyRead(c->db,c->argv[1]);
4159 if (o == NULL) {
c937aa89 4160 type = "+none";
ed9b544e 4161 } else {
ed9b544e 4162 switch(o->type) {
c937aa89 4163 case REDIS_STRING: type = "+string"; break;
4164 case REDIS_LIST: type = "+list"; break;
4165 case REDIS_SET: type = "+set"; break;
412a8bce 4166 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4167 case REDIS_HASH: type = "+hash"; break;
4168 default: type = "+unknown"; break;
ed9b544e 4169 }
4170 }
4171 addReplySds(c,sdsnew(type));
4172 addReply(c,shared.crlf);
4173}
4174
4175static void saveCommand(redisClient *c) {
9d65a1bb 4176 if (server.bgsavechildpid != -1) {
05557f6d 4177 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4178 return;
4179 }
f78fd11b 4180 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4181 addReply(c,shared.ok);
4182 } else {
4183 addReply(c,shared.err);
4184 }
4185}
4186
4187static void bgsaveCommand(redisClient *c) {
9d65a1bb 4188 if (server.bgsavechildpid != -1) {
ed9b544e 4189 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4190 return;
4191 }
f78fd11b 4192 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4193 char *status = "+Background saving started\r\n";
4194 addReplySds(c,sdsnew(status));
ed9b544e 4195 } else {
4196 addReply(c,shared.err);
4197 }
4198}
4199
4200static void shutdownCommand(redisClient *c) {
4201 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
a3b21203 4202 /* Kill the saving child if there is a background saving in progress.
4203 We want to avoid race conditions, for instance our saving child may
4204 overwrite the synchronous saving did by SHUTDOWN. */
9d65a1bb 4205 if (server.bgsavechildpid != -1) {
9f3c422c 4206 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4207 kill(server.bgsavechildpid,SIGKILL);
a3b21203 4208 rdbRemoveTempFile(server.bgsavechildpid);
9f3c422c 4209 }
ac945e2d 4210 if (server.appendonly) {
4211 /* Append only file: fsync() the AOF and exit */
4212 fsync(server.appendfd);
054e426d 4213 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 4214 exit(0);
ed9b544e 4215 } else {
ac945e2d 4216 /* Snapshotting. Perform a SYNC SAVE and exit */
4217 if (rdbSave(server.dbfilename) == REDIS_OK) {
4218 if (server.daemonize)
4219 unlink(server.pidfile);
4220 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4221 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
054e426d 4222 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 4223 exit(0);
4224 } else {
dd88747b 4225 /* Ooops.. error saving! The best we can do is to continue
4226 * operating. Note that if there was a background saving process,
4227 * in the next cron() Redis will be notified that the background
4228 * saving aborted, handling special stuff like slaves pending for
4229 * synchronization... */
ac945e2d 4230 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
dd88747b 4231 addReplySds(c,
4232 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
ac945e2d 4233 }
ed9b544e 4234 }
4235}
4236
4237static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4238 robj *o;
4239
4240 /* To use the same key as src and dst is probably an error */
4241 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4242 addReply(c,shared.sameobjecterr);
ed9b544e 4243 return;
4244 }
4245
dd88747b 4246 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4247 return;
dd88747b 4248
ed9b544e 4249 incrRefCount(o);
3305306f 4250 deleteIfVolatile(c->db,c->argv[2]);
4251 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 4252 if (nx) {
4253 decrRefCount(o);
c937aa89 4254 addReply(c,shared.czero);
ed9b544e 4255 return;
4256 }
3305306f 4257 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 4258 } else {
4259 incrRefCount(c->argv[2]);
4260 }
3305306f 4261 deleteKey(c->db,c->argv[1]);
ed9b544e 4262 server.dirty++;
c937aa89 4263 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4264}
4265
4266static void renameCommand(redisClient *c) {
4267 renameGenericCommand(c,0);
4268}
4269
4270static void renamenxCommand(redisClient *c) {
4271 renameGenericCommand(c,1);
4272}
4273
4274static void moveCommand(redisClient *c) {
3305306f 4275 robj *o;
4276 redisDb *src, *dst;
ed9b544e 4277 int srcid;
4278
4279 /* Obtain source and target DB pointers */
3305306f 4280 src = c->db;
4281 srcid = c->db->id;
ed9b544e 4282 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4283 addReply(c,shared.outofrangeerr);
ed9b544e 4284 return;
4285 }
3305306f 4286 dst = c->db;
4287 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4288
4289 /* If the user is moving using as target the same
4290 * DB as the source DB it is probably an error. */
4291 if (src == dst) {
c937aa89 4292 addReply(c,shared.sameobjecterr);
ed9b544e 4293 return;
4294 }
4295
4296 /* Check if the element exists and get a reference */
3305306f 4297 o = lookupKeyWrite(c->db,c->argv[1]);
4298 if (!o) {
c937aa89 4299 addReply(c,shared.czero);
ed9b544e 4300 return;
4301 }
4302
4303 /* Try to add the element to the target DB */
3305306f 4304 deleteIfVolatile(dst,c->argv[1]);
4305 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 4306 addReply(c,shared.czero);
ed9b544e 4307 return;
4308 }
3305306f 4309 incrRefCount(c->argv[1]);
ed9b544e 4310 incrRefCount(o);
4311
4312 /* OK! key moved, free the entry in the source DB */
3305306f 4313 deleteKey(src,c->argv[1]);
ed9b544e 4314 server.dirty++;
c937aa89 4315 addReply(c,shared.cone);
ed9b544e 4316}
4317
4318/* =================================== Lists ================================ */
4319static void pushGenericCommand(redisClient *c, int where) {
4320 robj *lobj;
ed9b544e 4321 list *list;
3305306f 4322
4323 lobj = lookupKeyWrite(c->db,c->argv[1]);
4324 if (lobj == NULL) {
95242ab5 4325 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4326 addReply(c,shared.cone);
95242ab5 4327 return;
4328 }
ed9b544e 4329 lobj = createListObject();
4330 list = lobj->ptr;
4331 if (where == REDIS_HEAD) {
6b47e12e 4332 listAddNodeHead(list,c->argv[2]);
ed9b544e 4333 } else {
6b47e12e 4334 listAddNodeTail(list,c->argv[2]);
ed9b544e 4335 }
3305306f 4336 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 4337 incrRefCount(c->argv[1]);
4338 incrRefCount(c->argv[2]);
4339 } else {
ed9b544e 4340 if (lobj->type != REDIS_LIST) {
4341 addReply(c,shared.wrongtypeerr);
4342 return;
4343 }
95242ab5 4344 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4345 addReply(c,shared.cone);
95242ab5 4346 return;
4347 }
ed9b544e 4348 list = lobj->ptr;
4349 if (where == REDIS_HEAD) {
6b47e12e 4350 listAddNodeHead(list,c->argv[2]);
ed9b544e 4351 } else {
6b47e12e 4352 listAddNodeTail(list,c->argv[2]);
ed9b544e 4353 }
4354 incrRefCount(c->argv[2]);
4355 }
4356 server.dirty++;
520b5a33 4357 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
ed9b544e 4358}
4359
4360static void lpushCommand(redisClient *c) {
4361 pushGenericCommand(c,REDIS_HEAD);
4362}
4363
4364static void rpushCommand(redisClient *c) {
4365 pushGenericCommand(c,REDIS_TAIL);
4366}
4367
4368static void llenCommand(redisClient *c) {
3305306f 4369 robj *o;
ed9b544e 4370 list *l;
dd88747b 4371
4372 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4373 checkType(c,o,REDIS_LIST)) return;
ed9b544e 4374
dd88747b 4375 l = o->ptr;
4376 addReplyUlong(c,listLength(l));
ed9b544e 4377}
4378
4379static void lindexCommand(redisClient *c) {
3305306f 4380 robj *o;
ed9b544e 4381 int index = atoi(c->argv[2]->ptr);
dd88747b 4382 list *list;
4383 listNode *ln;
4384
4385 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4386 checkType(c,o,REDIS_LIST)) return;
4387 list = o->ptr;
4388
4389 ln = listIndex(list, index);
4390 if (ln == NULL) {
c937aa89 4391 addReply(c,shared.nullbulk);
ed9b544e 4392 } else {
dd88747b 4393 robj *ele = listNodeValue(ln);
4394 addReplyBulk(c,ele);
ed9b544e 4395 }
4396}
4397
4398static void lsetCommand(redisClient *c) {
3305306f 4399 robj *o;
ed9b544e 4400 int index = atoi(c->argv[2]->ptr);
dd88747b 4401 list *list;
4402 listNode *ln;
4403
4404 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4405 checkType(c,o,REDIS_LIST)) return;
4406 list = o->ptr;
4407
4408 ln = listIndex(list, index);
4409 if (ln == NULL) {
4410 addReply(c,shared.outofrangeerr);
ed9b544e 4411 } else {
dd88747b 4412 robj *ele = listNodeValue(ln);
ed9b544e 4413
dd88747b 4414 decrRefCount(ele);
4415 listNodeValue(ln) = c->argv[3];
4416 incrRefCount(c->argv[3]);
4417 addReply(c,shared.ok);
4418 server.dirty++;
ed9b544e 4419 }
4420}
4421
4422static void popGenericCommand(redisClient *c, int where) {
3305306f 4423 robj *o;
dd88747b 4424 list *list;
4425 listNode *ln;
3305306f 4426
dd88747b 4427 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4428 checkType(c,o,REDIS_LIST)) return;
4429 list = o->ptr;
ed9b544e 4430
dd88747b 4431 if (where == REDIS_HEAD)
4432 ln = listFirst(list);
4433 else
4434 ln = listLast(list);
ed9b544e 4435
dd88747b 4436 if (ln == NULL) {
4437 addReply(c,shared.nullbulk);
4438 } else {
4439 robj *ele = listNodeValue(ln);
4440 addReplyBulk(c,ele);
4441 listDelNode(list,ln);
3ea27d37 4442 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4443 server.dirty++;
ed9b544e 4444 }
4445}
4446
4447static void lpopCommand(redisClient *c) {
4448 popGenericCommand(c,REDIS_HEAD);
4449}
4450
4451static void rpopCommand(redisClient *c) {
4452 popGenericCommand(c,REDIS_TAIL);
4453}
4454
4455static void lrangeCommand(redisClient *c) {
3305306f 4456 robj *o;
ed9b544e 4457 int start = atoi(c->argv[2]->ptr);
4458 int end = atoi(c->argv[3]->ptr);
dd88747b 4459 int llen;
4460 int rangelen, j;
4461 list *list;
4462 listNode *ln;
4463 robj *ele;
4464
4465 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
4466 checkType(c,o,REDIS_LIST)) return;
4467 list = o->ptr;
4468 llen = listLength(list);
4469
4470 /* convert negative indexes */
4471 if (start < 0) start = llen+start;
4472 if (end < 0) end = llen+end;
4473 if (start < 0) start = 0;
4474 if (end < 0) end = 0;
4475
4476 /* indexes sanity checks */
4477 if (start > end || start >= llen) {
4478 /* Out of range start or start > end result in empty list */
4479 addReply(c,shared.emptymultibulk);
4480 return;
4481 }
4482 if (end >= llen) end = llen-1;
4483 rangelen = (end-start)+1;
3305306f 4484
dd88747b 4485 /* Return the result in form of a multi-bulk reply */
4486 ln = listIndex(list, start);
4487 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4488 for (j = 0; j < rangelen; j++) {
4489 ele = listNodeValue(ln);
4490 addReplyBulk(c,ele);
4491 ln = ln->next;
ed9b544e 4492 }
4493}
4494
4495static void ltrimCommand(redisClient *c) {
3305306f 4496 robj *o;
ed9b544e 4497 int start = atoi(c->argv[2]->ptr);
4498 int end = atoi(c->argv[3]->ptr);
dd88747b 4499 int llen;
4500 int j, ltrim, rtrim;
4501 list *list;
4502 listNode *ln;
4503
4504 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4505 checkType(c,o,REDIS_LIST)) return;
4506 list = o->ptr;
4507 llen = listLength(list);
4508
4509 /* convert negative indexes */
4510 if (start < 0) start = llen+start;
4511 if (end < 0) end = llen+end;
4512 if (start < 0) start = 0;
4513 if (end < 0) end = 0;
4514
4515 /* indexes sanity checks */
4516 if (start > end || start >= llen) {
4517 /* Out of range start or start > end result in empty list */
4518 ltrim = llen;
4519 rtrim = 0;
ed9b544e 4520 } else {
dd88747b 4521 if (end >= llen) end = llen-1;
4522 ltrim = start;
4523 rtrim = llen-end-1;
4524 }
ed9b544e 4525
dd88747b 4526 /* Remove list elements to perform the trim */
4527 for (j = 0; j < ltrim; j++) {
4528 ln = listFirst(list);
4529 listDelNode(list,ln);
4530 }
4531 for (j = 0; j < rtrim; j++) {
4532 ln = listLast(list);
4533 listDelNode(list,ln);
ed9b544e 4534 }
3ea27d37 4535 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4536 server.dirty++;
4537 addReply(c,shared.ok);
ed9b544e 4538}
4539
4540static void lremCommand(redisClient *c) {
3305306f 4541 robj *o;
dd88747b 4542 list *list;
4543 listNode *ln, *next;
4544 int toremove = atoi(c->argv[2]->ptr);
4545 int removed = 0;
4546 int fromtail = 0;
a4d1ba9a 4547
dd88747b 4548 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4549 checkType(c,o,REDIS_LIST)) return;
4550 list = o->ptr;
4551
4552 if (toremove < 0) {
4553 toremove = -toremove;
4554 fromtail = 1;
4555 }
4556 ln = fromtail ? list->tail : list->head;
4557 while (ln) {
4558 robj *ele = listNodeValue(ln);
4559
4560 next = fromtail ? ln->prev : ln->next;
4561 if (compareStringObjects(ele,c->argv[3]) == 0) {
4562 listDelNode(list,ln);
4563 server.dirty++;
4564 removed++;
4565 if (toremove && removed == toremove) break;
ed9b544e 4566 }
dd88747b 4567 ln = next;
ed9b544e 4568 }
3ea27d37 4569 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4570 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 4571}
4572
12f9d551 4573/* This is the semantic of this command:
0f5f7e9a 4574 * RPOPLPUSH srclist dstlist:
12f9d551 4575 * IF LLEN(srclist) > 0
4576 * element = RPOP srclist
4577 * LPUSH dstlist element
4578 * RETURN element
4579 * ELSE
4580 * RETURN nil
4581 * END
4582 * END
4583 *
4584 * The idea is to be able to get an element from a list in a reliable way
4585 * since the element is not just returned but pushed against another list
4586 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4587 */
0f5f7e9a 4588static void rpoplpushcommand(redisClient *c) {
12f9d551 4589 robj *sobj;
dd88747b 4590 list *srclist;
4591 listNode *ln;
4592
4593 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4594 checkType(c,sobj,REDIS_LIST)) return;
4595 srclist = sobj->ptr;
4596 ln = listLast(srclist);
12f9d551 4597
dd88747b 4598 if (ln == NULL) {
12f9d551 4599 addReply(c,shared.nullbulk);
4600 } else {
dd88747b 4601 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4602 robj *ele = listNodeValue(ln);
4603 list *dstlist;
e20fb74f 4604
dd88747b 4605 if (dobj && dobj->type != REDIS_LIST) {
4606 addReply(c,shared.wrongtypeerr);
4607 return;
4608 }
12f9d551 4609
dd88747b 4610 /* Add the element to the target list (unless it's directly
4611 * passed to some BLPOP-ing client */
4612 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4613 if (dobj == NULL) {
4614 /* Create the list if the key does not exist */
4615 dobj = createListObject();
4616 dictAdd(c->db->dict,c->argv[2],dobj);
4617 incrRefCount(c->argv[2]);
12f9d551 4618 }
dd88747b 4619 dstlist = dobj->ptr;
4620 listAddNodeHead(dstlist,ele);
4621 incrRefCount(ele);
12f9d551 4622 }
dd88747b 4623
4624 /* Send the element to the client as reply as well */
4625 addReplyBulk(c,ele);
4626
4627 /* Finally remove the element from the source list */
4628 listDelNode(srclist,ln);
3ea27d37 4629 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4630 server.dirty++;
12f9d551 4631 }
4632}
4633
ed9b544e 4634/* ==================================== Sets ================================ */
4635
4636static void saddCommand(redisClient *c) {
ed9b544e 4637 robj *set;
4638
3305306f 4639 set = lookupKeyWrite(c->db,c->argv[1]);
4640 if (set == NULL) {
ed9b544e 4641 set = createSetObject();
3305306f 4642 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 4643 incrRefCount(c->argv[1]);
4644 } else {
ed9b544e 4645 if (set->type != REDIS_SET) {
c937aa89 4646 addReply(c,shared.wrongtypeerr);
ed9b544e 4647 return;
4648 }
4649 }
4650 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4651 incrRefCount(c->argv[2]);
4652 server.dirty++;
c937aa89 4653 addReply(c,shared.cone);
ed9b544e 4654 } else {
c937aa89 4655 addReply(c,shared.czero);
ed9b544e 4656 }
4657}
4658
4659static void sremCommand(redisClient *c) {
3305306f 4660 robj *set;
ed9b544e 4661
dd88747b 4662 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4663 checkType(c,set,REDIS_SET)) return;
4664
4665 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4666 server.dirty++;
4667 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 4668 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4669 addReply(c,shared.cone);
ed9b544e 4670 } else {
dd88747b 4671 addReply(c,shared.czero);
ed9b544e 4672 }
4673}
4674
a4460ef4 4675static void smoveCommand(redisClient *c) {
4676 robj *srcset, *dstset;
4677
4678 srcset = lookupKeyWrite(c->db,c->argv[1]);
4679 dstset = lookupKeyWrite(c->db,c->argv[2]);
4680
4681 /* If the source key does not exist return 0, if it's of the wrong type
4682 * raise an error */
4683 if (srcset == NULL || srcset->type != REDIS_SET) {
4684 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4685 return;
4686 }
4687 /* Error if the destination key is not a set as well */
4688 if (dstset && dstset->type != REDIS_SET) {
4689 addReply(c,shared.wrongtypeerr);
4690 return;
4691 }
4692 /* Remove the element from the source set */
4693 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4694 /* Key not found in the src set! return zero */
4695 addReply(c,shared.czero);
4696 return;
4697 }
3ea27d37 4698 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4699 deleteKey(c->db,c->argv[1]);
a4460ef4 4700 server.dirty++;
4701 /* Add the element to the destination set */
4702 if (!dstset) {
4703 dstset = createSetObject();
4704 dictAdd(c->db->dict,c->argv[2],dstset);
4705 incrRefCount(c->argv[2]);
4706 }
4707 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4708 incrRefCount(c->argv[3]);
4709 addReply(c,shared.cone);
4710}
4711
ed9b544e 4712static void sismemberCommand(redisClient *c) {
3305306f 4713 robj *set;
ed9b544e 4714
dd88747b 4715 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4716 checkType(c,set,REDIS_SET)) return;
4717
4718 if (dictFind(set->ptr,c->argv[2]))
4719 addReply(c,shared.cone);
4720 else
c937aa89 4721 addReply(c,shared.czero);
ed9b544e 4722}
4723
4724static void scardCommand(redisClient *c) {
3305306f 4725 robj *o;
ed9b544e 4726 dict *s;
dd88747b 4727
4728 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4729 checkType(c,o,REDIS_SET)) return;
ed9b544e 4730
dd88747b 4731 s = o->ptr;
4732 addReplyUlong(c,dictSize(s));
ed9b544e 4733}
4734
12fea928 4735static void spopCommand(redisClient *c) {
4736 robj *set;
4737 dictEntry *de;
4738
dd88747b 4739 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4740 checkType(c,set,REDIS_SET)) return;
4741
4742 de = dictGetRandomKey(set->ptr);
4743 if (de == NULL) {
12fea928 4744 addReply(c,shared.nullbulk);
4745 } else {
dd88747b 4746 robj *ele = dictGetEntryKey(de);
12fea928 4747
dd88747b 4748 addReplyBulk(c,ele);
4749 dictDelete(set->ptr,ele);
4750 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 4751 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4752 server.dirty++;
12fea928 4753 }
4754}
4755
2abb95a9 4756static void srandmemberCommand(redisClient *c) {
4757 robj *set;
4758 dictEntry *de;
4759
dd88747b 4760 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4761 checkType(c,set,REDIS_SET)) return;
4762
4763 de = dictGetRandomKey(set->ptr);
4764 if (de == NULL) {
2abb95a9 4765 addReply(c,shared.nullbulk);
4766 } else {
dd88747b 4767 robj *ele = dictGetEntryKey(de);
2abb95a9 4768
dd88747b 4769 addReplyBulk(c,ele);
2abb95a9 4770 }
4771}
4772
ed9b544e 4773static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4774 dict **d1 = (void*) s1, **d2 = (void*) s2;
4775
3305306f 4776 return dictSize(*d1)-dictSize(*d2);
ed9b544e 4777}
4778
682ac724 4779static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 4780 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4781 dictIterator *di;
4782 dictEntry *de;
4783 robj *lenobj = NULL, *dstset = NULL;
682ac724 4784 unsigned long j, cardinality = 0;
ed9b544e 4785
ed9b544e 4786 for (j = 0; j < setsnum; j++) {
4787 robj *setobj;
3305306f 4788
4789 setobj = dstkey ?
4790 lookupKeyWrite(c->db,setskeys[j]) :
4791 lookupKeyRead(c->db,setskeys[j]);
4792 if (!setobj) {
ed9b544e 4793 zfree(dv);
5faa6025 4794 if (dstkey) {
fdcaae84 4795 if (deleteKey(c->db,dstkey))
4796 server.dirty++;
0d36ded0 4797 addReply(c,shared.czero);
5faa6025 4798 } else {
4799 addReply(c,shared.nullmultibulk);
4800 }
ed9b544e 4801 return;
4802 }
ed9b544e 4803 if (setobj->type != REDIS_SET) {
4804 zfree(dv);
c937aa89 4805 addReply(c,shared.wrongtypeerr);
ed9b544e 4806 return;
4807 }
4808 dv[j] = setobj->ptr;
4809 }
4810 /* Sort sets from the smallest to largest, this will improve our
4811 * algorithm's performace */
4812 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4813
4814 /* The first thing we should output is the total number of elements...
4815 * since this is a multi-bulk write, but at this stage we don't know
4816 * the intersection set size, so we use a trick, append an empty object
4817 * to the output list and save the pointer to later modify it with the
4818 * right length */
4819 if (!dstkey) {
4820 lenobj = createObject(REDIS_STRING,NULL);
4821 addReply(c,lenobj);
4822 decrRefCount(lenobj);
4823 } else {
4824 /* If we have a target key where to store the resulting set
4825 * create this key with an empty set inside */
4826 dstset = createSetObject();
ed9b544e 4827 }
4828
4829 /* Iterate all the elements of the first (smallest) set, and test
4830 * the element against all the other sets, if at least one set does
4831 * not include the element it is discarded */
4832 di = dictGetIterator(dv[0]);
ed9b544e 4833
4834 while((de = dictNext(di)) != NULL) {
4835 robj *ele;
4836
4837 for (j = 1; j < setsnum; j++)
4838 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4839 if (j != setsnum)
4840 continue; /* at least one set does not contain the member */
4841 ele = dictGetEntryKey(de);
4842 if (!dstkey) {
dd88747b 4843 addReplyBulk(c,ele);
ed9b544e 4844 cardinality++;
4845 } else {
4846 dictAdd(dstset->ptr,ele,NULL);
4847 incrRefCount(ele);
4848 }
4849 }
4850 dictReleaseIterator(di);
4851
83cdfe18 4852 if (dstkey) {
3ea27d37 4853 /* Store the resulting set into the target, if the intersection
4854 * is not an empty set. */
83cdfe18 4855 deleteKey(c->db,dstkey);
3ea27d37 4856 if (dictSize((dict*)dstset->ptr) > 0) {
4857 dictAdd(c->db->dict,dstkey,dstset);
4858 incrRefCount(dstkey);
d36c4e97 4859 addReplyLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 4860 } else {
4861 decrRefCount(dstset);
d36c4e97 4862 addReply(c,shared.czero);
3ea27d37 4863 }
40d224a9 4864 server.dirty++;
d36c4e97 4865 } else {
4866 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 4867 }
ed9b544e 4868 zfree(dv);
4869}
4870
4871static void sinterCommand(redisClient *c) {
4872 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4873}
4874
4875static void sinterstoreCommand(redisClient *c) {
4876 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4877}
4878
f4f56e1d 4879#define REDIS_OP_UNION 0
4880#define REDIS_OP_DIFF 1
2830ca53 4881#define REDIS_OP_INTER 2
f4f56e1d 4882
4883static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 4884 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4885 dictIterator *di;
4886 dictEntry *de;
f4f56e1d 4887 robj *dstset = NULL;
40d224a9 4888 int j, cardinality = 0;
4889
40d224a9 4890 for (j = 0; j < setsnum; j++) {
4891 robj *setobj;
4892
4893 setobj = dstkey ?
4894 lookupKeyWrite(c->db,setskeys[j]) :
4895 lookupKeyRead(c->db,setskeys[j]);
4896 if (!setobj) {
4897 dv[j] = NULL;
4898 continue;
4899 }
4900 if (setobj->type != REDIS_SET) {
4901 zfree(dv);
4902 addReply(c,shared.wrongtypeerr);
4903 return;
4904 }
4905 dv[j] = setobj->ptr;
4906 }
4907
4908 /* We need a temp set object to store our union. If the dstkey
4909 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4910 * this set object will be the resulting object to set into the target key*/
4911 dstset = createSetObject();
4912
40d224a9 4913 /* Iterate all the elements of all the sets, add every element a single
4914 * time to the result set */
4915 for (j = 0; j < setsnum; j++) {
51829ed3 4916 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 4917 if (!dv[j]) continue; /* non existing keys are like empty sets */
4918
4919 di = dictGetIterator(dv[j]);
40d224a9 4920
4921 while((de = dictNext(di)) != NULL) {
4922 robj *ele;
4923
4924 /* dictAdd will not add the same element multiple times */
4925 ele = dictGetEntryKey(de);
f4f56e1d 4926 if (op == REDIS_OP_UNION || j == 0) {
4927 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4928 incrRefCount(ele);
40d224a9 4929 cardinality++;
4930 }
f4f56e1d 4931 } else if (op == REDIS_OP_DIFF) {
4932 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4933 cardinality--;
4934 }
40d224a9 4935 }
4936 }
4937 dictReleaseIterator(di);
51829ed3 4938
d36c4e97 4939 /* result set is empty? Exit asap. */
4940 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 4941 }
4942
f4f56e1d 4943 /* Output the content of the resulting set, if not in STORE mode */
4944 if (!dstkey) {
4945 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4946 di = dictGetIterator(dstset->ptr);
f4f56e1d 4947 while((de = dictNext(di)) != NULL) {
4948 robj *ele;
4949
4950 ele = dictGetEntryKey(de);
dd88747b 4951 addReplyBulk(c,ele);
f4f56e1d 4952 }
4953 dictReleaseIterator(di);
d36c4e97 4954 decrRefCount(dstset);
83cdfe18
AG
4955 } else {
4956 /* If we have a target key where to store the resulting set
4957 * create this key with the result set inside */
4958 deleteKey(c->db,dstkey);
3ea27d37 4959 if (dictSize((dict*)dstset->ptr) > 0) {
4960 dictAdd(c->db->dict,dstkey,dstset);
4961 incrRefCount(dstkey);
d36c4e97 4962 addReplyLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 4963 } else {
4964 decrRefCount(dstset);
d36c4e97 4965 addReply(c,shared.czero);
3ea27d37 4966 }
40d224a9 4967 server.dirty++;
4968 }
4969 zfree(dv);
4970}
4971
4972static void sunionCommand(redisClient *c) {
f4f56e1d 4973 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 4974}
4975
4976static void sunionstoreCommand(redisClient *c) {
f4f56e1d 4977 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4978}
4979
4980static void sdiffCommand(redisClient *c) {
4981 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4982}
4983
4984static void sdiffstoreCommand(redisClient *c) {
4985 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 4986}
4987
6b47e12e 4988/* ==================================== ZSets =============================== */
4989
4990/* ZSETs are ordered sets using two data structures to hold the same elements
4991 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
4992 * data structure.
4993 *
4994 * The elements are added to an hash table mapping Redis objects to scores.
4995 * At the same time the elements are added to a skip list mapping scores
4996 * to Redis objects (so objects are sorted by scores in this "view"). */
4997
4998/* This skiplist implementation is almost a C translation of the original
4999 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5000 * Alternative to Balanced Trees", modified in three ways:
5001 * a) this implementation allows for repeated values.
5002 * b) the comparison is not just by key (our 'score') but by satellite data.
5003 * c) there is a back pointer, so it's a doubly linked list with the back
5004 * pointers being only at "level 1". This allows to traverse the list
5005 * from tail to head, useful for ZREVRANGE. */
5006
5007static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5008 zskiplistNode *zn = zmalloc(sizeof(*zn));
5009
5010 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2b37892e
PN
5011 if (level > 0)
5012 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
6b47e12e 5013 zn->score = score;
5014 zn->obj = obj;
5015 return zn;
5016}
5017
5018static zskiplist *zslCreate(void) {
5019 int j;
5020 zskiplist *zsl;
5021
5022 zsl = zmalloc(sizeof(*zsl));
5023 zsl->level = 1;
cc812361 5024 zsl->length = 0;
6b47e12e 5025 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 5026 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 5027 zsl->header->forward[j] = NULL;
94e543b5 5028
5029 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5030 if (j < ZSKIPLIST_MAXLEVEL-1)
5031 zsl->header->span[j] = 0;
69d95c3e 5032 }
e3870fab 5033 zsl->header->backward = NULL;
5034 zsl->tail = NULL;
6b47e12e 5035 return zsl;
5036}
5037
fd8ccf44 5038static void zslFreeNode(zskiplistNode *node) {
5039 decrRefCount(node->obj);
ad807e6f 5040 zfree(node->forward);
69d95c3e 5041 zfree(node->span);
fd8ccf44 5042 zfree(node);
5043}
5044
5045static void zslFree(zskiplist *zsl) {
ad807e6f 5046 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 5047
ad807e6f 5048 zfree(zsl->header->forward);
69d95c3e 5049 zfree(zsl->header->span);
ad807e6f 5050 zfree(zsl->header);
fd8ccf44 5051 while(node) {
599379dd 5052 next = node->forward[0];
fd8ccf44 5053 zslFreeNode(node);
5054 node = next;
5055 }
ad807e6f 5056 zfree(zsl);
fd8ccf44 5057}
5058
6b47e12e 5059static int zslRandomLevel(void) {
5060 int level = 1;
5061 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5062 level += 1;
10c2baa5 5063 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6b47e12e 5064}
5065
5066static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5067 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 5068 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 5069 int i, level;
5070
5071 x = zsl->header;
5072 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
5073 /* store rank that is crossed to reach the insert position */
5074 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 5075
9d60e6e4 5076 while (x->forward[i] &&
5077 (x->forward[i]->score < score ||
5078 (x->forward[i]->score == score &&
69d95c3e 5079 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 5080 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 5081 x = x->forward[i];
69d95c3e 5082 }
6b47e12e 5083 update[i] = x;
5084 }
6b47e12e 5085 /* we assume the key is not already inside, since we allow duplicated
5086 * scores, and the re-insertion of score and redis object should never
5087 * happpen since the caller of zslInsert() should test in the hash table
5088 * if the element is already inside or not. */
5089 level = zslRandomLevel();
5090 if (level > zsl->level) {
69d95c3e 5091 for (i = zsl->level; i < level; i++) {
2b37892e 5092 rank[i] = 0;
6b47e12e 5093 update[i] = zsl->header;
2b37892e 5094 update[i]->span[i-1] = zsl->length;
69d95c3e 5095 }
6b47e12e 5096 zsl->level = level;
5097 }
5098 x = zslCreateNode(level,score,obj);
5099 for (i = 0; i < level; i++) {
5100 x->forward[i] = update[i]->forward[i];
5101 update[i]->forward[i] = x;
69d95c3e
PN
5102
5103 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
5104 if (i > 0) {
5105 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5106 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5107 }
6b47e12e 5108 }
69d95c3e
PN
5109
5110 /* increment span for untouched levels */
5111 for (i = level; i < zsl->level; i++) {
2b37892e 5112 update[i]->span[i-1]++;
69d95c3e
PN
5113 }
5114
bb975144 5115 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 5116 if (x->forward[0])
5117 x->forward[0]->backward = x;
5118 else
5119 zsl->tail = x;
cc812361 5120 zsl->length++;
6b47e12e 5121}
5122
84105336
PN
5123/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5124void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5125 int i;
5126 for (i = 0; i < zsl->level; i++) {
5127 if (update[i]->forward[i] == x) {
5128 if (i > 0) {
5129 update[i]->span[i-1] += x->span[i-1] - 1;
5130 }
5131 update[i]->forward[i] = x->forward[i];
5132 } else {
5133 /* invariant: i > 0, because update[0]->forward[0]
5134 * is always equal to x */
5135 update[i]->span[i-1] -= 1;
5136 }
5137 }
5138 if (x->forward[0]) {
5139 x->forward[0]->backward = x->backward;
5140 } else {
5141 zsl->tail = x->backward;
5142 }
5143 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5144 zsl->level--;
5145 zsl->length--;
5146}
5147
50c55df5 5148/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 5149static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 5150 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5151 int i;
5152
5153 x = zsl->header;
5154 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 5155 while (x->forward[i] &&
5156 (x->forward[i]->score < score ||
5157 (x->forward[i]->score == score &&
5158 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 5159 x = x->forward[i];
5160 update[i] = x;
5161 }
5162 /* We may have multiple elements with the same score, what we need
5163 * is to find the element with both the right score and object. */
5164 x = x->forward[0];
50c55df5 5165 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
84105336 5166 zslDeleteNode(zsl, x, update);
9d60e6e4 5167 zslFreeNode(x);
9d60e6e4 5168 return 1;
5169 } else {
5170 return 0; /* not found */
e197b441 5171 }
5172 return 0; /* not found */
fd8ccf44 5173}
5174
1807985b 5175/* Delete all the elements with score between min and max from the skiplist.
5176 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5177 * Note that this function takes the reference to the hash table view of the
5178 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 5179static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 5180 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5181 unsigned long removed = 0;
5182 int i;
5183
5184 x = zsl->header;
5185 for (i = zsl->level-1; i >= 0; i--) {
5186 while (x->forward[i] && x->forward[i]->score < min)
5187 x = x->forward[i];
5188 update[i] = x;
5189 }
5190 /* We may have multiple elements with the same score, what we need
5191 * is to find the element with both the right score and object. */
5192 x = x->forward[0];
5193 while (x && x->score <= max) {
84105336
PN
5194 zskiplistNode *next = x->forward[0];
5195 zslDeleteNode(zsl, x, update);
1807985b 5196 dictDelete(dict,x->obj);
5197 zslFreeNode(x);
1807985b 5198 removed++;
5199 x = next;
5200 }
5201 return removed; /* not found */
5202}
1807985b 5203
9212eafd 5204/* Delete all the elements with rank between start and end from the skiplist.
2424490f 5205 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
5206static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5207 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5208 unsigned long traversed = 0, removed = 0;
5209 int i;
5210
9212eafd
PN
5211 x = zsl->header;
5212 for (i = zsl->level-1; i >= 0; i--) {
5213 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5214 traversed += i > 0 ? x->span[i-1] : 1;
5215 x = x->forward[i];
1807985b 5216 }
9212eafd
PN
5217 update[i] = x;
5218 }
5219
5220 traversed++;
5221 x = x->forward[0];
5222 while (x && traversed <= end) {
84105336
PN
5223 zskiplistNode *next = x->forward[0];
5224 zslDeleteNode(zsl, x, update);
1807985b 5225 dictDelete(dict,x->obj);
5226 zslFreeNode(x);
1807985b 5227 removed++;
9212eafd 5228 traversed++;
1807985b 5229 x = next;
5230 }
9212eafd 5231 return removed;
1807985b 5232}
5233
50c55df5 5234/* Find the first node having a score equal or greater than the specified one.
5235 * Returns NULL if there is no match. */
5236static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5237 zskiplistNode *x;
5238 int i;
5239
5240 x = zsl->header;
5241 for (i = zsl->level-1; i >= 0; i--) {
5242 while (x->forward[i] && x->forward[i]->score < score)
5243 x = x->forward[i];
5244 }
5245 /* We may have multiple elements with the same score, what we need
5246 * is to find the element with both the right score and object. */
5247 return x->forward[0];
5248}
5249
27b0ccca
PN
5250/* Find the rank for an element by both score and key.
5251 * Returns 0 when the element cannot be found, rank otherwise.
5252 * Note that the rank is 1-based due to the span of zsl->header to the
5253 * first element. */
5254static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5255 zskiplistNode *x;
5256 unsigned long rank = 0;
5257 int i;
5258
5259 x = zsl->header;
5260 for (i = zsl->level-1; i >= 0; i--) {
5261 while (x->forward[i] &&
5262 (x->forward[i]->score < score ||
5263 (x->forward[i]->score == score &&
5264 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 5265 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
5266 x = x->forward[i];
5267 }
5268
5269 /* x might be equal to zsl->header, so test if obj is non-NULL */
5270 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5271 return rank;
5272 }
5273 }
5274 return 0;
5275}
5276
e74825c2
PN
5277/* Finds an element by its rank. The rank argument needs to be 1-based. */
5278zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5279 zskiplistNode *x;
5280 unsigned long traversed = 0;
5281 int i;
5282
5283 x = zsl->header;
5284 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 5285 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5286 {
a50ea45c 5287 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
5288 x = x->forward[i];
5289 }
e74825c2
PN
5290 if (traversed == rank) {
5291 return x;
5292 }
5293 }
5294 return NULL;
5295}
5296
fd8ccf44 5297/* The actual Z-commands implementations */
5298
7db723ad 5299/* This generic command implements both ZADD and ZINCRBY.
e2665397 5300 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 5301 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 5302static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 5303 robj *zsetobj;
5304 zset *zs;
5305 double *score;
5306
e2665397 5307 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 5308 if (zsetobj == NULL) {
5309 zsetobj = createZsetObject();
e2665397 5310 dictAdd(c->db->dict,key,zsetobj);
5311 incrRefCount(key);
fd8ccf44 5312 } else {
5313 if (zsetobj->type != REDIS_ZSET) {
5314 addReply(c,shared.wrongtypeerr);
5315 return;
5316 }
5317 }
fd8ccf44 5318 zs = zsetobj->ptr;
e2665397 5319
7db723ad 5320 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 5321 * needs to handle the two different conditions. It's all about setting
5322 * '*score', that is, the new score to set, to the right value. */
5323 score = zmalloc(sizeof(double));
5324 if (doincrement) {
5325 dictEntry *de;
5326
5327 /* Read the old score. If the element was not present starts from 0 */
5328 de = dictFind(zs->dict,ele);
5329 if (de) {
5330 double *oldscore = dictGetEntryVal(de);
5331 *score = *oldscore + scoreval;
5332 } else {
5333 *score = scoreval;
5334 }
5335 } else {
5336 *score = scoreval;
5337 }
5338
5339 /* What follows is a simple remove and re-insert operation that is common
7db723ad 5340 * to both ZADD and ZINCRBY... */
e2665397 5341 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 5342 /* case 1: New element */
e2665397 5343 incrRefCount(ele); /* added to hash */
5344 zslInsert(zs->zsl,*score,ele);
5345 incrRefCount(ele); /* added to skiplist */
fd8ccf44 5346 server.dirty++;
e2665397 5347 if (doincrement)
e2665397 5348 addReplyDouble(c,*score);
91d71bfc 5349 else
5350 addReply(c,shared.cone);
fd8ccf44 5351 } else {
5352 dictEntry *de;
5353 double *oldscore;
5354
5355 /* case 2: Score update operation */
e2665397 5356 de = dictFind(zs->dict,ele);
dfc5e96c 5357 redisAssert(de != NULL);
fd8ccf44 5358 oldscore = dictGetEntryVal(de);
5359 if (*score != *oldscore) {
5360 int deleted;
5361
e2665397 5362 /* Remove and insert the element in the skip list with new score */
5363 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 5364 redisAssert(deleted != 0);
e2665397 5365 zslInsert(zs->zsl,*score,ele);
5366 incrRefCount(ele);
5367 /* Update the score in the hash table */
5368 dictReplace(zs->dict,ele,score);
fd8ccf44 5369 server.dirty++;
2161a965 5370 } else {
5371 zfree(score);
fd8ccf44 5372 }
e2665397 5373 if (doincrement)
5374 addReplyDouble(c,*score);
5375 else
5376 addReply(c,shared.czero);
fd8ccf44 5377 }
5378}
5379
e2665397 5380static void zaddCommand(redisClient *c) {
5381 double scoreval;
5382
5383 scoreval = strtod(c->argv[2]->ptr,NULL);
5384 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5385}
5386
7db723ad 5387static void zincrbyCommand(redisClient *c) {
e2665397 5388 double scoreval;
5389
5390 scoreval = strtod(c->argv[2]->ptr,NULL);
5391 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5392}
5393
1b7106e7 5394static void zremCommand(redisClient *c) {
5395 robj *zsetobj;
5396 zset *zs;
dd88747b 5397 dictEntry *de;
5398 double *oldscore;
5399 int deleted;
1b7106e7 5400
dd88747b 5401 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5402 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 5403
dd88747b 5404 zs = zsetobj->ptr;
5405 de = dictFind(zs->dict,c->argv[2]);
5406 if (de == NULL) {
5407 addReply(c,shared.czero);
5408 return;
1b7106e7 5409 }
dd88747b 5410 /* Delete from the skiplist */
5411 oldscore = dictGetEntryVal(de);
5412 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5413 redisAssert(deleted != 0);
5414
5415 /* Delete from the hash table */
5416 dictDelete(zs->dict,c->argv[2]);
5417 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5418 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5419 server.dirty++;
5420 addReply(c,shared.cone);
1b7106e7 5421}
5422
1807985b 5423static void zremrangebyscoreCommand(redisClient *c) {
5424 double min = strtod(c->argv[2]->ptr,NULL);
5425 double max = strtod(c->argv[3]->ptr,NULL);
dd88747b 5426 long deleted;
1807985b 5427 robj *zsetobj;
5428 zset *zs;
5429
dd88747b 5430 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5431 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 5432
dd88747b 5433 zs = zsetobj->ptr;
5434 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5435 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5436 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5437 server.dirty += deleted;
5438 addReplyLong(c,deleted);
1807985b 5439}
5440
9212eafd
PN
5441static void zremrangebyrankCommand(redisClient *c) {
5442 int start = atoi(c->argv[2]->ptr);
5443 int end = atoi(c->argv[3]->ptr);
dd88747b 5444 int llen;
5445 long deleted;
9212eafd
PN
5446 robj *zsetobj;
5447 zset *zs;
5448
dd88747b 5449 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5450 checkType(c,zsetobj,REDIS_ZSET)) return;
5451 zs = zsetobj->ptr;
5452 llen = zs->zsl->length;
9212eafd 5453
dd88747b 5454 /* convert negative indexes */
5455 if (start < 0) start = llen+start;
5456 if (end < 0) end = llen+end;
5457 if (start < 0) start = 0;
5458 if (end < 0) end = 0;
9212eafd 5459
dd88747b 5460 /* indexes sanity checks */
5461 if (start > end || start >= llen) {
5462 addReply(c,shared.czero);
5463 return;
9212eafd 5464 }
dd88747b 5465 if (end >= llen) end = llen-1;
5466
5467 /* increment start and end because zsl*Rank functions
5468 * use 1-based rank */
5469 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5470 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5471 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5472 server.dirty += deleted;
5473 addReplyLong(c, deleted);
9212eafd
PN
5474}
5475
8f92e768
PN
5476typedef struct {
5477 dict *dict;
5478 double weight;
5479} zsetopsrc;
5480
5481static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5482 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5483 unsigned long size1, size2;
5484 size1 = d1->dict ? dictSize(d1->dict) : 0;
5485 size2 = d2->dict ? dictSize(d2->dict) : 0;
5486 return size1 - size2;
5487}
5488
d2764cd6
PN
5489#define REDIS_AGGR_SUM 1
5490#define REDIS_AGGR_MIN 2
5491#define REDIS_AGGR_MAX 3
5492
5493inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5494 if (aggregate == REDIS_AGGR_SUM) {
5495 *target = *target + val;
5496 } else if (aggregate == REDIS_AGGR_MIN) {
5497 *target = val < *target ? val : *target;
5498 } else if (aggregate == REDIS_AGGR_MAX) {
5499 *target = val > *target ? val : *target;
5500 } else {
5501 /* safety net */
5502 redisAssert(0 != 0);
5503 }
5504}
5505
2830ca53 5506static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
8f92e768 5507 int i, j, zsetnum;
d2764cd6 5508 int aggregate = REDIS_AGGR_SUM;
8f92e768 5509 zsetopsrc *src;
2830ca53
PN
5510 robj *dstobj;
5511 zset *dstzset;
b287c9bb
PN
5512 dictIterator *di;
5513 dictEntry *de;
5514
2830ca53
PN
5515 /* expect zsetnum input keys to be given */
5516 zsetnum = atoi(c->argv[2]->ptr);
5517 if (zsetnum < 1) {
5518 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5519 return;
b287c9bb 5520 }
2830ca53
PN
5521
5522 /* test if the expected number of keys would overflow */
5523 if (3+zsetnum > c->argc) {
b287c9bb
PN
5524 addReply(c,shared.syntaxerr);
5525 return;
5526 }
5527
2830ca53 5528 /* read keys to be used for input */
b9eed483 5529 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
2830ca53 5530 for (i = 0, j = 3; i < zsetnum; i++, j++) {
b287c9bb
PN
5531 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5532 if (!zsetobj) {
8f92e768 5533 src[i].dict = NULL;
b287c9bb
PN
5534 } else {
5535 if (zsetobj->type != REDIS_ZSET) {
8f92e768 5536 zfree(src);
b287c9bb
PN
5537 addReply(c,shared.wrongtypeerr);
5538 return;
5539 }
8f92e768 5540 src[i].dict = ((zset*)zsetobj->ptr)->dict;
b287c9bb 5541 }
2830ca53
PN
5542
5543 /* default all weights to 1 */
8f92e768 5544 src[i].weight = 1.0;
b287c9bb
PN
5545 }
5546
2830ca53
PN
5547 /* parse optional extra arguments */
5548 if (j < c->argc) {
d2764cd6 5549 int remaining = c->argc - j;
b287c9bb 5550
2830ca53 5551 while (remaining) {
d2764cd6 5552 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 5553 j++; remaining--;
2830ca53 5554 for (i = 0; i < zsetnum; i++, j++, remaining--) {
8f92e768 5555 src[i].weight = strtod(c->argv[j]->ptr, NULL);
2830ca53 5556 }
d2764cd6
PN
5557 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5558 j++; remaining--;
5559 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5560 aggregate = REDIS_AGGR_SUM;
5561 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5562 aggregate = REDIS_AGGR_MIN;
5563 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5564 aggregate = REDIS_AGGR_MAX;
5565 } else {
5566 zfree(src);
5567 addReply(c,shared.syntaxerr);
5568 return;
5569 }
5570 j++; remaining--;
2830ca53 5571 } else {
8f92e768 5572 zfree(src);
2830ca53
PN
5573 addReply(c,shared.syntaxerr);
5574 return;
5575 }
5576 }
5577 }
b287c9bb 5578
d2764cd6
PN
5579 /* sort sets from the smallest to largest, this will improve our
5580 * algorithm's performance */
5581 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5582
2830ca53
PN
5583 dstobj = createZsetObject();
5584 dstzset = dstobj->ptr;
5585
5586 if (op == REDIS_OP_INTER) {
8f92e768
PN
5587 /* skip going over all entries if the smallest zset is NULL or empty */
5588 if (src[0].dict && dictSize(src[0].dict) > 0) {
5589 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5590 * from small to large, all src[i > 0].dict are non-empty too */
5591 di = dictGetIterator(src[0].dict);
2830ca53 5592 while((de = dictNext(di)) != NULL) {
d2764cd6
PN
5593 double *score = zmalloc(sizeof(double)), value;
5594 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
2830ca53 5595
d2764cd6
PN
5596 for (j = 1; j < zsetnum; j++) {
5597 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 5598 if (other) {
d2764cd6
PN
5599 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5600 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
5601 } else {
5602 break;
5603 }
5604 }
b287c9bb 5605
2830ca53 5606 /* skip entry when not present in every source dict */
8f92e768 5607 if (j != zsetnum) {
2830ca53
PN
5608 zfree(score);
5609 } else {
5610 robj *o = dictGetEntryKey(de);
5611 dictAdd(dstzset->dict,o,score);
5612 incrRefCount(o); /* added to dictionary */
5613 zslInsert(dstzset->zsl,*score,o);
5614 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
5615 }
5616 }
2830ca53
PN
5617 dictReleaseIterator(di);
5618 }
5619 } else if (op == REDIS_OP_UNION) {
5620 for (i = 0; i < zsetnum; i++) {
8f92e768 5621 if (!src[i].dict) continue;
2830ca53 5622
8f92e768 5623 di = dictGetIterator(src[i].dict);
2830ca53
PN
5624 while((de = dictNext(di)) != NULL) {
5625 /* skip key when already processed */
5626 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5627
d2764cd6
PN
5628 double *score = zmalloc(sizeof(double)), value;
5629 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
2830ca53 5630
d2764cd6
PN
5631 /* because the zsets are sorted by size, its only possible
5632 * for sets at larger indices to hold this entry */
5633 for (j = (i+1); j < zsetnum; j++) {
5634 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 5635 if (other) {
d2764cd6
PN
5636 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5637 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
5638 }
5639 }
b287c9bb 5640
2830ca53
PN
5641 robj *o = dictGetEntryKey(de);
5642 dictAdd(dstzset->dict,o,score);
5643 incrRefCount(o); /* added to dictionary */
5644 zslInsert(dstzset->zsl,*score,o);
5645 incrRefCount(o); /* added to skiplist */
5646 }
5647 dictReleaseIterator(di);
b287c9bb 5648 }
2830ca53
PN
5649 } else {
5650 /* unknown operator */
5651 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
5652 }
5653
5654 deleteKey(c->db,dstkey);
3ea27d37 5655 if (dstzset->zsl->length) {
5656 dictAdd(c->db->dict,dstkey,dstobj);
5657 incrRefCount(dstkey);
5658 addReplyLong(c, dstzset->zsl->length);
5659 server.dirty++;
5660 } else {
8bca8773 5661 decrRefCount(dstobj);
3ea27d37 5662 addReply(c, shared.czero);
5663 }
8f92e768 5664 zfree(src);
b287c9bb
PN
5665}
5666
2830ca53
PN
5667static void zunionCommand(redisClient *c) {
5668 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
5669}
5670
2830ca53
PN
5671static void zinterCommand(redisClient *c) {
5672 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
5673}
5674
e3870fab 5675static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 5676 robj *o;
5677 int start = atoi(c->argv[2]->ptr);
5678 int end = atoi(c->argv[3]->ptr);
752da584 5679 int withscores = 0;
dd88747b 5680 int llen;
5681 int rangelen, j;
5682 zset *zsetobj;
5683 zskiplist *zsl;
5684 zskiplistNode *ln;
5685 robj *ele;
752da584 5686
5687 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5688 withscores = 1;
5689 } else if (c->argc >= 5) {
5690 addReply(c,shared.syntaxerr);
5691 return;
5692 }
cc812361 5693
dd88747b 5694 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
5695 checkType(c,o,REDIS_ZSET)) return;
5696 zsetobj = o->ptr;
5697 zsl = zsetobj->zsl;
5698 llen = zsl->length;
cc812361 5699
dd88747b 5700 /* convert negative indexes */
5701 if (start < 0) start = llen+start;
5702 if (end < 0) end = llen+end;
5703 if (start < 0) start = 0;
5704 if (end < 0) end = 0;
cc812361 5705
dd88747b 5706 /* indexes sanity checks */
5707 if (start > end || start >= llen) {
5708 /* Out of range start or start > end result in empty list */
5709 addReply(c,shared.emptymultibulk);
5710 return;
5711 }
5712 if (end >= llen) end = llen-1;
5713 rangelen = (end-start)+1;
cc812361 5714
dd88747b 5715 /* check if starting point is trivial, before searching
5716 * the element in log(N) time */
5717 if (reverse) {
5718 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5719 } else {
5720 ln = start == 0 ?
5721 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5722 }
cc812361 5723
dd88747b 5724 /* Return the result in form of a multi-bulk reply */
5725 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5726 withscores ? (rangelen*2) : rangelen));
5727 for (j = 0; j < rangelen; j++) {
5728 ele = ln->obj;
5729 addReplyBulk(c,ele);
5730 if (withscores)
5731 addReplyDouble(c,ln->score);
5732 ln = reverse ? ln->backward : ln->forward[0];
cc812361 5733 }
5734}
5735
e3870fab 5736static void zrangeCommand(redisClient *c) {
5737 zrangeGenericCommand(c,0);
5738}
5739
5740static void zrevrangeCommand(redisClient *c) {
5741 zrangeGenericCommand(c,1);
5742}
5743
f44dd428 5744/* This command implements both ZRANGEBYSCORE and ZCOUNT.
5745 * If justcount is non-zero, just the count is returned. */
5746static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 5747 robj *o;
f44dd428 5748 double min, max;
5749 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 5750 int offset = 0, limit = -1;
0500ef27
SH
5751 int withscores = 0;
5752 int badsyntax = 0;
5753
f44dd428 5754 /* Parse the min-max interval. If one of the values is prefixed
5755 * by the "(" character, it's considered "open". For instance
5756 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5757 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5758 if (((char*)c->argv[2]->ptr)[0] == '(') {
5759 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5760 minex = 1;
5761 } else {
5762 min = strtod(c->argv[2]->ptr,NULL);
5763 }
5764 if (((char*)c->argv[3]->ptr)[0] == '(') {
5765 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5766 maxex = 1;
5767 } else {
5768 max = strtod(c->argv[3]->ptr,NULL);
5769 }
5770
5771 /* Parse "WITHSCORES": note that if the command was called with
5772 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5773 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 5774 if (c->argc == 5 || c->argc == 8) {
3a3978b1 5775 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5776 withscores = 1;
5777 else
5778 badsyntax = 1;
0500ef27 5779 }
3a3978b1 5780 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 5781 badsyntax = 1;
0500ef27 5782 if (badsyntax) {
454d4e43 5783 addReplySds(c,
5784 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 5785 return;
0500ef27
SH
5786 }
5787
f44dd428 5788 /* Parse "LIMIT" */
0500ef27 5789 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 5790 addReply(c,shared.syntaxerr);
5791 return;
0500ef27 5792 } else if (c->argc == (7 + withscores)) {
80181f78 5793 offset = atoi(c->argv[5]->ptr);
5794 limit = atoi(c->argv[6]->ptr);
0b13687c 5795 if (offset < 0) offset = 0;
80181f78 5796 }
50c55df5 5797
f44dd428 5798 /* Ok, lookup the key and get the range */
50c55df5 5799 o = lookupKeyRead(c->db,c->argv[1]);
5800 if (o == NULL) {
f44dd428 5801 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
50c55df5 5802 } else {
5803 if (o->type != REDIS_ZSET) {
5804 addReply(c,shared.wrongtypeerr);
5805 } else {
5806 zset *zsetobj = o->ptr;
5807 zskiplist *zsl = zsetobj->zsl;
5808 zskiplistNode *ln;
f44dd428 5809 robj *ele, *lenobj = NULL;
5810 unsigned long rangelen = 0;
50c55df5 5811
f44dd428 5812 /* Get the first node with the score >= min, or with
5813 * score > min if 'minex' is true. */
50c55df5 5814 ln = zslFirstWithScore(zsl,min);
f44dd428 5815 while (minex && ln && ln->score == min) ln = ln->forward[0];
5816
50c55df5 5817 if (ln == NULL) {
5818 /* No element matching the speciifed interval */
f44dd428 5819 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 5820 return;
5821 }
5822
5823 /* We don't know in advance how many matching elements there
5824 * are in the list, so we push this object that will represent
5825 * the multi-bulk length in the output buffer, and will "fix"
5826 * it later */
f44dd428 5827 if (!justcount) {
5828 lenobj = createObject(REDIS_STRING,NULL);
5829 addReply(c,lenobj);
5830 decrRefCount(lenobj);
5831 }
50c55df5 5832
f44dd428 5833 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 5834 if (offset) {
5835 offset--;
5836 ln = ln->forward[0];
5837 continue;
5838 }
5839 if (limit == 0) break;
f44dd428 5840 if (!justcount) {
5841 ele = ln->obj;
dd88747b 5842 addReplyBulk(c,ele);
f44dd428 5843 if (withscores)
5844 addReplyDouble(c,ln->score);
5845 }
50c55df5 5846 ln = ln->forward[0];
5847 rangelen++;
80181f78 5848 if (limit > 0) limit--;
50c55df5 5849 }
f44dd428 5850 if (justcount) {
5851 addReplyLong(c,(long)rangelen);
5852 } else {
5853 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5854 withscores ? (rangelen*2) : rangelen);
5855 }
50c55df5 5856 }
5857 }
5858}
5859
f44dd428 5860static void zrangebyscoreCommand(redisClient *c) {
5861 genericZrangebyscoreCommand(c,0);
5862}
5863
5864static void zcountCommand(redisClient *c) {
5865 genericZrangebyscoreCommand(c,1);
5866}
5867
3c41331e 5868static void zcardCommand(redisClient *c) {
e197b441 5869 robj *o;
5870 zset *zs;
dd88747b 5871
5872 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5873 checkType(c,o,REDIS_ZSET)) return;
5874
5875 zs = o->ptr;
5876 addReplyUlong(c,zs->zsl->length);
e197b441 5877}
5878
6e333bbe 5879static void zscoreCommand(redisClient *c) {
5880 robj *o;
5881 zset *zs;
dd88747b 5882 dictEntry *de;
5883
5884 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5885 checkType(c,o,REDIS_ZSET)) return;
5886
5887 zs = o->ptr;
5888 de = dictFind(zs->dict,c->argv[2]);
5889 if (!de) {
96d8b4ee 5890 addReply(c,shared.nullbulk);
6e333bbe 5891 } else {
dd88747b 5892 double *score = dictGetEntryVal(de);
6e333bbe 5893
dd88747b 5894 addReplyDouble(c,*score);
6e333bbe 5895 }
5896}
5897
798d9e55 5898static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 5899 robj *o;
dd88747b 5900 zset *zs;
5901 zskiplist *zsl;
5902 dictEntry *de;
5903 unsigned long rank;
5904 double *score;
5905
5906 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5907 checkType(c,o,REDIS_ZSET)) return;
5908
5909 zs = o->ptr;
5910 zsl = zs->zsl;
5911 de = dictFind(zs->dict,c->argv[2]);
5912 if (!de) {
69d95c3e
PN
5913 addReply(c,shared.nullbulk);
5914 return;
5915 }
69d95c3e 5916
dd88747b 5917 score = dictGetEntryVal(de);
5918 rank = zslGetRank(zsl, *score, c->argv[2]);
5919 if (rank) {
5920 if (reverse) {
5921 addReplyLong(c, zsl->length - rank);
27b0ccca 5922 } else {
dd88747b 5923 addReplyLong(c, rank-1);
69d95c3e 5924 }
dd88747b 5925 } else {
5926 addReply(c,shared.nullbulk);
978c2c94 5927 }
5928}
5929
798d9e55
PN
5930static void zrankCommand(redisClient *c) {
5931 zrankGenericCommand(c, 0);
5932}
5933
5934static void zrevrankCommand(redisClient *c) {
5935 zrankGenericCommand(c, 1);
5936}
5937
cbba7dd7 5938/* =================================== Hashes =============================== */
978c2c94 5939static void hsetCommand(redisClient *c) {
5940 int update = 0;
5941 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5942
5943 if (o == NULL) {
5944 o = createHashObject();
5945 dictAdd(c->db->dict,c->argv[1],o);
5946 incrRefCount(c->argv[1]);
5947 } else {
5948 if (o->type != REDIS_HASH) {
5949 addReply(c,shared.wrongtypeerr);
5950 return;
5951 }
5952 }
bae2c7ec 5953 /* We want to convert the zipmap into an hash table right now if the
5954 * entry to be added is too big. Note that we check if the object
5955 * is integer encoded before to try fetching the length in the test below.
5956 * This is because integers are small, but currently stringObjectLen()
5957 * performs a slow conversion: not worth it. */
5958 if (o->encoding == REDIS_ENCODING_ZIPMAP &&
5959 ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
5960 sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
5961 (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
5962 sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
5963 {
5964 convertToRealHash(o);
5965 }
5966
978c2c94 5967 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5968 unsigned char *zm = o->ptr;
b1befe6a 5969 robj *valobj = getDecodedObject(c->argv[3]);
978c2c94 5970
5971 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
b1befe6a 5972 valobj->ptr,sdslen(valobj->ptr),&update);
5973 decrRefCount(valobj);
cbba7dd7 5974 o->ptr = zm;
bae2c7ec 5975
e9484a85
PN
5976 /* And here there is the second check for hash conversion. */
5977 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
bae2c7ec 5978 convertToRealHash(o);
978c2c94 5979 } else {
bae2c7ec 5980 tryObjectEncoding(c->argv[2]);
5981 /* note that c->argv[3] is already encoded, as the latest arg
5982 * of a bulk command is always integer encoded if possible. */
2069d06a 5983 if (dictReplace(o->ptr,c->argv[2],c->argv[3])) {
978c2c94 5984 incrRefCount(c->argv[2]);
5985 } else {
5986 update = 1;
5987 }
5988 incrRefCount(c->argv[3]);
5989 }
5990 server.dirty++;
5991 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
5992}
5993
01426b05 5994static void hincrbyCommand(redisClient *c) {
01426b05
PN
5995 long long value = 0, incr = 0;
5996 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5997
5998 if (o == NULL) {
5999 o = createHashObject();
6000 dictAdd(c->db->dict,c->argv[1],o);
6001 incrRefCount(c->argv[1]);
6002 } else {
6003 if (o->type != REDIS_HASH) {
6004 addReply(c,shared.wrongtypeerr);
6005 return;
6006 }
6007 }
6008
5e26ae88 6009 incr = strtoll(c->argv[3]->ptr, NULL, 10);
01426b05
PN
6010 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6011 unsigned char *zm = o->ptr;
6012 unsigned char *zval;
6013 unsigned int zvlen;
6014
6015 /* Find value if already present in hash */
6016 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6017 &zval,&zvlen)) {
6018 /* strtoll needs the char* to have a trailing \0, but
6019 * the zipmap doesn't include them. */
6020 sds szval = sdsnewlen(zval, zvlen);
6021 value = strtoll(szval,NULL,10);
6022 sdsfree(szval);
6023 }
6024
6025 value += incr;
6026 sds svalue = sdscatprintf(sdsempty(),"%lld",value);
6027 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
e9484a85 6028 (unsigned char*)svalue,sdslen(svalue),NULL);
01426b05
PN
6029 sdsfree(svalue);
6030 o->ptr = zm;
6031
e9484a85
PN
6032 /* Check if the zipmap needs to be converted. */
6033 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
01426b05
PN
6034 convertToRealHash(o);
6035 } else {
6036 robj *hval;
6037 dictEntry *de;
6038
6039 /* Find value if already present in hash */
6040 de = dictFind(o->ptr,c->argv[2]);
6041 if (de != NULL) {
6042 hval = dictGetEntryVal(de);
6043 if (hval->encoding == REDIS_ENCODING_RAW)
6044 value = strtoll(hval->ptr,NULL,10);
6045 else if (hval->encoding == REDIS_ENCODING_INT)
6046 value = (long)hval->ptr;
6047 else
6048 redisAssert(1 != 1);
6049 }
6050
6051 value += incr;
6052 hval = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
6053 tryObjectEncoding(hval);
01426b05
PN
6054 if (dictReplace(o->ptr,c->argv[2],hval)) {
6055 incrRefCount(c->argv[2]);
6056 }
6057 }
6058
6059 server.dirty++;
aa7c2934 6060 addReplyLongLong(c, value);
01426b05
PN
6061}
6062
978c2c94 6063static void hgetCommand(redisClient *c) {
dd88747b 6064 robj *o;
978c2c94 6065
dd88747b 6066 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6067 checkType(c,o,REDIS_HASH)) return;
6068
6069 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6070 unsigned char *zm = o->ptr;
6071 unsigned char *val;
6072 unsigned int vlen;
164ee595 6073 robj *field;
dd88747b 6074
164ee595 6075 field = getDecodedObject(c->argv[2]);
6076 if (zipmapGet(zm,field->ptr,sdslen(field->ptr), &val,&vlen)) {
dd88747b 6077 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6078 addReplySds(c,sdsnewlen(val,vlen));
6079 addReply(c,shared.crlf);
164ee595 6080 decrRefCount(field);
dd88747b 6081 return;
6082 } else {
6083 addReply(c,shared.nullbulk);
164ee595 6084 decrRefCount(field);
bcd11906 6085 return;
6086 }
dd88747b 6087 } else {
6088 struct dictEntry *de;
bcd11906 6089
dd88747b 6090 de = dictFind(o->ptr,c->argv[2]);
6091 if (de == NULL) {
6092 addReply(c,shared.nullbulk);
978c2c94 6093 } else {
dd88747b 6094 robj *e = dictGetEntryVal(de);
978c2c94 6095
dd88747b 6096 addReplyBulk(c,e);
978c2c94 6097 }
69d95c3e 6098 }
69d95c3e
PN
6099}
6100
07efaf74 6101static void hdelCommand(redisClient *c) {
dd88747b 6102 robj *o;
6103 int deleted = 0;
07efaf74 6104
dd88747b 6105 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6106 checkType(c,o,REDIS_HASH)) return;
07efaf74 6107
dd88747b 6108 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
2a1198b4 6109 robj *field = getDecodedObject(c->argv[2]);
6110
dd88747b 6111 o->ptr = zipmapDel((unsigned char*) o->ptr,
2a1198b4 6112 (unsigned char*) field->ptr,
6113 sdslen(field->ptr), &deleted);
6114 decrRefCount(field);
3ea27d37 6115 if (zipmapLen((unsigned char*) o->ptr) == 0)
6116 deleteKey(c->db,c->argv[1]);
dd88747b 6117 } else {
6118 deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
3ea27d37 6119 if (htNeedsResize(o->ptr)) dictResize(o->ptr);
6120 if (dictSize((dict*)o->ptr) == 0) deleteKey(c->db,c->argv[1]);
07efaf74 6121 }
c77169b7 6122 if (deleted) server.dirty++;
dd88747b 6123 addReply(c,deleted ? shared.cone : shared.czero);
07efaf74 6124}
6125
92b27fe9 6126static void hlenCommand(redisClient *c) {
6127 robj *o;
6128 unsigned long len;
6129
dd88747b 6130 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 6131 checkType(c,o,REDIS_HASH)) return;
6132
6133 len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6134 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6135 addReplyUlong(c,len);
6136}
6137
78409a0f 6138#define REDIS_GETALL_KEYS 1
6139#define REDIS_GETALL_VALS 2
6140static void genericHgetallCommand(redisClient *c, int flags) {
6141 robj *o, *lenobj;
6142 unsigned long count = 0;
6143
6144 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL
6145 || checkType(c,o,REDIS_HASH)) return;
6146
6147 lenobj = createObject(REDIS_STRING,NULL);
6148 addReply(c,lenobj);
6149 decrRefCount(lenobj);
6150
6151 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6152 unsigned char *p = zipmapRewind(o->ptr);
6153 unsigned char *field, *val;
6154 unsigned int flen, vlen;
6155
6156 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
6157 robj *aux;
6158
6159 if (flags & REDIS_GETALL_KEYS) {
6160 aux = createStringObject((char*)field,flen);
6161 addReplyBulk(c,aux);
6162 decrRefCount(aux);
6163 count++;
6164 }
6165 if (flags & REDIS_GETALL_VALS) {
6166 aux = createStringObject((char*)val,vlen);
6167 addReplyBulk(c,aux);
6168 decrRefCount(aux);
6169 count++;
6170 }
6171 }
6172 } else {
6173 dictIterator *di = dictGetIterator(o->ptr);
6174 dictEntry *de;
6175
6176 while((de = dictNext(di)) != NULL) {
6177 robj *fieldobj = dictGetEntryKey(de);
6178 robj *valobj = dictGetEntryVal(de);
6179
6180 if (flags & REDIS_GETALL_KEYS) {
6181 addReplyBulk(c,fieldobj);
6182 count++;
6183 }
6184 if (flags & REDIS_GETALL_VALS) {
6185 addReplyBulk(c,valobj);
6186 count++;
6187 }
6188 }
6189 dictReleaseIterator(di);
6190 }
6191 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6192}
6193
6194static void hkeysCommand(redisClient *c) {
6195 genericHgetallCommand(c,REDIS_GETALL_KEYS);
6196}
6197
6198static void hvalsCommand(redisClient *c) {
6199 genericHgetallCommand(c,REDIS_GETALL_VALS);
6200}
6201
6202static void hgetallCommand(redisClient *c) {
6203 genericHgetallCommand(c,REDIS_GETALL_KEYS|REDIS_GETALL_VALS);
6204}
6205
a86f14b1 6206static void hexistsCommand(redisClient *c) {
6207 robj *o;
6208 int exists = 0;
6209
6210 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6211 checkType(c,o,REDIS_HASH)) return;
6212
6213 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6214 robj *field;
6215 unsigned char *zm = o->ptr;
6216
6217 field = getDecodedObject(c->argv[2]);
6218 exists = zipmapExists(zm,field->ptr,sdslen(field->ptr));
6219 decrRefCount(field);
6220 } else {
6221 exists = dictFind(o->ptr,c->argv[2]) != NULL;
6222 }
6223 addReply(c,exists ? shared.cone : shared.czero);
6224}
6225
ada386b2 6226static void convertToRealHash(robj *o) {
6227 unsigned char *key, *val, *p, *zm = o->ptr;
6228 unsigned int klen, vlen;
6229 dict *dict = dictCreate(&hashDictType,NULL);
6230
6231 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6232 p = zipmapRewind(zm);
6233 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6234 robj *keyobj, *valobj;
6235
6236 keyobj = createStringObject((char*)key,klen);
6237 valobj = createStringObject((char*)val,vlen);
6238 tryObjectEncoding(keyobj);
6239 tryObjectEncoding(valobj);
6240 dictAdd(dict,keyobj,valobj);
6241 }
6242 o->encoding = REDIS_ENCODING_HT;
6243 o->ptr = dict;
6244 zfree(zm);
6245}
6246
6b47e12e 6247/* ========================= Non type-specific commands ==================== */
6248
ed9b544e 6249static void flushdbCommand(redisClient *c) {
ca37e9cd 6250 server.dirty += dictSize(c->db->dict);
3305306f 6251 dictEmpty(c->db->dict);
6252 dictEmpty(c->db->expires);
ed9b544e 6253 addReply(c,shared.ok);
ed9b544e 6254}
6255
6256static void flushallCommand(redisClient *c) {
ca37e9cd 6257 server.dirty += emptyDb();
ed9b544e 6258 addReply(c,shared.ok);
500ece7c 6259 if (server.bgsavechildpid != -1) {
6260 kill(server.bgsavechildpid,SIGKILL);
6261 rdbRemoveTempFile(server.bgsavechildpid);
6262 }
f78fd11b 6263 rdbSave(server.dbfilename);
ca37e9cd 6264 server.dirty++;
ed9b544e 6265}
6266
56906eef 6267static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 6268 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 6269 so->type = type;
6270 so->pattern = pattern;
6271 return so;
6272}
6273
6274/* Return the value associated to the key with a name obtained
6275 * substituting the first occurence of '*' in 'pattern' with 'subst' */
56906eef 6276static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
ed9b544e 6277 char *p;
6278 sds spat, ssub;
6279 robj keyobj;
6280 int prefixlen, sublen, postfixlen;
ed9b544e 6281 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6282 struct {
f1017b3f 6283 long len;
6284 long free;
ed9b544e 6285 char buf[REDIS_SORTKEY_MAX+1];
6286 } keyname;
6287
28173a49 6288 /* If the pattern is "#" return the substitution object itself in order
6289 * to implement the "SORT ... GET #" feature. */
6290 spat = pattern->ptr;
6291 if (spat[0] == '#' && spat[1] == '\0') {
6292 return subst;
6293 }
6294
6295 /* The substitution object may be specially encoded. If so we create
9d65a1bb 6296 * a decoded object on the fly. Otherwise getDecodedObject will just
6297 * increment the ref count, that we'll decrement later. */
6298 subst = getDecodedObject(subst);
942a3961 6299
ed9b544e 6300 ssub = subst->ptr;
6301 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6302 p = strchr(spat,'*');
ed5a857a 6303 if (!p) {
6304 decrRefCount(subst);
6305 return NULL;
6306 }
ed9b544e 6307
6308 prefixlen = p-spat;
6309 sublen = sdslen(ssub);
6310 postfixlen = sdslen(spat)-(prefixlen+1);
6311 memcpy(keyname.buf,spat,prefixlen);
6312 memcpy(keyname.buf+prefixlen,ssub,sublen);
6313 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6314 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6315 keyname.len = prefixlen+sublen+postfixlen;
6316
dfc5e96c 6317 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
942a3961 6318 decrRefCount(subst);
6319
a4d1ba9a 6320 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
3305306f 6321 return lookupKeyRead(db,&keyobj);
ed9b544e 6322}
6323
6324/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6325 * the additional parameter is not standard but a BSD-specific we have to
6326 * pass sorting parameters via the global 'server' structure */
6327static int sortCompare(const void *s1, const void *s2) {
6328 const redisSortObject *so1 = s1, *so2 = s2;
6329 int cmp;
6330
6331 if (!server.sort_alpha) {
6332 /* Numeric sorting. Here it's trivial as we precomputed scores */
6333 if (so1->u.score > so2->u.score) {
6334 cmp = 1;
6335 } else if (so1->u.score < so2->u.score) {
6336 cmp = -1;
6337 } else {
6338 cmp = 0;
6339 }
6340 } else {
6341 /* Alphanumeric sorting */
6342 if (server.sort_bypattern) {
6343 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6344 /* At least one compare object is NULL */
6345 if (so1->u.cmpobj == so2->u.cmpobj)
6346 cmp = 0;
6347 else if (so1->u.cmpobj == NULL)
6348 cmp = -1;
6349 else
6350 cmp = 1;
6351 } else {
6352 /* We have both the objects, use strcoll */
6353 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6354 }
6355 } else {
6356 /* Compare elements directly */
9d65a1bb 6357 robj *dec1, *dec2;
6358
6359 dec1 = getDecodedObject(so1->obj);
6360 dec2 = getDecodedObject(so2->obj);
6361 cmp = strcoll(dec1->ptr,dec2->ptr);
6362 decrRefCount(dec1);
6363 decrRefCount(dec2);
ed9b544e 6364 }
6365 }
6366 return server.sort_desc ? -cmp : cmp;
6367}
6368
6369/* The SORT command is the most complex command in Redis. Warning: this code
6370 * is optimized for speed and a bit less for readability */
6371static void sortCommand(redisClient *c) {
ed9b544e 6372 list *operations;
6373 int outputlen = 0;
6374 int desc = 0, alpha = 0;
6375 int limit_start = 0, limit_count = -1, start, end;
6376 int j, dontsort = 0, vectorlen;
6377 int getop = 0; /* GET operation counter */
443c6409 6378 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 6379 redisSortObject *vector; /* Resulting vector to sort */
6380
6381 /* Lookup the key to sort. It must be of the right types */
3305306f 6382 sortval = lookupKeyRead(c->db,c->argv[1]);
6383 if (sortval == NULL) {
d922ae65 6384 addReply(c,shared.nullmultibulk);
ed9b544e 6385 return;
6386 }
a5eb649b 6387 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6388 sortval->type != REDIS_ZSET)
6389 {
c937aa89 6390 addReply(c,shared.wrongtypeerr);
ed9b544e 6391 return;
6392 }
6393
6394 /* Create a list of operations to perform for every sorted element.
6395 * Operations can be GET/DEL/INCR/DECR */
6396 operations = listCreate();
092dac2a 6397 listSetFreeMethod(operations,zfree);
ed9b544e 6398 j = 2;
6399
6400 /* Now we need to protect sortval incrementing its count, in the future
6401 * SORT may have options able to overwrite/delete keys during the sorting
6402 * and the sorted key itself may get destroied */
6403 incrRefCount(sortval);
6404
6405 /* The SORT command has an SQL-alike syntax, parse it */
6406 while(j < c->argc) {
6407 int leftargs = c->argc-j-1;
6408 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6409 desc = 0;
6410 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6411 desc = 1;
6412 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6413 alpha = 1;
6414 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6415 limit_start = atoi(c->argv[j+1]->ptr);
6416 limit_count = atoi(c->argv[j+2]->ptr);
6417 j+=2;
443c6409 6418 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6419 storekey = c->argv[j+1];
6420 j++;
ed9b544e 6421 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6422 sortby = c->argv[j+1];
6423 /* If the BY pattern does not contain '*', i.e. it is constant,
6424 * we don't need to sort nor to lookup the weight keys. */
6425 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6426 j++;
6427 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6428 listAddNodeTail(operations,createSortOperation(
6429 REDIS_SORT_GET,c->argv[j+1]));
6430 getop++;
6431 j++;
ed9b544e 6432 } else {
6433 decrRefCount(sortval);
6434 listRelease(operations);
c937aa89 6435 addReply(c,shared.syntaxerr);
ed9b544e 6436 return;
6437 }
6438 j++;
6439 }
6440
6441 /* Load the sorting vector with all the objects to sort */
a5eb649b 6442 switch(sortval->type) {
6443 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6444 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6445 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
dfc5e96c 6446 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
a5eb649b 6447 }
ed9b544e 6448 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 6449 j = 0;
a5eb649b 6450
ed9b544e 6451 if (sortval->type == REDIS_LIST) {
6452 list *list = sortval->ptr;
6208b3a7 6453 listNode *ln;
c7df85a4 6454 listIter li;
6208b3a7 6455
c7df85a4 6456 listRewind(list,&li);
6457 while((ln = listNext(&li))) {
ed9b544e 6458 robj *ele = ln->value;
6459 vector[j].obj = ele;
6460 vector[j].u.score = 0;
6461 vector[j].u.cmpobj = NULL;
ed9b544e 6462 j++;
6463 }
6464 } else {
a5eb649b 6465 dict *set;
ed9b544e 6466 dictIterator *di;
6467 dictEntry *setele;
6468
a5eb649b 6469 if (sortval->type == REDIS_SET) {
6470 set = sortval->ptr;
6471 } else {
6472 zset *zs = sortval->ptr;
6473 set = zs->dict;
6474 }
6475
ed9b544e 6476 di = dictGetIterator(set);
ed9b544e 6477 while((setele = dictNext(di)) != NULL) {
6478 vector[j].obj = dictGetEntryKey(setele);
6479 vector[j].u.score = 0;
6480 vector[j].u.cmpobj = NULL;
6481 j++;
6482 }
6483 dictReleaseIterator(di);
6484 }
dfc5e96c 6485 redisAssert(j == vectorlen);
ed9b544e 6486
6487 /* Now it's time to load the right scores in the sorting vector */
6488 if (dontsort == 0) {
6489 for (j = 0; j < vectorlen; j++) {
6490 if (sortby) {
6491 robj *byval;
6492
3305306f 6493 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
ed9b544e 6494 if (!byval || byval->type != REDIS_STRING) continue;
6495 if (alpha) {
9d65a1bb 6496 vector[j].u.cmpobj = getDecodedObject(byval);
ed9b544e 6497 } else {
942a3961 6498 if (byval->encoding == REDIS_ENCODING_RAW) {
6499 vector[j].u.score = strtod(byval->ptr,NULL);
6500 } else {
9d65a1bb 6501 /* Don't need to decode the object if it's
6502 * integer-encoded (the only encoding supported) so
6503 * far. We can just cast it */
f1017b3f 6504 if (byval->encoding == REDIS_ENCODING_INT) {
942a3961 6505 vector[j].u.score = (long)byval->ptr;
f1017b3f 6506 } else
dfc5e96c 6507 redisAssert(1 != 1);
942a3961 6508 }
ed9b544e 6509 }
6510 } else {
942a3961 6511 if (!alpha) {
6512 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6513 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6514 else {
6515 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6516 vector[j].u.score = (long) vector[j].obj->ptr;
6517 else
dfc5e96c 6518 redisAssert(1 != 1);
942a3961 6519 }
6520 }
ed9b544e 6521 }
6522 }
6523 }
6524
6525 /* We are ready to sort the vector... perform a bit of sanity check
6526 * on the LIMIT option too. We'll use a partial version of quicksort. */
6527 start = (limit_start < 0) ? 0 : limit_start;
6528 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6529 if (start >= vectorlen) {
6530 start = vectorlen-1;
6531 end = vectorlen-2;
6532 }
6533 if (end >= vectorlen) end = vectorlen-1;
6534
6535 if (dontsort == 0) {
6536 server.sort_desc = desc;
6537 server.sort_alpha = alpha;
6538 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 6539 if (sortby && (start != 0 || end != vectorlen-1))
6540 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6541 else
6542 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 6543 }
6544
6545 /* Send command output to the output buffer, performing the specified
6546 * GET/DEL/INCR/DECR operations if any. */
6547 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 6548 if (storekey == NULL) {
6549 /* STORE option not specified, sent the sorting result to client */
6550 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6551 for (j = start; j <= end; j++) {
6552 listNode *ln;
c7df85a4 6553 listIter li;
6554
dd88747b 6555 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 6556 listRewind(operations,&li);
6557 while((ln = listNext(&li))) {
443c6409 6558 redisSortOperation *sop = ln->value;
6559 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6560 vector[j].obj);
6561
6562 if (sop->type == REDIS_SORT_GET) {
6563 if (!val || val->type != REDIS_STRING) {
6564 addReply(c,shared.nullbulk);
6565 } else {
dd88747b 6566 addReplyBulk(c,val);
443c6409 6567 }
6568 } else {
dfc5e96c 6569 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 6570 }
6571 }
ed9b544e 6572 }
443c6409 6573 } else {
6574 robj *listObject = createListObject();
6575 list *listPtr = (list*) listObject->ptr;
6576
6577 /* STORE option specified, set the sorting result as a List object */
6578 for (j = start; j <= end; j++) {
6579 listNode *ln;
c7df85a4 6580 listIter li;
6581
443c6409 6582 if (!getop) {
6583 listAddNodeTail(listPtr,vector[j].obj);
6584 incrRefCount(vector[j].obj);
6585 }
c7df85a4 6586 listRewind(operations,&li);
6587 while((ln = listNext(&li))) {
443c6409 6588 redisSortOperation *sop = ln->value;
6589 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6590 vector[j].obj);
6591
6592 if (sop->type == REDIS_SORT_GET) {
6593 if (!val || val->type != REDIS_STRING) {
6594 listAddNodeTail(listPtr,createStringObject("",0));
6595 } else {
6596 listAddNodeTail(listPtr,val);
6597 incrRefCount(val);
6598 }
ed9b544e 6599 } else {
dfc5e96c 6600 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 6601 }
ed9b544e 6602 }
ed9b544e 6603 }
121796f7 6604 if (dictReplace(c->db->dict,storekey,listObject)) {
6605 incrRefCount(storekey);
6606 }
443c6409 6607 /* Note: we add 1 because the DB is dirty anyway since even if the
6608 * SORT result is empty a new key is set and maybe the old content
6609 * replaced. */
6610 server.dirty += 1+outputlen;
6611 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 6612 }
6613
6614 /* Cleanup */
6615 decrRefCount(sortval);
6616 listRelease(operations);
6617 for (j = 0; j < vectorlen; j++) {
6618 if (sortby && alpha && vector[j].u.cmpobj)
6619 decrRefCount(vector[j].u.cmpobj);
6620 }
6621 zfree(vector);
6622}
6623
ec6c7a1d 6624/* Convert an amount of bytes into a human readable string in the form
6625 * of 100B, 2G, 100M, 4K, and so forth. */
6626static void bytesToHuman(char *s, unsigned long long n) {
6627 double d;
6628
6629 if (n < 1024) {
6630 /* Bytes */
6631 sprintf(s,"%lluB",n);
6632 return;
6633 } else if (n < (1024*1024)) {
6634 d = (double)n/(1024);
6635 sprintf(s,"%.2fK",d);
6636 } else if (n < (1024LL*1024*1024)) {
6637 d = (double)n/(1024*1024);
6638 sprintf(s,"%.2fM",d);
6639 } else if (n < (1024LL*1024*1024*1024)) {
6640 d = (double)n/(1024LL*1024*1024);
b72f6a4b 6641 sprintf(s,"%.2fG",d);
ec6c7a1d 6642 }
6643}
6644
1c85b79f 6645/* Create the string returned by the INFO command. This is decoupled
6646 * by the INFO command itself as we need to report the same information
6647 * on memory corruption problems. */
6648static sds genRedisInfoString(void) {
ed9b544e 6649 sds info;
6650 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 6651 int j;
ec6c7a1d 6652 char hmem[64];
55a8298f 6653
b72f6a4b 6654 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 6655 info = sdscatprintf(sdsempty(),
6656 "redis_version:%s\r\n"
f1017b3f 6657 "arch_bits:%s\r\n"
7a932b74 6658 "multiplexing_api:%s\r\n"
0d7170a4 6659 "process_id:%ld\r\n"
682ac724 6660 "uptime_in_seconds:%ld\r\n"
6661 "uptime_in_days:%ld\r\n"
ed9b544e 6662 "connected_clients:%d\r\n"
6663 "connected_slaves:%d\r\n"
f86a74e9 6664 "blocked_clients:%d\r\n"
5fba9f71 6665 "used_memory:%zu\r\n"
ec6c7a1d 6666 "used_memory_human:%s\r\n"
ed9b544e 6667 "changes_since_last_save:%lld\r\n"
be2bb6b0 6668 "bgsave_in_progress:%d\r\n"
682ac724 6669 "last_save_time:%ld\r\n"
b3fad521 6670 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 6671 "total_connections_received:%lld\r\n"
6672 "total_commands_processed:%lld\r\n"
2a6a2ed1 6673 "expired_keys:%lld\r\n"
55a8298f 6674 "hash_max_zipmap_entries:%ld\r\n"
6675 "hash_max_zipmap_value:%ld\r\n"
ffc6b7f8 6676 "pubsub_channels:%ld\r\n"
6677 "pubsub_patterns:%u\r\n"
7d98e08c 6678 "vm_enabled:%d\r\n"
a0f643ea 6679 "role:%s\r\n"
ed9b544e 6680 ,REDIS_VERSION,
f1017b3f 6681 (sizeof(long) == 8) ? "64" : "32",
7a932b74 6682 aeGetApiName(),
0d7170a4 6683 (long) getpid(),
a0f643ea 6684 uptime,
6685 uptime/(3600*24),
ed9b544e 6686 listLength(server.clients)-listLength(server.slaves),
6687 listLength(server.slaves),
d5d55fc3 6688 server.blpop_blocked_clients,
b72f6a4b 6689 zmalloc_used_memory(),
ec6c7a1d 6690 hmem,
ed9b544e 6691 server.dirty,
9d65a1bb 6692 server.bgsavechildpid != -1,
ed9b544e 6693 server.lastsave,
b3fad521 6694 server.bgrewritechildpid != -1,
ed9b544e 6695 server.stat_numconnections,
6696 server.stat_numcommands,
2a6a2ed1 6697 server.stat_expiredkeys,
55a8298f 6698 server.hash_max_zipmap_entries,
6699 server.hash_max_zipmap_value,
ffc6b7f8 6700 dictSize(server.pubsub_channels),
6701 listLength(server.pubsub_patterns),
7d98e08c 6702 server.vm_enabled != 0,
a0f643ea 6703 server.masterhost == NULL ? "master" : "slave"
ed9b544e 6704 );
a0f643ea 6705 if (server.masterhost) {
6706 info = sdscatprintf(info,
6707 "master_host:%s\r\n"
6708 "master_port:%d\r\n"
6709 "master_link_status:%s\r\n"
6710 "master_last_io_seconds_ago:%d\r\n"
6711 ,server.masterhost,
6712 server.masterport,
6713 (server.replstate == REDIS_REPL_CONNECTED) ?
6714 "up" : "down",
f72b934d 6715 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 6716 );
6717 }
7d98e08c 6718 if (server.vm_enabled) {
1064ef87 6719 lockThreadedIO();
7d98e08c 6720 info = sdscatprintf(info,
6721 "vm_conf_max_memory:%llu\r\n"
6722 "vm_conf_page_size:%llu\r\n"
6723 "vm_conf_pages:%llu\r\n"
6724 "vm_stats_used_pages:%llu\r\n"
6725 "vm_stats_swapped_objects:%llu\r\n"
6726 "vm_stats_swappin_count:%llu\r\n"
6727 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 6728 "vm_stats_io_newjobs_len:%lu\r\n"
6729 "vm_stats_io_processing_len:%lu\r\n"
6730 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 6731 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 6732 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 6733 ,(unsigned long long) server.vm_max_memory,
6734 (unsigned long long) server.vm_page_size,
6735 (unsigned long long) server.vm_pages,
6736 (unsigned long long) server.vm_stats_used_pages,
6737 (unsigned long long) server.vm_stats_swapped_objects,
6738 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 6739 (unsigned long long) server.vm_stats_swapouts,
6740 (unsigned long) listLength(server.io_newjobs),
6741 (unsigned long) listLength(server.io_processing),
6742 (unsigned long) listLength(server.io_processed),
d5d55fc3 6743 (unsigned long) server.io_active_threads,
6744 (unsigned long) server.vm_blocked_clients
7d98e08c 6745 );
1064ef87 6746 unlockThreadedIO();
7d98e08c 6747 }
c3cb078d 6748 for (j = 0; j < server.dbnum; j++) {
6749 long long keys, vkeys;
6750
6751 keys = dictSize(server.db[j].dict);
6752 vkeys = dictSize(server.db[j].expires);
6753 if (keys || vkeys) {
9d65a1bb 6754 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 6755 j, keys, vkeys);
6756 }
6757 }
1c85b79f 6758 return info;
6759}
6760
6761static void infoCommand(redisClient *c) {
6762 sds info = genRedisInfoString();
83c6a618 6763 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6764 (unsigned long)sdslen(info)));
ed9b544e 6765 addReplySds(c,info);
70003d28 6766 addReply(c,shared.crlf);
ed9b544e 6767}
6768
3305306f 6769static void monitorCommand(redisClient *c) {
6770 /* ignore MONITOR if aleady slave or in monitor mode */
6771 if (c->flags & REDIS_SLAVE) return;
6772
6773 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6774 c->slaveseldb = 0;
6b47e12e 6775 listAddNodeTail(server.monitors,c);
3305306f 6776 addReply(c,shared.ok);
6777}
6778
6779/* ================================= Expire ================================= */
6780static int removeExpire(redisDb *db, robj *key) {
6781 if (dictDelete(db->expires,key) == DICT_OK) {
6782 return 1;
6783 } else {
6784 return 0;
6785 }
6786}
6787
6788static int setExpire(redisDb *db, robj *key, time_t when) {
6789 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6790 return 0;
6791 } else {
6792 incrRefCount(key);
6793 return 1;
6794 }
6795}
6796
bb32ede5 6797/* Return the expire time of the specified key, or -1 if no expire
6798 * is associated with this key (i.e. the key is non volatile) */
6799static time_t getExpire(redisDb *db, robj *key) {
6800 dictEntry *de;
6801
6802 /* No expire? return ASAP */
6803 if (dictSize(db->expires) == 0 ||
6804 (de = dictFind(db->expires,key)) == NULL) return -1;
6805
6806 return (time_t) dictGetEntryVal(de);
6807}
6808
3305306f 6809static int expireIfNeeded(redisDb *db, robj *key) {
6810 time_t when;
6811 dictEntry *de;
6812
6813 /* No expire? return ASAP */
6814 if (dictSize(db->expires) == 0 ||
6815 (de = dictFind(db->expires,key)) == NULL) return 0;
6816
6817 /* Lookup the expire */
6818 when = (time_t) dictGetEntryVal(de);
6819 if (time(NULL) <= when) return 0;
6820
6821 /* Delete the key */
6822 dictDelete(db->expires,key);
2a6a2ed1 6823 server.stat_expiredkeys++;
3305306f 6824 return dictDelete(db->dict,key) == DICT_OK;
6825}
6826
6827static int deleteIfVolatile(redisDb *db, robj *key) {
6828 dictEntry *de;
6829
6830 /* No expire? return ASAP */
6831 if (dictSize(db->expires) == 0 ||
6832 (de = dictFind(db->expires,key)) == NULL) return 0;
6833
6834 /* Delete the key */
0c66a471 6835 server.dirty++;
2a6a2ed1 6836 server.stat_expiredkeys++;
3305306f 6837 dictDelete(db->expires,key);
6838 return dictDelete(db->dict,key) == DICT_OK;
6839}
6840
802e8373 6841static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
3305306f 6842 dictEntry *de;
3305306f 6843
802e8373 6844 de = dictFind(c->db->dict,key);
3305306f 6845 if (de == NULL) {
6846 addReply(c,shared.czero);
6847 return;
6848 }
43e5ccdf 6849 if (seconds < 0) {
6850 if (deleteKey(c->db,key)) server.dirty++;
6851 addReply(c, shared.cone);
3305306f 6852 return;
6853 } else {
6854 time_t when = time(NULL)+seconds;
802e8373 6855 if (setExpire(c->db,key,when)) {
3305306f 6856 addReply(c,shared.cone);
77423026 6857 server.dirty++;
6858 } else {
3305306f 6859 addReply(c,shared.czero);
77423026 6860 }
3305306f 6861 return;
6862 }
6863}
6864
802e8373 6865static void expireCommand(redisClient *c) {
6866 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6867}
6868
6869static void expireatCommand(redisClient *c) {
6870 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6871}
6872
fd88489a 6873static void ttlCommand(redisClient *c) {
6874 time_t expire;
6875 int ttl = -1;
6876
6877 expire = getExpire(c->db,c->argv[1]);
6878 if (expire != -1) {
6879 ttl = (int) (expire-time(NULL));
6880 if (ttl < 0) ttl = -1;
6881 }
6882 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6883}
6884
6e469882 6885/* ================================ MULTI/EXEC ============================== */
6886
6887/* Client state initialization for MULTI/EXEC */
6888static void initClientMultiState(redisClient *c) {
6889 c->mstate.commands = NULL;
6890 c->mstate.count = 0;
6891}
6892
6893/* Release all the resources associated with MULTI/EXEC state */
6894static void freeClientMultiState(redisClient *c) {
6895 int j;
6896
6897 for (j = 0; j < c->mstate.count; j++) {
6898 int i;
6899 multiCmd *mc = c->mstate.commands+j;
6900
6901 for (i = 0; i < mc->argc; i++)
6902 decrRefCount(mc->argv[i]);
6903 zfree(mc->argv);
6904 }
6905 zfree(c->mstate.commands);
6906}
6907
6908/* Add a new command into the MULTI commands queue */
6909static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6910 multiCmd *mc;
6911 int j;
6912
6913 c->mstate.commands = zrealloc(c->mstate.commands,
6914 sizeof(multiCmd)*(c->mstate.count+1));
6915 mc = c->mstate.commands+c->mstate.count;
6916 mc->cmd = cmd;
6917 mc->argc = c->argc;
6918 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6919 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6920 for (j = 0; j < c->argc; j++)
6921 incrRefCount(mc->argv[j]);
6922 c->mstate.count++;
6923}
6924
6925static void multiCommand(redisClient *c) {
6926 c->flags |= REDIS_MULTI;
36c548f0 6927 addReply(c,shared.ok);
6e469882 6928}
6929
18b6cb76
DJ
6930static void discardCommand(redisClient *c) {
6931 if (!(c->flags & REDIS_MULTI)) {
6932 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6933 return;
6934 }
6935
6936 freeClientMultiState(c);
6937 initClientMultiState(c);
6938 c->flags &= (~REDIS_MULTI);
6939 addReply(c,shared.ok);
6940}
6941
6e469882 6942static void execCommand(redisClient *c) {
6943 int j;
6944 robj **orig_argv;
6945 int orig_argc;
6946
6947 if (!(c->flags & REDIS_MULTI)) {
6948 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6949 return;
6950 }
6951
6952 orig_argv = c->argv;
6953 orig_argc = c->argc;
6954 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6955 for (j = 0; j < c->mstate.count; j++) {
6956 c->argc = c->mstate.commands[j].argc;
6957 c->argv = c->mstate.commands[j].argv;
6958 call(c,c->mstate.commands[j].cmd);
6959 }
6960 c->argv = orig_argv;
6961 c->argc = orig_argc;
6962 freeClientMultiState(c);
6963 initClientMultiState(c);
6964 c->flags &= (~REDIS_MULTI);
6965}
6966
4409877e 6967/* =========================== Blocking Operations ========================= */
6968
6969/* Currently Redis blocking operations support is limited to list POP ops,
6970 * so the current implementation is not fully generic, but it is also not
6971 * completely specific so it will not require a rewrite to support new
6972 * kind of blocking operations in the future.
6973 *
6974 * Still it's important to note that list blocking operations can be already
6975 * used as a notification mechanism in order to implement other blocking
6976 * operations at application level, so there must be a very strong evidence
6977 * of usefulness and generality before new blocking operations are implemented.
6978 *
6979 * This is how the current blocking POP works, we use BLPOP as example:
6980 * - If the user calls BLPOP and the key exists and contains a non empty list
6981 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6982 * if there is not to block.
6983 * - If instead BLPOP is called and the key does not exists or the list is
6984 * empty we need to block. In order to do so we remove the notification for
6985 * new data to read in the client socket (so that we'll not serve new
6986 * requests if the blocking request is not served). Also we put the client
95242ab5 6987 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
4409877e 6988 * blocking for this keys.
6989 * - If a PUSH operation against a key with blocked clients waiting is
6990 * performed, we serve the first in the list: basically instead to push
6991 * the new element inside the list we return it to the (first / oldest)
6992 * blocking client, unblock the client, and remove it form the list.
6993 *
6994 * The above comment and the source code should be enough in order to understand
6995 * the implementation and modify / fix it later.
6996 */
6997
6998/* Set a client in blocking mode for the specified key, with the specified
6999 * timeout */
b177fd30 7000static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 7001 dictEntry *de;
7002 list *l;
b177fd30 7003 int j;
4409877e 7004
b177fd30 7005 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7006 c->blockingkeysnum = numkeys;
4409877e 7007 c->blockingto = timeout;
b177fd30 7008 for (j = 0; j < numkeys; j++) {
7009 /* Add the key in the client structure, to map clients -> keys */
7010 c->blockingkeys[j] = keys[j];
7011 incrRefCount(keys[j]);
4409877e 7012
b177fd30 7013 /* And in the other "side", to map keys -> clients */
7014 de = dictFind(c->db->blockingkeys,keys[j]);
7015 if (de == NULL) {
7016 int retval;
7017
7018 /* For every key we take a list of clients blocked for it */
7019 l = listCreate();
7020 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7021 incrRefCount(keys[j]);
7022 assert(retval == DICT_OK);
7023 } else {
7024 l = dictGetEntryVal(de);
7025 }
7026 listAddNodeTail(l,c);
4409877e 7027 }
b177fd30 7028 /* Mark the client as a blocked client */
4409877e 7029 c->flags |= REDIS_BLOCKED;
d5d55fc3 7030 server.blpop_blocked_clients++;
4409877e 7031}
7032
7033/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 7034static void unblockClientWaitingData(redisClient *c) {
4409877e 7035 dictEntry *de;
7036 list *l;
b177fd30 7037 int j;
4409877e 7038
b177fd30 7039 assert(c->blockingkeys != NULL);
7040 /* The client may wait for multiple keys, so unblock it for every key. */
7041 for (j = 0; j < c->blockingkeysnum; j++) {
7042 /* Remove this client from the list of clients waiting for this key. */
7043 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7044 assert(de != NULL);
7045 l = dictGetEntryVal(de);
7046 listDelNode(l,listSearchKey(l,c));
7047 /* If the list is empty we need to remove it to avoid wasting memory */
7048 if (listLength(l) == 0)
7049 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7050 decrRefCount(c->blockingkeys[j]);
7051 }
7052 /* Cleanup the client structure */
7053 zfree(c->blockingkeys);
7054 c->blockingkeys = NULL;
4409877e 7055 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 7056 server.blpop_blocked_clients--;
5921aa36 7057 /* We want to process data if there is some command waiting
b0d8747d 7058 * in the input buffer. Note that this is safe even if
7059 * unblockClientWaitingData() gets called from freeClient() because
7060 * freeClient() will be smart enough to call this function
7061 * *after* c->querybuf was set to NULL. */
4409877e 7062 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7063}
7064
7065/* This should be called from any function PUSHing into lists.
7066 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7067 * 'ele' is the element pushed.
7068 *
7069 * If the function returns 0 there was no client waiting for a list push
7070 * against this key.
7071 *
7072 * If the function returns 1 there was a client waiting for a list push
7073 * against this key, the element was passed to this client thus it's not
7074 * needed to actually add it to the list and the caller should return asap. */
7075static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7076 struct dictEntry *de;
7077 redisClient *receiver;
7078 list *l;
7079 listNode *ln;
7080
7081 de = dictFind(c->db->blockingkeys,key);
7082 if (de == NULL) return 0;
7083 l = dictGetEntryVal(de);
7084 ln = listFirst(l);
7085 assert(ln != NULL);
7086 receiver = ln->value;
4409877e 7087
b177fd30 7088 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 7089 addReplyBulk(receiver,key);
7090 addReplyBulk(receiver,ele);
b0d8747d 7091 unblockClientWaitingData(receiver);
4409877e 7092 return 1;
7093}
7094
7095/* Blocking RPOP/LPOP */
7096static void blockingPopGenericCommand(redisClient *c, int where) {
7097 robj *o;
7098 time_t timeout;
b177fd30 7099 int j;
4409877e 7100
b177fd30 7101 for (j = 1; j < c->argc-1; j++) {
7102 o = lookupKeyWrite(c->db,c->argv[j]);
7103 if (o != NULL) {
7104 if (o->type != REDIS_LIST) {
7105 addReply(c,shared.wrongtypeerr);
4409877e 7106 return;
b177fd30 7107 } else {
7108 list *list = o->ptr;
7109 if (listLength(list) != 0) {
7110 /* If the list contains elements fall back to the usual
7111 * non-blocking POP operation */
7112 robj *argv[2], **orig_argv;
7113 int orig_argc;
7114
7115 /* We need to alter the command arguments before to call
7116 * popGenericCommand() as the command takes a single key. */
7117 orig_argv = c->argv;
7118 orig_argc = c->argc;
7119 argv[1] = c->argv[j];
7120 c->argv = argv;
7121 c->argc = 2;
7122
7123 /* Also the return value is different, we need to output
7124 * the multi bulk reply header and the key name. The
7125 * "real" command will add the last element (the value)
7126 * for us. If this souds like an hack to you it's just
7127 * because it is... */
7128 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 7129 addReplyBulk(c,argv[1]);
b177fd30 7130 popGenericCommand(c,where);
7131
7132 /* Fix the client structure with the original stuff */
7133 c->argv = orig_argv;
7134 c->argc = orig_argc;
7135 return;
7136 }
4409877e 7137 }
7138 }
7139 }
7140 /* If the list is empty or the key does not exists we must block */
b177fd30 7141 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 7142 if (timeout > 0) timeout += time(NULL);
b177fd30 7143 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 7144}
7145
7146static void blpopCommand(redisClient *c) {
7147 blockingPopGenericCommand(c,REDIS_HEAD);
7148}
7149
7150static void brpopCommand(redisClient *c) {
7151 blockingPopGenericCommand(c,REDIS_TAIL);
7152}
7153
ed9b544e 7154/* =============================== Replication ============================= */
7155
a4d1ba9a 7156static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7157 ssize_t nwritten, ret = size;
7158 time_t start = time(NULL);
7159
7160 timeout++;
7161 while(size) {
7162 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7163 nwritten = write(fd,ptr,size);
7164 if (nwritten == -1) return -1;
7165 ptr += nwritten;
7166 size -= nwritten;
7167 }
7168 if ((time(NULL)-start) > timeout) {
7169 errno = ETIMEDOUT;
7170 return -1;
7171 }
7172 }
7173 return ret;
7174}
7175
a4d1ba9a 7176static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7177 ssize_t nread, totread = 0;
7178 time_t start = time(NULL);
7179
7180 timeout++;
7181 while(size) {
7182 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7183 nread = read(fd,ptr,size);
7184 if (nread == -1) return -1;
7185 ptr += nread;
7186 size -= nread;
7187 totread += nread;
7188 }
7189 if ((time(NULL)-start) > timeout) {
7190 errno = ETIMEDOUT;
7191 return -1;
7192 }
7193 }
7194 return totread;
7195}
7196
7197static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7198 ssize_t nread = 0;
7199
7200 size--;
7201 while(size) {
7202 char c;
7203
7204 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7205 if (c == '\n') {
7206 *ptr = '\0';
7207 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7208 return nread;
7209 } else {
7210 *ptr++ = c;
7211 *ptr = '\0';
7212 nread++;
7213 }
7214 }
7215 return nread;
7216}
7217
7218static void syncCommand(redisClient *c) {
40d224a9 7219 /* ignore SYNC if aleady slave or in monitor mode */
7220 if (c->flags & REDIS_SLAVE) return;
7221
7222 /* SYNC can't be issued when the server has pending data to send to
7223 * the client about already issued commands. We need a fresh reply
7224 * buffer registering the differences between the BGSAVE and the current
7225 * dataset, so that we can copy to other slaves if needed. */
7226 if (listLength(c->reply) != 0) {
7227 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7228 return;
7229 }
7230
7231 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7232 /* Here we need to check if there is a background saving operation
7233 * in progress, or if it is required to start one */
9d65a1bb 7234 if (server.bgsavechildpid != -1) {
40d224a9 7235 /* Ok a background save is in progress. Let's check if it is a good
7236 * one for replication, i.e. if there is another slave that is
7237 * registering differences since the server forked to save */
7238 redisClient *slave;
7239 listNode *ln;
c7df85a4 7240 listIter li;
40d224a9 7241
c7df85a4 7242 listRewind(server.slaves,&li);
7243 while((ln = listNext(&li))) {
40d224a9 7244 slave = ln->value;
7245 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 7246 }
7247 if (ln) {
7248 /* Perfect, the server is already registering differences for
7249 * another slave. Set the right state, and copy the buffer. */
7250 listRelease(c->reply);
7251 c->reply = listDup(slave->reply);
40d224a9 7252 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7253 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7254 } else {
7255 /* No way, we need to wait for the next BGSAVE in order to
7256 * register differences */
7257 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7258 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7259 }
7260 } else {
7261 /* Ok we don't have a BGSAVE in progress, let's start one */
7262 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7263 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7264 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7265 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7266 return;
7267 }
7268 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7269 }
6208b3a7 7270 c->repldbfd = -1;
40d224a9 7271 c->flags |= REDIS_SLAVE;
7272 c->slaveseldb = 0;
6b47e12e 7273 listAddNodeTail(server.slaves,c);
40d224a9 7274 return;
7275}
7276
6208b3a7 7277static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7278 redisClient *slave = privdata;
7279 REDIS_NOTUSED(el);
7280 REDIS_NOTUSED(mask);
7281 char buf[REDIS_IOBUF_LEN];
7282 ssize_t nwritten, buflen;
7283
7284 if (slave->repldboff == 0) {
7285 /* Write the bulk write count before to transfer the DB. In theory here
7286 * we don't know how much room there is in the output buffer of the
7287 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7288 * operations) will never be smaller than the few bytes we need. */
7289 sds bulkcount;
7290
7291 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7292 slave->repldbsize);
7293 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7294 {
7295 sdsfree(bulkcount);
7296 freeClient(slave);
7297 return;
7298 }
7299 sdsfree(bulkcount);
7300 }
7301 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7302 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7303 if (buflen <= 0) {
7304 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7305 (buflen == 0) ? "premature EOF" : strerror(errno));
7306 freeClient(slave);
7307 return;
7308 }
7309 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 7310 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 7311 strerror(errno));
7312 freeClient(slave);
7313 return;
7314 }
7315 slave->repldboff += nwritten;
7316 if (slave->repldboff == slave->repldbsize) {
7317 close(slave->repldbfd);
7318 slave->repldbfd = -1;
7319 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7320 slave->replstate = REDIS_REPL_ONLINE;
7321 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 7322 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 7323 freeClient(slave);
7324 return;
7325 }
7326 addReplySds(slave,sdsempty());
7327 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7328 }
7329}
ed9b544e 7330
a3b21203 7331/* This function is called at the end of every backgrond saving.
7332 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7333 * otherwise REDIS_ERR is passed to the function.
7334 *
7335 * The goal of this function is to handle slaves waiting for a successful
7336 * background saving in order to perform non-blocking synchronization. */
7337static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 7338 listNode *ln;
7339 int startbgsave = 0;
c7df85a4 7340 listIter li;
ed9b544e 7341
c7df85a4 7342 listRewind(server.slaves,&li);
7343 while((ln = listNext(&li))) {
6208b3a7 7344 redisClient *slave = ln->value;
ed9b544e 7345
6208b3a7 7346 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7347 startbgsave = 1;
7348 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7349 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 7350 struct redis_stat buf;
6208b3a7 7351
7352 if (bgsaveerr != REDIS_OK) {
7353 freeClient(slave);
7354 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7355 continue;
7356 }
7357 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 7358 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 7359 freeClient(slave);
7360 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7361 continue;
7362 }
7363 slave->repldboff = 0;
7364 slave->repldbsize = buf.st_size;
7365 slave->replstate = REDIS_REPL_SEND_BULK;
7366 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 7367 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 7368 freeClient(slave);
7369 continue;
7370 }
7371 }
ed9b544e 7372 }
6208b3a7 7373 if (startbgsave) {
7374 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 7375 listIter li;
7376
7377 listRewind(server.slaves,&li);
6208b3a7 7378 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 7379 while((ln = listNext(&li))) {
6208b3a7 7380 redisClient *slave = ln->value;
ed9b544e 7381
6208b3a7 7382 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7383 freeClient(slave);
7384 }
7385 }
7386 }
ed9b544e 7387}
7388
7389static int syncWithMaster(void) {
d0ccebcf 7390 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 7391 long dumpsize;
ed9b544e 7392 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 7393 int dfd, maxtries = 5;
ed9b544e 7394
7395 if (fd == -1) {
7396 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7397 strerror(errno));
7398 return REDIS_ERR;
7399 }
d0ccebcf 7400
7401 /* AUTH with the master if required. */
7402 if(server.masterauth) {
7403 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7404 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7405 close(fd);
7406 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7407 strerror(errno));
7408 return REDIS_ERR;
7409 }
7410 /* Read the AUTH result. */
7411 if (syncReadLine(fd,buf,1024,3600) == -1) {
7412 close(fd);
7413 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7414 strerror(errno));
7415 return REDIS_ERR;
7416 }
7417 if (buf[0] != '+') {
7418 close(fd);
7419 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7420 return REDIS_ERR;
7421 }
7422 }
7423
ed9b544e 7424 /* Issue the SYNC command */
7425 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7426 close(fd);
7427 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7428 strerror(errno));
7429 return REDIS_ERR;
7430 }
7431 /* Read the bulk write count */
8c4d91fc 7432 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 7433 close(fd);
7434 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7435 strerror(errno));
7436 return REDIS_ERR;
7437 }
4aa701c1 7438 if (buf[0] != '$') {
7439 close(fd);
7440 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7441 return REDIS_ERR;
7442 }
18e61fa2 7443 dumpsize = strtol(buf+1,NULL,10);
7444 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 7445 /* Read the bulk write data on a temp file */
8c5abee8 7446 while(maxtries--) {
7447 snprintf(tmpfile,256,
7448 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7449 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7450 if (dfd != -1) break;
5de9ad7c 7451 sleep(1);
8c5abee8 7452 }
ed9b544e 7453 if (dfd == -1) {
7454 close(fd);
7455 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7456 return REDIS_ERR;
7457 }
7458 while(dumpsize) {
7459 int nread, nwritten;
7460
7461 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7462 if (nread == -1) {
7463 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7464 strerror(errno));
7465 close(fd);
7466 close(dfd);
7467 return REDIS_ERR;
7468 }
7469 nwritten = write(dfd,buf,nread);
7470 if (nwritten == -1) {
7471 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7472 close(fd);
7473 close(dfd);
7474 return REDIS_ERR;
7475 }
7476 dumpsize -= nread;
7477 }
7478 close(dfd);
7479 if (rename(tmpfile,server.dbfilename) == -1) {
7480 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7481 unlink(tmpfile);
7482 close(fd);
7483 return REDIS_ERR;
7484 }
7485 emptyDb();
f78fd11b 7486 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 7487 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7488 close(fd);
7489 return REDIS_ERR;
7490 }
7491 server.master = createClient(fd);
7492 server.master->flags |= REDIS_MASTER;
179b3952 7493 server.master->authenticated = 1;
ed9b544e 7494 server.replstate = REDIS_REPL_CONNECTED;
7495 return REDIS_OK;
7496}
7497
321b0e13 7498static void slaveofCommand(redisClient *c) {
7499 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7500 !strcasecmp(c->argv[2]->ptr,"one")) {
7501 if (server.masterhost) {
7502 sdsfree(server.masterhost);
7503 server.masterhost = NULL;
7504 if (server.master) freeClient(server.master);
7505 server.replstate = REDIS_REPL_NONE;
7506 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7507 }
7508 } else {
7509 sdsfree(server.masterhost);
7510 server.masterhost = sdsdup(c->argv[1]->ptr);
7511 server.masterport = atoi(c->argv[2]->ptr);
7512 if (server.master) freeClient(server.master);
7513 server.replstate = REDIS_REPL_CONNECT;
7514 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7515 server.masterhost, server.masterport);
7516 }
7517 addReply(c,shared.ok);
7518}
7519
3fd78bcd 7520/* ============================ Maxmemory directive ======================== */
7521
a5819310 7522/* Try to free one object form the pre-allocated objects free list.
7523 * This is useful under low mem conditions as by default we take 1 million
7524 * free objects allocated. On success REDIS_OK is returned, otherwise
7525 * REDIS_ERR. */
7526static int tryFreeOneObjectFromFreelist(void) {
f870935d 7527 robj *o;
7528
a5819310 7529 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7530 if (listLength(server.objfreelist)) {
7531 listNode *head = listFirst(server.objfreelist);
7532 o = listNodeValue(head);
7533 listDelNode(server.objfreelist,head);
7534 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7535 zfree(o);
7536 return REDIS_OK;
7537 } else {
7538 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7539 return REDIS_ERR;
7540 }
f870935d 7541}
7542
3fd78bcd 7543/* This function gets called when 'maxmemory' is set on the config file to limit
7544 * the max memory used by the server, and we are out of memory.
7545 * This function will try to, in order:
7546 *
7547 * - Free objects from the free list
7548 * - Try to remove keys with an EXPIRE set
7549 *
7550 * It is not possible to free enough memory to reach used-memory < maxmemory
7551 * the server will start refusing commands that will enlarge even more the
7552 * memory usage.
7553 */
7554static void freeMemoryIfNeeded(void) {
7555 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 7556 int j, k, freed = 0;
7557
7558 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7559 for (j = 0; j < server.dbnum; j++) {
7560 int minttl = -1;
7561 robj *minkey = NULL;
7562 struct dictEntry *de;
7563
7564 if (dictSize(server.db[j].expires)) {
7565 freed = 1;
7566 /* From a sample of three keys drop the one nearest to
7567 * the natural expire */
7568 for (k = 0; k < 3; k++) {
7569 time_t t;
7570
7571 de = dictGetRandomKey(server.db[j].expires);
7572 t = (time_t) dictGetEntryVal(de);
7573 if (minttl == -1 || t < minttl) {
7574 minkey = dictGetEntryKey(de);
7575 minttl = t;
3fd78bcd 7576 }
3fd78bcd 7577 }
a5819310 7578 deleteKey(server.db+j,minkey);
3fd78bcd 7579 }
3fd78bcd 7580 }
a5819310 7581 if (!freed) return; /* nothing to free... */
3fd78bcd 7582 }
7583}
7584
f80dff62 7585/* ============================== Append Only file ========================== */
7586
7587static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7588 sds buf = sdsempty();
7589 int j;
7590 ssize_t nwritten;
7591 time_t now;
7592 robj *tmpargv[3];
7593
7594 /* The DB this command was targetting is not the same as the last command
7595 * we appendend. To issue a SELECT command is needed. */
7596 if (dictid != server.appendseldb) {
7597 char seldb[64];
7598
7599 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 7600 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 7601 (unsigned long)strlen(seldb),seldb);
f80dff62 7602 server.appendseldb = dictid;
7603 }
7604
7605 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7606 * EXPIREs into EXPIREATs calls */
7607 if (cmd->proc == expireCommand) {
7608 long when;
7609
7610 tmpargv[0] = createStringObject("EXPIREAT",8);
7611 tmpargv[1] = argv[1];
7612 incrRefCount(argv[1]);
7613 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7614 tmpargv[2] = createObject(REDIS_STRING,
7615 sdscatprintf(sdsempty(),"%ld",when));
7616 argv = tmpargv;
7617 }
7618
7619 /* Append the actual command */
7620 buf = sdscatprintf(buf,"*%d\r\n",argc);
7621 for (j = 0; j < argc; j++) {
7622 robj *o = argv[j];
7623
9d65a1bb 7624 o = getDecodedObject(o);
83c6a618 7625 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
f80dff62 7626 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7627 buf = sdscatlen(buf,"\r\n",2);
9d65a1bb 7628 decrRefCount(o);
f80dff62 7629 }
7630
7631 /* Free the objects from the modified argv for EXPIREAT */
7632 if (cmd->proc == expireCommand) {
7633 for (j = 0; j < 3; j++)
7634 decrRefCount(argv[j]);
7635 }
7636
7637 /* We want to perform a single write. This should be guaranteed atomic
7638 * at least if the filesystem we are writing is a real physical one.
7639 * While this will save us against the server being killed I don't think
7640 * there is much to do about the whole server stopping for power problems
7641 * or alike */
7642 nwritten = write(server.appendfd,buf,sdslen(buf));
7643 if (nwritten != (signed)sdslen(buf)) {
7644 /* Ooops, we are in troubles. The best thing to do for now is
7645 * to simply exit instead to give the illusion that everything is
7646 * working as expected. */
7647 if (nwritten == -1) {
7648 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7649 } else {
7650 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7651 }
7652 exit(1);
7653 }
85a83172 7654 /* If a background append only file rewriting is in progress we want to
7655 * accumulate the differences between the child DB and the current one
7656 * in a buffer, so that when the child process will do its work we
7657 * can append the differences to the new append only file. */
7658 if (server.bgrewritechildpid != -1)
7659 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7660
7661 sdsfree(buf);
f80dff62 7662 now = time(NULL);
7663 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7664 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7665 now-server.lastfsync > 1))
7666 {
7667 fsync(server.appendfd); /* Let's try to get this data on the disk */
7668 server.lastfsync = now;
7669 }
7670}
7671
7672/* In Redis commands are always executed in the context of a client, so in
7673 * order to load the append only file we need to create a fake client. */
7674static struct redisClient *createFakeClient(void) {
7675 struct redisClient *c = zmalloc(sizeof(*c));
7676
7677 selectDb(c,0);
7678 c->fd = -1;
7679 c->querybuf = sdsempty();
7680 c->argc = 0;
7681 c->argv = NULL;
7682 c->flags = 0;
9387d17d 7683 /* We set the fake client as a slave waiting for the synchronization
7684 * so that Redis will not try to send replies to this client. */
7685 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 7686 c->reply = listCreate();
7687 listSetFreeMethod(c->reply,decrRefCount);
7688 listSetDupMethod(c->reply,dupClientReplyValue);
7689 return c;
7690}
7691
7692static void freeFakeClient(struct redisClient *c) {
7693 sdsfree(c->querybuf);
7694 listRelease(c->reply);
7695 zfree(c);
7696}
7697
7698/* Replay the append log file. On error REDIS_OK is returned. On non fatal
7699 * error (the append only file is zero-length) REDIS_ERR is returned. On
7700 * fatal error an error message is logged and the program exists. */
7701int loadAppendOnlyFile(char *filename) {
7702 struct redisClient *fakeClient;
7703 FILE *fp = fopen(filename,"r");
7704 struct redis_stat sb;
b492cf00 7705 unsigned long long loadedkeys = 0;
f80dff62 7706
7707 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7708 return REDIS_ERR;
7709
7710 if (fp == NULL) {
7711 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7712 exit(1);
7713 }
7714
7715 fakeClient = createFakeClient();
7716 while(1) {
7717 int argc, j;
7718 unsigned long len;
7719 robj **argv;
7720 char buf[128];
7721 sds argsds;
7722 struct redisCommand *cmd;
7723
7724 if (fgets(buf,sizeof(buf),fp) == NULL) {
7725 if (feof(fp))
7726 break;
7727 else
7728 goto readerr;
7729 }
7730 if (buf[0] != '*') goto fmterr;
7731 argc = atoi(buf+1);
7732 argv = zmalloc(sizeof(robj*)*argc);
7733 for (j = 0; j < argc; j++) {
7734 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7735 if (buf[0] != '$') goto fmterr;
7736 len = strtol(buf+1,NULL,10);
7737 argsds = sdsnewlen(NULL,len);
0f151ef1 7738 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 7739 argv[j] = createObject(REDIS_STRING,argsds);
7740 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7741 }
7742
7743 /* Command lookup */
7744 cmd = lookupCommand(argv[0]->ptr);
7745 if (!cmd) {
7746 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7747 exit(1);
7748 }
bdcb92f2 7749 /* Try object encoding */
f80dff62 7750 if (cmd->flags & REDIS_CMD_BULK)
7751 tryObjectEncoding(argv[argc-1]);
7752 /* Run the command in the context of a fake client */
7753 fakeClient->argc = argc;
7754 fakeClient->argv = argv;
7755 cmd->proc(fakeClient);
7756 /* Discard the reply objects list from the fake client */
7757 while(listLength(fakeClient->reply))
7758 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7759 /* Clean up, ready for the next command */
7760 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7761 zfree(argv);
b492cf00 7762 /* Handle swapping while loading big datasets when VM is on */
7763 loadedkeys++;
7764 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7765 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 7766 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 7767 }
7768 }
f80dff62 7769 }
7770 fclose(fp);
7771 freeFakeClient(fakeClient);
7772 return REDIS_OK;
7773
7774readerr:
7775 if (feof(fp)) {
7776 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7777 } else {
7778 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7779 }
7780 exit(1);
7781fmterr:
7782 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7783 exit(1);
7784}
7785
9d65a1bb 7786/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
9c8e3cee 7787static int fwriteBulkObject(FILE *fp, robj *obj) {
9d65a1bb 7788 char buf[128];
b9bc0eef 7789 int decrrc = 0;
7790
f2d9f50f 7791 /* Avoid the incr/decr ref count business if possible to help
7792 * copy-on-write (we are often in a child process when this function
7793 * is called).
7794 * Also makes sure that key objects don't get incrRefCount-ed when VM
7795 * is enabled */
7796 if (obj->encoding != REDIS_ENCODING_RAW) {
b9bc0eef 7797 obj = getDecodedObject(obj);
7798 decrrc = 1;
7799 }
9d65a1bb 7800 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7801 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 7802 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7803 goto err;
9d65a1bb 7804 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 7805 if (decrrc) decrRefCount(obj);
9d65a1bb 7806 return 1;
7807err:
b9bc0eef 7808 if (decrrc) decrRefCount(obj);
9d65a1bb 7809 return 0;
7810}
7811
9c8e3cee 7812/* Write binary-safe string into a file in the bulkformat
7813 * $<count>\r\n<payload>\r\n */
7814static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
7815 char buf[128];
7816
7817 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
7818 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7819 if (len && fwrite(s,len,1,fp) == 0) return 0;
7820 if (fwrite("\r\n",2,1,fp) == 0) return 0;
7821 return 1;
7822}
7823
9d65a1bb 7824/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7825static int fwriteBulkDouble(FILE *fp, double d) {
7826 char buf[128], dbuf[128];
7827
7828 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7829 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7830 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7831 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7832 return 1;
7833}
7834
7835/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7836static int fwriteBulkLong(FILE *fp, long l) {
7837 char buf[128], lbuf[128];
7838
7839 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7840 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7841 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7842 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7843 return 1;
7844}
7845
7846/* Write a sequence of commands able to fully rebuild the dataset into
7847 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7848static int rewriteAppendOnlyFile(char *filename) {
7849 dictIterator *di = NULL;
7850 dictEntry *de;
7851 FILE *fp;
7852 char tmpfile[256];
7853 int j;
7854 time_t now = time(NULL);
7855
7856 /* Note that we have to use a different temp name here compared to the
7857 * one used by rewriteAppendOnlyFileBackground() function. */
7858 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7859 fp = fopen(tmpfile,"w");
7860 if (!fp) {
7861 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7862 return REDIS_ERR;
7863 }
7864 for (j = 0; j < server.dbnum; j++) {
7865 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7866 redisDb *db = server.db+j;
7867 dict *d = db->dict;
7868 if (dictSize(d) == 0) continue;
7869 di = dictGetIterator(d);
7870 if (!di) {
7871 fclose(fp);
7872 return REDIS_ERR;
7873 }
7874
7875 /* SELECT the new DB */
7876 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 7877 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 7878
7879 /* Iterate this DB writing every entry */
7880 while((de = dictNext(di)) != NULL) {
e7546c63 7881 robj *key, *o;
7882 time_t expiretime;
7883 int swapped;
7884
7885 key = dictGetEntryKey(de);
b9bc0eef 7886 /* If the value for this key is swapped, load a preview in memory.
7887 * We use a "swapped" flag to remember if we need to free the
7888 * value object instead to just increment the ref count anyway
7889 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 7890 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7891 key->storage == REDIS_VM_SWAPPING) {
e7546c63 7892 o = dictGetEntryVal(de);
7893 swapped = 0;
7894 } else {
7895 o = vmPreviewObject(key);
e7546c63 7896 swapped = 1;
7897 }
7898 expiretime = getExpire(db,key);
9d65a1bb 7899
7900 /* Save the key and associated value */
9d65a1bb 7901 if (o->type == REDIS_STRING) {
7902 /* Emit a SET command */
7903 char cmd[]="*3\r\n$3\r\nSET\r\n";
7904 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7905 /* Key and value */
9c8e3cee 7906 if (fwriteBulkObject(fp,key) == 0) goto werr;
7907 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 7908 } else if (o->type == REDIS_LIST) {
7909 /* Emit the RPUSHes needed to rebuild the list */
7910 list *list = o->ptr;
7911 listNode *ln;
c7df85a4 7912 listIter li;
9d65a1bb 7913
c7df85a4 7914 listRewind(list,&li);
7915 while((ln = listNext(&li))) {
9d65a1bb 7916 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7917 robj *eleobj = listNodeValue(ln);
7918
7919 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 7920 if (fwriteBulkObject(fp,key) == 0) goto werr;
7921 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 7922 }
7923 } else if (o->type == REDIS_SET) {
7924 /* Emit the SADDs needed to rebuild the set */
7925 dict *set = o->ptr;
7926 dictIterator *di = dictGetIterator(set);
7927 dictEntry *de;
7928
7929 while((de = dictNext(di)) != NULL) {
7930 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7931 robj *eleobj = dictGetEntryKey(de);
7932
7933 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 7934 if (fwriteBulkObject(fp,key) == 0) goto werr;
7935 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 7936 }
7937 dictReleaseIterator(di);
7938 } else if (o->type == REDIS_ZSET) {
7939 /* Emit the ZADDs needed to rebuild the sorted set */
7940 zset *zs = o->ptr;
7941 dictIterator *di = dictGetIterator(zs->dict);
7942 dictEntry *de;
7943
7944 while((de = dictNext(di)) != NULL) {
7945 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7946 robj *eleobj = dictGetEntryKey(de);
7947 double *score = dictGetEntryVal(de);
7948
7949 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 7950 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 7951 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 7952 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 7953 }
7954 dictReleaseIterator(di);
9c8e3cee 7955 } else if (o->type == REDIS_HASH) {
7956 char cmd[]="*4\r\n$4\r\nHSET\r\n";
7957
7958 /* Emit the HSETs needed to rebuild the hash */
7959 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7960 unsigned char *p = zipmapRewind(o->ptr);
7961 unsigned char *field, *val;
7962 unsigned int flen, vlen;
7963
7964 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
7965 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7966 if (fwriteBulkObject(fp,key) == 0) goto werr;
7967 if (fwriteBulkString(fp,(char*)field,flen) == -1)
7968 return -1;
7969 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
7970 return -1;
7971 }
7972 } else {
7973 dictIterator *di = dictGetIterator(o->ptr);
7974 dictEntry *de;
7975
7976 while((de = dictNext(di)) != NULL) {
7977 robj *field = dictGetEntryKey(de);
7978 robj *val = dictGetEntryVal(de);
7979
7980 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7981 if (fwriteBulkObject(fp,key) == 0) goto werr;
7982 if (fwriteBulkObject(fp,field) == -1) return -1;
7983 if (fwriteBulkObject(fp,val) == -1) return -1;
7984 }
7985 dictReleaseIterator(di);
7986 }
9d65a1bb 7987 } else {
78409a0f 7988 redisAssert(0);
9d65a1bb 7989 }
7990 /* Save the expire time */
7991 if (expiretime != -1) {
e96e4fbf 7992 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 7993 /* If this key is already expired skip it */
7994 if (expiretime < now) continue;
7995 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 7996 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 7997 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
7998 }
b9bc0eef 7999 if (swapped) decrRefCount(o);
9d65a1bb 8000 }
8001 dictReleaseIterator(di);
8002 }
8003
8004 /* Make sure data will not remain on the OS's output buffers */
8005 fflush(fp);
8006 fsync(fileno(fp));
8007 fclose(fp);
8008
8009 /* Use RENAME to make sure the DB file is changed atomically only
8010 * if the generate DB file is ok. */
8011 if (rename(tmpfile,filename) == -1) {
8012 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8013 unlink(tmpfile);
8014 return REDIS_ERR;
8015 }
8016 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8017 return REDIS_OK;
8018
8019werr:
8020 fclose(fp);
8021 unlink(tmpfile);
e96e4fbf 8022 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 8023 if (di) dictReleaseIterator(di);
8024 return REDIS_ERR;
8025}
8026
8027/* This is how rewriting of the append only file in background works:
8028 *
8029 * 1) The user calls BGREWRITEAOF
8030 * 2) Redis calls this function, that forks():
8031 * 2a) the child rewrite the append only file in a temp file.
8032 * 2b) the parent accumulates differences in server.bgrewritebuf.
8033 * 3) When the child finished '2a' exists.
8034 * 4) The parent will trap the exit code, if it's OK, will append the
8035 * data accumulated into server.bgrewritebuf into the temp file, and
8036 * finally will rename(2) the temp file in the actual file name.
8037 * The the new file is reopened as the new append only file. Profit!
8038 */
8039static int rewriteAppendOnlyFileBackground(void) {
8040 pid_t childpid;
8041
8042 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 8043 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 8044 if ((childpid = fork()) == 0) {
8045 /* Child */
8046 char tmpfile[256];
9d65a1bb 8047
054e426d 8048 if (server.vm_enabled) vmReopenSwapFile();
8049 close(server.fd);
9d65a1bb 8050 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8051 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 8052 _exit(0);
9d65a1bb 8053 } else {
478c2c6f 8054 _exit(1);
9d65a1bb 8055 }
8056 } else {
8057 /* Parent */
8058 if (childpid == -1) {
8059 redisLog(REDIS_WARNING,
8060 "Can't rewrite append only file in background: fork: %s",
8061 strerror(errno));
8062 return REDIS_ERR;
8063 }
8064 redisLog(REDIS_NOTICE,
8065 "Background append only file rewriting started by pid %d",childpid);
8066 server.bgrewritechildpid = childpid;
884d4b39 8067 updateDictResizePolicy();
85a83172 8068 /* We set appendseldb to -1 in order to force the next call to the
8069 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8070 * accumulated by the parent into server.bgrewritebuf will start
8071 * with a SELECT statement and it will be safe to merge. */
8072 server.appendseldb = -1;
9d65a1bb 8073 return REDIS_OK;
8074 }
8075 return REDIS_OK; /* unreached */
8076}
8077
8078static void bgrewriteaofCommand(redisClient *c) {
8079 if (server.bgrewritechildpid != -1) {
8080 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8081 return;
8082 }
8083 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 8084 char *status = "+Background append only file rewriting started\r\n";
8085 addReplySds(c,sdsnew(status));
9d65a1bb 8086 } else {
8087 addReply(c,shared.err);
8088 }
8089}
8090
8091static void aofRemoveTempFile(pid_t childpid) {
8092 char tmpfile[256];
8093
8094 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8095 unlink(tmpfile);
8096}
8097
996cb5f7 8098/* Virtual Memory is composed mainly of two subsystems:
8099 * - Blocking Virutal Memory
8100 * - Threaded Virtual Memory I/O
8101 * The two parts are not fully decoupled, but functions are split among two
8102 * different sections of the source code (delimited by comments) in order to
8103 * make more clear what functionality is about the blocking VM and what about
8104 * the threaded (not blocking) VM.
8105 *
8106 * Redis VM design:
8107 *
8108 * Redis VM is a blocking VM (one that blocks reading swapped values from
8109 * disk into memory when a value swapped out is needed in memory) that is made
8110 * unblocking by trying to examine the command argument vector in order to
8111 * load in background values that will likely be needed in order to exec
8112 * the command. The command is executed only once all the relevant keys
8113 * are loaded into memory.
8114 *
8115 * This basically is almost as simple of a blocking VM, but almost as parallel
8116 * as a fully non-blocking VM.
8117 */
8118
8119/* =================== Virtual Memory - Blocking Side ====================== */
054e426d 8120
8121/* substitute the first occurrence of '%p' with the process pid in the
8122 * swap file name. */
8123static void expandVmSwapFilename(void) {
8124 char *p = strstr(server.vm_swap_file,"%p");
8125 sds new;
8126
8127 if (!p) return;
8128 new = sdsempty();
8129 *p = '\0';
8130 new = sdscat(new,server.vm_swap_file);
8131 new = sdscatprintf(new,"%ld",(long) getpid());
8132 new = sdscat(new,p+2);
8133 zfree(server.vm_swap_file);
8134 server.vm_swap_file = new;
8135}
8136
75680a3c 8137static void vmInit(void) {
8138 off_t totsize;
996cb5f7 8139 int pipefds[2];
bcaa7a4f 8140 size_t stacksize;
75680a3c 8141
4ad37480 8142 if (server.vm_max_threads != 0)
8143 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8144
054e426d 8145 expandVmSwapFilename();
8146 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
6fa987e3 8147 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8148 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8149 }
75680a3c 8150 if (server.vm_fp == NULL) {
6fa987e3 8151 redisLog(REDIS_WARNING,
8152 "Impossible to open the swap file: %s. Exiting.",
8153 strerror(errno));
75680a3c 8154 exit(1);
8155 }
8156 server.vm_fd = fileno(server.vm_fp);
8157 server.vm_next_page = 0;
8158 server.vm_near_pages = 0;
7d98e08c 8159 server.vm_stats_used_pages = 0;
8160 server.vm_stats_swapped_objects = 0;
8161 server.vm_stats_swapouts = 0;
8162 server.vm_stats_swapins = 0;
75680a3c 8163 totsize = server.vm_pages*server.vm_page_size;
8164 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8165 if (ftruncate(server.vm_fd,totsize) == -1) {
8166 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8167 strerror(errno));
8168 exit(1);
8169 } else {
8170 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8171 }
7d30035d 8172 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 8173 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 8174 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 8175 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 8176
996cb5f7 8177 /* Initialize threaded I/O (used by Virtual Memory) */
8178 server.io_newjobs = listCreate();
8179 server.io_processing = listCreate();
8180 server.io_processed = listCreate();
d5d55fc3 8181 server.io_ready_clients = listCreate();
92f8e882 8182 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 8183 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8184 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 8185 server.io_active_threads = 0;
996cb5f7 8186 if (pipe(pipefds) == -1) {
8187 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8188 ,strerror(errno));
8189 exit(1);
8190 }
8191 server.io_ready_pipe_read = pipefds[0];
8192 server.io_ready_pipe_write = pipefds[1];
8193 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 8194 /* LZF requires a lot of stack */
8195 pthread_attr_init(&server.io_threads_attr);
8196 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8197 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8198 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 8199 /* Listen for events in the threaded I/O pipe */
8200 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8201 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8202 oom("creating file event");
75680a3c 8203}
8204
06224fec 8205/* Mark the page as used */
8206static void vmMarkPageUsed(off_t page) {
8207 off_t byte = page/8;
8208 int bit = page&7;
970e10bb 8209 redisAssert(vmFreePage(page) == 1);
06224fec 8210 server.vm_bitmap[byte] |= 1<<bit;
8211}
8212
8213/* Mark N contiguous pages as used, with 'page' being the first. */
8214static void vmMarkPagesUsed(off_t page, off_t count) {
8215 off_t j;
8216
8217 for (j = 0; j < count; j++)
7d30035d 8218 vmMarkPageUsed(page+j);
7d98e08c 8219 server.vm_stats_used_pages += count;
7c775e09 8220 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8221 (long long)count, (long long)page);
06224fec 8222}
8223
8224/* Mark the page as free */
8225static void vmMarkPageFree(off_t page) {
8226 off_t byte = page/8;
8227 int bit = page&7;
970e10bb 8228 redisAssert(vmFreePage(page) == 0);
06224fec 8229 server.vm_bitmap[byte] &= ~(1<<bit);
8230}
8231
8232/* Mark N contiguous pages as free, with 'page' being the first. */
8233static void vmMarkPagesFree(off_t page, off_t count) {
8234 off_t j;
8235
8236 for (j = 0; j < count; j++)
7d30035d 8237 vmMarkPageFree(page+j);
7d98e08c 8238 server.vm_stats_used_pages -= count;
7c775e09 8239 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8240 (long long)count, (long long)page);
06224fec 8241}
8242
8243/* Test if the page is free */
8244static int vmFreePage(off_t page) {
8245 off_t byte = page/8;
8246 int bit = page&7;
7d30035d 8247 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 8248}
8249
8250/* Find N contiguous free pages storing the first page of the cluster in *first.
3a66edc7 8251 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8252 * REDIS_ERR is returned.
06224fec 8253 *
8254 * This function uses a simple algorithm: we try to allocate
8255 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8256 * again from the start of the swap file searching for free spaces.
8257 *
8258 * If it looks pretty clear that there are no free pages near our offset
8259 * we try to find less populated places doing a forward jump of
8260 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8261 * without hurry, and then we jump again and so forth...
8262 *
8263 * This function can be improved using a free list to avoid to guess
8264 * too much, since we could collect data about freed pages.
8265 *
8266 * note: I implemented this function just after watching an episode of
8267 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8268 */
c7df85a4 8269static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 8270 off_t base, offset = 0, since_jump = 0, numfree = 0;
8271
8272 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8273 server.vm_near_pages = 0;
8274 server.vm_next_page = 0;
8275 }
8276 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8277 base = server.vm_next_page;
8278
8279 while(offset < server.vm_pages) {
8280 off_t this = base+offset;
8281
8282 /* If we overflow, restart from page zero */
8283 if (this >= server.vm_pages) {
8284 this -= server.vm_pages;
8285 if (this == 0) {
8286 /* Just overflowed, what we found on tail is no longer
8287 * interesting, as it's no longer contiguous. */
8288 numfree = 0;
8289 }
8290 }
8291 if (vmFreePage(this)) {
8292 /* This is a free page */
8293 numfree++;
8294 /* Already got N free pages? Return to the caller, with success */
8295 if (numfree == n) {
7d30035d 8296 *first = this-(n-1);
8297 server.vm_next_page = this+1;
7c775e09 8298 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 8299 return REDIS_OK;
06224fec 8300 }
8301 } else {
8302 /* The current one is not a free page */
8303 numfree = 0;
8304 }
8305
8306 /* Fast-forward if the current page is not free and we already
8307 * searched enough near this place. */
8308 since_jump++;
8309 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8310 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8311 since_jump = 0;
8312 /* Note that even if we rewind after the jump, we are don't need
8313 * to make sure numfree is set to zero as we only jump *if* it
8314 * is set to zero. */
8315 } else {
8316 /* Otherwise just check the next page */
8317 offset++;
8318 }
8319 }
3a66edc7 8320 return REDIS_ERR;
8321}
8322
a5819310 8323/* Write the specified object at the specified page of the swap file */
8324static int vmWriteObjectOnSwap(robj *o, off_t page) {
8325 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8326 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8327 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8328 redisLog(REDIS_WARNING,
9ebed7cf 8329 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 8330 strerror(errno));
8331 return REDIS_ERR;
8332 }
8333 rdbSaveObject(server.vm_fp,o);
ba76a8f9 8334 fflush(server.vm_fp);
a5819310 8335 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8336 return REDIS_OK;
8337}
8338
3a66edc7 8339/* Swap the 'val' object relative to 'key' into disk. Store all the information
8340 * needed to later retrieve the object into the key object.
8341 * If we can't find enough contiguous empty pages to swap the object on disk
8342 * REDIS_ERR is returned. */
a69a0c9c 8343static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 8344 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 8345 off_t page;
8346
8347 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 8348 assert(key->refcount == 1);
3a66edc7 8349 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 8350 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 8351 key->vm.page = page;
8352 key->vm.usedpages = pages;
8353 key->storage = REDIS_VM_SWAPPED;
d894161b 8354 key->vtype = val->type;
3a66edc7 8355 decrRefCount(val); /* Deallocate the object from memory. */
8356 vmMarkPagesUsed(page,pages);
7d30035d 8357 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8358 (unsigned char*) key->ptr,
8359 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 8360 server.vm_stats_swapped_objects++;
8361 server.vm_stats_swapouts++;
3a66edc7 8362 return REDIS_OK;
8363}
8364
a5819310 8365static robj *vmReadObjectFromSwap(off_t page, int type) {
8366 robj *o;
3a66edc7 8367
a5819310 8368 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8369 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 8370 redisLog(REDIS_WARNING,
d5d55fc3 8371 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 8372 strerror(errno));
478c2c6f 8373 _exit(1);
3a66edc7 8374 }
a5819310 8375 o = rdbLoadObject(type,server.vm_fp);
8376 if (o == NULL) {
d5d55fc3 8377 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 8378 _exit(1);
3a66edc7 8379 }
a5819310 8380 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8381 return o;
8382}
8383
8384/* Load the value object relative to the 'key' object from swap to memory.
8385 * The newly allocated object is returned.
8386 *
8387 * If preview is true the unserialized object is returned to the caller but
8388 * no changes are made to the key object, nor the pages are marked as freed */
8389static robj *vmGenericLoadObject(robj *key, int preview) {
8390 robj *val;
8391
d5d55fc3 8392 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
a5819310 8393 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 8394 if (!preview) {
8395 key->storage = REDIS_VM_MEMORY;
8396 key->vm.atime = server.unixtime;
8397 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8398 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8399 (unsigned char*) key->ptr);
7d98e08c 8400 server.vm_stats_swapped_objects--;
38aba9a1 8401 } else {
8402 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8403 (unsigned char*) key->ptr);
7e69548d 8404 }
7d98e08c 8405 server.vm_stats_swapins++;
3a66edc7 8406 return val;
06224fec 8407}
8408
7e69548d 8409/* Plain object loading, from swap to memory */
8410static robj *vmLoadObject(robj *key) {
996cb5f7 8411 /* If we are loading the object in background, stop it, we
8412 * need to load this object synchronously ASAP. */
8413 if (key->storage == REDIS_VM_LOADING)
8414 vmCancelThreadedIOJob(key);
7e69548d 8415 return vmGenericLoadObject(key,0);
8416}
8417
8418/* Just load the value on disk, without to modify the key.
8419 * This is useful when we want to perform some operation on the value
8420 * without to really bring it from swap to memory, like while saving the
8421 * dataset or rewriting the append only log. */
8422static robj *vmPreviewObject(robj *key) {
8423 return vmGenericLoadObject(key,1);
8424}
8425
4ef8de8a 8426/* How a good candidate is this object for swapping?
8427 * The better candidate it is, the greater the returned value.
8428 *
8429 * Currently we try to perform a fast estimation of the object size in
8430 * memory, and combine it with aging informations.
8431 *
8432 * Basically swappability = idle-time * log(estimated size)
8433 *
8434 * Bigger objects are preferred over smaller objects, but not
8435 * proportionally, this is why we use the logarithm. This algorithm is
8436 * just a first try and will probably be tuned later. */
8437static double computeObjectSwappability(robj *o) {
8438 time_t age = server.unixtime - o->vm.atime;
8439 long asize = 0;
8440 list *l;
8441 dict *d;
8442 struct dictEntry *de;
8443 int z;
8444
8445 if (age <= 0) return 0;
8446 switch(o->type) {
8447 case REDIS_STRING:
8448 if (o->encoding != REDIS_ENCODING_RAW) {
8449 asize = sizeof(*o);
8450 } else {
8451 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8452 }
8453 break;
8454 case REDIS_LIST:
8455 l = o->ptr;
8456 listNode *ln = listFirst(l);
8457
8458 asize = sizeof(list);
8459 if (ln) {
8460 robj *ele = ln->value;
8461 long elesize;
8462
8463 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8464 (sizeof(*o)+sdslen(ele->ptr)) :
8465 sizeof(*o);
8466 asize += (sizeof(listNode)+elesize)*listLength(l);
8467 }
8468 break;
8469 case REDIS_SET:
8470 case REDIS_ZSET:
8471 z = (o->type == REDIS_ZSET);
8472 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8473
8474 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8475 if (z) asize += sizeof(zset)-sizeof(dict);
8476 if (dictSize(d)) {
8477 long elesize;
8478 robj *ele;
8479
8480 de = dictGetRandomKey(d);
8481 ele = dictGetEntryKey(de);
8482 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8483 (sizeof(*o)+sdslen(ele->ptr)) :
8484 sizeof(*o);
8485 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8486 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8487 }
8488 break;
a97b9060 8489 case REDIS_HASH:
8490 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8491 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8492 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8493 unsigned int klen, vlen;
8494 unsigned char *key, *val;
8495
8496 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8497 klen = 0;
8498 vlen = 0;
8499 }
8500 asize = len*(klen+vlen+3);
8501 } else if (o->encoding == REDIS_ENCODING_HT) {
8502 d = o->ptr;
8503 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8504 if (dictSize(d)) {
8505 long elesize;
8506 robj *ele;
8507
8508 de = dictGetRandomKey(d);
8509 ele = dictGetEntryKey(de);
8510 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8511 (sizeof(*o)+sdslen(ele->ptr)) :
8512 sizeof(*o);
8513 ele = dictGetEntryVal(de);
8514 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8515 (sizeof(*o)+sdslen(ele->ptr)) :
8516 sizeof(*o);
8517 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8518 }
8519 }
8520 break;
4ef8de8a 8521 }
c8c72447 8522 return (double)age*log(1+asize);
4ef8de8a 8523}
8524
8525/* Try to swap an object that's a good candidate for swapping.
8526 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 8527 * to swap any object at all.
8528 *
8529 * If 'usethreaded' is true, Redis will try to swap the object in background
8530 * using I/O threads. */
8531static int vmSwapOneObject(int usethreads) {
4ef8de8a 8532 int j, i;
8533 struct dictEntry *best = NULL;
8534 double best_swappability = 0;
b9bc0eef 8535 redisDb *best_db = NULL;
4ef8de8a 8536 robj *key, *val;
8537
8538 for (j = 0; j < server.dbnum; j++) {
8539 redisDb *db = server.db+j;
b72f6a4b 8540 /* Why maxtries is set to 100?
8541 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8542 * are swappable objects */
b0d8747d 8543 int maxtries = 100;
4ef8de8a 8544
8545 if (dictSize(db->dict) == 0) continue;
8546 for (i = 0; i < 5; i++) {
8547 dictEntry *de;
8548 double swappability;
8549
e3cadb8a 8550 if (maxtries) maxtries--;
4ef8de8a 8551 de = dictGetRandomKey(db->dict);
8552 key = dictGetEntryKey(de);
8553 val = dictGetEntryVal(de);
1064ef87 8554 /* Only swap objects that are currently in memory.
8555 *
8556 * Also don't swap shared objects if threaded VM is on, as we
8557 * try to ensure that the main thread does not touch the
8558 * object while the I/O thread is using it, but we can't
8559 * control other keys without adding additional mutex. */
8560 if (key->storage != REDIS_VM_MEMORY ||
8561 (server.vm_max_threads != 0 && val->refcount != 1)) {
e3cadb8a 8562 if (maxtries) i--; /* don't count this try */
8563 continue;
8564 }
4ef8de8a 8565 swappability = computeObjectSwappability(val);
8566 if (!best || swappability > best_swappability) {
8567 best = de;
8568 best_swappability = swappability;
b9bc0eef 8569 best_db = db;
4ef8de8a 8570 }
8571 }
8572 }
7c775e09 8573 if (best == NULL) return REDIS_ERR;
4ef8de8a 8574 key = dictGetEntryKey(best);
8575 val = dictGetEntryVal(best);
8576
e3cadb8a 8577 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 8578 key->ptr, best_swappability);
8579
8580 /* Unshare the key if needed */
8581 if (key->refcount > 1) {
8582 robj *newkey = dupStringObject(key);
8583 decrRefCount(key);
8584 key = dictGetEntryKey(best) = newkey;
8585 }
8586 /* Swap it */
a69a0c9c 8587 if (usethreads) {
b9bc0eef 8588 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 8589 return REDIS_OK;
8590 } else {
a69a0c9c 8591 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8592 dictGetEntryVal(best) = NULL;
8593 return REDIS_OK;
8594 } else {
8595 return REDIS_ERR;
8596 }
4ef8de8a 8597 }
8598}
8599
a69a0c9c 8600static int vmSwapOneObjectBlocking() {
8601 return vmSwapOneObject(0);
8602}
8603
8604static int vmSwapOneObjectThreaded() {
8605 return vmSwapOneObject(1);
8606}
8607
7e69548d 8608/* Return true if it's safe to swap out objects in a given moment.
8609 * Basically we don't want to swap objects out while there is a BGSAVE
8610 * or a BGAEOREWRITE running in backgroud. */
8611static int vmCanSwapOut(void) {
8612 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8613}
8614
1b03836c 8615/* Delete a key if swapped. Returns 1 if the key was found, was swapped
8616 * and was deleted. Otherwise 0 is returned. */
8617static int deleteIfSwapped(redisDb *db, robj *key) {
8618 dictEntry *de;
8619 robj *foundkey;
8620
8621 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8622 foundkey = dictGetEntryKey(de);
8623 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8624 deleteKey(db,key);
8625 return 1;
8626}
8627
996cb5f7 8628/* =================== Virtual Memory - Threaded I/O ======================= */
8629
b9bc0eef 8630static void freeIOJob(iojob *j) {
d5d55fc3 8631 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8632 j->type == REDIS_IOJOB_DO_SWAP ||
8633 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
b9bc0eef 8634 decrRefCount(j->val);
78ebe4c8 8635 /* We don't decrRefCount the j->key field as we did't incremented
8636 * the count creating IO Jobs. This is because the key field here is
8637 * just used as an indentifier and if a key is removed the Job should
8638 * never be touched again. */
b9bc0eef 8639 zfree(j);
8640}
8641
996cb5f7 8642/* Every time a thread finished a Job, it writes a byte into the write side
8643 * of an unix pipe in order to "awake" the main thread, and this function
8644 * is called. */
8645static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8646 int mask)
8647{
8648 char buf[1];
b0d8747d 8649 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 8650 REDIS_NOTUSED(el);
8651 REDIS_NOTUSED(mask);
8652 REDIS_NOTUSED(privdata);
8653
8654 /* For every byte we read in the read side of the pipe, there is one
8655 * I/O job completed to process. */
8656 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 8657 iojob *j;
8658 listNode *ln;
8659 robj *key;
8660 struct dictEntry *de;
8661
996cb5f7 8662 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 8663
8664 /* Get the processed element (the oldest one) */
8665 lockThreadedIO();
1064ef87 8666 assert(listLength(server.io_processed) != 0);
f6c0bba8 8667 if (toprocess == -1) {
8668 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8669 if (toprocess <= 0) toprocess = 1;
8670 }
b9bc0eef 8671 ln = listFirst(server.io_processed);
8672 j = ln->value;
8673 listDelNode(server.io_processed,ln);
8674 unlockThreadedIO();
8675 /* If this job is marked as canceled, just ignore it */
8676 if (j->canceled) {
8677 freeIOJob(j);
8678 continue;
8679 }
8680 /* Post process it in the main thread, as there are things we
8681 * can do just here to avoid race conditions and/or invasive locks */
6c96ba7d 8682 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 8683 de = dictFind(j->db->dict,j->key);
8684 assert(de != NULL);
8685 key = dictGetEntryKey(de);
8686 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 8687 redisDb *db;
8688
b9bc0eef 8689 /* Key loaded, bring it at home */
8690 key->storage = REDIS_VM_MEMORY;
8691 key->vm.atime = server.unixtime;
8692 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8693 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8694 (unsigned char*) key->ptr);
8695 server.vm_stats_swapped_objects--;
8696 server.vm_stats_swapins++;
d5d55fc3 8697 dictGetEntryVal(de) = j->val;
8698 incrRefCount(j->val);
8699 db = j->db;
b9bc0eef 8700 freeIOJob(j);
d5d55fc3 8701 /* Handle clients waiting for this key to be loaded. */
8702 handleClientsBlockedOnSwappedKey(db,key);
b9bc0eef 8703 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8704 /* Now we know the amount of pages required to swap this object.
8705 * Let's find some space for it, and queue this task again
8706 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 8707 if (!vmCanSwapOut() ||
8708 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8709 {
8710 /* Ooops... no space or we can't swap as there is
8711 * a fork()ed Redis trying to save stuff on disk. */
b9bc0eef 8712 freeIOJob(j);
054e426d 8713 key->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 8714 } else {
c7df85a4 8715 /* Note that we need to mark this pages as used now,
8716 * if the job will be canceled, we'll mark them as freed
8717 * again. */
8718 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 8719 j->type = REDIS_IOJOB_DO_SWAP;
8720 lockThreadedIO();
8721 queueIOJob(j);
8722 unlockThreadedIO();
8723 }
8724 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8725 robj *val;
8726
8727 /* Key swapped. We can finally free some memory. */
6c96ba7d 8728 if (key->storage != REDIS_VM_SWAPPING) {
8729 printf("key->storage: %d\n",key->storage);
8730 printf("key->name: %s\n",(char*)key->ptr);
8731 printf("key->refcount: %d\n",key->refcount);
8732 printf("val: %p\n",(void*)j->val);
8733 printf("val->type: %d\n",j->val->type);
8734 printf("val->ptr: %s\n",(char*)j->val->ptr);
8735 }
8736 redisAssert(key->storage == REDIS_VM_SWAPPING);
b9bc0eef 8737 val = dictGetEntryVal(de);
8738 key->vm.page = j->page;
8739 key->vm.usedpages = j->pages;
8740 key->storage = REDIS_VM_SWAPPED;
8741 key->vtype = j->val->type;
8742 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 8743 dictGetEntryVal(de) = NULL;
b9bc0eef 8744 redisLog(REDIS_DEBUG,
8745 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8746 (unsigned char*) key->ptr,
8747 (unsigned long long) j->page, (unsigned long long) j->pages);
8748 server.vm_stats_swapped_objects++;
8749 server.vm_stats_swapouts++;
8750 freeIOJob(j);
f11b8647 8751 /* Put a few more swap requests in queue if we are still
8752 * out of memory */
b0d8747d 8753 if (trytoswap && vmCanSwapOut() &&
8754 zmalloc_used_memory() > server.vm_max_memory)
8755 {
f11b8647 8756 int more = 1;
8757 while(more) {
8758 lockThreadedIO();
8759 more = listLength(server.io_newjobs) <
8760 (unsigned) server.vm_max_threads;
8761 unlockThreadedIO();
8762 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 8763 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8764 trytoswap = 0;
8765 break;
8766 }
f11b8647 8767 }
8768 }
b9bc0eef 8769 }
c953f24b 8770 processed++;
f6c0bba8 8771 if (processed == toprocess) return;
996cb5f7 8772 }
8773 if (retval < 0 && errno != EAGAIN) {
8774 redisLog(REDIS_WARNING,
8775 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8776 strerror(errno));
8777 }
8778}
8779
8780static void lockThreadedIO(void) {
8781 pthread_mutex_lock(&server.io_mutex);
8782}
8783
8784static void unlockThreadedIO(void) {
8785 pthread_mutex_unlock(&server.io_mutex);
8786}
8787
8788/* Remove the specified object from the threaded I/O queue if still not
8789 * processed, otherwise make sure to flag it as canceled. */
8790static void vmCancelThreadedIOJob(robj *o) {
8791 list *lists[3] = {
6c96ba7d 8792 server.io_newjobs, /* 0 */
8793 server.io_processing, /* 1 */
8794 server.io_processed /* 2 */
996cb5f7 8795 };
8796 int i;
8797
8798 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 8799again:
996cb5f7 8800 lockThreadedIO();
8801 /* Search for a matching key in one of the queues */
8802 for (i = 0; i < 3; i++) {
8803 listNode *ln;
c7df85a4 8804 listIter li;
996cb5f7 8805
c7df85a4 8806 listRewind(lists[i],&li);
8807 while ((ln = listNext(&li)) != NULL) {
996cb5f7 8808 iojob *job = ln->value;
8809
6c96ba7d 8810 if (job->canceled) continue; /* Skip this, already canceled. */
78ebe4c8 8811 if (job->key == o) {
970e10bb 8812 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8813 (void*)job, (char*)o->ptr, job->type, i);
427a2153 8814 /* Mark the pages as free since the swap didn't happened
8815 * or happened but is now discarded. */
970e10bb 8816 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 8817 vmMarkPagesFree(job->page,job->pages);
8818 /* Cancel the job. It depends on the list the job is
8819 * living in. */
996cb5f7 8820 switch(i) {
8821 case 0: /* io_newjobs */
6c96ba7d 8822 /* If the job was yet not processed the best thing to do
996cb5f7 8823 * is to remove it from the queue at all */
6c96ba7d 8824 freeIOJob(job);
996cb5f7 8825 listDelNode(lists[i],ln);
8826 break;
8827 case 1: /* io_processing */
d5d55fc3 8828 /* Oh Shi- the thread is messing with the Job:
8829 *
8830 * Probably it's accessing the object if this is a
8831 * PREPARE_SWAP or DO_SWAP job.
8832 * If it's a LOAD job it may be reading from disk and
8833 * if we don't wait for the job to terminate before to
8834 * cancel it, maybe in a few microseconds data can be
8835 * corrupted in this pages. So the short story is:
8836 *
8837 * Better to wait for the job to move into the
8838 * next queue (processed)... */
8839
8840 /* We try again and again until the job is completed. */
8841 unlockThreadedIO();
8842 /* But let's wait some time for the I/O thread
8843 * to finish with this job. After all this condition
8844 * should be very rare. */
8845 usleep(1);
8846 goto again;
996cb5f7 8847 case 2: /* io_processed */
2e111efe 8848 /* The job was already processed, that's easy...
8849 * just mark it as canceled so that we'll ignore it
8850 * when processing completed jobs. */
996cb5f7 8851 job->canceled = 1;
8852 break;
8853 }
c7df85a4 8854 /* Finally we have to adjust the storage type of the object
8855 * in order to "UNDO" the operaiton. */
996cb5f7 8856 if (o->storage == REDIS_VM_LOADING)
8857 o->storage = REDIS_VM_SWAPPED;
8858 else if (o->storage == REDIS_VM_SWAPPING)
8859 o->storage = REDIS_VM_MEMORY;
8860 unlockThreadedIO();
8861 return;
8862 }
8863 }
8864 }
8865 unlockThreadedIO();
8866 assert(1 != 1); /* We should never reach this */
8867}
8868
b9bc0eef 8869static void *IOThreadEntryPoint(void *arg) {
8870 iojob *j;
8871 listNode *ln;
8872 REDIS_NOTUSED(arg);
8873
8874 pthread_detach(pthread_self());
8875 while(1) {
8876 /* Get a new job to process */
8877 lockThreadedIO();
8878 if (listLength(server.io_newjobs) == 0) {
8879 /* No new jobs in queue, exit. */
9ebed7cf 8880 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8881 (long) pthread_self());
b9bc0eef 8882 server.io_active_threads--;
8883 unlockThreadedIO();
8884 return NULL;
8885 }
8886 ln = listFirst(server.io_newjobs);
8887 j = ln->value;
8888 listDelNode(server.io_newjobs,ln);
8889 /* Add the job in the processing queue */
8890 j->thread = pthread_self();
8891 listAddNodeTail(server.io_processing,j);
8892 ln = listLast(server.io_processing); /* We use ln later to remove it */
8893 unlockThreadedIO();
9ebed7cf 8894 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8895 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 8896
8897 /* Process the Job */
8898 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 8899 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
b9bc0eef 8900 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8901 FILE *fp = fopen("/dev/null","w+");
8902 j->pages = rdbSavedObjectPages(j->val,fp);
8903 fclose(fp);
8904 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 8905 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8906 j->canceled = 1;
b9bc0eef 8907 }
8908
8909 /* Done: insert the job into the processed queue */
9ebed7cf 8910 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8911 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 8912 lockThreadedIO();
8913 listDelNode(server.io_processing,ln);
8914 listAddNodeTail(server.io_processed,j);
8915 unlockThreadedIO();
8916
8917 /* Signal the main thread there is new stuff to process */
8918 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8919 }
8920 return NULL; /* never reached */
8921}
8922
8923static void spawnIOThread(void) {
8924 pthread_t thread;
478c2c6f 8925 sigset_t mask, omask;
a97b9060 8926 int err;
b9bc0eef 8927
478c2c6f 8928 sigemptyset(&mask);
8929 sigaddset(&mask,SIGCHLD);
8930 sigaddset(&mask,SIGHUP);
8931 sigaddset(&mask,SIGPIPE);
8932 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 8933 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
8934 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
8935 strerror(err));
8936 usleep(1000000);
8937 }
478c2c6f 8938 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 8939 server.io_active_threads++;
8940}
8941
4ee9488d 8942/* We need to wait for the last thread to exit before we are able to
8943 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 8944static void waitEmptyIOJobsQueue(void) {
4ee9488d 8945 while(1) {
76b7233a 8946 int io_processed_len;
8947
4ee9488d 8948 lockThreadedIO();
054e426d 8949 if (listLength(server.io_newjobs) == 0 &&
8950 listLength(server.io_processing) == 0 &&
8951 server.io_active_threads == 0)
8952 {
4ee9488d 8953 unlockThreadedIO();
8954 return;
8955 }
76b7233a 8956 /* While waiting for empty jobs queue condition we post-process some
8957 * finshed job, as I/O threads may be hanging trying to write against
8958 * the io_ready_pipe_write FD but there are so much pending jobs that
8959 * it's blocking. */
8960 io_processed_len = listLength(server.io_processed);
4ee9488d 8961 unlockThreadedIO();
76b7233a 8962 if (io_processed_len) {
8963 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8964 usleep(1000); /* 1 millisecond */
8965 } else {
8966 usleep(10000); /* 10 milliseconds */
8967 }
4ee9488d 8968 }
8969}
8970
054e426d 8971static void vmReopenSwapFile(void) {
478c2c6f 8972 /* Note: we don't close the old one as we are in the child process
8973 * and don't want to mess at all with the original file object. */
054e426d 8974 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8975 if (server.vm_fp == NULL) {
8976 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8977 server.vm_swap_file);
478c2c6f 8978 _exit(1);
054e426d 8979 }
8980 server.vm_fd = fileno(server.vm_fp);
8981}
8982
b9bc0eef 8983/* This function must be called while with threaded IO locked */
8984static void queueIOJob(iojob *j) {
6c96ba7d 8985 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8986 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 8987 listAddNodeTail(server.io_newjobs,j);
8988 if (server.io_active_threads < server.vm_max_threads)
8989 spawnIOThread();
8990}
8991
8992static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
8993 iojob *j;
8994
8995 assert(key->storage == REDIS_VM_MEMORY);
8996 assert(key->refcount == 1);
8997
8998 j = zmalloc(sizeof(*j));
8999 j->type = REDIS_IOJOB_PREPARE_SWAP;
9000 j->db = db;
78ebe4c8 9001 j->key = key;
b9bc0eef 9002 j->val = val;
9003 incrRefCount(val);
9004 j->canceled = 0;
9005 j->thread = (pthread_t) -1;
f11b8647 9006 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 9007
9008 lockThreadedIO();
9009 queueIOJob(j);
9010 unlockThreadedIO();
9011 return REDIS_OK;
9012}
9013
b0d8747d 9014/* ============ Virtual Memory - Blocking clients on missing keys =========== */
9015
d5d55fc3 9016/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9017 * If there is not already a job loading the key, it is craeted.
9018 * The key is added to the io_keys list in the client structure, and also
9019 * in the hash table mapping swapped keys to waiting clients, that is,
9020 * server.io_waited_keys. */
9021static int waitForSwappedKey(redisClient *c, robj *key) {
9022 struct dictEntry *de;
9023 robj *o;
9024 list *l;
9025
9026 /* If the key does not exist or is already in RAM we don't need to
9027 * block the client at all. */
9028 de = dictFind(c->db->dict,key);
9029 if (de == NULL) return 0;
9030 o = dictGetEntryKey(de);
9031 if (o->storage == REDIS_VM_MEMORY) {
9032 return 0;
9033 } else if (o->storage == REDIS_VM_SWAPPING) {
9034 /* We were swapping the key, undo it! */
9035 vmCancelThreadedIOJob(o);
9036 return 0;
9037 }
9038
9039 /* OK: the key is either swapped, or being loaded just now. */
9040
9041 /* Add the key to the list of keys this client is waiting for.
9042 * This maps clients to keys they are waiting for. */
9043 listAddNodeTail(c->io_keys,key);
9044 incrRefCount(key);
9045
9046 /* Add the client to the swapped keys => clients waiting map. */
9047 de = dictFind(c->db->io_keys,key);
9048 if (de == NULL) {
9049 int retval;
9050
9051 /* For every key we take a list of clients blocked for it */
9052 l = listCreate();
9053 retval = dictAdd(c->db->io_keys,key,l);
9054 incrRefCount(key);
9055 assert(retval == DICT_OK);
9056 } else {
9057 l = dictGetEntryVal(de);
9058 }
9059 listAddNodeTail(l,c);
9060
9061 /* Are we already loading the key from disk? If not create a job */
9062 if (o->storage == REDIS_VM_SWAPPED) {
9063 iojob *j;
9064
9065 o->storage = REDIS_VM_LOADING;
9066 j = zmalloc(sizeof(*j));
9067 j->type = REDIS_IOJOB_LOAD;
9068 j->db = c->db;
78ebe4c8 9069 j->key = o;
d5d55fc3 9070 j->key->vtype = o->vtype;
9071 j->page = o->vm.page;
9072 j->val = NULL;
9073 j->canceled = 0;
9074 j->thread = (pthread_t) -1;
9075 lockThreadedIO();
9076 queueIOJob(j);
9077 unlockThreadedIO();
9078 }
9079 return 1;
9080}
9081
76583ea4
PN
9082/* Preload keys needed for the ZUNION and ZINTER commands. */
9083static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9084 int i, num;
9085 num = atoi(c->argv[2]->ptr);
9086 for (i = 0; i < num; i++) {
9087 waitForSwappedKey(c,c->argv[3+i]);
9088 }
9089}
9090
b0d8747d 9091/* Is this client attempting to run a command against swapped keys?
d5d55fc3 9092 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 9093 *
d5d55fc3 9094 * The important idea about this function is that it can fail! If keys will
9095 * still be swapped when the client is resumed, this key lookups will
9096 * just block loading keys from disk. In practical terms this should only
9097 * happen with SORT BY command or if there is a bug in this function.
9098 *
9099 * Return 1 if the client is marked as blocked, 0 if the client can
9100 * continue as the keys it is going to access appear to be in memory. */
9101static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
7c775e09 9102 int j, last;
9103
76583ea4
PN
9104 if (cmd->vm_preload_proc != NULL) {
9105 cmd->vm_preload_proc(c);
9106 } else {
9107 if (cmd->vm_firstkey == 0) return 0;
9108 last = cmd->vm_lastkey;
9109 if (last < 0) last = c->argc+last;
9110 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9111 waitForSwappedKey(c,c->argv[j]);
9112 }
9113
d5d55fc3 9114 /* If the client was blocked for at least one key, mark it as blocked. */
9115 if (listLength(c->io_keys)) {
9116 c->flags |= REDIS_IO_WAIT;
9117 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9118 server.vm_blocked_clients++;
9119 return 1;
9120 } else {
9121 return 0;
9122 }
9123}
9124
9125/* Remove the 'key' from the list of blocked keys for a given client.
9126 *
9127 * The function returns 1 when there are no longer blocking keys after
9128 * the current one was removed (and the client can be unblocked). */
9129static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9130 list *l;
9131 listNode *ln;
9132 listIter li;
9133 struct dictEntry *de;
9134
9135 /* Remove the key from the list of keys this client is waiting for. */
9136 listRewind(c->io_keys,&li);
9137 while ((ln = listNext(&li)) != NULL) {
9138 if (compareStringObjects(ln->value,key) == 0) {
9139 listDelNode(c->io_keys,ln);
9140 break;
9141 }
9142 }
9143 assert(ln != NULL);
9144
9145 /* Remove the client form the key => waiting clients map. */
9146 de = dictFind(c->db->io_keys,key);
9147 assert(de != NULL);
9148 l = dictGetEntryVal(de);
9149 ln = listSearchKey(l,c);
9150 assert(ln != NULL);
9151 listDelNode(l,ln);
9152 if (listLength(l) == 0)
9153 dictDelete(c->db->io_keys,key);
9154
9155 return listLength(c->io_keys) == 0;
9156}
9157
9158static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9159 struct dictEntry *de;
9160 list *l;
9161 listNode *ln;
9162 int len;
9163
9164 de = dictFind(db->io_keys,key);
9165 if (!de) return;
9166
9167 l = dictGetEntryVal(de);
9168 len = listLength(l);
9169 /* Note: we can't use something like while(listLength(l)) as the list
9170 * can be freed by the calling function when we remove the last element. */
9171 while (len--) {
9172 ln = listFirst(l);
9173 redisClient *c = ln->value;
9174
9175 if (dontWaitForSwappedKey(c,key)) {
9176 /* Put the client in the list of clients ready to go as we
9177 * loaded all the keys about it. */
9178 listAddNodeTail(server.io_ready_clients,c);
9179 }
9180 }
b0d8747d 9181}
b0d8747d 9182
500ece7c 9183/* =========================== Remote Configuration ========================= */
9184
9185static void configSetCommand(redisClient *c) {
9186 robj *o = getDecodedObject(c->argv[3]);
9187 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9188 zfree(server.dbfilename);
9189 server.dbfilename = zstrdup(o->ptr);
9190 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9191 zfree(server.requirepass);
9192 server.requirepass = zstrdup(o->ptr);
9193 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9194 zfree(server.masterauth);
9195 server.masterauth = zstrdup(o->ptr);
9196 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9197 server.maxmemory = strtoll(o->ptr, NULL, 10);
9198 } else {
9199 addReplySds(c,sdscatprintf(sdsempty(),
9200 "-ERR not supported CONFIG parameter %s\r\n",
9201 (char*)c->argv[2]->ptr));
9202 decrRefCount(o);
9203 return;
9204 }
9205 decrRefCount(o);
9206 addReply(c,shared.ok);
9207}
9208
9209static void configGetCommand(redisClient *c) {
9210 robj *o = getDecodedObject(c->argv[2]);
9211 robj *lenobj = createObject(REDIS_STRING,NULL);
9212 char *pattern = o->ptr;
9213 int matches = 0;
9214
9215 addReply(c,lenobj);
9216 decrRefCount(lenobj);
9217
9218 if (stringmatch(pattern,"dbfilename",0)) {
9219 addReplyBulkCString(c,"dbfilename");
9220 addReplyBulkCString(c,server.dbfilename);
9221 matches++;
9222 }
9223 if (stringmatch(pattern,"requirepass",0)) {
9224 addReplyBulkCString(c,"requirepass");
9225 addReplyBulkCString(c,server.requirepass);
9226 matches++;
9227 }
9228 if (stringmatch(pattern,"masterauth",0)) {
9229 addReplyBulkCString(c,"masterauth");
9230 addReplyBulkCString(c,server.masterauth);
9231 matches++;
9232 }
9233 if (stringmatch(pattern,"maxmemory",0)) {
9234 char buf[128];
9235
9236 snprintf(buf,128,"%llu\n",server.maxmemory);
9237 addReplyBulkCString(c,"maxmemory");
9238 addReplyBulkCString(c,buf);
9239 matches++;
9240 }
9241 decrRefCount(o);
9242 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9243}
9244
9245static void configCommand(redisClient *c) {
9246 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9247 if (c->argc != 4) goto badarity;
9248 configSetCommand(c);
9249 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9250 if (c->argc != 3) goto badarity;
9251 configGetCommand(c);
9252 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9253 if (c->argc != 2) goto badarity;
9254 server.stat_numcommands = 0;
9255 server.stat_numconnections = 0;
9256 server.stat_expiredkeys = 0;
9257 server.stat_starttime = time(NULL);
9258 addReply(c,shared.ok);
9259 } else {
9260 addReplySds(c,sdscatprintf(sdsempty(),
9261 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9262 }
9263 return;
9264
9265badarity:
9266 addReplySds(c,sdscatprintf(sdsempty(),
9267 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9268 (char*) c->argv[1]->ptr));
9269}
9270
befec3cd 9271/* =========================== Pubsub implementation ======================== */
9272
ffc6b7f8 9273static void freePubsubPattern(void *p) {
9274 pubsubPattern *pat = p;
9275
9276 decrRefCount(pat->pattern);
9277 zfree(pat);
9278}
9279
9280static int listMatchPubsubPattern(void *a, void *b) {
9281 pubsubPattern *pa = a, *pb = b;
9282
9283 return (pa->client == pb->client) &&
9284 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9285}
9286
9287/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9288 * 0 if the client was already subscribed to that channel. */
9289static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
befec3cd 9290 struct dictEntry *de;
9291 list *clients = NULL;
9292 int retval = 0;
9293
ffc6b7f8 9294 /* Add the channel to the client -> channels hash table */
9295 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
befec3cd 9296 retval = 1;
ffc6b7f8 9297 incrRefCount(channel);
9298 /* Add the client to the channel -> list of clients hash table */
9299 de = dictFind(server.pubsub_channels,channel);
befec3cd 9300 if (de == NULL) {
9301 clients = listCreate();
ffc6b7f8 9302 dictAdd(server.pubsub_channels,channel,clients);
9303 incrRefCount(channel);
befec3cd 9304 } else {
9305 clients = dictGetEntryVal(de);
9306 }
9307 listAddNodeTail(clients,c);
9308 }
9309 /* Notify the client */
9310 addReply(c,shared.mbulk3);
9311 addReply(c,shared.subscribebulk);
ffc6b7f8 9312 addReplyBulk(c,channel);
9313 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
befec3cd 9314 return retval;
9315}
9316
ffc6b7f8 9317/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9318 * 0 if the client was not subscribed to the specified channel. */
9319static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
befec3cd 9320 struct dictEntry *de;
9321 list *clients;
9322 listNode *ln;
9323 int retval = 0;
9324
ffc6b7f8 9325 /* Remove the channel from the client -> channels hash table */
9326 incrRefCount(channel); /* channel may be just a pointer to the same object
201037f5 9327 we have in the hash tables. Protect it... */
ffc6b7f8 9328 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
befec3cd 9329 retval = 1;
ffc6b7f8 9330 /* Remove the client from the channel -> clients list hash table */
9331 de = dictFind(server.pubsub_channels,channel);
befec3cd 9332 assert(de != NULL);
9333 clients = dictGetEntryVal(de);
9334 ln = listSearchKey(clients,c);
9335 assert(ln != NULL);
9336 listDelNode(clients,ln);
ff767a75 9337 if (listLength(clients) == 0) {
9338 /* Free the list and associated hash entry at all if this was
9339 * the latest client, so that it will be possible to abuse
ffc6b7f8 9340 * Redis PUBSUB creating millions of channels. */
9341 dictDelete(server.pubsub_channels,channel);
ff767a75 9342 }
befec3cd 9343 }
9344 /* Notify the client */
9345 if (notify) {
9346 addReply(c,shared.mbulk3);
9347 addReply(c,shared.unsubscribebulk);
ffc6b7f8 9348 addReplyBulk(c,channel);
9349 addReplyLong(c,dictSize(c->pubsub_channels)+
9350 listLength(c->pubsub_patterns));
9351
9352 }
9353 decrRefCount(channel); /* it is finally safe to release it */
9354 return retval;
9355}
9356
9357/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9358static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9359 int retval = 0;
9360
9361 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9362 retval = 1;
9363 pubsubPattern *pat;
9364 listAddNodeTail(c->pubsub_patterns,pattern);
9365 incrRefCount(pattern);
9366 pat = zmalloc(sizeof(*pat));
9367 pat->pattern = getDecodedObject(pattern);
9368 pat->client = c;
9369 listAddNodeTail(server.pubsub_patterns,pat);
9370 }
9371 /* Notify the client */
9372 addReply(c,shared.mbulk3);
9373 addReply(c,shared.psubscribebulk);
9374 addReplyBulk(c,pattern);
9375 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9376 return retval;
9377}
9378
9379/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9380 * 0 if the client was not subscribed to the specified channel. */
9381static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9382 listNode *ln;
9383 pubsubPattern pat;
9384 int retval = 0;
9385
9386 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9387 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9388 retval = 1;
9389 listDelNode(c->pubsub_patterns,ln);
9390 pat.client = c;
9391 pat.pattern = pattern;
9392 ln = listSearchKey(server.pubsub_patterns,&pat);
9393 listDelNode(server.pubsub_patterns,ln);
9394 }
9395 /* Notify the client */
9396 if (notify) {
9397 addReply(c,shared.mbulk3);
9398 addReply(c,shared.punsubscribebulk);
9399 addReplyBulk(c,pattern);
9400 addReplyLong(c,dictSize(c->pubsub_channels)+
9401 listLength(c->pubsub_patterns));
befec3cd 9402 }
ffc6b7f8 9403 decrRefCount(pattern);
befec3cd 9404 return retval;
9405}
9406
ffc6b7f8 9407/* Unsubscribe from all the channels. Return the number of channels the
9408 * client was subscribed from. */
9409static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9410 dictIterator *di = dictGetIterator(c->pubsub_channels);
befec3cd 9411 dictEntry *de;
9412 int count = 0;
9413
9414 while((de = dictNext(di)) != NULL) {
ffc6b7f8 9415 robj *channel = dictGetEntryKey(de);
befec3cd 9416
ffc6b7f8 9417 count += pubsubUnsubscribeChannel(c,channel,notify);
befec3cd 9418 }
9419 dictReleaseIterator(di);
9420 return count;
9421}
9422
ffc6b7f8 9423/* Unsubscribe from all the patterns. Return the number of patterns the
9424 * client was subscribed from. */
9425static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9426 listNode *ln;
9427 listIter li;
9428 int count = 0;
9429
9430 listRewind(c->pubsub_patterns,&li);
9431 while ((ln = listNext(&li)) != NULL) {
9432 robj *pattern = ln->value;
9433
9434 count += pubsubUnsubscribePattern(c,pattern,notify);
9435 }
9436 return count;
9437}
9438
befec3cd 9439/* Publish a message */
ffc6b7f8 9440static int pubsubPublishMessage(robj *channel, robj *message) {
befec3cd 9441 int receivers = 0;
9442 struct dictEntry *de;
ffc6b7f8 9443 listNode *ln;
9444 listIter li;
befec3cd 9445
ffc6b7f8 9446 /* Send to clients listening for that channel */
9447 de = dictFind(server.pubsub_channels,channel);
befec3cd 9448 if (de) {
9449 list *list = dictGetEntryVal(de);
9450 listNode *ln;
9451 listIter li;
9452
9453 listRewind(list,&li);
9454 while ((ln = listNext(&li)) != NULL) {
9455 redisClient *c = ln->value;
9456
9457 addReply(c,shared.mbulk3);
9458 addReply(c,shared.messagebulk);
ffc6b7f8 9459 addReplyBulk(c,channel);
befec3cd 9460 addReplyBulk(c,message);
9461 receivers++;
9462 }
9463 }
ffc6b7f8 9464 /* Send to clients listening to matching channels */
9465 if (listLength(server.pubsub_patterns)) {
9466 listRewind(server.pubsub_patterns,&li);
9467 channel = getDecodedObject(channel);
9468 while ((ln = listNext(&li)) != NULL) {
9469 pubsubPattern *pat = ln->value;
9470
9471 if (stringmatchlen((char*)pat->pattern->ptr,
9472 sdslen(pat->pattern->ptr),
9473 (char*)channel->ptr,
9474 sdslen(channel->ptr),0)) {
9475 addReply(pat->client,shared.mbulk3);
9476 addReply(pat->client,shared.messagebulk);
9477 addReplyBulk(pat->client,channel);
9478 addReplyBulk(pat->client,message);
9479 receivers++;
9480 }
9481 }
9482 decrRefCount(channel);
9483 }
befec3cd 9484 return receivers;
9485}
9486
9487static void subscribeCommand(redisClient *c) {
9488 int j;
9489
9490 for (j = 1; j < c->argc; j++)
ffc6b7f8 9491 pubsubSubscribeChannel(c,c->argv[j]);
befec3cd 9492}
9493
9494static void unsubscribeCommand(redisClient *c) {
9495 if (c->argc == 1) {
ffc6b7f8 9496 pubsubUnsubscribeAllChannels(c,1);
9497 return;
9498 } else {
9499 int j;
9500
9501 for (j = 1; j < c->argc; j++)
9502 pubsubUnsubscribeChannel(c,c->argv[j],1);
9503 }
9504}
9505
9506static void psubscribeCommand(redisClient *c) {
9507 int j;
9508
9509 for (j = 1; j < c->argc; j++)
9510 pubsubSubscribePattern(c,c->argv[j]);
9511}
9512
9513static void punsubscribeCommand(redisClient *c) {
9514 if (c->argc == 1) {
9515 pubsubUnsubscribeAllPatterns(c,1);
befec3cd 9516 return;
9517 } else {
9518 int j;
9519
9520 for (j = 1; j < c->argc; j++)
ffc6b7f8 9521 pubsubUnsubscribePattern(c,c->argv[j],1);
befec3cd 9522 }
9523}
9524
9525static void publishCommand(redisClient *c) {
9526 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9527 addReplyLong(c,receivers);
9528}
9529
7f957c92 9530/* ================================= Debugging ============================== */
9531
9532static void debugCommand(redisClient *c) {
9533 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9534 *((char*)-1) = 'x';
210e29f7 9535 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9536 if (rdbSave(server.dbfilename) != REDIS_OK) {
9537 addReply(c,shared.err);
9538 return;
9539 }
9540 emptyDb();
9541 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9542 addReply(c,shared.err);
9543 return;
9544 }
9545 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9546 addReply(c,shared.ok);
71c2b467 9547 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9548 emptyDb();
9549 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9550 addReply(c,shared.err);
9551 return;
9552 }
9553 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9554 addReply(c,shared.ok);
333298da 9555 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9556 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9557 robj *key, *val;
9558
9559 if (!de) {
9560 addReply(c,shared.nokeyerr);
9561 return;
9562 }
9563 key = dictGetEntryKey(de);
9564 val = dictGetEntryVal(de);
59146ef3 9565 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9566 key->storage == REDIS_VM_SWAPPING)) {
07efaf74 9567 char *strenc;
9568 char buf[128];
9569
9570 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9571 strenc = strencoding[val->encoding];
9572 } else {
9573 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9574 strenc = buf;
9575 }
ace06542 9576 addReplySds(c,sdscatprintf(sdsempty(),
9577 "+Key at:%p refcount:%d, value at:%p refcount:%d "
07efaf74 9578 "encoding:%s serializedlength:%lld\r\n",
682ac724 9579 (void*)key, key->refcount, (void*)val, val->refcount,
07efaf74 9580 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 9581 } else {
9582 addReplySds(c,sdscatprintf(sdsempty(),
9583 "+Key at:%p refcount:%d, value swapped at: page %llu "
9584 "using %llu pages\r\n",
9585 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9586 (unsigned long long) key->vm.usedpages));
9587 }
78ebe4c8 9588 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
9589 lookupKeyRead(c->db,c->argv[2]);
9590 addReply(c,shared.ok);
7d30035d 9591 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9592 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9593 robj *key, *val;
9594
9595 if (!server.vm_enabled) {
9596 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9597 return;
9598 }
9599 if (!de) {
9600 addReply(c,shared.nokeyerr);
9601 return;
9602 }
9603 key = dictGetEntryKey(de);
9604 val = dictGetEntryVal(de);
4ef8de8a 9605 /* If the key is shared we want to create a copy */
9606 if (key->refcount > 1) {
9607 robj *newkey = dupStringObject(key);
9608 decrRefCount(key);
9609 key = dictGetEntryKey(de) = newkey;
9610 }
9611 /* Swap it */
7d30035d 9612 if (key->storage != REDIS_VM_MEMORY) {
9613 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 9614 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 9615 dictGetEntryVal(de) = NULL;
9616 addReply(c,shared.ok);
9617 } else {
9618 addReply(c,shared.err);
9619 }
7f957c92 9620 } else {
333298da 9621 addReplySds(c,sdsnew(
bdcb92f2 9622 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 9623 }
9624}
56906eef 9625
6c96ba7d 9626static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 9627 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
6c96ba7d 9628 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
dfc5e96c 9629#ifdef HAVE_BACKTRACE
9630 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9631 *((char*)-1) = 'x';
9632#endif
9633}
9634
bcfc686d 9635/* =================================== Main! ================================ */
56906eef 9636
bcfc686d 9637#ifdef __linux__
9638int linuxOvercommitMemoryValue(void) {
9639 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9640 char buf[64];
56906eef 9641
bcfc686d 9642 if (!fp) return -1;
9643 if (fgets(buf,64,fp) == NULL) {
9644 fclose(fp);
9645 return -1;
9646 }
9647 fclose(fp);
56906eef 9648
bcfc686d 9649 return atoi(buf);
9650}
9651
9652void linuxOvercommitMemoryWarning(void) {
9653 if (linuxOvercommitMemoryValue() == 0) {
9654 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9655 }
9656}
9657#endif /* __linux__ */
9658
9659static void daemonize(void) {
9660 int fd;
9661 FILE *fp;
9662
9663 if (fork() != 0) exit(0); /* parent exits */
9664 setsid(); /* create a new session */
9665
9666 /* Every output goes to /dev/null. If Redis is daemonized but
9667 * the 'logfile' is set to 'stdout' in the configuration file
9668 * it will not log at all. */
9669 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9670 dup2(fd, STDIN_FILENO);
9671 dup2(fd, STDOUT_FILENO);
9672 dup2(fd, STDERR_FILENO);
9673 if (fd > STDERR_FILENO) close(fd);
9674 }
9675 /* Try to write the pid file */
9676 fp = fopen(server.pidfile,"w");
9677 if (fp) {
9678 fprintf(fp,"%d\n",getpid());
9679 fclose(fp);
56906eef 9680 }
56906eef 9681}
9682
42ab0172
AO
9683static void version() {
9684 printf("Redis server version %s\n", REDIS_VERSION);
9685 exit(0);
9686}
9687
723fb69b
AO
9688static void usage() {
9689 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 9690 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
9691 exit(1);
9692}
9693
bcfc686d 9694int main(int argc, char **argv) {
9651a787 9695 time_t start;
9696
bcfc686d 9697 initServerConfig();
9698 if (argc == 2) {
44efe66e 9699 if (strcmp(argv[1], "-v") == 0 ||
9700 strcmp(argv[1], "--version") == 0) version();
9701 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 9702 resetServerSaveParams();
9703 loadServerConfig(argv[1]);
723fb69b
AO
9704 } else if ((argc > 2)) {
9705 usage();
bcfc686d 9706 } else {
9707 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9708 }
bcfc686d 9709 if (server.daemonize) daemonize();
71c54b21 9710 initServer();
bcfc686d 9711 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
9712#ifdef __linux__
9713 linuxOvercommitMemoryWarning();
9714#endif
9651a787 9715 start = time(NULL);
bcfc686d 9716 if (server.appendonly) {
9717 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 9718 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 9719 } else {
9720 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 9721 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 9722 }
bcfc686d 9723 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 9724 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 9725 aeMain(server.el);
9726 aeDeleteEventLoop(server.el);
9727 return 0;
9728}
9729
9730/* ============================= Backtrace support ========================= */
9731
9732#ifdef HAVE_BACKTRACE
9733static char *findFuncName(void *pointer, unsigned long *offset);
9734
56906eef 9735static void *getMcontextEip(ucontext_t *uc) {
9736#if defined(__FreeBSD__)
9737 return (void*) uc->uc_mcontext.mc_eip;
9738#elif defined(__dietlibc__)
9739 return (void*) uc->uc_mcontext.eip;
06db1f50 9740#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 9741 #if __x86_64__
9742 return (void*) uc->uc_mcontext->__ss.__rip;
9743 #else
56906eef 9744 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 9745 #endif
06db1f50 9746#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 9747 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 9748 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 9749 #else
9750 return (void*) uc->uc_mcontext->__ss.__eip;
9751 #endif
54bac49d 9752#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 9753 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 9754#elif defined(__ia64__) /* Linux IA64 */
9755 return (void*) uc->uc_mcontext.sc_ip;
9756#else
9757 return NULL;
56906eef 9758#endif
9759}
9760
9761static void segvHandler(int sig, siginfo_t *info, void *secret) {
9762 void *trace[100];
9763 char **messages = NULL;
9764 int i, trace_size = 0;
9765 unsigned long offset=0;
56906eef 9766 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 9767 sds infostring;
56906eef 9768 REDIS_NOTUSED(info);
9769
9770 redisLog(REDIS_WARNING,
9771 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 9772 infostring = genRedisInfoString();
9773 redisLog(REDIS_WARNING, "%s",infostring);
9774 /* It's not safe to sdsfree() the returned string under memory
9775 * corruption conditions. Let it leak as we are going to abort */
56906eef 9776
9777 trace_size = backtrace(trace, 100);
de96dbfe 9778 /* overwrite sigaction with caller's address */
b91cf5ef 9779 if (getMcontextEip(uc) != NULL) {
9780 trace[1] = getMcontextEip(uc);
9781 }
56906eef 9782 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 9783
d76412d1 9784 for (i=1; i<trace_size; ++i) {
56906eef 9785 char *fn = findFuncName(trace[i], &offset), *p;
9786
9787 p = strchr(messages[i],'+');
9788 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
9789 redisLog(REDIS_WARNING,"%s", messages[i]);
9790 } else {
9791 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
9792 }
9793 }
b177fd30 9794 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 9795 _exit(0);
fe3bbfbe 9796}
56906eef 9797
9798static void setupSigSegvAction(void) {
9799 struct sigaction act;
9800
9801 sigemptyset (&act.sa_mask);
9802 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
9803 * is used. Otherwise, sa_handler is used */
9804 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
9805 act.sa_sigaction = segvHandler;
9806 sigaction (SIGSEGV, &act, NULL);
9807 sigaction (SIGBUS, &act, NULL);
12fea928 9808 sigaction (SIGFPE, &act, NULL);
9809 sigaction (SIGILL, &act, NULL);
9810 sigaction (SIGBUS, &act, NULL);
e65fdc78 9811 return;
56906eef 9812}
e65fdc78 9813
bcfc686d 9814#include "staticsymbols.h"
9815/* This function try to convert a pointer into a function name. It's used in
9816 * oreder to provide a backtrace under segmentation fault that's able to
9817 * display functions declared as static (otherwise the backtrace is useless). */
9818static char *findFuncName(void *pointer, unsigned long *offset){
9819 int i, ret = -1;
9820 unsigned long off, minoff = 0;
ed9b544e 9821
bcfc686d 9822 /* Try to match against the Symbol with the smallest offset */
9823 for (i=0; symsTable[i].pointer; i++) {
9824 unsigned long lp = (unsigned long) pointer;
0bc03378 9825
bcfc686d 9826 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
9827 off=lp-symsTable[i].pointer;
9828 if (ret < 0 || off < minoff) {
9829 minoff=off;
9830 ret=i;
9831 }
9832 }
0bc03378 9833 }
bcfc686d 9834 if (ret == -1) return NULL;
9835 *offset = minoff;
9836 return symsTable[ret].name;
0bc03378 9837}
bcfc686d 9838#else /* HAVE_BACKTRACE */
9839static void setupSigSegvAction(void) {
0bc03378 9840}
bcfc686d 9841#endif /* HAVE_BACKTRACE */
0bc03378 9842
ed9b544e 9843
ed9b544e 9844
bcfc686d 9845/* The End */
9846
9847
ed9b544e 9848