]> git.saurik.com Git - redis.git/blame - redis.c
Sharing of small integer objects: may save a lot of memory with datasets having many...
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
cac154c5 30#define REDIS_VERSION "1.3.8"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
c9468bcf 40#define __USE_POSIX199309
54bac49d 41#define __USE_UNIX98
ed9b544e 42#include <signal.h>
fbf9bcdb 43
44#ifdef HAVE_BACKTRACE
c9468bcf 45#include <execinfo.h>
46#include <ucontext.h>
fbf9bcdb 47#endif /* HAVE_BACKTRACE */
48
ed9b544e 49#include <sys/wait.h>
50#include <errno.h>
51#include <assert.h>
52#include <ctype.h>
53#include <stdarg.h>
54#include <inttypes.h>
55#include <arpa/inet.h>
56#include <sys/stat.h>
57#include <fcntl.h>
58#include <sys/time.h>
59#include <sys/resource.h>
2895e862 60#include <sys/uio.h>
f78fd11b 61#include <limits.h>
a7866db6 62#include <math.h>
92f8e882 63#include <pthread.h>
0bc1b2f6 64
65#if defined(__sun)
5043dff3 66#include "solarisfixes.h"
67#endif
ed9b544e 68
c9468bcf 69#include "redis.h"
ed9b544e 70#include "ae.h" /* Event driven programming library */
71#include "sds.h" /* Dynamic safe strings */
72#include "anet.h" /* Networking the easy way */
73#include "dict.h" /* Hash tables */
74#include "adlist.h" /* Linked lists */
75#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 76#include "lzf.h" /* LZF compression library */
77#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
5234952b 78#include "zipmap.h"
ed9b544e 79
80/* Error codes */
81#define REDIS_OK 0
82#define REDIS_ERR -1
83
84/* Static server configuration */
85#define REDIS_SERVERPORT 6379 /* TCP port */
86#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 87#define REDIS_IOBUF_LEN 1024
ed9b544e 88#define REDIS_LOADBUF_LEN 1024
248ea310 89#define REDIS_STATIC_ARGS 8
ed9b544e 90#define REDIS_DEFAULT_DBNUM 16
91#define REDIS_CONFIGLINE_MAX 1024
92#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
1763929f 94#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* try to expire 10 keys/loop */
6f376729 95#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 96#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99#define REDIS_WRITEV_THRESHOLD 3
100/* Max number of iovecs used for each writev call */
101#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 102
103/* Hash table parameters */
104#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 105
106/* Command flags */
3fd78bcd 107#define REDIS_CMD_BULK 1 /* Bulk write command */
108#define REDIS_CMD_INLINE 2 /* Inline command */
109/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113#define REDIS_CMD_DENYOOM 4
4005fef1 114#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
ed9b544e 115
116/* Object types */
117#define REDIS_STRING 0
118#define REDIS_LIST 1
119#define REDIS_SET 2
1812e024 120#define REDIS_ZSET 3
121#define REDIS_HASH 4
f78fd11b 122
5234952b 123/* Objects encoding. Some kind of objects like Strings and Hashes can be
124 * internally represented in multiple ways. The 'encoding' field of the object
125 * is set to one of this fields for this object. */
942a3961 126#define REDIS_ENCODING_RAW 0 /* Raw representation */
127#define REDIS_ENCODING_INT 1 /* Encoded as integer */
5234952b 128#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
129#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
942a3961 130
07efaf74 131static char* strencoding[] = {
132 "raw", "int", "zipmap", "hashtable"
133};
134
f78fd11b 135/* Object types only used for dumping to disk */
bb32ede5 136#define REDIS_EXPIRETIME 253
ed9b544e 137#define REDIS_SELECTDB 254
138#define REDIS_EOF 255
139
f78fd11b 140/* Defines related to the dump file format. To store 32 bits lengths for short
141 * keys requires a lot of space, so we check the most significant 2 bits of
142 * the first byte to interpreter the length:
143 *
144 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
145 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
146 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 147 * 11|000000 this means: specially encoded object will follow. The six bits
148 * number specify the kind of object that follows.
149 * See the REDIS_RDB_ENC_* defines.
f78fd11b 150 *
10c43610 151 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
152 * values, will fit inside. */
f78fd11b 153#define REDIS_RDB_6BITLEN 0
154#define REDIS_RDB_14BITLEN 1
155#define REDIS_RDB_32BITLEN 2
17be1a4a 156#define REDIS_RDB_ENCVAL 3
f78fd11b 157#define REDIS_RDB_LENERR UINT_MAX
158
a4d1ba9a 159/* When a length of a string object stored on disk has the first two bits
160 * set, the remaining two bits specify a special encoding for the object
161 * accordingly to the following defines: */
162#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
163#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
164#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 165#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 166
75680a3c 167/* Virtual memory object->where field. */
168#define REDIS_VM_MEMORY 0 /* The object is on memory */
169#define REDIS_VM_SWAPPED 1 /* The object is on disk */
170#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
171#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
172
06224fec 173/* Virtual memory static configuration stuff.
174 * Check vmFindContiguousPages() to know more about this magic numbers. */
175#define REDIS_VM_MAX_NEAR_PAGES 65536
176#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 177#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 178#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 179/* The following is the *percentage* of completed I/O jobs to process when the
180 * handelr is called. While Virtual Memory I/O operations are performed by
181 * threads, this operations must be processed by the main thread when completed
182 * in order to take effect. */
c953f24b 183#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 184
ed9b544e 185/* Client flags */
d5d55fc3 186#define REDIS_SLAVE 1 /* This client is a slave server */
187#define REDIS_MASTER 2 /* This client is a master server */
188#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
189#define REDIS_MULTI 8 /* This client is in a MULTI context */
190#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
191#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
ed9b544e 192
40d224a9 193/* Slave replication state - slave side */
ed9b544e 194#define REDIS_REPL_NONE 0 /* No active replication */
195#define REDIS_REPL_CONNECT 1 /* Must connect to master */
196#define REDIS_REPL_CONNECTED 2 /* Connected to master */
197
40d224a9 198/* Slave replication state - from the point of view of master
199 * Note that in SEND_BULK and ONLINE state the slave receives new updates
200 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
201 * to start the next background saving in order to send updates to it. */
202#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
203#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
204#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
205#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
206
ed9b544e 207/* List related stuff */
208#define REDIS_HEAD 0
209#define REDIS_TAIL 1
210
211/* Sort operations */
212#define REDIS_SORT_GET 0
443c6409 213#define REDIS_SORT_ASC 1
214#define REDIS_SORT_DESC 2
ed9b544e 215#define REDIS_SORTKEY_MAX 1024
216
217/* Log levels */
218#define REDIS_DEBUG 0
f870935d 219#define REDIS_VERBOSE 1
220#define REDIS_NOTICE 2
221#define REDIS_WARNING 3
ed9b544e 222
223/* Anti-warning macro... */
224#define REDIS_NOTUSED(V) ((void) V)
225
6b47e12e 226#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
227#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 228
48f0308a 229/* Append only defines */
230#define APPENDFSYNC_NO 0
231#define APPENDFSYNC_ALWAYS 1
232#define APPENDFSYNC_EVERYSEC 2
233
cbba7dd7 234/* Hashes related defaults */
235#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
236#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
237
dfc5e96c 238/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 239#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
6c96ba7d 240static void _redisAssert(char *estr, char *file, int line);
dfc5e96c 241
ed9b544e 242/*================================= Data types ============================== */
243
244/* A redis object, that is a type able to hold a string / list / set */
75680a3c 245
246/* The VM object structure */
247struct redisObjectVM {
3a66edc7 248 off_t page; /* the page at witch the object is stored on disk */
249 off_t usedpages; /* number of pages used on disk */
250 time_t atime; /* Last access time */
75680a3c 251} vm;
252
253/* The actual Redis Object */
ed9b544e 254typedef struct redisObject {
ed9b544e 255 void *ptr;
942a3961 256 unsigned char type;
257 unsigned char encoding;
d894161b 258 unsigned char storage; /* If this object is a key, where is the value?
259 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
260 unsigned char vtype; /* If this object is a key, and value is swapped out,
261 * this is the type of the swapped out object. */
ed9b544e 262 int refcount;
75680a3c 263 /* VM fields, this are only allocated if VM is active, otherwise the
264 * object allocation function will just allocate
265 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
266 * Redis without VM active will not have any overhead. */
267 struct redisObjectVM vm;
ed9b544e 268} robj;
269
dfc5e96c 270/* Macro used to initalize a Redis object allocated on the stack.
271 * Note that this macro is taken near the structure definition to make sure
272 * we'll update it when the structure is changed, to avoid bugs like
273 * bug #85 introduced exactly in this way. */
274#define initStaticStringObject(_var,_ptr) do { \
275 _var.refcount = 1; \
276 _var.type = REDIS_STRING; \
277 _var.encoding = REDIS_ENCODING_RAW; \
278 _var.ptr = _ptr; \
3a66edc7 279 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 280} while(0);
281
3305306f 282typedef struct redisDb {
4409877e 283 dict *dict; /* The keyspace for this DB */
284 dict *expires; /* Timeout of keys with a timeout set */
285 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 286 dict *io_keys; /* Keys with clients waiting for VM I/O */
3305306f 287 int id;
288} redisDb;
289
6e469882 290/* Client MULTI/EXEC state */
291typedef struct multiCmd {
292 robj **argv;
293 int argc;
294 struct redisCommand *cmd;
295} multiCmd;
296
297typedef struct multiState {
298 multiCmd *commands; /* Array of MULTI commands */
299 int count; /* Total number of MULTI commands */
300} multiState;
301
ed9b544e 302/* With multiplexing we need to take per-clinet state.
303 * Clients are taken in a liked list. */
304typedef struct redisClient {
305 int fd;
3305306f 306 redisDb *db;
ed9b544e 307 int dictid;
308 sds querybuf;
e8a74421 309 robj **argv, **mbargv;
310 int argc, mbargc;
40d224a9 311 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 312 int multibulk; /* multi bulk command format active */
ed9b544e 313 list *reply;
314 int sentlen;
315 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 316 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 317 int slaveseldb; /* slave selected db, if this client is a slave */
318 int authenticated; /* when requirepass is non-NULL */
319 int replstate; /* replication state if this is a slave */
320 int repldbfd; /* replication DB file descriptor */
6e469882 321 long repldboff; /* replication DB file offset */
40d224a9 322 off_t repldbsize; /* replication DB file size */
6e469882 323 multiState mstate; /* MULTI/EXEC state */
d5d55fc3 324 robj **blockingkeys; /* The key we are waiting to terminate a blocking
4409877e 325 * operation such as BLPOP. Otherwise NULL. */
b177fd30 326 int blockingkeysnum; /* Number of blocking keys */
4409877e 327 time_t blockingto; /* Blocking operation timeout. If UNIX current time
328 * is >= blockingto then the operation timed out. */
92f8e882 329 list *io_keys; /* Keys this client is waiting to be loaded from the
330 * swap file in order to continue. */
ffc6b7f8 331 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
332 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
ed9b544e 333} redisClient;
334
335struct saveparam {
336 time_t seconds;
337 int changes;
338};
339
340/* Global server state structure */
341struct redisServer {
342 int port;
343 int fd;
3305306f 344 redisDb *db;
ed9b544e 345 long long dirty; /* changes to DB from the last save */
346 list *clients;
87eca727 347 list *slaves, *monitors;
ed9b544e 348 char neterr[ANET_ERR_LEN];
349 aeEventLoop *el;
350 int cronloops; /* number of times the cron function run */
351 list *objfreelist; /* A list of freed objects to avoid malloc() */
352 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 353 /* Fields used only for stats */
354 time_t stat_starttime; /* server start time */
355 long long stat_numcommands; /* number of processed commands */
356 long long stat_numconnections; /* number of connections received */
2a6a2ed1 357 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 358 /* Configuration */
359 int verbosity;
360 int glueoutputbuf;
361 int maxidletime;
362 int dbnum;
363 int daemonize;
44b38ef4 364 int appendonly;
48f0308a 365 int appendfsync;
366 time_t lastfsync;
44b38ef4 367 int appendfd;
368 int appendseldb;
ed329fcf 369 char *pidfile;
9f3c422c 370 pid_t bgsavechildpid;
9d65a1bb 371 pid_t bgrewritechildpid;
372 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
ed9b544e 373 struct saveparam *saveparams;
374 int saveparamslen;
375 char *logfile;
376 char *bindaddr;
377 char *dbfilename;
44b38ef4 378 char *appendfilename;
abcb223e 379 char *requirepass;
10c43610 380 int shareobjects;
121f70cf 381 int rdbcompression;
ed9b544e 382 /* Replication related */
383 int isslave;
d0ccebcf 384 char *masterauth;
ed9b544e 385 char *masterhost;
386 int masterport;
40d224a9 387 redisClient *master; /* client that is master for this slave */
ed9b544e 388 int replstate;
285add55 389 unsigned int maxclients;
4ef8de8a 390 unsigned long long maxmemory;
d5d55fc3 391 unsigned int blpop_blocked_clients;
392 unsigned int vm_blocked_clients;
ed9b544e 393 /* Sort parameters - qsort_r() is only available under BSD so we
394 * have to take this state global, in order to pass it to sortCompare() */
395 int sort_desc;
396 int sort_alpha;
397 int sort_bypattern;
75680a3c 398 /* Virtual memory configuration */
399 int vm_enabled;
054e426d 400 char *vm_swap_file;
75680a3c 401 off_t vm_page_size;
402 off_t vm_pages;
4ef8de8a 403 unsigned long long vm_max_memory;
cbba7dd7 404 /* Hashes config */
405 size_t hash_max_zipmap_entries;
406 size_t hash_max_zipmap_value;
75680a3c 407 /* Virtual memory state */
408 FILE *vm_fp;
409 int vm_fd;
410 off_t vm_next_page; /* Next probably empty page */
411 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 412 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 413 time_t unixtime; /* Unix time sampled every second. */
92f8e882 414 /* Virtual memory I/O threads stuff */
92f8e882 415 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 416 * put the result of the operation in the io_done list. While the
417 * job is being processed, it's put on io_processing queue. */
418 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
419 list *io_processing; /* List of VM I/O jobs being processed */
420 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 421 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 422 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 423 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
424 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 425 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 426 int io_active_threads; /* Number of running I/O threads */
427 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 428 /* Our main thread is blocked on the event loop, locking for sockets ready
429 * to be read or written, so when a threaded I/O operation is ready to be
430 * processed by the main thread, the I/O thread will use a unix pipe to
431 * awake the main thread. The followings are the two pipe FDs. */
432 int io_ready_pipe_read;
433 int io_ready_pipe_write;
7d98e08c 434 /* Virtual memory stats */
435 unsigned long long vm_stats_used_pages;
436 unsigned long long vm_stats_swapped_objects;
437 unsigned long long vm_stats_swapouts;
438 unsigned long long vm_stats_swapins;
befec3cd 439 /* Pubsub */
ffc6b7f8 440 dict *pubsub_channels; /* Map channels to list of subscribed clients */
441 list *pubsub_patterns; /* A list of pubsub_patterns */
befec3cd 442 /* Misc */
b9bc0eef 443 FILE *devnull;
ed9b544e 444};
445
ffc6b7f8 446typedef struct pubsubPattern {
447 redisClient *client;
448 robj *pattern;
449} pubsubPattern;
450
ed9b544e 451typedef void redisCommandProc(redisClient *c);
452struct redisCommand {
453 char *name;
454 redisCommandProc *proc;
455 int arity;
456 int flags;
76583ea4
PN
457 /* Use a function to determine which keys need to be loaded
458 * in the background prior to executing this command. Takes precedence
459 * over vm_firstkey and others, ignored when NULL */
460 redisCommandProc *vm_preload_proc;
7c775e09 461 /* What keys should be loaded in background when calling this command? */
462 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
463 int vm_lastkey; /* THe last argument that's a key */
464 int vm_keystep; /* The step between first and last key */
ed9b544e 465};
466
de96dbfe 467struct redisFunctionSym {
468 char *name;
56906eef 469 unsigned long pointer;
de96dbfe 470};
471
ed9b544e 472typedef struct _redisSortObject {
473 robj *obj;
474 union {
475 double score;
476 robj *cmpobj;
477 } u;
478} redisSortObject;
479
480typedef struct _redisSortOperation {
481 int type;
482 robj *pattern;
483} redisSortOperation;
484
6b47e12e 485/* ZSETs use a specialized version of Skiplists */
486
487typedef struct zskiplistNode {
488 struct zskiplistNode **forward;
e3870fab 489 struct zskiplistNode *backward;
912b9165 490 unsigned int *span;
6b47e12e 491 double score;
492 robj *obj;
493} zskiplistNode;
494
495typedef struct zskiplist {
e3870fab 496 struct zskiplistNode *header, *tail;
d13f767c 497 unsigned long length;
6b47e12e 498 int level;
499} zskiplist;
500
1812e024 501typedef struct zset {
502 dict *dict;
6b47e12e 503 zskiplist *zsl;
1812e024 504} zset;
505
6b47e12e 506/* Our shared "common" objects */
507
05df7621 508#define REDIS_SHARED_INTEGERS 10000
ed9b544e 509struct sharedObjectsStruct {
c937aa89 510 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 511 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 512 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
513 *outofrangeerr, *plus,
ed9b544e 514 *select0, *select1, *select2, *select3, *select4,
befec3cd 515 *select5, *select6, *select7, *select8, *select9,
ffc6b7f8 516 *messagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
05df7621 517 *psubscribebulk, *punsubscribebulk, *integers[REDIS_SHARED_INTEGERS];
ed9b544e 518} shared;
519
a7866db6 520/* Global vars that are actally used as constants. The following double
521 * values are used for double on-disk serialization, and are initialized
522 * at runtime to avoid strange compiler optimizations. */
523
524static double R_Zero, R_PosInf, R_NegInf, R_Nan;
525
92f8e882 526/* VM threaded I/O request message */
b9bc0eef 527#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
528#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
529#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 530typedef struct iojob {
996cb5f7 531 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 532 redisDb *db;/* Redis database */
92f8e882 533 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 534 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 535 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
536 off_t page; /* Swap page where to read/write the object */
248ea310 537 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 538 int canceled; /* True if this command was canceled by blocking side of VM */
539 pthread_t thread; /* ID of the thread processing this entry */
540} iojob;
92f8e882 541
ed9b544e 542/*================================ Prototypes =============================== */
543
544static void freeStringObject(robj *o);
545static void freeListObject(robj *o);
546static void freeSetObject(robj *o);
547static void decrRefCount(void *o);
548static robj *createObject(int type, void *ptr);
549static void freeClient(redisClient *c);
f78fd11b 550static int rdbLoad(char *filename);
ed9b544e 551static void addReply(redisClient *c, robj *obj);
552static void addReplySds(redisClient *c, sds s);
553static void incrRefCount(robj *o);
f78fd11b 554static int rdbSaveBackground(char *filename);
ed9b544e 555static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 556static robj *dupStringObject(robj *o);
248ea310 557static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
44b38ef4 558static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 559static int syncWithMaster(void);
05df7621 560static robj *tryObjectEncoding(robj *o);
9d65a1bb 561static robj *getDecodedObject(robj *o);
3305306f 562static int removeExpire(redisDb *db, robj *key);
563static int expireIfNeeded(redisDb *db, robj *key);
564static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 565static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 566static int deleteKey(redisDb *db, robj *key);
bb32ede5 567static time_t getExpire(redisDb *db, robj *key);
568static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 569static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 570static void freeMemoryIfNeeded(void);
de96dbfe 571static int processCommand(redisClient *c);
56906eef 572static void setupSigSegvAction(void);
a3b21203 573static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 574static void aofRemoveTempFile(pid_t childpid);
0ea663ea 575static size_t stringObjectLen(robj *o);
638e42ac 576static void processInputBuffer(redisClient *c);
6b47e12e 577static zskiplist *zslCreate(void);
fd8ccf44 578static void zslFree(zskiplist *zsl);
2b59cfdf 579static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 580static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 581static void initClientMultiState(redisClient *c);
582static void freeClientMultiState(redisClient *c);
583static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 584static void unblockClientWaitingData(redisClient *c);
4409877e 585static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 586static void vmInit(void);
a35ddf12 587static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 588static robj *vmLoadObject(robj *key);
7e69548d 589static robj *vmPreviewObject(robj *key);
a69a0c9c 590static int vmSwapOneObjectBlocking(void);
591static int vmSwapOneObjectThreaded(void);
7e69548d 592static int vmCanSwapOut(void);
a5819310 593static int tryFreeOneObjectFromFreelist(void);
996cb5f7 594static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
595static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
596static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 597static void lockThreadedIO(void);
598static void unlockThreadedIO(void);
599static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
600static void freeIOJob(iojob *j);
601static void queueIOJob(iojob *j);
a5819310 602static int vmWriteObjectOnSwap(robj *o, off_t page);
603static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 604static void waitEmptyIOJobsQueue(void);
605static void vmReopenSwapFile(void);
970e10bb 606static int vmFreePage(off_t page);
76583ea4 607static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
d5d55fc3 608static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
609static int dontWaitForSwappedKey(redisClient *c, robj *key);
610static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
611static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
612static struct redisCommand *lookupCommand(char *name);
613static void call(redisClient *c, struct redisCommand *cmd);
614static void resetClient(redisClient *c);
ada386b2 615static void convertToRealHash(robj *o);
ffc6b7f8 616static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
617static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
618static void freePubsubPattern(void *p);
619static int listMatchPubsubPattern(void *a, void *b);
620static int compareStringObjects(robj *a, robj *b);
befec3cd 621static void usage();
ed9b544e 622
abcb223e 623static void authCommand(redisClient *c);
ed9b544e 624static void pingCommand(redisClient *c);
625static void echoCommand(redisClient *c);
626static void setCommand(redisClient *c);
627static void setnxCommand(redisClient *c);
628static void getCommand(redisClient *c);
629static void delCommand(redisClient *c);
630static void existsCommand(redisClient *c);
631static void incrCommand(redisClient *c);
632static void decrCommand(redisClient *c);
633static void incrbyCommand(redisClient *c);
634static void decrbyCommand(redisClient *c);
635static void selectCommand(redisClient *c);
636static void randomkeyCommand(redisClient *c);
637static void keysCommand(redisClient *c);
638static void dbsizeCommand(redisClient *c);
639static void lastsaveCommand(redisClient *c);
640static void saveCommand(redisClient *c);
641static void bgsaveCommand(redisClient *c);
9d65a1bb 642static void bgrewriteaofCommand(redisClient *c);
ed9b544e 643static void shutdownCommand(redisClient *c);
644static void moveCommand(redisClient *c);
645static void renameCommand(redisClient *c);
646static void renamenxCommand(redisClient *c);
647static void lpushCommand(redisClient *c);
648static void rpushCommand(redisClient *c);
649static void lpopCommand(redisClient *c);
650static void rpopCommand(redisClient *c);
651static void llenCommand(redisClient *c);
652static void lindexCommand(redisClient *c);
653static void lrangeCommand(redisClient *c);
654static void ltrimCommand(redisClient *c);
655static void typeCommand(redisClient *c);
656static void lsetCommand(redisClient *c);
657static void saddCommand(redisClient *c);
658static void sremCommand(redisClient *c);
a4460ef4 659static void smoveCommand(redisClient *c);
ed9b544e 660static void sismemberCommand(redisClient *c);
661static void scardCommand(redisClient *c);
12fea928 662static void spopCommand(redisClient *c);
2abb95a9 663static void srandmemberCommand(redisClient *c);
ed9b544e 664static void sinterCommand(redisClient *c);
665static void sinterstoreCommand(redisClient *c);
40d224a9 666static void sunionCommand(redisClient *c);
667static void sunionstoreCommand(redisClient *c);
f4f56e1d 668static void sdiffCommand(redisClient *c);
669static void sdiffstoreCommand(redisClient *c);
ed9b544e 670static void syncCommand(redisClient *c);
671static void flushdbCommand(redisClient *c);
672static void flushallCommand(redisClient *c);
673static void sortCommand(redisClient *c);
674static void lremCommand(redisClient *c);
0f5f7e9a 675static void rpoplpushcommand(redisClient *c);
ed9b544e 676static void infoCommand(redisClient *c);
70003d28 677static void mgetCommand(redisClient *c);
87eca727 678static void monitorCommand(redisClient *c);
3305306f 679static void expireCommand(redisClient *c);
802e8373 680static void expireatCommand(redisClient *c);
f6b141c5 681static void getsetCommand(redisClient *c);
fd88489a 682static void ttlCommand(redisClient *c);
321b0e13 683static void slaveofCommand(redisClient *c);
7f957c92 684static void debugCommand(redisClient *c);
f6b141c5 685static void msetCommand(redisClient *c);
686static void msetnxCommand(redisClient *c);
fd8ccf44 687static void zaddCommand(redisClient *c);
7db723ad 688static void zincrbyCommand(redisClient *c);
cc812361 689static void zrangeCommand(redisClient *c);
50c55df5 690static void zrangebyscoreCommand(redisClient *c);
f44dd428 691static void zcountCommand(redisClient *c);
e3870fab 692static void zrevrangeCommand(redisClient *c);
3c41331e 693static void zcardCommand(redisClient *c);
1b7106e7 694static void zremCommand(redisClient *c);
6e333bbe 695static void zscoreCommand(redisClient *c);
1807985b 696static void zremrangebyscoreCommand(redisClient *c);
6e469882 697static void multiCommand(redisClient *c);
698static void execCommand(redisClient *c);
18b6cb76 699static void discardCommand(redisClient *c);
4409877e 700static void blpopCommand(redisClient *c);
701static void brpopCommand(redisClient *c);
4b00bebd 702static void appendCommand(redisClient *c);
39191553 703static void substrCommand(redisClient *c);
69d95c3e 704static void zrankCommand(redisClient *c);
798d9e55 705static void zrevrankCommand(redisClient *c);
978c2c94 706static void hsetCommand(redisClient *c);
707static void hgetCommand(redisClient *c);
07efaf74 708static void hdelCommand(redisClient *c);
92b27fe9 709static void hlenCommand(redisClient *c);
9212eafd 710static void zremrangebyrankCommand(redisClient *c);
2830ca53
PN
711static void zunionCommand(redisClient *c);
712static void zinterCommand(redisClient *c);
78409a0f 713static void hkeysCommand(redisClient *c);
714static void hvalsCommand(redisClient *c);
715static void hgetallCommand(redisClient *c);
a86f14b1 716static void hexistsCommand(redisClient *c);
500ece7c 717static void configCommand(redisClient *c);
01426b05 718static void hincrbyCommand(redisClient *c);
befec3cd 719static void subscribeCommand(redisClient *c);
720static void unsubscribeCommand(redisClient *c);
ffc6b7f8 721static void psubscribeCommand(redisClient *c);
722static void punsubscribeCommand(redisClient *c);
befec3cd 723static void publishCommand(redisClient *c);
f6b141c5 724
ed9b544e 725/*================================= Globals ================================= */
726
727/* Global vars */
728static struct redisServer server; /* server global state */
729static struct redisCommand cmdTable[] = {
76583ea4
PN
730 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
731 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
732 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
733 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
734 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
735 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
736 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
737 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
738 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
739 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
740 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
741 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
742 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
743 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
744 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
745 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
746 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
748 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
749 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
750 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
751 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
752 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
753 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
754 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
755 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
756 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
757 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
758 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
760 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
761 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
762 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
763 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
764 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
765 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
766 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
767 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
768 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
769 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
770 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
771 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
772 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
773 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
774 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
776 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
780 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
781 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
782 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
01426b05 783 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
784 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
785 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
786 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
788 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
789 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 790 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
791 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
792 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
793 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
794 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
795 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
796 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
797 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
798 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
800 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
801 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
802 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
803 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
804 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
805 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
806 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
807 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
808 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
809 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
810 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
811 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
812 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
814 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
958cd5f3 815 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
816 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
818 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
819 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
820 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
821 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
823 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
824 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
825 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 826 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
befec3cd 827 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
ffc6b7f8 829 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
4005fef1 831 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
76583ea4 832 {NULL,NULL,0,0,NULL,0,0,0}
ed9b544e 833};
bcfc686d 834
ed9b544e 835/*============================ Utility functions ============================ */
836
837/* Glob-style pattern matching. */
500ece7c 838static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 839 const char *string, int stringLen, int nocase)
840{
841 while(patternLen) {
842 switch(pattern[0]) {
843 case '*':
844 while (pattern[1] == '*') {
845 pattern++;
846 patternLen--;
847 }
848 if (patternLen == 1)
849 return 1; /* match */
850 while(stringLen) {
851 if (stringmatchlen(pattern+1, patternLen-1,
852 string, stringLen, nocase))
853 return 1; /* match */
854 string++;
855 stringLen--;
856 }
857 return 0; /* no match */
858 break;
859 case '?':
860 if (stringLen == 0)
861 return 0; /* no match */
862 string++;
863 stringLen--;
864 break;
865 case '[':
866 {
867 int not, match;
868
869 pattern++;
870 patternLen--;
871 not = pattern[0] == '^';
872 if (not) {
873 pattern++;
874 patternLen--;
875 }
876 match = 0;
877 while(1) {
878 if (pattern[0] == '\\') {
879 pattern++;
880 patternLen--;
881 if (pattern[0] == string[0])
882 match = 1;
883 } else if (pattern[0] == ']') {
884 break;
885 } else if (patternLen == 0) {
886 pattern--;
887 patternLen++;
888 break;
889 } else if (pattern[1] == '-' && patternLen >= 3) {
890 int start = pattern[0];
891 int end = pattern[2];
892 int c = string[0];
893 if (start > end) {
894 int t = start;
895 start = end;
896 end = t;
897 }
898 if (nocase) {
899 start = tolower(start);
900 end = tolower(end);
901 c = tolower(c);
902 }
903 pattern += 2;
904 patternLen -= 2;
905 if (c >= start && c <= end)
906 match = 1;
907 } else {
908 if (!nocase) {
909 if (pattern[0] == string[0])
910 match = 1;
911 } else {
912 if (tolower((int)pattern[0]) == tolower((int)string[0]))
913 match = 1;
914 }
915 }
916 pattern++;
917 patternLen--;
918 }
919 if (not)
920 match = !match;
921 if (!match)
922 return 0; /* no match */
923 string++;
924 stringLen--;
925 break;
926 }
927 case '\\':
928 if (patternLen >= 2) {
929 pattern++;
930 patternLen--;
931 }
932 /* fall through */
933 default:
934 if (!nocase) {
935 if (pattern[0] != string[0])
936 return 0; /* no match */
937 } else {
938 if (tolower((int)pattern[0]) != tolower((int)string[0]))
939 return 0; /* no match */
940 }
941 string++;
942 stringLen--;
943 break;
944 }
945 pattern++;
946 patternLen--;
947 if (stringLen == 0) {
948 while(*pattern == '*') {
949 pattern++;
950 patternLen--;
951 }
952 break;
953 }
954 }
955 if (patternLen == 0 && stringLen == 0)
956 return 1;
957 return 0;
958}
959
500ece7c 960static int stringmatch(const char *pattern, const char *string, int nocase) {
961 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
962}
963
56906eef 964static void redisLog(int level, const char *fmt, ...) {
ed9b544e 965 va_list ap;
966 FILE *fp;
967
968 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
969 if (!fp) return;
970
971 va_start(ap, fmt);
972 if (level >= server.verbosity) {
6766f45e 973 char *c = ".-*#";
1904ecc1 974 char buf[64];
975 time_t now;
976
977 now = time(NULL);
6c9385e0 978 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 979 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 980 vfprintf(fp, fmt, ap);
981 fprintf(fp,"\n");
982 fflush(fp);
983 }
984 va_end(ap);
985
986 if (server.logfile) fclose(fp);
987}
988
989/*====================== Hash table type implementation ==================== */
990
991/* This is an hash table type that uses the SDS dynamic strings libary as
992 * keys and radis objects as values (objects can hold SDS strings,
993 * lists, sets). */
994
1812e024 995static void dictVanillaFree(void *privdata, void *val)
996{
997 DICT_NOTUSED(privdata);
998 zfree(val);
999}
1000
4409877e 1001static void dictListDestructor(void *privdata, void *val)
1002{
1003 DICT_NOTUSED(privdata);
1004 listRelease((list*)val);
1005}
1006
ed9b544e 1007static int sdsDictKeyCompare(void *privdata, const void *key1,
1008 const void *key2)
1009{
1010 int l1,l2;
1011 DICT_NOTUSED(privdata);
1012
1013 l1 = sdslen((sds)key1);
1014 l2 = sdslen((sds)key2);
1015 if (l1 != l2) return 0;
1016 return memcmp(key1, key2, l1) == 0;
1017}
1018
1019static void dictRedisObjectDestructor(void *privdata, void *val)
1020{
1021 DICT_NOTUSED(privdata);
1022
a35ddf12 1023 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 1024 decrRefCount(val);
1025}
1026
942a3961 1027static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1028 const void *key2)
1029{
1030 const robj *o1 = key1, *o2 = key2;
1031 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1032}
1033
942a3961 1034static unsigned int dictObjHash(const void *key) {
ed9b544e 1035 const robj *o = key;
1036 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1037}
1038
942a3961 1039static int dictEncObjKeyCompare(void *privdata, const void *key1,
1040 const void *key2)
1041{
9d65a1bb 1042 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1043 int cmp;
942a3961 1044
2a1198b4 1045 if (o1->encoding == REDIS_ENCODING_INT &&
1046 o2->encoding == REDIS_ENCODING_INT &&
db5946fc 1047 o1->ptr == o2->ptr) return 1;
2a1198b4 1048
9d65a1bb 1049 o1 = getDecodedObject(o1);
1050 o2 = getDecodedObject(o2);
1051 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1052 decrRefCount(o1);
1053 decrRefCount(o2);
1054 return cmp;
942a3961 1055}
1056
1057static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1058 robj *o = (robj*) key;
942a3961 1059
ed9e4966 1060 if (o->encoding == REDIS_ENCODING_RAW) {
1061 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1062 } else {
1063 if (o->encoding == REDIS_ENCODING_INT) {
1064 char buf[32];
1065 int len;
1066
1067 len = snprintf(buf,32,"%ld",(long)o->ptr);
1068 return dictGenHashFunction((unsigned char*)buf, len);
1069 } else {
1070 unsigned int hash;
1071
1072 o = getDecodedObject(o);
1073 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1074 decrRefCount(o);
1075 return hash;
1076 }
1077 }
942a3961 1078}
1079
f2d9f50f 1080/* Sets type and expires */
ed9b544e 1081static dictType setDictType = {
942a3961 1082 dictEncObjHash, /* hash function */
ed9b544e 1083 NULL, /* key dup */
1084 NULL, /* val dup */
942a3961 1085 dictEncObjKeyCompare, /* key compare */
ed9b544e 1086 dictRedisObjectDestructor, /* key destructor */
1087 NULL /* val destructor */
1088};
1089
f2d9f50f 1090/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1091static dictType zsetDictType = {
1092 dictEncObjHash, /* hash function */
1093 NULL, /* key dup */
1094 NULL, /* val dup */
1095 dictEncObjKeyCompare, /* key compare */
1096 dictRedisObjectDestructor, /* key destructor */
da0a1620 1097 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1098};
1099
f2d9f50f 1100/* Db->dict */
5234952b 1101static dictType dbDictType = {
942a3961 1102 dictObjHash, /* hash function */
ed9b544e 1103 NULL, /* key dup */
1104 NULL, /* val dup */
942a3961 1105 dictObjKeyCompare, /* key compare */
ed9b544e 1106 dictRedisObjectDestructor, /* key destructor */
1107 dictRedisObjectDestructor /* val destructor */
1108};
1109
f2d9f50f 1110/* Db->expires */
1111static dictType keyptrDictType = {
1112 dictObjHash, /* hash function */
1113 NULL, /* key dup */
1114 NULL, /* val dup */
1115 dictObjKeyCompare, /* key compare */
1116 dictRedisObjectDestructor, /* key destructor */
1117 NULL /* val destructor */
1118};
1119
5234952b 1120/* Hash type hash table (note that small hashes are represented with zimpaps) */
1121static dictType hashDictType = {
1122 dictEncObjHash, /* hash function */
1123 NULL, /* key dup */
1124 NULL, /* val dup */
1125 dictEncObjKeyCompare, /* key compare */
1126 dictRedisObjectDestructor, /* key destructor */
1127 dictRedisObjectDestructor /* val destructor */
1128};
1129
4409877e 1130/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1131 * lists as values. It's used for blocking operations (BLPOP) and to
1132 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1133static dictType keylistDictType = {
1134 dictObjHash, /* hash function */
1135 NULL, /* key dup */
1136 NULL, /* val dup */
1137 dictObjKeyCompare, /* key compare */
1138 dictRedisObjectDestructor, /* key destructor */
1139 dictListDestructor /* val destructor */
1140};
1141
42ab0172
AO
1142static void version();
1143
ed9b544e 1144/* ========================= Random utility functions ======================= */
1145
1146/* Redis generally does not try to recover from out of memory conditions
1147 * when allocating objects or strings, it is not clear if it will be possible
1148 * to report this condition to the client since the networking layer itself
1149 * is based on heap allocation for send buffers, so we simply abort.
1150 * At least the code will be simpler to read... */
1151static void oom(const char *msg) {
71c54b21 1152 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1153 sleep(1);
1154 abort();
1155}
1156
1157/* ====================== Redis server networking stuff ===================== */
56906eef 1158static void closeTimedoutClients(void) {
ed9b544e 1159 redisClient *c;
ed9b544e 1160 listNode *ln;
1161 time_t now = time(NULL);
c7df85a4 1162 listIter li;
ed9b544e 1163
c7df85a4 1164 listRewind(server.clients,&li);
1165 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1166 c = listNodeValue(ln);
f86a74e9 1167 if (server.maxidletime &&
1168 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1169 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
ffc6b7f8 1170 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1171 listLength(c->pubsub_patterns) == 0 &&
d6cc8867 1172 (now - c->lastinteraction > server.maxidletime))
f86a74e9 1173 {
f870935d 1174 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1175 freeClient(c);
f86a74e9 1176 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1177 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1178 addReply(c,shared.nullmultibulk);
b0d8747d 1179 unblockClientWaitingData(c);
f86a74e9 1180 }
ed9b544e 1181 }
1182 }
ed9b544e 1183}
1184
12fea928 1185static int htNeedsResize(dict *dict) {
1186 long long size, used;
1187
1188 size = dictSlots(dict);
1189 used = dictSize(dict);
1190 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1191 (used*100/size < REDIS_HT_MINFILL));
1192}
1193
0bc03378 1194/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1195 * we resize the hash table to save memory */
56906eef 1196static void tryResizeHashTables(void) {
0bc03378 1197 int j;
1198
1199 for (j = 0; j < server.dbnum; j++) {
12fea928 1200 if (htNeedsResize(server.db[j].dict)) {
f870935d 1201 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
0bc03378 1202 dictResize(server.db[j].dict);
f870935d 1203 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
0bc03378 1204 }
12fea928 1205 if (htNeedsResize(server.db[j].expires))
1206 dictResize(server.db[j].expires);
0bc03378 1207 }
1208}
1209
9d65a1bb 1210/* A background saving child (BGSAVE) terminated its work. Handle this. */
1211void backgroundSaveDoneHandler(int statloc) {
1212 int exitcode = WEXITSTATUS(statloc);
1213 int bysignal = WIFSIGNALED(statloc);
1214
1215 if (!bysignal && exitcode == 0) {
1216 redisLog(REDIS_NOTICE,
1217 "Background saving terminated with success");
1218 server.dirty = 0;
1219 server.lastsave = time(NULL);
1220 } else if (!bysignal && exitcode != 0) {
1221 redisLog(REDIS_WARNING, "Background saving error");
1222 } else {
1223 redisLog(REDIS_WARNING,
454eea7c 1224 "Background saving terminated by signal %d", WTERMSIG(statloc));
9d65a1bb 1225 rdbRemoveTempFile(server.bgsavechildpid);
1226 }
1227 server.bgsavechildpid = -1;
1228 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1229 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1230 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1231}
1232
1233/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1234 * Handle this. */
1235void backgroundRewriteDoneHandler(int statloc) {
1236 int exitcode = WEXITSTATUS(statloc);
1237 int bysignal = WIFSIGNALED(statloc);
1238
1239 if (!bysignal && exitcode == 0) {
1240 int fd;
1241 char tmpfile[256];
1242
1243 redisLog(REDIS_NOTICE,
1244 "Background append only file rewriting terminated with success");
1245 /* Now it's time to flush the differences accumulated by the parent */
1246 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1247 fd = open(tmpfile,O_WRONLY|O_APPEND);
1248 if (fd == -1) {
1249 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1250 goto cleanup;
1251 }
1252 /* Flush our data... */
1253 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1254 (signed) sdslen(server.bgrewritebuf)) {
1255 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1256 close(fd);
1257 goto cleanup;
1258 }
b32627cd 1259 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1260 /* Now our work is to rename the temp file into the stable file. And
1261 * switch the file descriptor used by the server for append only. */
1262 if (rename(tmpfile,server.appendfilename) == -1) {
1263 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1264 close(fd);
1265 goto cleanup;
1266 }
1267 /* Mission completed... almost */
1268 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1269 if (server.appendfd != -1) {
1270 /* If append only is actually enabled... */
1271 close(server.appendfd);
1272 server.appendfd = fd;
1273 fsync(fd);
85a83172 1274 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1275 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1276 } else {
1277 /* If append only is disabled we just generate a dump in this
1278 * format. Why not? */
1279 close(fd);
1280 }
1281 } else if (!bysignal && exitcode != 0) {
1282 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1283 } else {
1284 redisLog(REDIS_WARNING,
454eea7c 1285 "Background append only file rewriting terminated by signal %d",
1286 WTERMSIG(statloc));
9d65a1bb 1287 }
1288cleanup:
1289 sdsfree(server.bgrewritebuf);
1290 server.bgrewritebuf = sdsempty();
1291 aofRemoveTempFile(server.bgrewritechildpid);
1292 server.bgrewritechildpid = -1;
1293}
1294
884d4b39 1295/* This function is called once a background process of some kind terminates,
1296 * as we want to avoid resizing the hash tables when there is a child in order
1297 * to play well with copy-on-write (otherwise when a resize happens lots of
1298 * memory pages are copied). The goal of this function is to update the ability
1299 * for dict.c to resize the hash tables accordingly to the fact we have o not
1300 * running childs. */
1301static void updateDictResizePolicy(void) {
1302 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1303 dictEnableResize();
1304 else
1305 dictDisableResize();
1306}
1307
56906eef 1308static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1309 int j, loops = server.cronloops++;
ed9b544e 1310 REDIS_NOTUSED(eventLoop);
1311 REDIS_NOTUSED(id);
1312 REDIS_NOTUSED(clientData);
1313
3a66edc7 1314 /* We take a cached value of the unix time in the global state because
1315 * with virtual memory and aging there is to store the current time
1316 * in objects at every object access, and accuracy is not needed.
1317 * To access a global var is faster than calling time(NULL) */
1318 server.unixtime = time(NULL);
1319
0bc03378 1320 /* Show some info about non-empty databases */
ed9b544e 1321 for (j = 0; j < server.dbnum; j++) {
dec423d9 1322 long long size, used, vkeys;
94754ccc 1323
3305306f 1324 size = dictSlots(server.db[j].dict);
1325 used = dictSize(server.db[j].dict);
94754ccc 1326 vkeys = dictSize(server.db[j].expires);
1763929f 1327 if (!(loops % 50) && (used || vkeys)) {
f870935d 1328 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1329 /* dictPrintStats(server.dict); */
ed9b544e 1330 }
ed9b544e 1331 }
1332
0bc03378 1333 /* We don't want to resize the hash tables while a bacground saving
1334 * is in progress: the saving child is created using fork() that is
1335 * implemented with a copy-on-write semantic in most modern systems, so
1336 * if we resize the HT while there is the saving child at work actually
1337 * a lot of memory movements in the parent will cause a lot of pages
1338 * copied. */
884d4b39 1339 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1 &&
1340 !(loops % 10))
1341 {
1342 tryResizeHashTables();
1343 }
0bc03378 1344
ed9b544e 1345 /* Show information about connected clients */
1763929f 1346 if (!(loops % 50)) {
bdcb92f2 1347 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
ed9b544e 1348 listLength(server.clients)-listLength(server.slaves),
1349 listLength(server.slaves),
bdcb92f2 1350 zmalloc_used_memory());
ed9b544e 1351 }
1352
1353 /* Close connections of timedout clients */
1763929f 1354 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1355 closeTimedoutClients();
1356
9d65a1bb 1357 /* Check if a background saving or AOF rewrite in progress terminated */
1358 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1359 int statloc;
9d65a1bb 1360 pid_t pid;
1361
1362 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1363 if (pid == server.bgsavechildpid) {
1364 backgroundSaveDoneHandler(statloc);
ed9b544e 1365 } else {
9d65a1bb 1366 backgroundRewriteDoneHandler(statloc);
ed9b544e 1367 }
884d4b39 1368 updateDictResizePolicy();
ed9b544e 1369 }
1370 } else {
1371 /* If there is not a background saving in progress check if
1372 * we have to save now */
1373 time_t now = time(NULL);
1374 for (j = 0; j < server.saveparamslen; j++) {
1375 struct saveparam *sp = server.saveparams+j;
1376
1377 if (server.dirty >= sp->changes &&
1378 now-server.lastsave > sp->seconds) {
1379 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1380 sp->changes, sp->seconds);
f78fd11b 1381 rdbSaveBackground(server.dbfilename);
ed9b544e 1382 break;
1383 }
1384 }
1385 }
94754ccc 1386
f2324293 1387 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1388 * will use few CPU cycles if there are few expiring keys, otherwise
1389 * it will get more aggressive to avoid that too much memory is used by
1390 * keys that can be removed from the keyspace. */
94754ccc 1391 for (j = 0; j < server.dbnum; j++) {
f2324293 1392 int expired;
94754ccc 1393 redisDb *db = server.db+j;
94754ccc 1394
f2324293 1395 /* Continue to expire if at the end of the cycle more than 25%
1396 * of the keys were expired. */
1397 do {
4ef8de8a 1398 long num = dictSize(db->expires);
94754ccc 1399 time_t now = time(NULL);
1400
f2324293 1401 expired = 0;
94754ccc 1402 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1403 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1404 while (num--) {
1405 dictEntry *de;
1406 time_t t;
1407
1408 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1409 t = (time_t) dictGetEntryVal(de);
1410 if (now > t) {
1411 deleteKey(db,dictGetEntryKey(de));
f2324293 1412 expired++;
2a6a2ed1 1413 server.stat_expiredkeys++;
94754ccc 1414 }
1415 }
f2324293 1416 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1417 }
1418
4ef8de8a 1419 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1420 * is enbled. Try to free objects from the free list first. */
7e69548d 1421 if (vmCanSwapOut()) {
1422 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1423 server.vm_max_memory)
1424 {
72e9fd40 1425 int retval;
1426
a5819310 1427 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1428 retval = (server.vm_max_threads == 0) ?
1429 vmSwapOneObjectBlocking() :
1430 vmSwapOneObjectThreaded();
1763929f 1431 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1432 zmalloc_used_memory() >
1433 (server.vm_max_memory+server.vm_max_memory/10))
1434 {
1435 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1436 }
72e9fd40 1437 /* Note that when using threade I/O we free just one object,
1438 * because anyway when the I/O thread in charge to swap this
1439 * object out will finish, the handler of completed jobs
1440 * will try to swap more objects if we are still out of memory. */
1441 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1442 }
1443 }
1444
ed9b544e 1445 /* Check if we should connect to a MASTER */
1763929f 1446 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1447 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1448 if (syncWithMaster() == REDIS_OK) {
1449 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1450 }
1451 }
1763929f 1452 return 100;
ed9b544e 1453}
1454
d5d55fc3 1455/* This function gets called every time Redis is entering the
1456 * main loop of the event driven library, that is, before to sleep
1457 * for ready file descriptors. */
1458static void beforeSleep(struct aeEventLoop *eventLoop) {
1459 REDIS_NOTUSED(eventLoop);
1460
1461 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1462 listIter li;
1463 listNode *ln;
1464
1465 listRewind(server.io_ready_clients,&li);
1466 while((ln = listNext(&li))) {
1467 redisClient *c = ln->value;
1468 struct redisCommand *cmd;
1469
1470 /* Resume the client. */
1471 listDelNode(server.io_ready_clients,ln);
1472 c->flags &= (~REDIS_IO_WAIT);
1473 server.vm_blocked_clients--;
1474 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1475 readQueryFromClient, c);
1476 cmd = lookupCommand(c->argv[0]->ptr);
1477 assert(cmd != NULL);
1478 call(c,cmd);
1479 resetClient(c);
1480 /* There may be more data to process in the input buffer. */
1481 if (c->querybuf && sdslen(c->querybuf) > 0)
1482 processInputBuffer(c);
1483 }
1484 }
1485}
1486
ed9b544e 1487static void createSharedObjects(void) {
05df7621 1488 int j;
1489
ed9b544e 1490 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1491 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1492 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1493 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1494 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1495 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1496 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1497 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1498 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1499 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1500 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1501 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1502 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1503 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1504 "-ERR no such key\r\n"));
ed9b544e 1505 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1506 "-ERR syntax error\r\n"));
c937aa89 1507 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1508 "-ERR source and destination objects are the same\r\n"));
1509 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1510 "-ERR index out of range\r\n"));
ed9b544e 1511 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1512 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1513 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1514 shared.select0 = createStringObject("select 0\r\n",10);
1515 shared.select1 = createStringObject("select 1\r\n",10);
1516 shared.select2 = createStringObject("select 2\r\n",10);
1517 shared.select3 = createStringObject("select 3\r\n",10);
1518 shared.select4 = createStringObject("select 4\r\n",10);
1519 shared.select5 = createStringObject("select 5\r\n",10);
1520 shared.select6 = createStringObject("select 6\r\n",10);
1521 shared.select7 = createStringObject("select 7\r\n",10);
1522 shared.select8 = createStringObject("select 8\r\n",10);
1523 shared.select9 = createStringObject("select 9\r\n",10);
befec3cd 1524 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1525 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
fc46bb71 1526 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
ffc6b7f8 1527 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1528 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
befec3cd 1529 shared.mbulk3 = createStringObject("*3\r\n",4);
05df7621 1530 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1531 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1532 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1533 }
ed9b544e 1534}
1535
1536static void appendServerSaveParams(time_t seconds, int changes) {
1537 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1538 server.saveparams[server.saveparamslen].seconds = seconds;
1539 server.saveparams[server.saveparamslen].changes = changes;
1540 server.saveparamslen++;
1541}
1542
bcfc686d 1543static void resetServerSaveParams() {
ed9b544e 1544 zfree(server.saveparams);
1545 server.saveparams = NULL;
1546 server.saveparamslen = 0;
1547}
1548
1549static void initServerConfig() {
1550 server.dbnum = REDIS_DEFAULT_DBNUM;
1551 server.port = REDIS_SERVERPORT;
f870935d 1552 server.verbosity = REDIS_VERBOSE;
ed9b544e 1553 server.maxidletime = REDIS_MAXIDLETIME;
1554 server.saveparams = NULL;
1555 server.logfile = NULL; /* NULL = log on standard output */
1556 server.bindaddr = NULL;
1557 server.glueoutputbuf = 1;
1558 server.daemonize = 0;
44b38ef4 1559 server.appendonly = 0;
4e141d5a 1560 server.appendfsync = APPENDFSYNC_ALWAYS;
48f0308a 1561 server.lastfsync = time(NULL);
44b38ef4 1562 server.appendfd = -1;
1563 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1564 server.pidfile = zstrdup("/var/run/redis.pid");
1565 server.dbfilename = zstrdup("dump.rdb");
1566 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1567 server.requirepass = NULL;
10c43610 1568 server.shareobjects = 0;
b0553789 1569 server.rdbcompression = 1;
285add55 1570 server.maxclients = 0;
d5d55fc3 1571 server.blpop_blocked_clients = 0;
3fd78bcd 1572 server.maxmemory = 0;
75680a3c 1573 server.vm_enabled = 0;
054e426d 1574 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1575 server.vm_page_size = 256; /* 256 bytes per page */
1576 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1577 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1578 server.vm_max_threads = 4;
d5d55fc3 1579 server.vm_blocked_clients = 0;
cbba7dd7 1580 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1581 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
75680a3c 1582
bcfc686d 1583 resetServerSaveParams();
ed9b544e 1584
1585 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1586 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1587 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1588 /* Replication related */
1589 server.isslave = 0;
d0ccebcf 1590 server.masterauth = NULL;
ed9b544e 1591 server.masterhost = NULL;
1592 server.masterport = 6379;
1593 server.master = NULL;
1594 server.replstate = REDIS_REPL_NONE;
a7866db6 1595
1596 /* Double constants initialization */
1597 R_Zero = 0.0;
1598 R_PosInf = 1.0/R_Zero;
1599 R_NegInf = -1.0/R_Zero;
1600 R_Nan = R_Zero/R_Zero;
ed9b544e 1601}
1602
1603static void initServer() {
1604 int j;
1605
1606 signal(SIGHUP, SIG_IGN);
1607 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1608 setupSigSegvAction();
ed9b544e 1609
b9bc0eef 1610 server.devnull = fopen("/dev/null","w");
1611 if (server.devnull == NULL) {
1612 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1613 exit(1);
1614 }
ed9b544e 1615 server.clients = listCreate();
1616 server.slaves = listCreate();
87eca727 1617 server.monitors = listCreate();
ed9b544e 1618 server.objfreelist = listCreate();
1619 createSharedObjects();
1620 server.el = aeCreateEventLoop();
3305306f 1621 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
ed9b544e 1622 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1623 if (server.fd == -1) {
1624 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1625 exit(1);
1626 }
3305306f 1627 for (j = 0; j < server.dbnum; j++) {
5234952b 1628 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1629 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
4409877e 1630 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1631 if (server.vm_enabled)
1632 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1633 server.db[j].id = j;
1634 }
ffc6b7f8 1635 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1636 server.pubsub_patterns = listCreate();
1637 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1638 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
ed9b544e 1639 server.cronloops = 0;
9f3c422c 1640 server.bgsavechildpid = -1;
9d65a1bb 1641 server.bgrewritechildpid = -1;
1642 server.bgrewritebuf = sdsempty();
ed9b544e 1643 server.lastsave = time(NULL);
1644 server.dirty = 0;
ed9b544e 1645 server.stat_numcommands = 0;
1646 server.stat_numconnections = 0;
2a6a2ed1 1647 server.stat_expiredkeys = 0;
ed9b544e 1648 server.stat_starttime = time(NULL);
3a66edc7 1649 server.unixtime = time(NULL);
d8f8b666 1650 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1651 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1652 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1653
1654 if (server.appendonly) {
71eba477 1655 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1656 if (server.appendfd == -1) {
1657 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1658 strerror(errno));
1659 exit(1);
1660 }
1661 }
75680a3c 1662
1663 if (server.vm_enabled) vmInit();
ed9b544e 1664}
1665
1666/* Empty the whole database */
ca37e9cd 1667static long long emptyDb() {
ed9b544e 1668 int j;
ca37e9cd 1669 long long removed = 0;
ed9b544e 1670
3305306f 1671 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1672 removed += dictSize(server.db[j].dict);
3305306f 1673 dictEmpty(server.db[j].dict);
1674 dictEmpty(server.db[j].expires);
1675 }
ca37e9cd 1676 return removed;
ed9b544e 1677}
1678
85dd2f3a 1679static int yesnotoi(char *s) {
1680 if (!strcasecmp(s,"yes")) return 1;
1681 else if (!strcasecmp(s,"no")) return 0;
1682 else return -1;
1683}
1684
ed9b544e 1685/* I agree, this is a very rudimental way to load a configuration...
1686 will improve later if the config gets more complex */
1687static void loadServerConfig(char *filename) {
c9a111ac 1688 FILE *fp;
ed9b544e 1689 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1690 int linenum = 0;
1691 sds line = NULL;
6bccf64a
AO
1692 char *errormsg = "Fatal error, can't open config file '%s'";
1693 char *errorbuf = zmalloc(sizeof(char)*(strlen(errormsg)+strlen(filename)));
1694 sprintf(errorbuf, errormsg, filename);
c9a111ac 1695
1696 if (filename[0] == '-' && filename[1] == '\0')
1697 fp = stdin;
1698 else {
1699 if ((fp = fopen(filename,"r")) == NULL) {
6bccf64a 1700 redisLog(REDIS_WARNING, errorbuf);
c9a111ac 1701 exit(1);
1702 }
ed9b544e 1703 }
c9a111ac 1704
ed9b544e 1705 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1706 sds *argv;
1707 int argc, j;
1708
1709 linenum++;
1710 line = sdsnew(buf);
1711 line = sdstrim(line," \t\r\n");
1712
1713 /* Skip comments and blank lines*/
1714 if (line[0] == '#' || line[0] == '\0') {
1715 sdsfree(line);
1716 continue;
1717 }
1718
1719 /* Split into arguments */
1720 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1721 sdstolower(argv[0]);
1722
1723 /* Execute config directives */
bb0b03a3 1724 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1725 server.maxidletime = atoi(argv[1]);
0150db36 1726 if (server.maxidletime < 0) {
ed9b544e 1727 err = "Invalid timeout value"; goto loaderr;
1728 }
bb0b03a3 1729 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1730 server.port = atoi(argv[1]);
1731 if (server.port < 1 || server.port > 65535) {
1732 err = "Invalid port"; goto loaderr;
1733 }
bb0b03a3 1734 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1735 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1736 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1737 int seconds = atoi(argv[1]);
1738 int changes = atoi(argv[2]);
1739 if (seconds < 1 || changes < 0) {
1740 err = "Invalid save parameters"; goto loaderr;
1741 }
1742 appendServerSaveParams(seconds,changes);
bb0b03a3 1743 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1744 if (chdir(argv[1]) == -1) {
1745 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1746 argv[1], strerror(errno));
1747 exit(1);
1748 }
bb0b03a3 1749 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1750 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1751 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1752 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1753 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1754 else {
1755 err = "Invalid log level. Must be one of debug, notice, warning";
1756 goto loaderr;
1757 }
bb0b03a3 1758 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1759 FILE *logfp;
ed9b544e 1760
1761 server.logfile = zstrdup(argv[1]);
bb0b03a3 1762 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1763 zfree(server.logfile);
1764 server.logfile = NULL;
1765 }
1766 if (server.logfile) {
1767 /* Test if we are able to open the file. The server will not
1768 * be able to abort just for this problem later... */
c9a111ac 1769 logfp = fopen(server.logfile,"a");
1770 if (logfp == NULL) {
ed9b544e 1771 err = sdscatprintf(sdsempty(),
1772 "Can't open the log file: %s", strerror(errno));
1773 goto loaderr;
1774 }
c9a111ac 1775 fclose(logfp);
ed9b544e 1776 }
bb0b03a3 1777 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1778 server.dbnum = atoi(argv[1]);
1779 if (server.dbnum < 1) {
1780 err = "Invalid number of databases"; goto loaderr;
1781 }
b3f83f12
JZ
1782 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1783 loadServerConfig(argv[1]);
285add55 1784 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1785 server.maxclients = atoi(argv[1]);
3fd78bcd 1786 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
d4465900 1787 server.maxmemory = strtoll(argv[1], NULL, 10);
bb0b03a3 1788 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1789 server.masterhost = sdsnew(argv[1]);
1790 server.masterport = atoi(argv[2]);
1791 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1792 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1793 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1794 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1795 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1796 err = "argument must be 'yes' or 'no'"; goto loaderr;
1797 }
bb0b03a3 1798 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
85dd2f3a 1799 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
10c43610 1800 err = "argument must be 'yes' or 'no'"; goto loaderr;
1801 }
121f70cf 1802 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1803 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1804 err = "argument must be 'yes' or 'no'"; goto loaderr;
1805 }
bb0b03a3 1806 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1807 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1808 err = "argument must be 'yes' or 'no'"; goto loaderr;
1809 }
44b38ef4 1810 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1811 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1812 err = "argument must be 'yes' or 'no'"; goto loaderr;
1813 }
48f0308a 1814 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1815 if (!strcasecmp(argv[1],"no")) {
48f0308a 1816 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1817 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1818 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1819 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1820 server.appendfsync = APPENDFSYNC_EVERYSEC;
1821 } else {
1822 err = "argument must be 'no', 'always' or 'everysec'";
1823 goto loaderr;
1824 }
bb0b03a3 1825 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 1826 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1827 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 1828 zfree(server.pidfile);
054e426d 1829 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1830 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 1831 zfree(server.dbfilename);
054e426d 1832 server.dbfilename = zstrdup(argv[1]);
75680a3c 1833 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1834 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1835 err = "argument must be 'yes' or 'no'"; goto loaderr;
1836 }
054e426d 1837 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 1838 zfree(server.vm_swap_file);
054e426d 1839 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 1840 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1841 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1842 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1843 server.vm_page_size = strtoll(argv[1], NULL, 10);
1844 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1845 server.vm_pages = strtoll(argv[1], NULL, 10);
92f8e882 1846 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1847 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 1848 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1849 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1850 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1851 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1852 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1853 server.vm_max_threads = strtoll(argv[1], NULL, 10);
ed9b544e 1854 } else {
1855 err = "Bad directive or wrong number of arguments"; goto loaderr;
1856 }
1857 for (j = 0; j < argc; j++)
1858 sdsfree(argv[j]);
1859 zfree(argv);
1860 sdsfree(line);
1861 }
c9a111ac 1862 if (fp != stdin) fclose(fp);
ed9b544e 1863 return;
1864
1865loaderr:
1866 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1867 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1868 fprintf(stderr, ">>> '%s'\n", line);
1869 fprintf(stderr, "%s\n", err);
1870 exit(1);
1871}
1872
1873static void freeClientArgv(redisClient *c) {
1874 int j;
1875
1876 for (j = 0; j < c->argc; j++)
1877 decrRefCount(c->argv[j]);
e8a74421 1878 for (j = 0; j < c->mbargc; j++)
1879 decrRefCount(c->mbargv[j]);
ed9b544e 1880 c->argc = 0;
e8a74421 1881 c->mbargc = 0;
ed9b544e 1882}
1883
1884static void freeClient(redisClient *c) {
1885 listNode *ln;
1886
4409877e 1887 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 1888 * call, we have to set querybuf to NULL *before* to call
1889 * unblockClientWaitingData() to avoid processInputBuffer() will get
1890 * called. Also it is important to remove the file events after
1891 * this, because this call adds the READABLE event. */
4409877e 1892 sdsfree(c->querybuf);
1893 c->querybuf = NULL;
1894 if (c->flags & REDIS_BLOCKED)
b0d8747d 1895 unblockClientWaitingData(c);
4409877e 1896
ffc6b7f8 1897 /* Unsubscribe from all the pubsub channels */
1898 pubsubUnsubscribeAllChannels(c,0);
1899 pubsubUnsubscribeAllPatterns(c,0);
1900 dictRelease(c->pubsub_channels);
1901 listRelease(c->pubsub_patterns);
befec3cd 1902 /* Obvious cleanup */
ed9b544e 1903 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1904 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 1905 listRelease(c->reply);
1906 freeClientArgv(c);
1907 close(c->fd);
92f8e882 1908 /* Remove from the list of clients */
ed9b544e 1909 ln = listSearchKey(server.clients,c);
dfc5e96c 1910 redisAssert(ln != NULL);
ed9b544e 1911 listDelNode(server.clients,ln);
d5d55fc3 1912 /* Remove from the list of clients waiting for swapped keys */
1913 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1914 ln = listSearchKey(server.io_ready_clients,c);
1915 if (ln) {
1916 listDelNode(server.io_ready_clients,ln);
1917 server.vm_blocked_clients--;
1918 }
1919 }
1920 while (server.vm_enabled && listLength(c->io_keys)) {
1921 ln = listFirst(c->io_keys);
1922 dontWaitForSwappedKey(c,ln->value);
92f8e882 1923 }
b3e3d0d7 1924 listRelease(c->io_keys);
befec3cd 1925 /* Master/slave cleanup */
ed9b544e 1926 if (c->flags & REDIS_SLAVE) {
6208b3a7 1927 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1928 close(c->repldbfd);
87eca727 1929 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1930 ln = listSearchKey(l,c);
dfc5e96c 1931 redisAssert(ln != NULL);
87eca727 1932 listDelNode(l,ln);
ed9b544e 1933 }
1934 if (c->flags & REDIS_MASTER) {
1935 server.master = NULL;
1936 server.replstate = REDIS_REPL_CONNECT;
1937 }
befec3cd 1938 /* Release memory */
93ea3759 1939 zfree(c->argv);
e8a74421 1940 zfree(c->mbargv);
6e469882 1941 freeClientMultiState(c);
ed9b544e 1942 zfree(c);
1943}
1944
cc30e368 1945#define GLUEREPLY_UP_TO (1024)
ed9b544e 1946static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 1947 int copylen = 0;
1948 char buf[GLUEREPLY_UP_TO];
6208b3a7 1949 listNode *ln;
c7df85a4 1950 listIter li;
ed9b544e 1951 robj *o;
1952
c7df85a4 1953 listRewind(c->reply,&li);
1954 while((ln = listNext(&li))) {
c28b42ac 1955 int objlen;
1956
ed9b544e 1957 o = ln->value;
c28b42ac 1958 objlen = sdslen(o->ptr);
1959 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1960 memcpy(buf+copylen,o->ptr,objlen);
1961 copylen += objlen;
ed9b544e 1962 listDelNode(c->reply,ln);
c28b42ac 1963 } else {
1964 if (copylen == 0) return;
1965 break;
ed9b544e 1966 }
ed9b544e 1967 }
c28b42ac 1968 /* Now the output buffer is empty, add the new single element */
1969 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1970 listAddNodeHead(c->reply,o);
ed9b544e 1971}
1972
1973static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1974 redisClient *c = privdata;
1975 int nwritten = 0, totwritten = 0, objlen;
1976 robj *o;
1977 REDIS_NOTUSED(el);
1978 REDIS_NOTUSED(mask);
1979
2895e862 1980 /* Use writev() if we have enough buffers to send */
7ea870c0 1981 if (!server.glueoutputbuf &&
1982 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
1983 !(c->flags & REDIS_MASTER))
2895e862 1984 {
1985 sendReplyToClientWritev(el, fd, privdata, mask);
1986 return;
1987 }
2895e862 1988
ed9b544e 1989 while(listLength(c->reply)) {
c28b42ac 1990 if (server.glueoutputbuf && listLength(c->reply) > 1)
1991 glueReplyBuffersIfNeeded(c);
1992
ed9b544e 1993 o = listNodeValue(listFirst(c->reply));
1994 objlen = sdslen(o->ptr);
1995
1996 if (objlen == 0) {
1997 listDelNode(c->reply,listFirst(c->reply));
1998 continue;
1999 }
2000
2001 if (c->flags & REDIS_MASTER) {
6f376729 2002 /* Don't reply to a master */
ed9b544e 2003 nwritten = objlen - c->sentlen;
2004 } else {
a4d1ba9a 2005 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 2006 if (nwritten <= 0) break;
2007 }
2008 c->sentlen += nwritten;
2009 totwritten += nwritten;
2010 /* If we fully sent the object on head go to the next one */
2011 if (c->sentlen == objlen) {
2012 listDelNode(c->reply,listFirst(c->reply));
2013 c->sentlen = 0;
2014 }
6f376729 2015 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 2016 * bytes, in a single threaded server it's a good idea to serve
6f376729 2017 * other clients as well, even if a very large request comes from
2018 * super fast link that is always able to accept data (in real world
12f9d551 2019 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 2020 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 2021 }
2022 if (nwritten == -1) {
2023 if (errno == EAGAIN) {
2024 nwritten = 0;
2025 } else {
f870935d 2026 redisLog(REDIS_VERBOSE,
ed9b544e 2027 "Error writing to client: %s", strerror(errno));
2028 freeClient(c);
2029 return;
2030 }
2031 }
2032 if (totwritten > 0) c->lastinteraction = time(NULL);
2033 if (listLength(c->reply) == 0) {
2034 c->sentlen = 0;
2035 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2036 }
2037}
2038
2895e862 2039static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2040{
2041 redisClient *c = privdata;
2042 int nwritten = 0, totwritten = 0, objlen, willwrite;
2043 robj *o;
2044 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2045 int offset, ion = 0;
2046 REDIS_NOTUSED(el);
2047 REDIS_NOTUSED(mask);
2048
2049 listNode *node;
2050 while (listLength(c->reply)) {
2051 offset = c->sentlen;
2052 ion = 0;
2053 willwrite = 0;
2054
2055 /* fill-in the iov[] array */
2056 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2057 o = listNodeValue(node);
2058 objlen = sdslen(o->ptr);
2059
2060 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2061 break;
2062
2063 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2064 break; /* no more iovecs */
2065
2066 iov[ion].iov_base = ((char*)o->ptr) + offset;
2067 iov[ion].iov_len = objlen - offset;
2068 willwrite += objlen - offset;
2069 offset = 0; /* just for the first item */
2070 ion++;
2071 }
2072
2073 if(willwrite == 0)
2074 break;
2075
2076 /* write all collected blocks at once */
2077 if((nwritten = writev(fd, iov, ion)) < 0) {
2078 if (errno != EAGAIN) {
f870935d 2079 redisLog(REDIS_VERBOSE,
2895e862 2080 "Error writing to client: %s", strerror(errno));
2081 freeClient(c);
2082 return;
2083 }
2084 break;
2085 }
2086
2087 totwritten += nwritten;
2088 offset = c->sentlen;
2089
2090 /* remove written robjs from c->reply */
2091 while (nwritten && listLength(c->reply)) {
2092 o = listNodeValue(listFirst(c->reply));
2093 objlen = sdslen(o->ptr);
2094
2095 if(nwritten >= objlen - offset) {
2096 listDelNode(c->reply, listFirst(c->reply));
2097 nwritten -= objlen - offset;
2098 c->sentlen = 0;
2099 } else {
2100 /* partial write */
2101 c->sentlen += nwritten;
2102 break;
2103 }
2104 offset = 0;
2105 }
2106 }
2107
2108 if (totwritten > 0)
2109 c->lastinteraction = time(NULL);
2110
2111 if (listLength(c->reply) == 0) {
2112 c->sentlen = 0;
2113 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2114 }
2115}
2116
ed9b544e 2117static struct redisCommand *lookupCommand(char *name) {
2118 int j = 0;
2119 while(cmdTable[j].name != NULL) {
bb0b03a3 2120 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
ed9b544e 2121 j++;
2122 }
2123 return NULL;
2124}
2125
2126/* resetClient prepare the client to process the next command */
2127static void resetClient(redisClient *c) {
2128 freeClientArgv(c);
2129 c->bulklen = -1;
e8a74421 2130 c->multibulk = 0;
ed9b544e 2131}
2132
6e469882 2133/* Call() is the core of Redis execution of a command */
2134static void call(redisClient *c, struct redisCommand *cmd) {
2135 long long dirty;
2136
2137 dirty = server.dirty;
2138 cmd->proc(c);
4005fef1 2139 dirty = server.dirty-dirty;
2140
2141 if (server.appendonly && dirty)
6e469882 2142 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
4005fef1 2143 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2144 listLength(server.slaves))
248ea310 2145 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2146 if (listLength(server.monitors))
248ea310 2147 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2148 server.stat_numcommands++;
2149}
2150
ed9b544e 2151/* If this function gets called we already read a whole
2152 * command, argments are in the client argv/argc fields.
2153 * processCommand() execute the command or prepare the
2154 * server for a bulk read from the client.
2155 *
2156 * If 1 is returned the client is still alive and valid and
2157 * and other operations can be performed by the caller. Otherwise
2158 * if 0 is returned the client was destroied (i.e. after QUIT). */
2159static int processCommand(redisClient *c) {
2160 struct redisCommand *cmd;
ed9b544e 2161
3fd78bcd 2162 /* Free some memory if needed (maxmemory setting) */
2163 if (server.maxmemory) freeMemoryIfNeeded();
2164
e8a74421 2165 /* Handle the multi bulk command type. This is an alternative protocol
2166 * supported by Redis in order to receive commands that are composed of
2167 * multiple binary-safe "bulk" arguments. The latency of processing is
2168 * a bit higher but this allows things like multi-sets, so if this
2169 * protocol is used only for MSET and similar commands this is a big win. */
2170 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2171 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2172 if (c->multibulk <= 0) {
2173 resetClient(c);
2174 return 1;
2175 } else {
2176 decrRefCount(c->argv[c->argc-1]);
2177 c->argc--;
2178 return 1;
2179 }
2180 } else if (c->multibulk) {
2181 if (c->bulklen == -1) {
2182 if (((char*)c->argv[0]->ptr)[0] != '$') {
2183 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2184 resetClient(c);
2185 return 1;
2186 } else {
2187 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2188 decrRefCount(c->argv[0]);
2189 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2190 c->argc--;
2191 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2192 resetClient(c);
2193 return 1;
2194 }
2195 c->argc--;
2196 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2197 return 1;
2198 }
2199 } else {
2200 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2201 c->mbargv[c->mbargc] = c->argv[0];
2202 c->mbargc++;
2203 c->argc--;
2204 c->multibulk--;
2205 if (c->multibulk == 0) {
2206 robj **auxargv;
2207 int auxargc;
2208
2209 /* Here we need to swap the multi-bulk argc/argv with the
2210 * normal argc/argv of the client structure. */
2211 auxargv = c->argv;
2212 c->argv = c->mbargv;
2213 c->mbargv = auxargv;
2214
2215 auxargc = c->argc;
2216 c->argc = c->mbargc;
2217 c->mbargc = auxargc;
2218
2219 /* We need to set bulklen to something different than -1
2220 * in order for the code below to process the command without
2221 * to try to read the last argument of a bulk command as
2222 * a special argument. */
2223 c->bulklen = 0;
2224 /* continue below and process the command */
2225 } else {
2226 c->bulklen = -1;
2227 return 1;
2228 }
2229 }
2230 }
2231 /* -- end of multi bulk commands processing -- */
2232
ed9b544e 2233 /* The QUIT command is handled as a special case. Normal command
2234 * procs are unable to close the client connection safely */
bb0b03a3 2235 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2236 freeClient(c);
2237 return 0;
2238 }
d5d55fc3 2239
2240 /* Now lookup the command and check ASAP about trivial error conditions
2241 * such wrong arity, bad command name and so forth. */
ed9b544e 2242 cmd = lookupCommand(c->argv[0]->ptr);
2243 if (!cmd) {
2c14807b 2244 addReplySds(c,
2245 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2246 (char*)c->argv[0]->ptr));
ed9b544e 2247 resetClient(c);
2248 return 1;
2249 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2250 (c->argc < -cmd->arity)) {
454d4e43 2251 addReplySds(c,
2252 sdscatprintf(sdsempty(),
2253 "-ERR wrong number of arguments for '%s' command\r\n",
2254 cmd->name));
ed9b544e 2255 resetClient(c);
2256 return 1;
2257 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2258 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2259 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2260
2261 decrRefCount(c->argv[c->argc-1]);
2262 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2263 c->argc--;
2264 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2265 resetClient(c);
2266 return 1;
2267 }
2268 c->argc--;
2269 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2270 /* It is possible that the bulk read is already in the
8d0490e7 2271 * buffer. Check this condition and handle it accordingly.
2272 * This is just a fast path, alternative to call processInputBuffer().
2273 * It's a good idea since the code is small and this condition
2274 * happens most of the times. */
ed9b544e 2275 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2276 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2277 c->argc++;
2278 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2279 } else {
d5d55fc3 2280 /* Otherwise return... there is to read the last argument
2281 * from the socket. */
ed9b544e 2282 return 1;
2283 }
2284 }
942a3961 2285 /* Let's try to encode the bulk object to save space. */
2286 if (cmd->flags & REDIS_CMD_BULK)
05df7621 2287 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
942a3961 2288
e63943a4 2289 /* Check if the user is authenticated */
2290 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2291 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2292 resetClient(c);
2293 return 1;
2294 }
2295
b61a28fe 2296 /* Handle the maxmemory directive */
2297 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2298 zmalloc_used_memory() > server.maxmemory)
2299 {
2300 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2301 resetClient(c);
2302 return 1;
2303 }
2304
d6cc8867 2305 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
ffc6b7f8 2306 if (dictSize(c->pubsub_channels) > 0 &&
2307 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2308 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2309 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
d6cc8867 2310 resetClient(c);
2311 return 1;
2312 }
2313
ed9b544e 2314 /* Exec the command */
18b6cb76 2315 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
6e469882 2316 queueMultiCommand(c,cmd);
2317 addReply(c,shared.queued);
2318 } else {
d5d55fc3 2319 if (server.vm_enabled && server.vm_max_threads > 0 &&
2320 blockClientOnSwappedKeys(cmd,c)) return 1;
6e469882 2321 call(c,cmd);
2322 }
ed9b544e 2323
2324 /* Prepare the client for the next command */
ed9b544e 2325 resetClient(c);
2326 return 1;
2327}
2328
248ea310 2329static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2330 listNode *ln;
c7df85a4 2331 listIter li;
ed9b544e 2332 int outc = 0, j;
93ea3759 2333 robj **outv;
248ea310 2334 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2335 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2336 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2337 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2338 robj *lenobj;
93ea3759 2339
2340 if (argc <= REDIS_STATIC_ARGS) {
2341 outv = static_outv;
2342 } else {
248ea310 2343 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2344 }
248ea310 2345
2346 lenobj = createObject(REDIS_STRING,
2347 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2348 lenobj->refcount = 0;
2349 outv[outc++] = lenobj;
ed9b544e 2350 for (j = 0; j < argc; j++) {
248ea310 2351 lenobj = createObject(REDIS_STRING,
2352 sdscatprintf(sdsempty(),"$%lu\r\n",
2353 (unsigned long) stringObjectLen(argv[j])));
2354 lenobj->refcount = 0;
2355 outv[outc++] = lenobj;
ed9b544e 2356 outv[outc++] = argv[j];
248ea310 2357 outv[outc++] = shared.crlf;
ed9b544e 2358 }
ed9b544e 2359
40d224a9 2360 /* Increment all the refcounts at start and decrement at end in order to
2361 * be sure to free objects if there is no slave in a replication state
2362 * able to be feed with commands */
2363 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2364 listRewind(slaves,&li);
2365 while((ln = listNext(&li))) {
ed9b544e 2366 redisClient *slave = ln->value;
40d224a9 2367
2368 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2369 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2370
2371 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2372 if (slave->slaveseldb != dictid) {
2373 robj *selectcmd;
2374
2375 switch(dictid) {
2376 case 0: selectcmd = shared.select0; break;
2377 case 1: selectcmd = shared.select1; break;
2378 case 2: selectcmd = shared.select2; break;
2379 case 3: selectcmd = shared.select3; break;
2380 case 4: selectcmd = shared.select4; break;
2381 case 5: selectcmd = shared.select5; break;
2382 case 6: selectcmd = shared.select6; break;
2383 case 7: selectcmd = shared.select7; break;
2384 case 8: selectcmd = shared.select8; break;
2385 case 9: selectcmd = shared.select9; break;
2386 default:
2387 selectcmd = createObject(REDIS_STRING,
2388 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2389 selectcmd->refcount = 0;
2390 break;
2391 }
2392 addReply(slave,selectcmd);
2393 slave->slaveseldb = dictid;
2394 }
2395 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2396 }
40d224a9 2397 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2398 if (outv != static_outv) zfree(outv);
ed9b544e 2399}
2400
638e42ac 2401static void processInputBuffer(redisClient *c) {
ed9b544e 2402again:
4409877e 2403 /* Before to process the input buffer, make sure the client is not
2404 * waitig for a blocking operation such as BLPOP. Note that the first
2405 * iteration the client is never blocked, otherwise the processInputBuffer
2406 * would not be called at all, but after the execution of the first commands
2407 * in the input buffer the client may be blocked, and the "goto again"
2408 * will try to reiterate. The following line will make it return asap. */
92f8e882 2409 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2410 if (c->bulklen == -1) {
2411 /* Read the first line of the query */
2412 char *p = strchr(c->querybuf,'\n');
2413 size_t querylen;
644fafa3 2414
ed9b544e 2415 if (p) {
2416 sds query, *argv;
2417 int argc, j;
2418
2419 query = c->querybuf;
2420 c->querybuf = sdsempty();
2421 querylen = 1+(p-(query));
2422 if (sdslen(query) > querylen) {
2423 /* leave data after the first line of the query in the buffer */
2424 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2425 }
2426 *p = '\0'; /* remove "\n" */
2427 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2428 sdsupdatelen(query);
2429
2430 /* Now we can split the query in arguments */
ed9b544e 2431 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2432 sdsfree(query);
2433
2434 if (c->argv) zfree(c->argv);
2435 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2436
2437 for (j = 0; j < argc; j++) {
ed9b544e 2438 if (sdslen(argv[j])) {
2439 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2440 c->argc++;
2441 } else {
2442 sdsfree(argv[j]);
2443 }
2444 }
2445 zfree(argv);
7c49733c 2446 if (c->argc) {
2447 /* Execute the command. If the client is still valid
2448 * after processCommand() return and there is something
2449 * on the query buffer try to process the next command. */
2450 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2451 } else {
2452 /* Nothing to process, argc == 0. Just process the query
2453 * buffer if it's not empty or return to the caller */
2454 if (sdslen(c->querybuf)) goto again;
2455 }
ed9b544e 2456 return;
644fafa3 2457 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2458 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2459 freeClient(c);
2460 return;
2461 }
2462 } else {
2463 /* Bulk read handling. Note that if we are at this point
2464 the client already sent a command terminated with a newline,
2465 we are reading the bulk data that is actually the last
2466 argument of the command. */
2467 int qbl = sdslen(c->querybuf);
2468
2469 if (c->bulklen <= qbl) {
2470 /* Copy everything but the final CRLF as final argument */
2471 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2472 c->argc++;
2473 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2474 /* Process the command. If the client is still valid after
2475 * the processing and there is more data in the buffer
2476 * try to parse it. */
2477 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2478 return;
2479 }
2480 }
2481}
2482
638e42ac 2483static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2484 redisClient *c = (redisClient*) privdata;
2485 char buf[REDIS_IOBUF_LEN];
2486 int nread;
2487 REDIS_NOTUSED(el);
2488 REDIS_NOTUSED(mask);
2489
2490 nread = read(fd, buf, REDIS_IOBUF_LEN);
2491 if (nread == -1) {
2492 if (errno == EAGAIN) {
2493 nread = 0;
2494 } else {
f870935d 2495 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2496 freeClient(c);
2497 return;
2498 }
2499 } else if (nread == 0) {
f870935d 2500 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2501 freeClient(c);
2502 return;
2503 }
2504 if (nread) {
2505 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2506 c->lastinteraction = time(NULL);
2507 } else {
2508 return;
2509 }
168ac5c6 2510 processInputBuffer(c);
638e42ac 2511}
2512
ed9b544e 2513static int selectDb(redisClient *c, int id) {
2514 if (id < 0 || id >= server.dbnum)
2515 return REDIS_ERR;
3305306f 2516 c->db = &server.db[id];
ed9b544e 2517 return REDIS_OK;
2518}
2519
40d224a9 2520static void *dupClientReplyValue(void *o) {
2521 incrRefCount((robj*)o);
12d090d2 2522 return o;
40d224a9 2523}
2524
ffc6b7f8 2525static int listMatchObjects(void *a, void *b) {
2526 return compareStringObjects(a,b) == 0;
2527}
2528
ed9b544e 2529static redisClient *createClient(int fd) {
2530 redisClient *c = zmalloc(sizeof(*c));
2531
2532 anetNonBlock(NULL,fd);
2533 anetTcpNoDelay(NULL,fd);
2534 if (!c) return NULL;
2535 selectDb(c,0);
2536 c->fd = fd;
2537 c->querybuf = sdsempty();
2538 c->argc = 0;
93ea3759 2539 c->argv = NULL;
ed9b544e 2540 c->bulklen = -1;
e8a74421 2541 c->multibulk = 0;
2542 c->mbargc = 0;
2543 c->mbargv = NULL;
ed9b544e 2544 c->sentlen = 0;
2545 c->flags = 0;
2546 c->lastinteraction = time(NULL);
abcb223e 2547 c->authenticated = 0;
40d224a9 2548 c->replstate = REDIS_REPL_NONE;
6b47e12e 2549 c->reply = listCreate();
ed9b544e 2550 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2551 listSetDupMethod(c->reply,dupClientReplyValue);
92f8e882 2552 c->blockingkeys = NULL;
2553 c->blockingkeysnum = 0;
2554 c->io_keys = listCreate();
2555 listSetFreeMethod(c->io_keys,decrRefCount);
ffc6b7f8 2556 c->pubsub_channels = dictCreate(&setDictType,NULL);
2557 c->pubsub_patterns = listCreate();
2558 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2559 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
ed9b544e 2560 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2561 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2562 freeClient(c);
2563 return NULL;
2564 }
6b47e12e 2565 listAddNodeTail(server.clients,c);
6e469882 2566 initClientMultiState(c);
ed9b544e 2567 return c;
2568}
2569
2570static void addReply(redisClient *c, robj *obj) {
2571 if (listLength(c->reply) == 0 &&
6208b3a7 2572 (c->replstate == REDIS_REPL_NONE ||
2573 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2574 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2575 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2576
2577 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2578 obj = dupStringObject(obj);
2579 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2580 }
9d65a1bb 2581 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2582}
2583
2584static void addReplySds(redisClient *c, sds s) {
2585 robj *o = createObject(REDIS_STRING,s);
2586 addReply(c,o);
2587 decrRefCount(o);
2588}
2589
e2665397 2590static void addReplyDouble(redisClient *c, double d) {
2591 char buf[128];
2592
2593 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2594 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2595 (unsigned long) strlen(buf),buf));
e2665397 2596}
2597
f44dd428 2598static void addReplyLong(redisClient *c, long l) {
2599 char buf[128];
2600 size_t len;
2601
dd88747b 2602 if (l == 0) {
2603 addReply(c,shared.czero);
2604 return;
2605 } else if (l == 1) {
2606 addReply(c,shared.cone);
2607 return;
2608 }
f44dd428 2609 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2610 addReplySds(c,sdsnewlen(buf,len));
2611}
2612
aa7c2934
PN
2613static void addReplyLongLong(redisClient *c, long long ll) {
2614 char buf[128];
2615 size_t len;
2616
2617 if (ll == 0) {
2618 addReply(c,shared.czero);
2619 return;
2620 } else if (ll == 1) {
2621 addReply(c,shared.cone);
2622 return;
2623 }
2624 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2625 addReplySds(c,sdsnewlen(buf,len));
2626}
2627
92b27fe9 2628static void addReplyUlong(redisClient *c, unsigned long ul) {
2629 char buf[128];
2630 size_t len;
2631
dd88747b 2632 if (ul == 0) {
2633 addReply(c,shared.czero);
2634 return;
2635 } else if (ul == 1) {
2636 addReply(c,shared.cone);
2637 return;
2638 }
92b27fe9 2639 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2640 addReplySds(c,sdsnewlen(buf,len));
2641}
2642
942a3961 2643static void addReplyBulkLen(redisClient *c, robj *obj) {
2644 size_t len;
2645
2646 if (obj->encoding == REDIS_ENCODING_RAW) {
2647 len = sdslen(obj->ptr);
2648 } else {
2649 long n = (long)obj->ptr;
2650
e054afda 2651 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2652 len = 1;
2653 if (n < 0) {
2654 len++;
2655 n = -n;
2656 }
2657 while((n = n/10) != 0) {
2658 len++;
2659 }
2660 }
83c6a618 2661 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
942a3961 2662}
2663
dd88747b 2664static void addReplyBulk(redisClient *c, robj *obj) {
2665 addReplyBulkLen(c,obj);
2666 addReply(c,obj);
2667 addReply(c,shared.crlf);
2668}
2669
500ece7c 2670/* In the CONFIG command we need to add vanilla C string as bulk replies */
2671static void addReplyBulkCString(redisClient *c, char *s) {
2672 if (s == NULL) {
2673 addReply(c,shared.nullbulk);
2674 } else {
2675 robj *o = createStringObject(s,strlen(s));
2676 addReplyBulk(c,o);
2677 decrRefCount(o);
2678 }
2679}
2680
ed9b544e 2681static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2682 int cport, cfd;
2683 char cip[128];
285add55 2684 redisClient *c;
ed9b544e 2685 REDIS_NOTUSED(el);
2686 REDIS_NOTUSED(mask);
2687 REDIS_NOTUSED(privdata);
2688
2689 cfd = anetAccept(server.neterr, fd, cip, &cport);
2690 if (cfd == AE_ERR) {
f870935d 2691 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2692 return;
2693 }
f870935d 2694 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2695 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2696 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2697 close(cfd); /* May be already closed, just ingore errors */
2698 return;
2699 }
285add55 2700 /* If maxclient directive is set and this is one client more... close the
2701 * connection. Note that we create the client instead to check before
2702 * for this condition, since now the socket is already set in nonblocking
2703 * mode and we can send an error for free using the Kernel I/O */
2704 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2705 char *err = "-ERR max number of clients reached\r\n";
2706
2707 /* That's a best effort error message, don't check write errors */
fee803ba 2708 if (write(c->fd,err,strlen(err)) == -1) {
2709 /* Nothing to do, Just to avoid the warning... */
2710 }
285add55 2711 freeClient(c);
2712 return;
2713 }
ed9b544e 2714 server.stat_numconnections++;
2715}
2716
2717/* ======================= Redis objects implementation ===================== */
2718
2719static robj *createObject(int type, void *ptr) {
2720 robj *o;
2721
a5819310 2722 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2723 if (listLength(server.objfreelist)) {
2724 listNode *head = listFirst(server.objfreelist);
2725 o = listNodeValue(head);
2726 listDelNode(server.objfreelist,head);
a5819310 2727 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2728 } else {
75680a3c 2729 if (server.vm_enabled) {
a5819310 2730 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2731 o = zmalloc(sizeof(*o));
2732 } else {
2733 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2734 }
ed9b544e 2735 }
ed9b544e 2736 o->type = type;
942a3961 2737 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2738 o->ptr = ptr;
2739 o->refcount = 1;
3a66edc7 2740 if (server.vm_enabled) {
1064ef87 2741 /* Note that this code may run in the context of an I/O thread
2742 * and accessing to server.unixtime in theory is an error
2743 * (no locks). But in practice this is safe, and even if we read
2744 * garbage Redis will not fail, as it's just a statistical info */
3a66edc7 2745 o->vm.atime = server.unixtime;
2746 o->storage = REDIS_VM_MEMORY;
2747 }
ed9b544e 2748 return o;
2749}
2750
2751static robj *createStringObject(char *ptr, size_t len) {
2752 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2753}
2754
4ef8de8a 2755static robj *dupStringObject(robj *o) {
b9bc0eef 2756 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2757 return createStringObject(o->ptr,sdslen(o->ptr));
2758}
2759
ed9b544e 2760static robj *createListObject(void) {
2761 list *l = listCreate();
2762
ed9b544e 2763 listSetFreeMethod(l,decrRefCount);
2764 return createObject(REDIS_LIST,l);
2765}
2766
2767static robj *createSetObject(void) {
2768 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2769 return createObject(REDIS_SET,d);
2770}
2771
5234952b 2772static robj *createHashObject(void) {
2773 /* All the Hashes start as zipmaps. Will be automatically converted
2774 * into hash tables if there are enough elements or big elements
2775 * inside. */
2776 unsigned char *zm = zipmapNew();
2777 robj *o = createObject(REDIS_HASH,zm);
2778 o->encoding = REDIS_ENCODING_ZIPMAP;
2779 return o;
2780}
2781
1812e024 2782static robj *createZsetObject(void) {
6b47e12e 2783 zset *zs = zmalloc(sizeof(*zs));
2784
2785 zs->dict = dictCreate(&zsetDictType,NULL);
2786 zs->zsl = zslCreate();
2787 return createObject(REDIS_ZSET,zs);
1812e024 2788}
2789
ed9b544e 2790static void freeStringObject(robj *o) {
942a3961 2791 if (o->encoding == REDIS_ENCODING_RAW) {
2792 sdsfree(o->ptr);
2793 }
ed9b544e 2794}
2795
2796static void freeListObject(robj *o) {
2797 listRelease((list*) o->ptr);
2798}
2799
2800static void freeSetObject(robj *o) {
2801 dictRelease((dict*) o->ptr);
2802}
2803
fd8ccf44 2804static void freeZsetObject(robj *o) {
2805 zset *zs = o->ptr;
2806
2807 dictRelease(zs->dict);
2808 zslFree(zs->zsl);
2809 zfree(zs);
2810}
2811
ed9b544e 2812static void freeHashObject(robj *o) {
cbba7dd7 2813 switch (o->encoding) {
2814 case REDIS_ENCODING_HT:
2815 dictRelease((dict*) o->ptr);
2816 break;
2817 case REDIS_ENCODING_ZIPMAP:
2818 zfree(o->ptr);
2819 break;
2820 default:
2821 redisAssert(0);
2822 break;
2823 }
ed9b544e 2824}
2825
2826static void incrRefCount(robj *o) {
2827 o->refcount++;
2828}
2829
2830static void decrRefCount(void *obj) {
2831 robj *o = obj;
94754ccc 2832
970e10bb 2833 /* Object is a key of a swapped out value, or in the process of being
2834 * loaded. */
996cb5f7 2835 if (server.vm_enabled &&
2836 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2837 {
996cb5f7 2838 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 2839 redisAssert(o->type == REDIS_STRING);
a35ddf12 2840 freeStringObject(o);
2841 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 2842 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 2843 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2844 !listAddNodeHead(server.objfreelist,o))
2845 zfree(o);
a5819310 2846 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 2847 server.vm_stats_swapped_objects--;
a35ddf12 2848 return;
2849 }
996cb5f7 2850 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 2851 if (--(o->refcount) == 0) {
996cb5f7 2852 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2853 vmCancelThreadedIOJob(obj);
ed9b544e 2854 switch(o->type) {
2855 case REDIS_STRING: freeStringObject(o); break;
2856 case REDIS_LIST: freeListObject(o); break;
2857 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 2858 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 2859 case REDIS_HASH: freeHashObject(o); break;
78409a0f 2860 default: redisAssert(0); break;
ed9b544e 2861 }
a5819310 2862 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2863 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2864 !listAddNodeHead(server.objfreelist,o))
2865 zfree(o);
a5819310 2866 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2867 }
2868}
2869
942a3961 2870static robj *lookupKey(redisDb *db, robj *key) {
2871 dictEntry *de = dictFind(db->dict,key);
3a66edc7 2872 if (de) {
55cf8433 2873 robj *key = dictGetEntryKey(de);
2874 robj *val = dictGetEntryVal(de);
3a66edc7 2875
55cf8433 2876 if (server.vm_enabled) {
996cb5f7 2877 if (key->storage == REDIS_VM_MEMORY ||
2878 key->storage == REDIS_VM_SWAPPING)
2879 {
2880 /* If we were swapping the object out, stop it, this key
2881 * was requested. */
2882 if (key->storage == REDIS_VM_SWAPPING)
2883 vmCancelThreadedIOJob(key);
55cf8433 2884 /* Update the access time of the key for the aging algorithm. */
2885 key->vm.atime = server.unixtime;
2886 } else {
d5d55fc3 2887 int notify = (key->storage == REDIS_VM_LOADING);
2888
55cf8433 2889 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 2890 redisAssert(val == NULL);
55cf8433 2891 val = vmLoadObject(key);
2892 dictGetEntryVal(de) = val;
d5d55fc3 2893
2894 /* Clients blocked by the VM subsystem may be waiting for
2895 * this key... */
2896 if (notify) handleClientsBlockedOnSwappedKey(db,key);
55cf8433 2897 }
2898 }
2899 return val;
3a66edc7 2900 } else {
2901 return NULL;
2902 }
942a3961 2903}
2904
2905static robj *lookupKeyRead(redisDb *db, robj *key) {
2906 expireIfNeeded(db,key);
2907 return lookupKey(db,key);
2908}
2909
2910static robj *lookupKeyWrite(redisDb *db, robj *key) {
2911 deleteIfVolatile(db,key);
2912 return lookupKey(db,key);
2913}
2914
92b27fe9 2915static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2916 robj *o = lookupKeyRead(c->db, key);
2917 if (!o) addReply(c,reply);
2918 return o;
2919}
2920
2921static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2922 robj *o = lookupKeyWrite(c->db, key);
2923 if (!o) addReply(c,reply);
2924 return o;
2925}
2926
2927static int checkType(redisClient *c, robj *o, int type) {
2928 if (o->type != type) {
2929 addReply(c,shared.wrongtypeerr);
2930 return 1;
2931 }
2932 return 0;
2933}
2934
942a3961 2935static int deleteKey(redisDb *db, robj *key) {
2936 int retval;
2937
2938 /* We need to protect key from destruction: after the first dictDelete()
2939 * it may happen that 'key' is no longer valid if we don't increment
2940 * it's count. This may happen when we get the object reference directly
2941 * from the hash table with dictRandomKey() or dict iterators */
2942 incrRefCount(key);
2943 if (dictSize(db->expires)) dictDelete(db->expires,key);
2944 retval = dictDelete(db->dict,key);
2945 decrRefCount(key);
2946
2947 return retval == DICT_OK;
2948}
2949
724a51b1 2950/* Check if the nul-terminated string 's' can be represented by a long
2951 * (that is, is a number that fits into long without any other space or
2952 * character before or after the digits).
2953 *
2954 * If so, the function returns REDIS_OK and *longval is set to the value
2955 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 2956static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 2957 char buf[32], *endptr;
2958 long value;
2959 int slen;
2960
2961 value = strtol(s, &endptr, 10);
2962 if (endptr[0] != '\0') return REDIS_ERR;
2963 slen = snprintf(buf,32,"%ld",value);
2964
2965 /* If the number converted back into a string is not identical
2966 * then it's not possible to encode the string as integer */
f69f2cba 2967 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 2968 if (longval) *longval = value;
2969 return REDIS_OK;
2970}
2971
942a3961 2972/* Try to encode a string object in order to save space */
05df7621 2973static robj *tryObjectEncoding(robj *o) {
942a3961 2974 long value;
942a3961 2975 sds s = o->ptr;
3305306f 2976
942a3961 2977 if (o->encoding != REDIS_ENCODING_RAW)
05df7621 2978 return o; /* Already encoded */
3305306f 2979
05df7621 2980 /* It's not safe to encode shared objects: shared objects can be shared
942a3961 2981 * everywhere in the "object space" of Redis. Encoded objects can only
2982 * appear as "values" (and not, for instance, as keys) */
05df7621 2983 if (o->refcount > 1) return o;
3305306f 2984
942a3961 2985 /* Currently we try to encode only strings */
dfc5e96c 2986 redisAssert(o->type == REDIS_STRING);
94754ccc 2987
724a51b1 2988 /* Check if we can represent this string as a long integer */
05df7621 2989 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
942a3961 2990
2991 /* Ok, this object can be encoded */
05df7621 2992 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2993 decrRefCount(o);
2994 incrRefCount(shared.integers[value]);
2995 return shared.integers[value];
2996 } else {
2997 o->encoding = REDIS_ENCODING_INT;
2998 sdsfree(o->ptr);
2999 o->ptr = (void*) value;
3000 return o;
3001 }
942a3961 3002}
3003
9d65a1bb 3004/* Get a decoded version of an encoded object (returned as a new object).
3005 * If the object is already raw-encoded just increment the ref count. */
3006static robj *getDecodedObject(robj *o) {
942a3961 3007 robj *dec;
3008
9d65a1bb 3009 if (o->encoding == REDIS_ENCODING_RAW) {
3010 incrRefCount(o);
3011 return o;
3012 }
942a3961 3013 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3014 char buf[32];
3015
3016 snprintf(buf,32,"%ld",(long)o->ptr);
3017 dec = createStringObject(buf,strlen(buf));
3018 return dec;
3019 } else {
dfc5e96c 3020 redisAssert(1 != 1);
942a3961 3021 }
3305306f 3022}
3023
d7f43c08 3024/* Compare two string objects via strcmp() or alike.
3025 * Note that the objects may be integer-encoded. In such a case we
3026 * use snprintf() to get a string representation of the numbers on the stack
1fd9bc8a 3027 * and compare the strings, it's much faster than calling getDecodedObject().
3028 *
3029 * Important note: if objects are not integer encoded, but binary-safe strings,
3030 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3031 * binary safe. */
724a51b1 3032static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 3033 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 3034 char bufa[128], bufb[128], *astr, *bstr;
3035 int bothsds = 1;
724a51b1 3036
e197b441 3037 if (a == b) return 0;
d7f43c08 3038 if (a->encoding != REDIS_ENCODING_RAW) {
3039 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3040 astr = bufa;
3041 bothsds = 0;
724a51b1 3042 } else {
d7f43c08 3043 astr = a->ptr;
724a51b1 3044 }
d7f43c08 3045 if (b->encoding != REDIS_ENCODING_RAW) {
3046 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3047 bstr = bufb;
3048 bothsds = 0;
3049 } else {
3050 bstr = b->ptr;
3051 }
3052 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3053}
3054
0ea663ea 3055static size_t stringObjectLen(robj *o) {
dfc5e96c 3056 redisAssert(o->type == REDIS_STRING);
0ea663ea 3057 if (o->encoding == REDIS_ENCODING_RAW) {
3058 return sdslen(o->ptr);
3059 } else {
3060 char buf[32];
3061
3062 return snprintf(buf,32,"%ld",(long)o->ptr);
3063 }
3064}
3065
06233c45 3066/*============================ RDB saving/loading =========================== */
ed9b544e 3067
f78fd11b 3068static int rdbSaveType(FILE *fp, unsigned char type) {
3069 if (fwrite(&type,1,1,fp) == 0) return -1;
3070 return 0;
3071}
3072
bb32ede5 3073static int rdbSaveTime(FILE *fp, time_t t) {
3074 int32_t t32 = (int32_t) t;
3075 if (fwrite(&t32,4,1,fp) == 0) return -1;
3076 return 0;
3077}
3078
e3566d4b 3079/* check rdbLoadLen() comments for more info */
f78fd11b 3080static int rdbSaveLen(FILE *fp, uint32_t len) {
3081 unsigned char buf[2];
3082
3083 if (len < (1<<6)) {
3084 /* Save a 6 bit len */
10c43610 3085 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3086 if (fwrite(buf,1,1,fp) == 0) return -1;
3087 } else if (len < (1<<14)) {
3088 /* Save a 14 bit len */
10c43610 3089 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3090 buf[1] = len&0xFF;
17be1a4a 3091 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3092 } else {
3093 /* Save a 32 bit len */
10c43610 3094 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3095 if (fwrite(buf,1,1,fp) == 0) return -1;
3096 len = htonl(len);
3097 if (fwrite(&len,4,1,fp) == 0) return -1;
3098 }
3099 return 0;
3100}
3101
e3566d4b 3102/* String objects in the form "2391" "-100" without any space and with a
3103 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3104 * encoded as integers to save space */
b1befe6a 3105static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
e3566d4b 3106 long long value;
3107 char *endptr, buf[32];
3108
3109 /* Check if it's possible to encode this value as a number */
3110 value = strtoll(s, &endptr, 10);
3111 if (endptr[0] != '\0') return 0;
3112 snprintf(buf,32,"%lld",value);
3113
3114 /* If the number converted back into a string is not identical
3115 * then it's not possible to encode the string as integer */
b1befe6a 3116 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
e3566d4b 3117
3118 /* Finally check if it fits in our ranges */
3119 if (value >= -(1<<7) && value <= (1<<7)-1) {
3120 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3121 enc[1] = value&0xFF;
3122 return 2;
3123 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3124 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3125 enc[1] = value&0xFF;
3126 enc[2] = (value>>8)&0xFF;
3127 return 3;
3128 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3129 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3130 enc[1] = value&0xFF;
3131 enc[2] = (value>>8)&0xFF;
3132 enc[3] = (value>>16)&0xFF;
3133 enc[4] = (value>>24)&0xFF;
3134 return 5;
3135 } else {
3136 return 0;
3137 }
3138}
3139
b1befe6a 3140static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3141 size_t comprlen, outlen;
774e3047 3142 unsigned char byte;
3143 void *out;
3144
3145 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3146 if (len <= 4) return 0;
3147 outlen = len-4;
3a2694c4 3148 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3149 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3150 if (comprlen == 0) {
88e85998 3151 zfree(out);
774e3047 3152 return 0;
3153 }
3154 /* Data compressed! Let's save it on disk */
3155 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3156 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3157 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3158 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3159 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3160 zfree(out);
774e3047 3161 return comprlen;
3162
3163writeerr:
88e85998 3164 zfree(out);
774e3047 3165 return -1;
3166}
3167
e3566d4b 3168/* Save a string objet as [len][data] on disk. If the object is a string
3169 * representation of an integer value we try to safe it in a special form */
b1befe6a 3170static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3171 int enclen;
10c43610 3172
774e3047 3173 /* Try integer encoding */
e3566d4b 3174 if (len <= 11) {
3175 unsigned char buf[5];
b1befe6a 3176 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3177 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3178 return 0;
3179 }
3180 }
774e3047 3181
3182 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3183 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3184 if (server.rdbcompression && len > 20) {
774e3047 3185 int retval;
3186
b1befe6a 3187 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3188 if (retval == -1) return -1;
3189 if (retval > 0) return 0;
3190 /* retval == 0 means data can't be compressed, save the old way */
3191 }
3192
3193 /* Store verbatim */
10c43610 3194 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3195 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3196 return 0;
3197}
3198
942a3961 3199/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3200static int rdbSaveStringObject(FILE *fp, robj *obj) {
3201 int retval;
942a3961 3202
f2d9f50f 3203 /* Avoid incr/decr ref count business when possible.
3204 * This plays well with copy-on-write given that we are probably
3205 * in a child process (BGSAVE). Also this makes sure key objects
3206 * of swapped objects are not incRefCount-ed (an assert does not allow
3207 * this in order to avoid bugs) */
3208 if (obj->encoding != REDIS_ENCODING_RAW) {
996cb5f7 3209 obj = getDecodedObject(obj);
b1befe6a 3210 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3211 decrRefCount(obj);
3212 } else {
b1befe6a 3213 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3214 }
9d65a1bb 3215 return retval;
942a3961 3216}
3217
a7866db6 3218/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3219 * 8 bit integer specifing the length of the representation.
3220 * This 8 bit integer has special values in order to specify the following
3221 * conditions:
3222 * 253: not a number
3223 * 254: + inf
3224 * 255: - inf
3225 */
3226static int rdbSaveDoubleValue(FILE *fp, double val) {
3227 unsigned char buf[128];
3228 int len;
3229
3230 if (isnan(val)) {
3231 buf[0] = 253;
3232 len = 1;
3233 } else if (!isfinite(val)) {
3234 len = 1;
3235 buf[0] = (val < 0) ? 255 : 254;
3236 } else {
eaa256ad 3237 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3238 buf[0] = strlen((char*)buf+1);
a7866db6 3239 len = buf[0]+1;
3240 }
3241 if (fwrite(buf,len,1,fp) == 0) return -1;
3242 return 0;
3243}
3244
06233c45 3245/* Save a Redis object. */
3246static int rdbSaveObject(FILE *fp, robj *o) {
3247 if (o->type == REDIS_STRING) {
3248 /* Save a string value */
3249 if (rdbSaveStringObject(fp,o) == -1) return -1;
3250 } else if (o->type == REDIS_LIST) {
3251 /* Save a list value */
3252 list *list = o->ptr;
c7df85a4 3253 listIter li;
06233c45 3254 listNode *ln;
3255
06233c45 3256 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
c7df85a4 3257 listRewind(list,&li);
3258 while((ln = listNext(&li))) {
06233c45 3259 robj *eleobj = listNodeValue(ln);
3260
3261 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3262 }
3263 } else if (o->type == REDIS_SET) {
3264 /* Save a set value */
3265 dict *set = o->ptr;
3266 dictIterator *di = dictGetIterator(set);
3267 dictEntry *de;
3268
3269 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3270 while((de = dictNext(di)) != NULL) {
3271 robj *eleobj = dictGetEntryKey(de);
3272
3273 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3274 }
3275 dictReleaseIterator(di);
3276 } else if (o->type == REDIS_ZSET) {
3277 /* Save a set value */
3278 zset *zs = o->ptr;
3279 dictIterator *di = dictGetIterator(zs->dict);
3280 dictEntry *de;
3281
3282 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3283 while((de = dictNext(di)) != NULL) {
3284 robj *eleobj = dictGetEntryKey(de);
3285 double *score = dictGetEntryVal(de);
3286
3287 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3288 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3289 }
3290 dictReleaseIterator(di);
b1befe6a 3291 } else if (o->type == REDIS_HASH) {
3292 /* Save a hash value */
3293 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3294 unsigned char *p = zipmapRewind(o->ptr);
3295 unsigned int count = zipmapLen(o->ptr);
3296 unsigned char *key, *val;
3297 unsigned int klen, vlen;
3298
3299 if (rdbSaveLen(fp,count) == -1) return -1;
3300 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3301 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3302 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3303 }
3304 } else {
3305 dictIterator *di = dictGetIterator(o->ptr);
3306 dictEntry *de;
3307
3308 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3309 while((de = dictNext(di)) != NULL) {
3310 robj *key = dictGetEntryKey(de);
3311 robj *val = dictGetEntryVal(de);
3312
3313 if (rdbSaveStringObject(fp,key) == -1) return -1;
3314 if (rdbSaveStringObject(fp,val) == -1) return -1;
3315 }
3316 dictReleaseIterator(di);
3317 }
06233c45 3318 } else {
78409a0f 3319 redisAssert(0);
06233c45 3320 }
3321 return 0;
3322}
3323
3324/* Return the length the object will have on disk if saved with
3325 * the rdbSaveObject() function. Currently we use a trick to get
3326 * this length with very little changes to the code. In the future
3327 * we could switch to a faster solution. */
b9bc0eef 3328static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3329 if (fp == NULL) fp = server.devnull;
06233c45 3330 rewind(fp);
3331 assert(rdbSaveObject(fp,o) != 1);
3332 return ftello(fp);
3333}
3334
06224fec 3335/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3336static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3337 off_t bytes = rdbSavedObjectLen(o,fp);
06224fec 3338
3339 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3340}
3341
ed9b544e 3342/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3343static int rdbSave(char *filename) {
ed9b544e 3344 dictIterator *di = NULL;
3345 dictEntry *de;
ed9b544e 3346 FILE *fp;
3347 char tmpfile[256];
3348 int j;
bb32ede5 3349 time_t now = time(NULL);
ed9b544e 3350
2316bb3b 3351 /* Wait for I/O therads to terminate, just in case this is a
3352 * foreground-saving, to avoid seeking the swap file descriptor at the
3353 * same time. */
3354 if (server.vm_enabled)
3355 waitEmptyIOJobsQueue();
3356
a3b21203 3357 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3358 fp = fopen(tmpfile,"w");
3359 if (!fp) {
3360 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3361 return REDIS_ERR;
3362 }
f78fd11b 3363 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3364 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3365 redisDb *db = server.db+j;
3366 dict *d = db->dict;
3305306f 3367 if (dictSize(d) == 0) continue;
ed9b544e 3368 di = dictGetIterator(d);
3369 if (!di) {
3370 fclose(fp);
3371 return REDIS_ERR;
3372 }
3373
3374 /* Write the SELECT DB opcode */
f78fd11b 3375 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3376 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3377
3378 /* Iterate this DB writing every entry */
3379 while((de = dictNext(di)) != NULL) {
3380 robj *key = dictGetEntryKey(de);
3381 robj *o = dictGetEntryVal(de);
bb32ede5 3382 time_t expiretime = getExpire(db,key);
3383
3384 /* Save the expire time */
3385 if (expiretime != -1) {
3386 /* If this key is already expired skip it */
3387 if (expiretime < now) continue;
3388 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3389 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3390 }
7e69548d 3391 /* Save the key and associated value. This requires special
3392 * handling if the value is swapped out. */
996cb5f7 3393 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3394 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3395 /* Save type, key, value */
3396 if (rdbSaveType(fp,o->type) == -1) goto werr;
3397 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3398 if (rdbSaveObject(fp,o) == -1) goto werr;
3399 } else {
996cb5f7 3400 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3401 robj *po;
7e69548d 3402 /* Get a preview of the object in memory */
3403 po = vmPreviewObject(key);
7e69548d 3404 /* Save type, key, value */
3405 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3406 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3407 if (rdbSaveObject(fp,po) == -1) goto werr;
3408 /* Remove the loaded object from memory */
3409 decrRefCount(po);
7e69548d 3410 }
ed9b544e 3411 }
3412 dictReleaseIterator(di);
3413 }
3414 /* EOF opcode */
f78fd11b 3415 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3416
3417 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3418 fflush(fp);
3419 fsync(fileno(fp));
3420 fclose(fp);
3421
3422 /* Use RENAME to make sure the DB file is changed atomically only
3423 * if the generate DB file is ok. */
3424 if (rename(tmpfile,filename) == -1) {
325d1eb4 3425 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3426 unlink(tmpfile);
3427 return REDIS_ERR;
3428 }
3429 redisLog(REDIS_NOTICE,"DB saved on disk");
3430 server.dirty = 0;
3431 server.lastsave = time(NULL);
3432 return REDIS_OK;
3433
3434werr:
3435 fclose(fp);
3436 unlink(tmpfile);
3437 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3438 if (di) dictReleaseIterator(di);
3439 return REDIS_ERR;
3440}
3441
f78fd11b 3442static int rdbSaveBackground(char *filename) {
ed9b544e 3443 pid_t childpid;
3444
9d65a1bb 3445 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3446 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3447 if ((childpid = fork()) == 0) {
3448 /* Child */
054e426d 3449 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3450 close(server.fd);
f78fd11b 3451 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3452 _exit(0);
ed9b544e 3453 } else {
478c2c6f 3454 _exit(1);
ed9b544e 3455 }
3456 } else {
3457 /* Parent */
5a7c647e 3458 if (childpid == -1) {
3459 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3460 strerror(errno));
3461 return REDIS_ERR;
3462 }
ed9b544e 3463 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3464 server.bgsavechildpid = childpid;
884d4b39 3465 updateDictResizePolicy();
ed9b544e 3466 return REDIS_OK;
3467 }
3468 return REDIS_OK; /* unreached */
3469}
3470
a3b21203 3471static void rdbRemoveTempFile(pid_t childpid) {
3472 char tmpfile[256];
3473
3474 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3475 unlink(tmpfile);
3476}
3477
f78fd11b 3478static int rdbLoadType(FILE *fp) {
3479 unsigned char type;
7b45bfb2 3480 if (fread(&type,1,1,fp) == 0) return -1;
3481 return type;
3482}
3483
bb32ede5 3484static time_t rdbLoadTime(FILE *fp) {
3485 int32_t t32;
3486 if (fread(&t32,4,1,fp) == 0) return -1;
3487 return (time_t) t32;
3488}
3489
e3566d4b 3490/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3491 * of this file for a description of how this are stored on disk.
3492 *
3493 * isencoded is set to 1 if the readed length is not actually a length but
3494 * an "encoding type", check the above comments for more info */
c78a8ccc 3495static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3496 unsigned char buf[2];
3497 uint32_t len;
c78a8ccc 3498 int type;
f78fd11b 3499
e3566d4b 3500 if (isencoded) *isencoded = 0;
c78a8ccc 3501 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3502 type = (buf[0]&0xC0)>>6;
3503 if (type == REDIS_RDB_6BITLEN) {
3504 /* Read a 6 bit len */
3505 return buf[0]&0x3F;
3506 } else if (type == REDIS_RDB_ENCVAL) {
3507 /* Read a 6 bit len encoding type */
3508 if (isencoded) *isencoded = 1;
3509 return buf[0]&0x3F;
3510 } else if (type == REDIS_RDB_14BITLEN) {
3511 /* Read a 14 bit len */
3512 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3513 return ((buf[0]&0x3F)<<8)|buf[1];
3514 } else {
3515 /* Read a 32 bit len */
f78fd11b 3516 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3517 return ntohl(len);
f78fd11b 3518 }
f78fd11b 3519}
3520
e3566d4b 3521static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3522 unsigned char enc[4];
3523 long long val;
3524
3525 if (enctype == REDIS_RDB_ENC_INT8) {
3526 if (fread(enc,1,1,fp) == 0) return NULL;
3527 val = (signed char)enc[0];
3528 } else if (enctype == REDIS_RDB_ENC_INT16) {
3529 uint16_t v;
3530 if (fread(enc,2,1,fp) == 0) return NULL;
3531 v = enc[0]|(enc[1]<<8);
3532 val = (int16_t)v;
3533 } else if (enctype == REDIS_RDB_ENC_INT32) {
3534 uint32_t v;
3535 if (fread(enc,4,1,fp) == 0) return NULL;
3536 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3537 val = (int32_t)v;
3538 } else {
3539 val = 0; /* anti-warning */
78409a0f 3540 redisAssert(0);
e3566d4b 3541 }
3542 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3543}
3544
c78a8ccc 3545static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3546 unsigned int len, clen;
3547 unsigned char *c = NULL;
3548 sds val = NULL;
3549
c78a8ccc 3550 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3551 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3552 if ((c = zmalloc(clen)) == NULL) goto err;
3553 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3554 if (fread(c,clen,1,fp) == 0) goto err;
3555 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3556 zfree(c);
88e85998 3557 return createObject(REDIS_STRING,val);
3558err:
3559 zfree(c);
3560 sdsfree(val);
3561 return NULL;
3562}
3563
c78a8ccc 3564static robj *rdbLoadStringObject(FILE*fp) {
e3566d4b 3565 int isencoded;
3566 uint32_t len;
f78fd11b 3567 sds val;
3568
c78a8ccc 3569 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3570 if (isencoded) {
3571 switch(len) {
3572 case REDIS_RDB_ENC_INT8:
3573 case REDIS_RDB_ENC_INT16:
3574 case REDIS_RDB_ENC_INT32:
bdcb92f2 3575 return rdbLoadIntegerObject(fp,len);
88e85998 3576 case REDIS_RDB_ENC_LZF:
bdcb92f2 3577 return rdbLoadLzfStringObject(fp);
e3566d4b 3578 default:
78409a0f 3579 redisAssert(0);
e3566d4b 3580 }
3581 }
3582
f78fd11b 3583 if (len == REDIS_RDB_LENERR) return NULL;
3584 val = sdsnewlen(NULL,len);
3585 if (len && fread(val,len,1,fp) == 0) {
3586 sdsfree(val);
3587 return NULL;
3588 }
bdcb92f2 3589 return createObject(REDIS_STRING,val);
f78fd11b 3590}
3591
a7866db6 3592/* For information about double serialization check rdbSaveDoubleValue() */
3593static int rdbLoadDoubleValue(FILE *fp, double *val) {
3594 char buf[128];
3595 unsigned char len;
3596
3597 if (fread(&len,1,1,fp) == 0) return -1;
3598 switch(len) {
3599 case 255: *val = R_NegInf; return 0;
3600 case 254: *val = R_PosInf; return 0;
3601 case 253: *val = R_Nan; return 0;
3602 default:
3603 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3604 buf[len] = '\0';
a7866db6 3605 sscanf(buf, "%lg", val);
3606 return 0;
3607 }
3608}
3609
c78a8ccc 3610/* Load a Redis object of the specified type from the specified file.
3611 * On success a newly allocated object is returned, otherwise NULL. */
3612static robj *rdbLoadObject(int type, FILE *fp) {
3613 robj *o;
3614
bcd11906 3615 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 3616 if (type == REDIS_STRING) {
3617 /* Read string value */
3618 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
05df7621 3619 o = tryObjectEncoding(o);
c78a8ccc 3620 } else if (type == REDIS_LIST || type == REDIS_SET) {
3621 /* Read list/set value */
3622 uint32_t listlen;
3623
3624 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3625 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3c68de9b 3626 /* It's faster to expand the dict to the right size asap in order
3627 * to avoid rehashing */
3628 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3629 dictExpand(o->ptr,listlen);
c78a8ccc 3630 /* Load every single element of the list/set */
3631 while(listlen--) {
3632 robj *ele;
3633
3634 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
05df7621 3635 ele = tryObjectEncoding(ele);
c78a8ccc 3636 if (type == REDIS_LIST) {
3637 listAddNodeTail((list*)o->ptr,ele);
3638 } else {
3639 dictAdd((dict*)o->ptr,ele,NULL);
3640 }
3641 }
3642 } else if (type == REDIS_ZSET) {
3643 /* Read list/set value */
ada386b2 3644 size_t zsetlen;
c78a8ccc 3645 zset *zs;
3646
3647 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3648 o = createZsetObject();
3649 zs = o->ptr;
3650 /* Load every single element of the list/set */
3651 while(zsetlen--) {
3652 robj *ele;
3653 double *score = zmalloc(sizeof(double));
3654
3655 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
05df7621 3656 ele = tryObjectEncoding(ele);
c78a8ccc 3657 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3658 dictAdd(zs->dict,ele,score);
3659 zslInsert(zs->zsl,*score,ele);
3660 incrRefCount(ele); /* added to skiplist */
3661 }
ada386b2 3662 } else if (type == REDIS_HASH) {
3663 size_t hashlen;
3664
3665 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3666 o = createHashObject();
3667 /* Too many entries? Use an hash table. */
3668 if (hashlen > server.hash_max_zipmap_entries)
3669 convertToRealHash(o);
3670 /* Load every key/value, then set it into the zipmap or hash
3671 * table, as needed. */
3672 while(hashlen--) {
3673 robj *key, *val;
3674
3675 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3676 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3677 /* If we are using a zipmap and there are too big values
3678 * the object is converted to real hash table encoding. */
3679 if (o->encoding != REDIS_ENCODING_HT &&
3680 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3681 sdslen(val->ptr) > server.hash_max_zipmap_value))
3682 {
3683 convertToRealHash(o);
3684 }
3685
3686 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3687 unsigned char *zm = o->ptr;
3688
3689 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3690 val->ptr,sdslen(val->ptr),NULL);
3691 o->ptr = zm;
3692 decrRefCount(key);
3693 decrRefCount(val);
3694 } else {
05df7621 3695 key = tryObjectEncoding(key);
3696 val = tryObjectEncoding(val);
ada386b2 3697 dictAdd((dict*)o->ptr,key,val);
ada386b2 3698 }
3699 }
c78a8ccc 3700 } else {
78409a0f 3701 redisAssert(0);
c78a8ccc 3702 }
3703 return o;
3704}
3705
f78fd11b 3706static int rdbLoad(char *filename) {
ed9b544e 3707 FILE *fp;
f78fd11b 3708 robj *keyobj = NULL;
3709 uint32_t dbid;
bb32ede5 3710 int type, retval, rdbver;
3305306f 3711 dict *d = server.db[0].dict;
bb32ede5 3712 redisDb *db = server.db+0;
f78fd11b 3713 char buf[1024];
bb32ede5 3714 time_t expiretime = -1, now = time(NULL);
b492cf00 3715 long long loadedkeys = 0;
bb32ede5 3716
ed9b544e 3717 fp = fopen(filename,"r");
3718 if (!fp) return REDIS_ERR;
3719 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 3720 buf[9] = '\0';
3721 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 3722 fclose(fp);
3723 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3724 return REDIS_ERR;
3725 }
f78fd11b 3726 rdbver = atoi(buf+5);
c78a8ccc 3727 if (rdbver != 1) {
f78fd11b 3728 fclose(fp);
3729 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3730 return REDIS_ERR;
3731 }
ed9b544e 3732 while(1) {
3733 robj *o;
3734
3735 /* Read type. */
f78fd11b 3736 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 3737 if (type == REDIS_EXPIRETIME) {
3738 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3739 /* We read the time so we need to read the object type again */
3740 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3741 }
ed9b544e 3742 if (type == REDIS_EOF) break;
3743 /* Handle SELECT DB opcode as a special case */
3744 if (type == REDIS_SELECTDB) {
c78a8ccc 3745 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 3746 goto eoferr;
ed9b544e 3747 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 3748 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 3749 exit(1);
3750 }
bb32ede5 3751 db = server.db+dbid;
3752 d = db->dict;
ed9b544e 3753 continue;
3754 }
3755 /* Read key */
c78a8ccc 3756 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3757 /* Read value */
3758 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
ed9b544e 3759 /* Add the new object in the hash table */
f78fd11b 3760 retval = dictAdd(d,keyobj,o);
ed9b544e 3761 if (retval == DICT_ERR) {
f78fd11b 3762 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
ed9b544e 3763 exit(1);
3764 }
bb32ede5 3765 /* Set the expire time if needed */
3766 if (expiretime != -1) {
3767 setExpire(db,keyobj,expiretime);
3768 /* Delete this key if already expired */
3769 if (expiretime < now) deleteKey(db,keyobj);
3770 expiretime = -1;
3771 }
f78fd11b 3772 keyobj = o = NULL;
b492cf00 3773 /* Handle swapping while loading big datasets when VM is on */
3774 loadedkeys++;
3775 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3776 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 3777 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 3778 }
3779 }
ed9b544e 3780 }
3781 fclose(fp);
3782 return REDIS_OK;
3783
3784eoferr: /* unexpected end of file is handled here with a fatal exit */
e3566d4b 3785 if (keyobj) decrRefCount(keyobj);
f80dff62 3786 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 3787 exit(1);
3788 return REDIS_ERR; /* Just to avoid warning */
3789}
3790
3791/*================================== Commands =============================== */
3792
abcb223e 3793static void authCommand(redisClient *c) {
2e77c2ee 3794 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
3795 c->authenticated = 1;
3796 addReply(c,shared.ok);
3797 } else {
3798 c->authenticated = 0;
fa4c0aba 3799 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
3800 }
3801}
3802
ed9b544e 3803static void pingCommand(redisClient *c) {
3804 addReply(c,shared.pong);
3805}
3806
3807static void echoCommand(redisClient *c) {
dd88747b 3808 addReplyBulk(c,c->argv[1]);
ed9b544e 3809}
3810
3811/*=================================== Strings =============================== */
3812
3813static void setGenericCommand(redisClient *c, int nx) {
3814 int retval;
3815
333fd216 3816 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3305306f 3817 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3818 if (retval == DICT_ERR) {
3819 if (!nx) {
1b03836c 3820 /* If the key is about a swapped value, we want a new key object
3821 * to overwrite the old. So we delete the old key in the database.
3822 * This will also make sure that swap pages about the old object
3823 * will be marked as free. */
ddfaca9d 3824 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
1b03836c 3825 incrRefCount(c->argv[1]);
3305306f 3826 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3827 incrRefCount(c->argv[2]);
3828 } else {
c937aa89 3829 addReply(c,shared.czero);
ed9b544e 3830 return;
3831 }
3832 } else {
3833 incrRefCount(c->argv[1]);
3834 incrRefCount(c->argv[2]);
3835 }
3836 server.dirty++;
3305306f 3837 removeExpire(c->db,c->argv[1]);
c937aa89 3838 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 3839}
3840
3841static void setCommand(redisClient *c) {
a4d1ba9a 3842 setGenericCommand(c,0);
ed9b544e 3843}
3844
3845static void setnxCommand(redisClient *c) {
a4d1ba9a 3846 setGenericCommand(c,1);
ed9b544e 3847}
3848
322fc7d8 3849static int getGenericCommand(redisClient *c) {
dd88747b 3850 robj *o;
3851
3852 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 3853 return REDIS_OK;
dd88747b 3854
3855 if (o->type != REDIS_STRING) {
3856 addReply(c,shared.wrongtypeerr);
3857 return REDIS_ERR;
ed9b544e 3858 } else {
dd88747b 3859 addReplyBulk(c,o);
3860 return REDIS_OK;
ed9b544e 3861 }
3862}
3863
322fc7d8 3864static void getCommand(redisClient *c) {
3865 getGenericCommand(c);
3866}
3867
f6b141c5 3868static void getsetCommand(redisClient *c) {
322fc7d8 3869 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 3870 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3871 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3872 } else {
3873 incrRefCount(c->argv[1]);
3874 }
3875 incrRefCount(c->argv[2]);
3876 server.dirty++;
3877 removeExpire(c->db,c->argv[1]);
3878}
3879
70003d28 3880static void mgetCommand(redisClient *c) {
70003d28 3881 int j;
3882
c937aa89 3883 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 3884 for (j = 1; j < c->argc; j++) {
3305306f 3885 robj *o = lookupKeyRead(c->db,c->argv[j]);
3886 if (o == NULL) {
c937aa89 3887 addReply(c,shared.nullbulk);
70003d28 3888 } else {
70003d28 3889 if (o->type != REDIS_STRING) {
c937aa89 3890 addReply(c,shared.nullbulk);
70003d28 3891 } else {
dd88747b 3892 addReplyBulk(c,o);
70003d28 3893 }
3894 }
3895 }
3896}
3897
6c446631 3898static void msetGenericCommand(redisClient *c, int nx) {
906573e7 3899 int j, busykeys = 0;
6c446631 3900
3901 if ((c->argc % 2) == 0) {
454d4e43 3902 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 3903 return;
3904 }
3905 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3906 * set nothing at all if at least one already key exists. */
3907 if (nx) {
3908 for (j = 1; j < c->argc; j += 2) {
906573e7 3909 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3910 busykeys++;
6c446631 3911 }
3912 }
3913 }
906573e7 3914 if (busykeys) {
3915 addReply(c, shared.czero);
3916 return;
3917 }
6c446631 3918
3919 for (j = 1; j < c->argc; j += 2) {
3920 int retval;
3921
05df7621 3922 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
6c446631 3923 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3924 if (retval == DICT_ERR) {
3925 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3926 incrRefCount(c->argv[j+1]);
3927 } else {
3928 incrRefCount(c->argv[j]);
3929 incrRefCount(c->argv[j+1]);
3930 }
3931 removeExpire(c->db,c->argv[j]);
3932 }
3933 server.dirty += (c->argc-1)/2;
3934 addReply(c, nx ? shared.cone : shared.ok);
3935}
3936
3937static void msetCommand(redisClient *c) {
3938 msetGenericCommand(c,0);
3939}
3940
3941static void msetnxCommand(redisClient *c) {
3942 msetGenericCommand(c,1);
3943}
3944
d68ed120 3945static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 3946 long long value;
3947 int retval;
3948 robj *o;
3949
3305306f 3950 o = lookupKeyWrite(c->db,c->argv[1]);
3951 if (o == NULL) {
ed9b544e 3952 value = 0;
3953 } else {
ed9b544e 3954 if (o->type != REDIS_STRING) {
3955 value = 0;
3956 } else {
3957 char *eptr;
3958
942a3961 3959 if (o->encoding == REDIS_ENCODING_RAW)
3960 value = strtoll(o->ptr, &eptr, 10);
3961 else if (o->encoding == REDIS_ENCODING_INT)
3962 value = (long)o->ptr;
3963 else
dfc5e96c 3964 redisAssert(1 != 1);
ed9b544e 3965 }
3966 }
3967
3968 value += incr;
3969 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
05df7621 3970 o = tryObjectEncoding(o);
3305306f 3971 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 3972 if (retval == DICT_ERR) {
3305306f 3973 dictReplace(c->db->dict,c->argv[1],o);
3974 removeExpire(c->db,c->argv[1]);
ed9b544e 3975 } else {
3976 incrRefCount(c->argv[1]);
3977 }
3978 server.dirty++;
c937aa89 3979 addReply(c,shared.colon);
ed9b544e 3980 addReply(c,o);
3981 addReply(c,shared.crlf);
3982}
3983
3984static void incrCommand(redisClient *c) {
a4d1ba9a 3985 incrDecrCommand(c,1);
ed9b544e 3986}
3987
3988static void decrCommand(redisClient *c) {
a4d1ba9a 3989 incrDecrCommand(c,-1);
ed9b544e 3990}
3991
3992static void incrbyCommand(redisClient *c) {
d68ed120 3993 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
a4d1ba9a 3994 incrDecrCommand(c,incr);
ed9b544e 3995}
3996
3997static void decrbyCommand(redisClient *c) {
d68ed120 3998 long long incr = strtoll(c->argv[2]->ptr, NULL, 10);
a4d1ba9a 3999 incrDecrCommand(c,-incr);
ed9b544e 4000}
4001
4b00bebd 4002static void appendCommand(redisClient *c) {
4003 int retval;
4004 size_t totlen;
4005 robj *o;
4006
4007 o = lookupKeyWrite(c->db,c->argv[1]);
4008 if (o == NULL) {
4009 /* Create the key */
4010 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4011 incrRefCount(c->argv[1]);
4012 incrRefCount(c->argv[2]);
4013 totlen = stringObjectLen(c->argv[2]);
4014 } else {
4015 dictEntry *de;
4016
4017 de = dictFind(c->db->dict,c->argv[1]);
4018 assert(de != NULL);
4019
4020 o = dictGetEntryVal(de);
4021 if (o->type != REDIS_STRING) {
4022 addReply(c,shared.wrongtypeerr);
4023 return;
4024 }
4025 /* If the object is specially encoded or shared we have to make
4026 * a copy */
4027 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4028 robj *decoded = getDecodedObject(o);
4029
4030 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4031 decrRefCount(decoded);
4032 dictReplace(c->db->dict,c->argv[1],o);
4033 }
4034 /* APPEND! */
4035 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4036 o->ptr = sdscatlen(o->ptr,
4037 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4038 } else {
4039 o->ptr = sdscatprintf(o->ptr, "%ld",
4040 (unsigned long) c->argv[2]->ptr);
4041 }
4042 totlen = sdslen(o->ptr);
4043 }
4044 server.dirty++;
4045 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4046}
4047
39191553 4048static void substrCommand(redisClient *c) {
4049 robj *o;
4050 long start = atoi(c->argv[2]->ptr);
4051 long end = atoi(c->argv[3]->ptr);
dd88747b 4052 size_t rangelen, strlen;
4053 sds range;
39191553 4054
dd88747b 4055 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4056 checkType(c,o,REDIS_STRING)) return;
39191553 4057
dd88747b 4058 o = getDecodedObject(o);
4059 strlen = sdslen(o->ptr);
8fe7fad7 4060
dd88747b 4061 /* convert negative indexes */
4062 if (start < 0) start = strlen+start;
4063 if (end < 0) end = strlen+end;
4064 if (start < 0) start = 0;
4065 if (end < 0) end = 0;
39191553 4066
dd88747b 4067 /* indexes sanity checks */
4068 if (start > end || (size_t)start >= strlen) {
4069 /* Out of range start or start > end result in null reply */
4070 addReply(c,shared.nullbulk);
4071 decrRefCount(o);
4072 return;
39191553 4073 }
dd88747b 4074 if ((size_t)end >= strlen) end = strlen-1;
4075 rangelen = (end-start)+1;
4076
4077 /* Return the result */
4078 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4079 range = sdsnewlen((char*)o->ptr+start,rangelen);
4080 addReplySds(c,range);
4081 addReply(c,shared.crlf);
4082 decrRefCount(o);
39191553 4083}
4084
ed9b544e 4085/* ========================= Type agnostic commands ========================= */
4086
4087static void delCommand(redisClient *c) {
5109cdff 4088 int deleted = 0, j;
4089
4090 for (j = 1; j < c->argc; j++) {
4091 if (deleteKey(c->db,c->argv[j])) {
4092 server.dirty++;
4093 deleted++;
4094 }
4095 }
dd88747b 4096 addReplyLong(c,deleted);
ed9b544e 4097}
4098
4099static void existsCommand(redisClient *c) {
3305306f 4100 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
ed9b544e 4101}
4102
4103static void selectCommand(redisClient *c) {
4104 int id = atoi(c->argv[1]->ptr);
4105
4106 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4107 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4108 } else {
4109 addReply(c,shared.ok);
4110 }
4111}
4112
4113static void randomkeyCommand(redisClient *c) {
4114 dictEntry *de;
3305306f 4115
4116 while(1) {
4117 de = dictGetRandomKey(c->db->dict);
ce7bef07 4118 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 4119 }
ed9b544e 4120 if (de == NULL) {
ce7bef07 4121 addReply(c,shared.plus);
ed9b544e 4122 addReply(c,shared.crlf);
4123 } else {
c937aa89 4124 addReply(c,shared.plus);
ed9b544e 4125 addReply(c,dictGetEntryKey(de));
4126 addReply(c,shared.crlf);
4127 }
4128}
4129
4130static void keysCommand(redisClient *c) {
4131 dictIterator *di;
4132 dictEntry *de;
4133 sds pattern = c->argv[1]->ptr;
4134 int plen = sdslen(pattern);
a3f9eec2 4135 unsigned long numkeys = 0;
ed9b544e 4136 robj *lenobj = createObject(REDIS_STRING,NULL);
4137
3305306f 4138 di = dictGetIterator(c->db->dict);
ed9b544e 4139 addReply(c,lenobj);
4140 decrRefCount(lenobj);
4141 while((de = dictNext(di)) != NULL) {
4142 robj *keyobj = dictGetEntryKey(de);
3305306f 4143
ed9b544e 4144 sds key = keyobj->ptr;
4145 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4146 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 4147 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4148 addReplyBulk(c,keyobj);
3305306f 4149 numkeys++;
3305306f 4150 }
ed9b544e 4151 }
4152 }
4153 dictReleaseIterator(di);
a3f9eec2 4154 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4155}
4156
4157static void dbsizeCommand(redisClient *c) {
4158 addReplySds(c,
3305306f 4159 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4160}
4161
4162static void lastsaveCommand(redisClient *c) {
4163 addReplySds(c,
c937aa89 4164 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4165}
4166
4167static void typeCommand(redisClient *c) {
3305306f 4168 robj *o;
ed9b544e 4169 char *type;
3305306f 4170
4171 o = lookupKeyRead(c->db,c->argv[1]);
4172 if (o == NULL) {
c937aa89 4173 type = "+none";
ed9b544e 4174 } else {
ed9b544e 4175 switch(o->type) {
c937aa89 4176 case REDIS_STRING: type = "+string"; break;
4177 case REDIS_LIST: type = "+list"; break;
4178 case REDIS_SET: type = "+set"; break;
412a8bce 4179 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4180 case REDIS_HASH: type = "+hash"; break;
4181 default: type = "+unknown"; break;
ed9b544e 4182 }
4183 }
4184 addReplySds(c,sdsnew(type));
4185 addReply(c,shared.crlf);
4186}
4187
4188static void saveCommand(redisClient *c) {
9d65a1bb 4189 if (server.bgsavechildpid != -1) {
05557f6d 4190 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4191 return;
4192 }
f78fd11b 4193 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4194 addReply(c,shared.ok);
4195 } else {
4196 addReply(c,shared.err);
4197 }
4198}
4199
4200static void bgsaveCommand(redisClient *c) {
9d65a1bb 4201 if (server.bgsavechildpid != -1) {
ed9b544e 4202 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4203 return;
4204 }
f78fd11b 4205 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4206 char *status = "+Background saving started\r\n";
4207 addReplySds(c,sdsnew(status));
ed9b544e 4208 } else {
4209 addReply(c,shared.err);
4210 }
4211}
4212
4213static void shutdownCommand(redisClient *c) {
4214 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
a3b21203 4215 /* Kill the saving child if there is a background saving in progress.
4216 We want to avoid race conditions, for instance our saving child may
4217 overwrite the synchronous saving did by SHUTDOWN. */
9d65a1bb 4218 if (server.bgsavechildpid != -1) {
9f3c422c 4219 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4220 kill(server.bgsavechildpid,SIGKILL);
a3b21203 4221 rdbRemoveTempFile(server.bgsavechildpid);
9f3c422c 4222 }
ac945e2d 4223 if (server.appendonly) {
4224 /* Append only file: fsync() the AOF and exit */
4225 fsync(server.appendfd);
054e426d 4226 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 4227 exit(0);
ed9b544e 4228 } else {
ac945e2d 4229 /* Snapshotting. Perform a SYNC SAVE and exit */
4230 if (rdbSave(server.dbfilename) == REDIS_OK) {
4231 if (server.daemonize)
4232 unlink(server.pidfile);
4233 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4234 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
054e426d 4235 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 4236 exit(0);
4237 } else {
dd88747b 4238 /* Ooops.. error saving! The best we can do is to continue
4239 * operating. Note that if there was a background saving process,
4240 * in the next cron() Redis will be notified that the background
4241 * saving aborted, handling special stuff like slaves pending for
4242 * synchronization... */
ac945e2d 4243 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
dd88747b 4244 addReplySds(c,
4245 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
ac945e2d 4246 }
ed9b544e 4247 }
4248}
4249
4250static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4251 robj *o;
4252
4253 /* To use the same key as src and dst is probably an error */
4254 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4255 addReply(c,shared.sameobjecterr);
ed9b544e 4256 return;
4257 }
4258
dd88747b 4259 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4260 return;
dd88747b 4261
ed9b544e 4262 incrRefCount(o);
3305306f 4263 deleteIfVolatile(c->db,c->argv[2]);
4264 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 4265 if (nx) {
4266 decrRefCount(o);
c937aa89 4267 addReply(c,shared.czero);
ed9b544e 4268 return;
4269 }
3305306f 4270 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 4271 } else {
4272 incrRefCount(c->argv[2]);
4273 }
3305306f 4274 deleteKey(c->db,c->argv[1]);
ed9b544e 4275 server.dirty++;
c937aa89 4276 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4277}
4278
4279static void renameCommand(redisClient *c) {
4280 renameGenericCommand(c,0);
4281}
4282
4283static void renamenxCommand(redisClient *c) {
4284 renameGenericCommand(c,1);
4285}
4286
4287static void moveCommand(redisClient *c) {
3305306f 4288 robj *o;
4289 redisDb *src, *dst;
ed9b544e 4290 int srcid;
4291
4292 /* Obtain source and target DB pointers */
3305306f 4293 src = c->db;
4294 srcid = c->db->id;
ed9b544e 4295 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4296 addReply(c,shared.outofrangeerr);
ed9b544e 4297 return;
4298 }
3305306f 4299 dst = c->db;
4300 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4301
4302 /* If the user is moving using as target the same
4303 * DB as the source DB it is probably an error. */
4304 if (src == dst) {
c937aa89 4305 addReply(c,shared.sameobjecterr);
ed9b544e 4306 return;
4307 }
4308
4309 /* Check if the element exists and get a reference */
3305306f 4310 o = lookupKeyWrite(c->db,c->argv[1]);
4311 if (!o) {
c937aa89 4312 addReply(c,shared.czero);
ed9b544e 4313 return;
4314 }
4315
4316 /* Try to add the element to the target DB */
3305306f 4317 deleteIfVolatile(dst,c->argv[1]);
4318 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 4319 addReply(c,shared.czero);
ed9b544e 4320 return;
4321 }
3305306f 4322 incrRefCount(c->argv[1]);
ed9b544e 4323 incrRefCount(o);
4324
4325 /* OK! key moved, free the entry in the source DB */
3305306f 4326 deleteKey(src,c->argv[1]);
ed9b544e 4327 server.dirty++;
c937aa89 4328 addReply(c,shared.cone);
ed9b544e 4329}
4330
4331/* =================================== Lists ================================ */
4332static void pushGenericCommand(redisClient *c, int where) {
4333 robj *lobj;
ed9b544e 4334 list *list;
3305306f 4335
4336 lobj = lookupKeyWrite(c->db,c->argv[1]);
4337 if (lobj == NULL) {
95242ab5 4338 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4339 addReply(c,shared.cone);
95242ab5 4340 return;
4341 }
ed9b544e 4342 lobj = createListObject();
4343 list = lobj->ptr;
4344 if (where == REDIS_HEAD) {
6b47e12e 4345 listAddNodeHead(list,c->argv[2]);
ed9b544e 4346 } else {
6b47e12e 4347 listAddNodeTail(list,c->argv[2]);
ed9b544e 4348 }
3305306f 4349 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 4350 incrRefCount(c->argv[1]);
4351 incrRefCount(c->argv[2]);
4352 } else {
ed9b544e 4353 if (lobj->type != REDIS_LIST) {
4354 addReply(c,shared.wrongtypeerr);
4355 return;
4356 }
95242ab5 4357 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4358 addReply(c,shared.cone);
95242ab5 4359 return;
4360 }
ed9b544e 4361 list = lobj->ptr;
4362 if (where == REDIS_HEAD) {
6b47e12e 4363 listAddNodeHead(list,c->argv[2]);
ed9b544e 4364 } else {
6b47e12e 4365 listAddNodeTail(list,c->argv[2]);
ed9b544e 4366 }
4367 incrRefCount(c->argv[2]);
4368 }
4369 server.dirty++;
520b5a33 4370 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
ed9b544e 4371}
4372
4373static void lpushCommand(redisClient *c) {
4374 pushGenericCommand(c,REDIS_HEAD);
4375}
4376
4377static void rpushCommand(redisClient *c) {
4378 pushGenericCommand(c,REDIS_TAIL);
4379}
4380
4381static void llenCommand(redisClient *c) {
3305306f 4382 robj *o;
ed9b544e 4383 list *l;
dd88747b 4384
4385 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4386 checkType(c,o,REDIS_LIST)) return;
ed9b544e 4387
dd88747b 4388 l = o->ptr;
4389 addReplyUlong(c,listLength(l));
ed9b544e 4390}
4391
4392static void lindexCommand(redisClient *c) {
3305306f 4393 robj *o;
ed9b544e 4394 int index = atoi(c->argv[2]->ptr);
dd88747b 4395 list *list;
4396 listNode *ln;
4397
4398 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4399 checkType(c,o,REDIS_LIST)) return;
4400 list = o->ptr;
4401
4402 ln = listIndex(list, index);
4403 if (ln == NULL) {
c937aa89 4404 addReply(c,shared.nullbulk);
ed9b544e 4405 } else {
dd88747b 4406 robj *ele = listNodeValue(ln);
4407 addReplyBulk(c,ele);
ed9b544e 4408 }
4409}
4410
4411static void lsetCommand(redisClient *c) {
3305306f 4412 robj *o;
ed9b544e 4413 int index = atoi(c->argv[2]->ptr);
dd88747b 4414 list *list;
4415 listNode *ln;
4416
4417 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4418 checkType(c,o,REDIS_LIST)) return;
4419 list = o->ptr;
4420
4421 ln = listIndex(list, index);
4422 if (ln == NULL) {
4423 addReply(c,shared.outofrangeerr);
ed9b544e 4424 } else {
dd88747b 4425 robj *ele = listNodeValue(ln);
ed9b544e 4426
dd88747b 4427 decrRefCount(ele);
4428 listNodeValue(ln) = c->argv[3];
4429 incrRefCount(c->argv[3]);
4430 addReply(c,shared.ok);
4431 server.dirty++;
ed9b544e 4432 }
4433}
4434
4435static void popGenericCommand(redisClient *c, int where) {
3305306f 4436 robj *o;
dd88747b 4437 list *list;
4438 listNode *ln;
3305306f 4439
dd88747b 4440 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4441 checkType(c,o,REDIS_LIST)) return;
4442 list = o->ptr;
ed9b544e 4443
dd88747b 4444 if (where == REDIS_HEAD)
4445 ln = listFirst(list);
4446 else
4447 ln = listLast(list);
ed9b544e 4448
dd88747b 4449 if (ln == NULL) {
4450 addReply(c,shared.nullbulk);
4451 } else {
4452 robj *ele = listNodeValue(ln);
4453 addReplyBulk(c,ele);
4454 listDelNode(list,ln);
3ea27d37 4455 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4456 server.dirty++;
ed9b544e 4457 }
4458}
4459
4460static void lpopCommand(redisClient *c) {
4461 popGenericCommand(c,REDIS_HEAD);
4462}
4463
4464static void rpopCommand(redisClient *c) {
4465 popGenericCommand(c,REDIS_TAIL);
4466}
4467
4468static void lrangeCommand(redisClient *c) {
3305306f 4469 robj *o;
ed9b544e 4470 int start = atoi(c->argv[2]->ptr);
4471 int end = atoi(c->argv[3]->ptr);
dd88747b 4472 int llen;
4473 int rangelen, j;
4474 list *list;
4475 listNode *ln;
4476 robj *ele;
4477
4478 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
4479 checkType(c,o,REDIS_LIST)) return;
4480 list = o->ptr;
4481 llen = listLength(list);
4482
4483 /* convert negative indexes */
4484 if (start < 0) start = llen+start;
4485 if (end < 0) end = llen+end;
4486 if (start < 0) start = 0;
4487 if (end < 0) end = 0;
4488
4489 /* indexes sanity checks */
4490 if (start > end || start >= llen) {
4491 /* Out of range start or start > end result in empty list */
4492 addReply(c,shared.emptymultibulk);
4493 return;
4494 }
4495 if (end >= llen) end = llen-1;
4496 rangelen = (end-start)+1;
3305306f 4497
dd88747b 4498 /* Return the result in form of a multi-bulk reply */
4499 ln = listIndex(list, start);
4500 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4501 for (j = 0; j < rangelen; j++) {
4502 ele = listNodeValue(ln);
4503 addReplyBulk(c,ele);
4504 ln = ln->next;
ed9b544e 4505 }
4506}
4507
4508static void ltrimCommand(redisClient *c) {
3305306f 4509 robj *o;
ed9b544e 4510 int start = atoi(c->argv[2]->ptr);
4511 int end = atoi(c->argv[3]->ptr);
dd88747b 4512 int llen;
4513 int j, ltrim, rtrim;
4514 list *list;
4515 listNode *ln;
4516
4517 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4518 checkType(c,o,REDIS_LIST)) return;
4519 list = o->ptr;
4520 llen = listLength(list);
4521
4522 /* convert negative indexes */
4523 if (start < 0) start = llen+start;
4524 if (end < 0) end = llen+end;
4525 if (start < 0) start = 0;
4526 if (end < 0) end = 0;
4527
4528 /* indexes sanity checks */
4529 if (start > end || start >= llen) {
4530 /* Out of range start or start > end result in empty list */
4531 ltrim = llen;
4532 rtrim = 0;
ed9b544e 4533 } else {
dd88747b 4534 if (end >= llen) end = llen-1;
4535 ltrim = start;
4536 rtrim = llen-end-1;
4537 }
ed9b544e 4538
dd88747b 4539 /* Remove list elements to perform the trim */
4540 for (j = 0; j < ltrim; j++) {
4541 ln = listFirst(list);
4542 listDelNode(list,ln);
4543 }
4544 for (j = 0; j < rtrim; j++) {
4545 ln = listLast(list);
4546 listDelNode(list,ln);
ed9b544e 4547 }
3ea27d37 4548 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4549 server.dirty++;
4550 addReply(c,shared.ok);
ed9b544e 4551}
4552
4553static void lremCommand(redisClient *c) {
3305306f 4554 robj *o;
dd88747b 4555 list *list;
4556 listNode *ln, *next;
4557 int toremove = atoi(c->argv[2]->ptr);
4558 int removed = 0;
4559 int fromtail = 0;
a4d1ba9a 4560
dd88747b 4561 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4562 checkType(c,o,REDIS_LIST)) return;
4563 list = o->ptr;
4564
4565 if (toremove < 0) {
4566 toremove = -toremove;
4567 fromtail = 1;
4568 }
4569 ln = fromtail ? list->tail : list->head;
4570 while (ln) {
4571 robj *ele = listNodeValue(ln);
4572
4573 next = fromtail ? ln->prev : ln->next;
4574 if (compareStringObjects(ele,c->argv[3]) == 0) {
4575 listDelNode(list,ln);
4576 server.dirty++;
4577 removed++;
4578 if (toremove && removed == toremove) break;
ed9b544e 4579 }
dd88747b 4580 ln = next;
ed9b544e 4581 }
3ea27d37 4582 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4583 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 4584}
4585
12f9d551 4586/* This is the semantic of this command:
0f5f7e9a 4587 * RPOPLPUSH srclist dstlist:
12f9d551 4588 * IF LLEN(srclist) > 0
4589 * element = RPOP srclist
4590 * LPUSH dstlist element
4591 * RETURN element
4592 * ELSE
4593 * RETURN nil
4594 * END
4595 * END
4596 *
4597 * The idea is to be able to get an element from a list in a reliable way
4598 * since the element is not just returned but pushed against another list
4599 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4600 */
0f5f7e9a 4601static void rpoplpushcommand(redisClient *c) {
12f9d551 4602 robj *sobj;
dd88747b 4603 list *srclist;
4604 listNode *ln;
4605
4606 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4607 checkType(c,sobj,REDIS_LIST)) return;
4608 srclist = sobj->ptr;
4609 ln = listLast(srclist);
12f9d551 4610
dd88747b 4611 if (ln == NULL) {
12f9d551 4612 addReply(c,shared.nullbulk);
4613 } else {
dd88747b 4614 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4615 robj *ele = listNodeValue(ln);
4616 list *dstlist;
e20fb74f 4617
dd88747b 4618 if (dobj && dobj->type != REDIS_LIST) {
4619 addReply(c,shared.wrongtypeerr);
4620 return;
4621 }
12f9d551 4622
dd88747b 4623 /* Add the element to the target list (unless it's directly
4624 * passed to some BLPOP-ing client */
4625 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4626 if (dobj == NULL) {
4627 /* Create the list if the key does not exist */
4628 dobj = createListObject();
4629 dictAdd(c->db->dict,c->argv[2],dobj);
4630 incrRefCount(c->argv[2]);
12f9d551 4631 }
dd88747b 4632 dstlist = dobj->ptr;
4633 listAddNodeHead(dstlist,ele);
4634 incrRefCount(ele);
12f9d551 4635 }
dd88747b 4636
4637 /* Send the element to the client as reply as well */
4638 addReplyBulk(c,ele);
4639
4640 /* Finally remove the element from the source list */
4641 listDelNode(srclist,ln);
3ea27d37 4642 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4643 server.dirty++;
12f9d551 4644 }
4645}
4646
ed9b544e 4647/* ==================================== Sets ================================ */
4648
4649static void saddCommand(redisClient *c) {
ed9b544e 4650 robj *set;
4651
3305306f 4652 set = lookupKeyWrite(c->db,c->argv[1]);
4653 if (set == NULL) {
ed9b544e 4654 set = createSetObject();
3305306f 4655 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 4656 incrRefCount(c->argv[1]);
4657 } else {
ed9b544e 4658 if (set->type != REDIS_SET) {
c937aa89 4659 addReply(c,shared.wrongtypeerr);
ed9b544e 4660 return;
4661 }
4662 }
4663 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4664 incrRefCount(c->argv[2]);
4665 server.dirty++;
c937aa89 4666 addReply(c,shared.cone);
ed9b544e 4667 } else {
c937aa89 4668 addReply(c,shared.czero);
ed9b544e 4669 }
4670}
4671
4672static void sremCommand(redisClient *c) {
3305306f 4673 robj *set;
ed9b544e 4674
dd88747b 4675 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4676 checkType(c,set,REDIS_SET)) return;
4677
4678 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4679 server.dirty++;
4680 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 4681 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4682 addReply(c,shared.cone);
ed9b544e 4683 } else {
dd88747b 4684 addReply(c,shared.czero);
ed9b544e 4685 }
4686}
4687
a4460ef4 4688static void smoveCommand(redisClient *c) {
4689 robj *srcset, *dstset;
4690
4691 srcset = lookupKeyWrite(c->db,c->argv[1]);
4692 dstset = lookupKeyWrite(c->db,c->argv[2]);
4693
4694 /* If the source key does not exist return 0, if it's of the wrong type
4695 * raise an error */
4696 if (srcset == NULL || srcset->type != REDIS_SET) {
4697 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4698 return;
4699 }
4700 /* Error if the destination key is not a set as well */
4701 if (dstset && dstset->type != REDIS_SET) {
4702 addReply(c,shared.wrongtypeerr);
4703 return;
4704 }
4705 /* Remove the element from the source set */
4706 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4707 /* Key not found in the src set! return zero */
4708 addReply(c,shared.czero);
4709 return;
4710 }
3ea27d37 4711 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4712 deleteKey(c->db,c->argv[1]);
a4460ef4 4713 server.dirty++;
4714 /* Add the element to the destination set */
4715 if (!dstset) {
4716 dstset = createSetObject();
4717 dictAdd(c->db->dict,c->argv[2],dstset);
4718 incrRefCount(c->argv[2]);
4719 }
4720 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4721 incrRefCount(c->argv[3]);
4722 addReply(c,shared.cone);
4723}
4724
ed9b544e 4725static void sismemberCommand(redisClient *c) {
3305306f 4726 robj *set;
ed9b544e 4727
dd88747b 4728 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4729 checkType(c,set,REDIS_SET)) return;
4730
4731 if (dictFind(set->ptr,c->argv[2]))
4732 addReply(c,shared.cone);
4733 else
c937aa89 4734 addReply(c,shared.czero);
ed9b544e 4735}
4736
4737static void scardCommand(redisClient *c) {
3305306f 4738 robj *o;
ed9b544e 4739 dict *s;
dd88747b 4740
4741 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4742 checkType(c,o,REDIS_SET)) return;
ed9b544e 4743
dd88747b 4744 s = o->ptr;
4745 addReplyUlong(c,dictSize(s));
ed9b544e 4746}
4747
12fea928 4748static void spopCommand(redisClient *c) {
4749 robj *set;
4750 dictEntry *de;
4751
dd88747b 4752 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4753 checkType(c,set,REDIS_SET)) return;
4754
4755 de = dictGetRandomKey(set->ptr);
4756 if (de == NULL) {
12fea928 4757 addReply(c,shared.nullbulk);
4758 } else {
dd88747b 4759 robj *ele = dictGetEntryKey(de);
12fea928 4760
dd88747b 4761 addReplyBulk(c,ele);
4762 dictDelete(set->ptr,ele);
4763 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 4764 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4765 server.dirty++;
12fea928 4766 }
4767}
4768
2abb95a9 4769static void srandmemberCommand(redisClient *c) {
4770 robj *set;
4771 dictEntry *de;
4772
dd88747b 4773 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4774 checkType(c,set,REDIS_SET)) return;
4775
4776 de = dictGetRandomKey(set->ptr);
4777 if (de == NULL) {
2abb95a9 4778 addReply(c,shared.nullbulk);
4779 } else {
dd88747b 4780 robj *ele = dictGetEntryKey(de);
2abb95a9 4781
dd88747b 4782 addReplyBulk(c,ele);
2abb95a9 4783 }
4784}
4785
ed9b544e 4786static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4787 dict **d1 = (void*) s1, **d2 = (void*) s2;
4788
3305306f 4789 return dictSize(*d1)-dictSize(*d2);
ed9b544e 4790}
4791
682ac724 4792static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 4793 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4794 dictIterator *di;
4795 dictEntry *de;
4796 robj *lenobj = NULL, *dstset = NULL;
682ac724 4797 unsigned long j, cardinality = 0;
ed9b544e 4798
ed9b544e 4799 for (j = 0; j < setsnum; j++) {
4800 robj *setobj;
3305306f 4801
4802 setobj = dstkey ?
4803 lookupKeyWrite(c->db,setskeys[j]) :
4804 lookupKeyRead(c->db,setskeys[j]);
4805 if (!setobj) {
ed9b544e 4806 zfree(dv);
5faa6025 4807 if (dstkey) {
fdcaae84 4808 if (deleteKey(c->db,dstkey))
4809 server.dirty++;
0d36ded0 4810 addReply(c,shared.czero);
5faa6025 4811 } else {
4812 addReply(c,shared.nullmultibulk);
4813 }
ed9b544e 4814 return;
4815 }
ed9b544e 4816 if (setobj->type != REDIS_SET) {
4817 zfree(dv);
c937aa89 4818 addReply(c,shared.wrongtypeerr);
ed9b544e 4819 return;
4820 }
4821 dv[j] = setobj->ptr;
4822 }
4823 /* Sort sets from the smallest to largest, this will improve our
4824 * algorithm's performace */
4825 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4826
4827 /* The first thing we should output is the total number of elements...
4828 * since this is a multi-bulk write, but at this stage we don't know
4829 * the intersection set size, so we use a trick, append an empty object
4830 * to the output list and save the pointer to later modify it with the
4831 * right length */
4832 if (!dstkey) {
4833 lenobj = createObject(REDIS_STRING,NULL);
4834 addReply(c,lenobj);
4835 decrRefCount(lenobj);
4836 } else {
4837 /* If we have a target key where to store the resulting set
4838 * create this key with an empty set inside */
4839 dstset = createSetObject();
ed9b544e 4840 }
4841
4842 /* Iterate all the elements of the first (smallest) set, and test
4843 * the element against all the other sets, if at least one set does
4844 * not include the element it is discarded */
4845 di = dictGetIterator(dv[0]);
ed9b544e 4846
4847 while((de = dictNext(di)) != NULL) {
4848 robj *ele;
4849
4850 for (j = 1; j < setsnum; j++)
4851 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4852 if (j != setsnum)
4853 continue; /* at least one set does not contain the member */
4854 ele = dictGetEntryKey(de);
4855 if (!dstkey) {
dd88747b 4856 addReplyBulk(c,ele);
ed9b544e 4857 cardinality++;
4858 } else {
4859 dictAdd(dstset->ptr,ele,NULL);
4860 incrRefCount(ele);
4861 }
4862 }
4863 dictReleaseIterator(di);
4864
83cdfe18 4865 if (dstkey) {
3ea27d37 4866 /* Store the resulting set into the target, if the intersection
4867 * is not an empty set. */
83cdfe18 4868 deleteKey(c->db,dstkey);
3ea27d37 4869 if (dictSize((dict*)dstset->ptr) > 0) {
4870 dictAdd(c->db->dict,dstkey,dstset);
4871 incrRefCount(dstkey);
d36c4e97 4872 addReplyLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 4873 } else {
4874 decrRefCount(dstset);
d36c4e97 4875 addReply(c,shared.czero);
3ea27d37 4876 }
40d224a9 4877 server.dirty++;
d36c4e97 4878 } else {
4879 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 4880 }
ed9b544e 4881 zfree(dv);
4882}
4883
4884static void sinterCommand(redisClient *c) {
4885 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4886}
4887
4888static void sinterstoreCommand(redisClient *c) {
4889 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4890}
4891
f4f56e1d 4892#define REDIS_OP_UNION 0
4893#define REDIS_OP_DIFF 1
2830ca53 4894#define REDIS_OP_INTER 2
f4f56e1d 4895
4896static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 4897 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4898 dictIterator *di;
4899 dictEntry *de;
f4f56e1d 4900 robj *dstset = NULL;
40d224a9 4901 int j, cardinality = 0;
4902
40d224a9 4903 for (j = 0; j < setsnum; j++) {
4904 robj *setobj;
4905
4906 setobj = dstkey ?
4907 lookupKeyWrite(c->db,setskeys[j]) :
4908 lookupKeyRead(c->db,setskeys[j]);
4909 if (!setobj) {
4910 dv[j] = NULL;
4911 continue;
4912 }
4913 if (setobj->type != REDIS_SET) {
4914 zfree(dv);
4915 addReply(c,shared.wrongtypeerr);
4916 return;
4917 }
4918 dv[j] = setobj->ptr;
4919 }
4920
4921 /* We need a temp set object to store our union. If the dstkey
4922 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4923 * this set object will be the resulting object to set into the target key*/
4924 dstset = createSetObject();
4925
40d224a9 4926 /* Iterate all the elements of all the sets, add every element a single
4927 * time to the result set */
4928 for (j = 0; j < setsnum; j++) {
51829ed3 4929 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 4930 if (!dv[j]) continue; /* non existing keys are like empty sets */
4931
4932 di = dictGetIterator(dv[j]);
40d224a9 4933
4934 while((de = dictNext(di)) != NULL) {
4935 robj *ele;
4936
4937 /* dictAdd will not add the same element multiple times */
4938 ele = dictGetEntryKey(de);
f4f56e1d 4939 if (op == REDIS_OP_UNION || j == 0) {
4940 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
4941 incrRefCount(ele);
40d224a9 4942 cardinality++;
4943 }
f4f56e1d 4944 } else if (op == REDIS_OP_DIFF) {
4945 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
4946 cardinality--;
4947 }
40d224a9 4948 }
4949 }
4950 dictReleaseIterator(di);
51829ed3 4951
d36c4e97 4952 /* result set is empty? Exit asap. */
4953 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 4954 }
4955
f4f56e1d 4956 /* Output the content of the resulting set, if not in STORE mode */
4957 if (!dstkey) {
4958 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
4959 di = dictGetIterator(dstset->ptr);
f4f56e1d 4960 while((de = dictNext(di)) != NULL) {
4961 robj *ele;
4962
4963 ele = dictGetEntryKey(de);
dd88747b 4964 addReplyBulk(c,ele);
f4f56e1d 4965 }
4966 dictReleaseIterator(di);
d36c4e97 4967 decrRefCount(dstset);
83cdfe18
AG
4968 } else {
4969 /* If we have a target key where to store the resulting set
4970 * create this key with the result set inside */
4971 deleteKey(c->db,dstkey);
3ea27d37 4972 if (dictSize((dict*)dstset->ptr) > 0) {
4973 dictAdd(c->db->dict,dstkey,dstset);
4974 incrRefCount(dstkey);
d36c4e97 4975 addReplyLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 4976 } else {
4977 decrRefCount(dstset);
d36c4e97 4978 addReply(c,shared.czero);
3ea27d37 4979 }
40d224a9 4980 server.dirty++;
4981 }
4982 zfree(dv);
4983}
4984
4985static void sunionCommand(redisClient *c) {
f4f56e1d 4986 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 4987}
4988
4989static void sunionstoreCommand(redisClient *c) {
f4f56e1d 4990 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
4991}
4992
4993static void sdiffCommand(redisClient *c) {
4994 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
4995}
4996
4997static void sdiffstoreCommand(redisClient *c) {
4998 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 4999}
5000
6b47e12e 5001/* ==================================== ZSets =============================== */
5002
5003/* ZSETs are ordered sets using two data structures to hold the same elements
5004 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5005 * data structure.
5006 *
5007 * The elements are added to an hash table mapping Redis objects to scores.
5008 * At the same time the elements are added to a skip list mapping scores
5009 * to Redis objects (so objects are sorted by scores in this "view"). */
5010
5011/* This skiplist implementation is almost a C translation of the original
5012 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5013 * Alternative to Balanced Trees", modified in three ways:
5014 * a) this implementation allows for repeated values.
5015 * b) the comparison is not just by key (our 'score') but by satellite data.
5016 * c) there is a back pointer, so it's a doubly linked list with the back
5017 * pointers being only at "level 1". This allows to traverse the list
5018 * from tail to head, useful for ZREVRANGE. */
5019
5020static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5021 zskiplistNode *zn = zmalloc(sizeof(*zn));
5022
5023 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2b37892e
PN
5024 if (level > 0)
5025 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
6b47e12e 5026 zn->score = score;
5027 zn->obj = obj;
5028 return zn;
5029}
5030
5031static zskiplist *zslCreate(void) {
5032 int j;
5033 zskiplist *zsl;
5034
5035 zsl = zmalloc(sizeof(*zsl));
5036 zsl->level = 1;
cc812361 5037 zsl->length = 0;
6b47e12e 5038 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 5039 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 5040 zsl->header->forward[j] = NULL;
94e543b5 5041
5042 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5043 if (j < ZSKIPLIST_MAXLEVEL-1)
5044 zsl->header->span[j] = 0;
69d95c3e 5045 }
e3870fab 5046 zsl->header->backward = NULL;
5047 zsl->tail = NULL;
6b47e12e 5048 return zsl;
5049}
5050
fd8ccf44 5051static void zslFreeNode(zskiplistNode *node) {
5052 decrRefCount(node->obj);
ad807e6f 5053 zfree(node->forward);
69d95c3e 5054 zfree(node->span);
fd8ccf44 5055 zfree(node);
5056}
5057
5058static void zslFree(zskiplist *zsl) {
ad807e6f 5059 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 5060
ad807e6f 5061 zfree(zsl->header->forward);
69d95c3e 5062 zfree(zsl->header->span);
ad807e6f 5063 zfree(zsl->header);
fd8ccf44 5064 while(node) {
599379dd 5065 next = node->forward[0];
fd8ccf44 5066 zslFreeNode(node);
5067 node = next;
5068 }
ad807e6f 5069 zfree(zsl);
fd8ccf44 5070}
5071
6b47e12e 5072static int zslRandomLevel(void) {
5073 int level = 1;
5074 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5075 level += 1;
10c2baa5 5076 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6b47e12e 5077}
5078
5079static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5080 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 5081 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 5082 int i, level;
5083
5084 x = zsl->header;
5085 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
5086 /* store rank that is crossed to reach the insert position */
5087 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 5088
9d60e6e4 5089 while (x->forward[i] &&
5090 (x->forward[i]->score < score ||
5091 (x->forward[i]->score == score &&
69d95c3e 5092 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 5093 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 5094 x = x->forward[i];
69d95c3e 5095 }
6b47e12e 5096 update[i] = x;
5097 }
6b47e12e 5098 /* we assume the key is not already inside, since we allow duplicated
5099 * scores, and the re-insertion of score and redis object should never
5100 * happpen since the caller of zslInsert() should test in the hash table
5101 * if the element is already inside or not. */
5102 level = zslRandomLevel();
5103 if (level > zsl->level) {
69d95c3e 5104 for (i = zsl->level; i < level; i++) {
2b37892e 5105 rank[i] = 0;
6b47e12e 5106 update[i] = zsl->header;
2b37892e 5107 update[i]->span[i-1] = zsl->length;
69d95c3e 5108 }
6b47e12e 5109 zsl->level = level;
5110 }
5111 x = zslCreateNode(level,score,obj);
5112 for (i = 0; i < level; i++) {
5113 x->forward[i] = update[i]->forward[i];
5114 update[i]->forward[i] = x;
69d95c3e
PN
5115
5116 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
5117 if (i > 0) {
5118 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5119 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5120 }
6b47e12e 5121 }
69d95c3e
PN
5122
5123 /* increment span for untouched levels */
5124 for (i = level; i < zsl->level; i++) {
2b37892e 5125 update[i]->span[i-1]++;
69d95c3e
PN
5126 }
5127
bb975144 5128 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 5129 if (x->forward[0])
5130 x->forward[0]->backward = x;
5131 else
5132 zsl->tail = x;
cc812361 5133 zsl->length++;
6b47e12e 5134}
5135
84105336
PN
5136/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5137void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5138 int i;
5139 for (i = 0; i < zsl->level; i++) {
5140 if (update[i]->forward[i] == x) {
5141 if (i > 0) {
5142 update[i]->span[i-1] += x->span[i-1] - 1;
5143 }
5144 update[i]->forward[i] = x->forward[i];
5145 } else {
5146 /* invariant: i > 0, because update[0]->forward[0]
5147 * is always equal to x */
5148 update[i]->span[i-1] -= 1;
5149 }
5150 }
5151 if (x->forward[0]) {
5152 x->forward[0]->backward = x->backward;
5153 } else {
5154 zsl->tail = x->backward;
5155 }
5156 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5157 zsl->level--;
5158 zsl->length--;
5159}
5160
50c55df5 5161/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 5162static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 5163 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5164 int i;
5165
5166 x = zsl->header;
5167 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 5168 while (x->forward[i] &&
5169 (x->forward[i]->score < score ||
5170 (x->forward[i]->score == score &&
5171 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 5172 x = x->forward[i];
5173 update[i] = x;
5174 }
5175 /* We may have multiple elements with the same score, what we need
5176 * is to find the element with both the right score and object. */
5177 x = x->forward[0];
50c55df5 5178 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
84105336 5179 zslDeleteNode(zsl, x, update);
9d60e6e4 5180 zslFreeNode(x);
9d60e6e4 5181 return 1;
5182 } else {
5183 return 0; /* not found */
e197b441 5184 }
5185 return 0; /* not found */
fd8ccf44 5186}
5187
1807985b 5188/* Delete all the elements with score between min and max from the skiplist.
5189 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5190 * Note that this function takes the reference to the hash table view of the
5191 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 5192static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 5193 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5194 unsigned long removed = 0;
5195 int i;
5196
5197 x = zsl->header;
5198 for (i = zsl->level-1; i >= 0; i--) {
5199 while (x->forward[i] && x->forward[i]->score < min)
5200 x = x->forward[i];
5201 update[i] = x;
5202 }
5203 /* We may have multiple elements with the same score, what we need
5204 * is to find the element with both the right score and object. */
5205 x = x->forward[0];
5206 while (x && x->score <= max) {
84105336
PN
5207 zskiplistNode *next = x->forward[0];
5208 zslDeleteNode(zsl, x, update);
1807985b 5209 dictDelete(dict,x->obj);
5210 zslFreeNode(x);
1807985b 5211 removed++;
5212 x = next;
5213 }
5214 return removed; /* not found */
5215}
1807985b 5216
9212eafd 5217/* Delete all the elements with rank between start and end from the skiplist.
2424490f 5218 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
5219static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5220 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5221 unsigned long traversed = 0, removed = 0;
5222 int i;
5223
9212eafd
PN
5224 x = zsl->header;
5225 for (i = zsl->level-1; i >= 0; i--) {
5226 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5227 traversed += i > 0 ? x->span[i-1] : 1;
5228 x = x->forward[i];
1807985b 5229 }
9212eafd
PN
5230 update[i] = x;
5231 }
5232
5233 traversed++;
5234 x = x->forward[0];
5235 while (x && traversed <= end) {
84105336
PN
5236 zskiplistNode *next = x->forward[0];
5237 zslDeleteNode(zsl, x, update);
1807985b 5238 dictDelete(dict,x->obj);
5239 zslFreeNode(x);
1807985b 5240 removed++;
9212eafd 5241 traversed++;
1807985b 5242 x = next;
5243 }
9212eafd 5244 return removed;
1807985b 5245}
5246
50c55df5 5247/* Find the first node having a score equal or greater than the specified one.
5248 * Returns NULL if there is no match. */
5249static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5250 zskiplistNode *x;
5251 int i;
5252
5253 x = zsl->header;
5254 for (i = zsl->level-1; i >= 0; i--) {
5255 while (x->forward[i] && x->forward[i]->score < score)
5256 x = x->forward[i];
5257 }
5258 /* We may have multiple elements with the same score, what we need
5259 * is to find the element with both the right score and object. */
5260 return x->forward[0];
5261}
5262
27b0ccca
PN
5263/* Find the rank for an element by both score and key.
5264 * Returns 0 when the element cannot be found, rank otherwise.
5265 * Note that the rank is 1-based due to the span of zsl->header to the
5266 * first element. */
5267static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5268 zskiplistNode *x;
5269 unsigned long rank = 0;
5270 int i;
5271
5272 x = zsl->header;
5273 for (i = zsl->level-1; i >= 0; i--) {
5274 while (x->forward[i] &&
5275 (x->forward[i]->score < score ||
5276 (x->forward[i]->score == score &&
5277 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 5278 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
5279 x = x->forward[i];
5280 }
5281
5282 /* x might be equal to zsl->header, so test if obj is non-NULL */
5283 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5284 return rank;
5285 }
5286 }
5287 return 0;
5288}
5289
e74825c2
PN
5290/* Finds an element by its rank. The rank argument needs to be 1-based. */
5291zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5292 zskiplistNode *x;
5293 unsigned long traversed = 0;
5294 int i;
5295
5296 x = zsl->header;
5297 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 5298 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5299 {
a50ea45c 5300 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
5301 x = x->forward[i];
5302 }
e74825c2
PN
5303 if (traversed == rank) {
5304 return x;
5305 }
5306 }
5307 return NULL;
5308}
5309
fd8ccf44 5310/* The actual Z-commands implementations */
5311
7db723ad 5312/* This generic command implements both ZADD and ZINCRBY.
e2665397 5313 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 5314 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 5315static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 5316 robj *zsetobj;
5317 zset *zs;
5318 double *score;
5319
e2665397 5320 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 5321 if (zsetobj == NULL) {
5322 zsetobj = createZsetObject();
e2665397 5323 dictAdd(c->db->dict,key,zsetobj);
5324 incrRefCount(key);
fd8ccf44 5325 } else {
5326 if (zsetobj->type != REDIS_ZSET) {
5327 addReply(c,shared.wrongtypeerr);
5328 return;
5329 }
5330 }
fd8ccf44 5331 zs = zsetobj->ptr;
e2665397 5332
7db723ad 5333 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 5334 * needs to handle the two different conditions. It's all about setting
5335 * '*score', that is, the new score to set, to the right value. */
5336 score = zmalloc(sizeof(double));
5337 if (doincrement) {
5338 dictEntry *de;
5339
5340 /* Read the old score. If the element was not present starts from 0 */
5341 de = dictFind(zs->dict,ele);
5342 if (de) {
5343 double *oldscore = dictGetEntryVal(de);
5344 *score = *oldscore + scoreval;
5345 } else {
5346 *score = scoreval;
5347 }
5348 } else {
5349 *score = scoreval;
5350 }
5351
5352 /* What follows is a simple remove and re-insert operation that is common
7db723ad 5353 * to both ZADD and ZINCRBY... */
e2665397 5354 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 5355 /* case 1: New element */
e2665397 5356 incrRefCount(ele); /* added to hash */
5357 zslInsert(zs->zsl,*score,ele);
5358 incrRefCount(ele); /* added to skiplist */
fd8ccf44 5359 server.dirty++;
e2665397 5360 if (doincrement)
e2665397 5361 addReplyDouble(c,*score);
91d71bfc 5362 else
5363 addReply(c,shared.cone);
fd8ccf44 5364 } else {
5365 dictEntry *de;
5366 double *oldscore;
5367
5368 /* case 2: Score update operation */
e2665397 5369 de = dictFind(zs->dict,ele);
dfc5e96c 5370 redisAssert(de != NULL);
fd8ccf44 5371 oldscore = dictGetEntryVal(de);
5372 if (*score != *oldscore) {
5373 int deleted;
5374
e2665397 5375 /* Remove and insert the element in the skip list with new score */
5376 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 5377 redisAssert(deleted != 0);
e2665397 5378 zslInsert(zs->zsl,*score,ele);
5379 incrRefCount(ele);
5380 /* Update the score in the hash table */
5381 dictReplace(zs->dict,ele,score);
fd8ccf44 5382 server.dirty++;
2161a965 5383 } else {
5384 zfree(score);
fd8ccf44 5385 }
e2665397 5386 if (doincrement)
5387 addReplyDouble(c,*score);
5388 else
5389 addReply(c,shared.czero);
fd8ccf44 5390 }
5391}
5392
e2665397 5393static void zaddCommand(redisClient *c) {
5394 double scoreval;
5395
5396 scoreval = strtod(c->argv[2]->ptr,NULL);
5397 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5398}
5399
7db723ad 5400static void zincrbyCommand(redisClient *c) {
e2665397 5401 double scoreval;
5402
5403 scoreval = strtod(c->argv[2]->ptr,NULL);
5404 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5405}
5406
1b7106e7 5407static void zremCommand(redisClient *c) {
5408 robj *zsetobj;
5409 zset *zs;
dd88747b 5410 dictEntry *de;
5411 double *oldscore;
5412 int deleted;
1b7106e7 5413
dd88747b 5414 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5415 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 5416
dd88747b 5417 zs = zsetobj->ptr;
5418 de = dictFind(zs->dict,c->argv[2]);
5419 if (de == NULL) {
5420 addReply(c,shared.czero);
5421 return;
1b7106e7 5422 }
dd88747b 5423 /* Delete from the skiplist */
5424 oldscore = dictGetEntryVal(de);
5425 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5426 redisAssert(deleted != 0);
5427
5428 /* Delete from the hash table */
5429 dictDelete(zs->dict,c->argv[2]);
5430 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5431 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5432 server.dirty++;
5433 addReply(c,shared.cone);
1b7106e7 5434}
5435
1807985b 5436static void zremrangebyscoreCommand(redisClient *c) {
5437 double min = strtod(c->argv[2]->ptr,NULL);
5438 double max = strtod(c->argv[3]->ptr,NULL);
dd88747b 5439 long deleted;
1807985b 5440 robj *zsetobj;
5441 zset *zs;
5442
dd88747b 5443 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5444 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 5445
dd88747b 5446 zs = zsetobj->ptr;
5447 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5448 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5449 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5450 server.dirty += deleted;
5451 addReplyLong(c,deleted);
1807985b 5452}
5453
9212eafd
PN
5454static void zremrangebyrankCommand(redisClient *c) {
5455 int start = atoi(c->argv[2]->ptr);
5456 int end = atoi(c->argv[3]->ptr);
dd88747b 5457 int llen;
5458 long deleted;
9212eafd
PN
5459 robj *zsetobj;
5460 zset *zs;
5461
dd88747b 5462 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5463 checkType(c,zsetobj,REDIS_ZSET)) return;
5464 zs = zsetobj->ptr;
5465 llen = zs->zsl->length;
9212eafd 5466
dd88747b 5467 /* convert negative indexes */
5468 if (start < 0) start = llen+start;
5469 if (end < 0) end = llen+end;
5470 if (start < 0) start = 0;
5471 if (end < 0) end = 0;
9212eafd 5472
dd88747b 5473 /* indexes sanity checks */
5474 if (start > end || start >= llen) {
5475 addReply(c,shared.czero);
5476 return;
9212eafd 5477 }
dd88747b 5478 if (end >= llen) end = llen-1;
5479
5480 /* increment start and end because zsl*Rank functions
5481 * use 1-based rank */
5482 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5483 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5484 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5485 server.dirty += deleted;
5486 addReplyLong(c, deleted);
9212eafd
PN
5487}
5488
8f92e768
PN
5489typedef struct {
5490 dict *dict;
5491 double weight;
5492} zsetopsrc;
5493
5494static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5495 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5496 unsigned long size1, size2;
5497 size1 = d1->dict ? dictSize(d1->dict) : 0;
5498 size2 = d2->dict ? dictSize(d2->dict) : 0;
5499 return size1 - size2;
5500}
5501
d2764cd6
PN
5502#define REDIS_AGGR_SUM 1
5503#define REDIS_AGGR_MIN 2
5504#define REDIS_AGGR_MAX 3
5505
5506inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5507 if (aggregate == REDIS_AGGR_SUM) {
5508 *target = *target + val;
5509 } else if (aggregate == REDIS_AGGR_MIN) {
5510 *target = val < *target ? val : *target;
5511 } else if (aggregate == REDIS_AGGR_MAX) {
5512 *target = val > *target ? val : *target;
5513 } else {
5514 /* safety net */
5515 redisAssert(0 != 0);
5516 }
5517}
5518
2830ca53 5519static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
8f92e768 5520 int i, j, zsetnum;
d2764cd6 5521 int aggregate = REDIS_AGGR_SUM;
8f92e768 5522 zsetopsrc *src;
2830ca53
PN
5523 robj *dstobj;
5524 zset *dstzset;
b287c9bb
PN
5525 dictIterator *di;
5526 dictEntry *de;
5527
2830ca53
PN
5528 /* expect zsetnum input keys to be given */
5529 zsetnum = atoi(c->argv[2]->ptr);
5530 if (zsetnum < 1) {
5531 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5532 return;
b287c9bb 5533 }
2830ca53
PN
5534
5535 /* test if the expected number of keys would overflow */
5536 if (3+zsetnum > c->argc) {
b287c9bb
PN
5537 addReply(c,shared.syntaxerr);
5538 return;
5539 }
5540
2830ca53 5541 /* read keys to be used for input */
b9eed483 5542 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
2830ca53 5543 for (i = 0, j = 3; i < zsetnum; i++, j++) {
b287c9bb
PN
5544 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5545 if (!zsetobj) {
8f92e768 5546 src[i].dict = NULL;
b287c9bb
PN
5547 } else {
5548 if (zsetobj->type != REDIS_ZSET) {
8f92e768 5549 zfree(src);
b287c9bb
PN
5550 addReply(c,shared.wrongtypeerr);
5551 return;
5552 }
8f92e768 5553 src[i].dict = ((zset*)zsetobj->ptr)->dict;
b287c9bb 5554 }
2830ca53
PN
5555
5556 /* default all weights to 1 */
8f92e768 5557 src[i].weight = 1.0;
b287c9bb
PN
5558 }
5559
2830ca53
PN
5560 /* parse optional extra arguments */
5561 if (j < c->argc) {
d2764cd6 5562 int remaining = c->argc - j;
b287c9bb 5563
2830ca53 5564 while (remaining) {
d2764cd6 5565 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 5566 j++; remaining--;
2830ca53 5567 for (i = 0; i < zsetnum; i++, j++, remaining--) {
8f92e768 5568 src[i].weight = strtod(c->argv[j]->ptr, NULL);
2830ca53 5569 }
d2764cd6
PN
5570 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5571 j++; remaining--;
5572 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5573 aggregate = REDIS_AGGR_SUM;
5574 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5575 aggregate = REDIS_AGGR_MIN;
5576 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5577 aggregate = REDIS_AGGR_MAX;
5578 } else {
5579 zfree(src);
5580 addReply(c,shared.syntaxerr);
5581 return;
5582 }
5583 j++; remaining--;
2830ca53 5584 } else {
8f92e768 5585 zfree(src);
2830ca53
PN
5586 addReply(c,shared.syntaxerr);
5587 return;
5588 }
5589 }
5590 }
b287c9bb 5591
d2764cd6
PN
5592 /* sort sets from the smallest to largest, this will improve our
5593 * algorithm's performance */
5594 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5595
2830ca53
PN
5596 dstobj = createZsetObject();
5597 dstzset = dstobj->ptr;
5598
5599 if (op == REDIS_OP_INTER) {
8f92e768
PN
5600 /* skip going over all entries if the smallest zset is NULL or empty */
5601 if (src[0].dict && dictSize(src[0].dict) > 0) {
5602 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5603 * from small to large, all src[i > 0].dict are non-empty too */
5604 di = dictGetIterator(src[0].dict);
2830ca53 5605 while((de = dictNext(di)) != NULL) {
d2764cd6
PN
5606 double *score = zmalloc(sizeof(double)), value;
5607 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
2830ca53 5608
d2764cd6
PN
5609 for (j = 1; j < zsetnum; j++) {
5610 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 5611 if (other) {
d2764cd6
PN
5612 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5613 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
5614 } else {
5615 break;
5616 }
5617 }
b287c9bb 5618
2830ca53 5619 /* skip entry when not present in every source dict */
8f92e768 5620 if (j != zsetnum) {
2830ca53
PN
5621 zfree(score);
5622 } else {
5623 robj *o = dictGetEntryKey(de);
5624 dictAdd(dstzset->dict,o,score);
5625 incrRefCount(o); /* added to dictionary */
5626 zslInsert(dstzset->zsl,*score,o);
5627 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
5628 }
5629 }
2830ca53
PN
5630 dictReleaseIterator(di);
5631 }
5632 } else if (op == REDIS_OP_UNION) {
5633 for (i = 0; i < zsetnum; i++) {
8f92e768 5634 if (!src[i].dict) continue;
2830ca53 5635
8f92e768 5636 di = dictGetIterator(src[i].dict);
2830ca53
PN
5637 while((de = dictNext(di)) != NULL) {
5638 /* skip key when already processed */
5639 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5640
d2764cd6
PN
5641 double *score = zmalloc(sizeof(double)), value;
5642 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
2830ca53 5643
d2764cd6
PN
5644 /* because the zsets are sorted by size, its only possible
5645 * for sets at larger indices to hold this entry */
5646 for (j = (i+1); j < zsetnum; j++) {
5647 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 5648 if (other) {
d2764cd6
PN
5649 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5650 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
5651 }
5652 }
b287c9bb 5653
2830ca53
PN
5654 robj *o = dictGetEntryKey(de);
5655 dictAdd(dstzset->dict,o,score);
5656 incrRefCount(o); /* added to dictionary */
5657 zslInsert(dstzset->zsl,*score,o);
5658 incrRefCount(o); /* added to skiplist */
5659 }
5660 dictReleaseIterator(di);
b287c9bb 5661 }
2830ca53
PN
5662 } else {
5663 /* unknown operator */
5664 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
5665 }
5666
5667 deleteKey(c->db,dstkey);
3ea27d37 5668 if (dstzset->zsl->length) {
5669 dictAdd(c->db->dict,dstkey,dstobj);
5670 incrRefCount(dstkey);
5671 addReplyLong(c, dstzset->zsl->length);
5672 server.dirty++;
5673 } else {
8bca8773 5674 decrRefCount(dstobj);
3ea27d37 5675 addReply(c, shared.czero);
5676 }
8f92e768 5677 zfree(src);
b287c9bb
PN
5678}
5679
2830ca53
PN
5680static void zunionCommand(redisClient *c) {
5681 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
5682}
5683
2830ca53
PN
5684static void zinterCommand(redisClient *c) {
5685 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
5686}
5687
e3870fab 5688static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 5689 robj *o;
5690 int start = atoi(c->argv[2]->ptr);
5691 int end = atoi(c->argv[3]->ptr);
752da584 5692 int withscores = 0;
dd88747b 5693 int llen;
5694 int rangelen, j;
5695 zset *zsetobj;
5696 zskiplist *zsl;
5697 zskiplistNode *ln;
5698 robj *ele;
752da584 5699
5700 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5701 withscores = 1;
5702 } else if (c->argc >= 5) {
5703 addReply(c,shared.syntaxerr);
5704 return;
5705 }
cc812361 5706
dd88747b 5707 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL ||
5708 checkType(c,o,REDIS_ZSET)) return;
5709 zsetobj = o->ptr;
5710 zsl = zsetobj->zsl;
5711 llen = zsl->length;
cc812361 5712
dd88747b 5713 /* convert negative indexes */
5714 if (start < 0) start = llen+start;
5715 if (end < 0) end = llen+end;
5716 if (start < 0) start = 0;
5717 if (end < 0) end = 0;
cc812361 5718
dd88747b 5719 /* indexes sanity checks */
5720 if (start > end || start >= llen) {
5721 /* Out of range start or start > end result in empty list */
5722 addReply(c,shared.emptymultibulk);
5723 return;
5724 }
5725 if (end >= llen) end = llen-1;
5726 rangelen = (end-start)+1;
cc812361 5727
dd88747b 5728 /* check if starting point is trivial, before searching
5729 * the element in log(N) time */
5730 if (reverse) {
5731 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5732 } else {
5733 ln = start == 0 ?
5734 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5735 }
cc812361 5736
dd88747b 5737 /* Return the result in form of a multi-bulk reply */
5738 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5739 withscores ? (rangelen*2) : rangelen));
5740 for (j = 0; j < rangelen; j++) {
5741 ele = ln->obj;
5742 addReplyBulk(c,ele);
5743 if (withscores)
5744 addReplyDouble(c,ln->score);
5745 ln = reverse ? ln->backward : ln->forward[0];
cc812361 5746 }
5747}
5748
e3870fab 5749static void zrangeCommand(redisClient *c) {
5750 zrangeGenericCommand(c,0);
5751}
5752
5753static void zrevrangeCommand(redisClient *c) {
5754 zrangeGenericCommand(c,1);
5755}
5756
f44dd428 5757/* This command implements both ZRANGEBYSCORE and ZCOUNT.
5758 * If justcount is non-zero, just the count is returned. */
5759static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 5760 robj *o;
f44dd428 5761 double min, max;
5762 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 5763 int offset = 0, limit = -1;
0500ef27
SH
5764 int withscores = 0;
5765 int badsyntax = 0;
5766
f44dd428 5767 /* Parse the min-max interval. If one of the values is prefixed
5768 * by the "(" character, it's considered "open". For instance
5769 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5770 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5771 if (((char*)c->argv[2]->ptr)[0] == '(') {
5772 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5773 minex = 1;
5774 } else {
5775 min = strtod(c->argv[2]->ptr,NULL);
5776 }
5777 if (((char*)c->argv[3]->ptr)[0] == '(') {
5778 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5779 maxex = 1;
5780 } else {
5781 max = strtod(c->argv[3]->ptr,NULL);
5782 }
5783
5784 /* Parse "WITHSCORES": note that if the command was called with
5785 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5786 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 5787 if (c->argc == 5 || c->argc == 8) {
3a3978b1 5788 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5789 withscores = 1;
5790 else
5791 badsyntax = 1;
0500ef27 5792 }
3a3978b1 5793 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 5794 badsyntax = 1;
0500ef27 5795 if (badsyntax) {
454d4e43 5796 addReplySds(c,
5797 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 5798 return;
0500ef27
SH
5799 }
5800
f44dd428 5801 /* Parse "LIMIT" */
0500ef27 5802 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 5803 addReply(c,shared.syntaxerr);
5804 return;
0500ef27 5805 } else if (c->argc == (7 + withscores)) {
80181f78 5806 offset = atoi(c->argv[5]->ptr);
5807 limit = atoi(c->argv[6]->ptr);
0b13687c 5808 if (offset < 0) offset = 0;
80181f78 5809 }
50c55df5 5810
f44dd428 5811 /* Ok, lookup the key and get the range */
50c55df5 5812 o = lookupKeyRead(c->db,c->argv[1]);
5813 if (o == NULL) {
f44dd428 5814 addReply(c,justcount ? shared.czero : shared.nullmultibulk);
50c55df5 5815 } else {
5816 if (o->type != REDIS_ZSET) {
5817 addReply(c,shared.wrongtypeerr);
5818 } else {
5819 zset *zsetobj = o->ptr;
5820 zskiplist *zsl = zsetobj->zsl;
5821 zskiplistNode *ln;
f44dd428 5822 robj *ele, *lenobj = NULL;
5823 unsigned long rangelen = 0;
50c55df5 5824
f44dd428 5825 /* Get the first node with the score >= min, or with
5826 * score > min if 'minex' is true. */
50c55df5 5827 ln = zslFirstWithScore(zsl,min);
f44dd428 5828 while (minex && ln && ln->score == min) ln = ln->forward[0];
5829
50c55df5 5830 if (ln == NULL) {
5831 /* No element matching the speciifed interval */
f44dd428 5832 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 5833 return;
5834 }
5835
5836 /* We don't know in advance how many matching elements there
5837 * are in the list, so we push this object that will represent
5838 * the multi-bulk length in the output buffer, and will "fix"
5839 * it later */
f44dd428 5840 if (!justcount) {
5841 lenobj = createObject(REDIS_STRING,NULL);
5842 addReply(c,lenobj);
5843 decrRefCount(lenobj);
5844 }
50c55df5 5845
f44dd428 5846 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 5847 if (offset) {
5848 offset--;
5849 ln = ln->forward[0];
5850 continue;
5851 }
5852 if (limit == 0) break;
f44dd428 5853 if (!justcount) {
5854 ele = ln->obj;
dd88747b 5855 addReplyBulk(c,ele);
f44dd428 5856 if (withscores)
5857 addReplyDouble(c,ln->score);
5858 }
50c55df5 5859 ln = ln->forward[0];
5860 rangelen++;
80181f78 5861 if (limit > 0) limit--;
50c55df5 5862 }
f44dd428 5863 if (justcount) {
5864 addReplyLong(c,(long)rangelen);
5865 } else {
5866 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5867 withscores ? (rangelen*2) : rangelen);
5868 }
50c55df5 5869 }
5870 }
5871}
5872
f44dd428 5873static void zrangebyscoreCommand(redisClient *c) {
5874 genericZrangebyscoreCommand(c,0);
5875}
5876
5877static void zcountCommand(redisClient *c) {
5878 genericZrangebyscoreCommand(c,1);
5879}
5880
3c41331e 5881static void zcardCommand(redisClient *c) {
e197b441 5882 robj *o;
5883 zset *zs;
dd88747b 5884
5885 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5886 checkType(c,o,REDIS_ZSET)) return;
5887
5888 zs = o->ptr;
5889 addReplyUlong(c,zs->zsl->length);
e197b441 5890}
5891
6e333bbe 5892static void zscoreCommand(redisClient *c) {
5893 robj *o;
5894 zset *zs;
dd88747b 5895 dictEntry *de;
5896
5897 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5898 checkType(c,o,REDIS_ZSET)) return;
5899
5900 zs = o->ptr;
5901 de = dictFind(zs->dict,c->argv[2]);
5902 if (!de) {
96d8b4ee 5903 addReply(c,shared.nullbulk);
6e333bbe 5904 } else {
dd88747b 5905 double *score = dictGetEntryVal(de);
6e333bbe 5906
dd88747b 5907 addReplyDouble(c,*score);
6e333bbe 5908 }
5909}
5910
798d9e55 5911static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 5912 robj *o;
dd88747b 5913 zset *zs;
5914 zskiplist *zsl;
5915 dictEntry *de;
5916 unsigned long rank;
5917 double *score;
5918
5919 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5920 checkType(c,o,REDIS_ZSET)) return;
5921
5922 zs = o->ptr;
5923 zsl = zs->zsl;
5924 de = dictFind(zs->dict,c->argv[2]);
5925 if (!de) {
69d95c3e
PN
5926 addReply(c,shared.nullbulk);
5927 return;
5928 }
69d95c3e 5929
dd88747b 5930 score = dictGetEntryVal(de);
5931 rank = zslGetRank(zsl, *score, c->argv[2]);
5932 if (rank) {
5933 if (reverse) {
5934 addReplyLong(c, zsl->length - rank);
27b0ccca 5935 } else {
dd88747b 5936 addReplyLong(c, rank-1);
69d95c3e 5937 }
dd88747b 5938 } else {
5939 addReply(c,shared.nullbulk);
978c2c94 5940 }
5941}
5942
798d9e55
PN
5943static void zrankCommand(redisClient *c) {
5944 zrankGenericCommand(c, 0);
5945}
5946
5947static void zrevrankCommand(redisClient *c) {
5948 zrankGenericCommand(c, 1);
5949}
5950
cbba7dd7 5951/* =================================== Hashes =============================== */
978c2c94 5952static void hsetCommand(redisClient *c) {
5953 int update = 0;
5954 robj *o = lookupKeyWrite(c->db,c->argv[1]);
5955
5956 if (o == NULL) {
5957 o = createHashObject();
5958 dictAdd(c->db->dict,c->argv[1],o);
5959 incrRefCount(c->argv[1]);
5960 } else {
5961 if (o->type != REDIS_HASH) {
5962 addReply(c,shared.wrongtypeerr);
5963 return;
5964 }
5965 }
bae2c7ec 5966 /* We want to convert the zipmap into an hash table right now if the
5967 * entry to be added is too big. Note that we check if the object
5968 * is integer encoded before to try fetching the length in the test below.
5969 * This is because integers are small, but currently stringObjectLen()
5970 * performs a slow conversion: not worth it. */
5971 if (o->encoding == REDIS_ENCODING_ZIPMAP &&
5972 ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
5973 sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
5974 (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
5975 sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
5976 {
5977 convertToRealHash(o);
5978 }
5979
978c2c94 5980 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
5981 unsigned char *zm = o->ptr;
b1befe6a 5982 robj *valobj = getDecodedObject(c->argv[3]);
978c2c94 5983
5984 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
b1befe6a 5985 valobj->ptr,sdslen(valobj->ptr),&update);
5986 decrRefCount(valobj);
cbba7dd7 5987 o->ptr = zm;
bae2c7ec 5988
e9484a85
PN
5989 /* And here there is the second check for hash conversion. */
5990 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
bae2c7ec 5991 convertToRealHash(o);
978c2c94 5992 } else {
05df7621 5993 c->argv[2] = tryObjectEncoding(c->argv[2]);
bae2c7ec 5994 /* note that c->argv[3] is already encoded, as the latest arg
5995 * of a bulk command is always integer encoded if possible. */
2069d06a 5996 if (dictReplace(o->ptr,c->argv[2],c->argv[3])) {
978c2c94 5997 incrRefCount(c->argv[2]);
5998 } else {
5999 update = 1;
6000 }
6001 incrRefCount(c->argv[3]);
6002 }
6003 server.dirty++;
6004 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
6005}
6006
01426b05 6007static void hincrbyCommand(redisClient *c) {
01426b05
PN
6008 long long value = 0, incr = 0;
6009 robj *o = lookupKeyWrite(c->db,c->argv[1]);
6010
6011 if (o == NULL) {
6012 o = createHashObject();
6013 dictAdd(c->db->dict,c->argv[1],o);
6014 incrRefCount(c->argv[1]);
6015 } else {
6016 if (o->type != REDIS_HASH) {
6017 addReply(c,shared.wrongtypeerr);
6018 return;
6019 }
6020 }
6021
5e26ae88 6022 incr = strtoll(c->argv[3]->ptr, NULL, 10);
01426b05
PN
6023 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6024 unsigned char *zm = o->ptr;
6025 unsigned char *zval;
6026 unsigned int zvlen;
6027
6028 /* Find value if already present in hash */
6029 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6030 &zval,&zvlen)) {
6031 /* strtoll needs the char* to have a trailing \0, but
6032 * the zipmap doesn't include them. */
6033 sds szval = sdsnewlen(zval, zvlen);
6034 value = strtoll(szval,NULL,10);
6035 sdsfree(szval);
6036 }
6037
6038 value += incr;
6039 sds svalue = sdscatprintf(sdsempty(),"%lld",value);
6040 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
e9484a85 6041 (unsigned char*)svalue,sdslen(svalue),NULL);
01426b05
PN
6042 sdsfree(svalue);
6043 o->ptr = zm;
6044
e9484a85
PN
6045 /* Check if the zipmap needs to be converted. */
6046 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
01426b05
PN
6047 convertToRealHash(o);
6048 } else {
6049 robj *hval;
6050 dictEntry *de;
6051
6052 /* Find value if already present in hash */
6053 de = dictFind(o->ptr,c->argv[2]);
6054 if (de != NULL) {
6055 hval = dictGetEntryVal(de);
6056 if (hval->encoding == REDIS_ENCODING_RAW)
6057 value = strtoll(hval->ptr,NULL,10);
6058 else if (hval->encoding == REDIS_ENCODING_INT)
6059 value = (long)hval->ptr;
6060 else
6061 redisAssert(1 != 1);
6062 }
6063
6064 value += incr;
6065 hval = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
05df7621 6066 hval = tryObjectEncoding(hval);
01426b05
PN
6067 if (dictReplace(o->ptr,c->argv[2],hval)) {
6068 incrRefCount(c->argv[2]);
6069 }
6070 }
6071
6072 server.dirty++;
aa7c2934 6073 addReplyLongLong(c, value);
01426b05
PN
6074}
6075
978c2c94 6076static void hgetCommand(redisClient *c) {
dd88747b 6077 robj *o;
978c2c94 6078
dd88747b 6079 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6080 checkType(c,o,REDIS_HASH)) return;
6081
6082 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6083 unsigned char *zm = o->ptr;
6084 unsigned char *val;
6085 unsigned int vlen;
164ee595 6086 robj *field;
dd88747b 6087
164ee595 6088 field = getDecodedObject(c->argv[2]);
6089 if (zipmapGet(zm,field->ptr,sdslen(field->ptr), &val,&vlen)) {
dd88747b 6090 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6091 addReplySds(c,sdsnewlen(val,vlen));
6092 addReply(c,shared.crlf);
164ee595 6093 decrRefCount(field);
dd88747b 6094 return;
6095 } else {
6096 addReply(c,shared.nullbulk);
164ee595 6097 decrRefCount(field);
bcd11906 6098 return;
6099 }
dd88747b 6100 } else {
6101 struct dictEntry *de;
bcd11906 6102
dd88747b 6103 de = dictFind(o->ptr,c->argv[2]);
6104 if (de == NULL) {
6105 addReply(c,shared.nullbulk);
978c2c94 6106 } else {
dd88747b 6107 robj *e = dictGetEntryVal(de);
978c2c94 6108
dd88747b 6109 addReplyBulk(c,e);
978c2c94 6110 }
69d95c3e 6111 }
69d95c3e
PN
6112}
6113
07efaf74 6114static void hdelCommand(redisClient *c) {
dd88747b 6115 robj *o;
6116 int deleted = 0;
07efaf74 6117
dd88747b 6118 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6119 checkType(c,o,REDIS_HASH)) return;
07efaf74 6120
dd88747b 6121 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
2a1198b4 6122 robj *field = getDecodedObject(c->argv[2]);
6123
dd88747b 6124 o->ptr = zipmapDel((unsigned char*) o->ptr,
2a1198b4 6125 (unsigned char*) field->ptr,
6126 sdslen(field->ptr), &deleted);
6127 decrRefCount(field);
3ea27d37 6128 if (zipmapLen((unsigned char*) o->ptr) == 0)
6129 deleteKey(c->db,c->argv[1]);
dd88747b 6130 } else {
6131 deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
3ea27d37 6132 if (htNeedsResize(o->ptr)) dictResize(o->ptr);
6133 if (dictSize((dict*)o->ptr) == 0) deleteKey(c->db,c->argv[1]);
07efaf74 6134 }
c77169b7 6135 if (deleted) server.dirty++;
dd88747b 6136 addReply(c,deleted ? shared.cone : shared.czero);
07efaf74 6137}
6138
92b27fe9 6139static void hlenCommand(redisClient *c) {
6140 robj *o;
6141 unsigned long len;
6142
dd88747b 6143 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 6144 checkType(c,o,REDIS_HASH)) return;
6145
6146 len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6147 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6148 addReplyUlong(c,len);
6149}
6150
78409a0f 6151#define REDIS_GETALL_KEYS 1
6152#define REDIS_GETALL_VALS 2
6153static void genericHgetallCommand(redisClient *c, int flags) {
6154 robj *o, *lenobj;
6155 unsigned long count = 0;
6156
6157 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullmultibulk)) == NULL
6158 || checkType(c,o,REDIS_HASH)) return;
6159
6160 lenobj = createObject(REDIS_STRING,NULL);
6161 addReply(c,lenobj);
6162 decrRefCount(lenobj);
6163
6164 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6165 unsigned char *p = zipmapRewind(o->ptr);
6166 unsigned char *field, *val;
6167 unsigned int flen, vlen;
6168
6169 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
6170 robj *aux;
6171
6172 if (flags & REDIS_GETALL_KEYS) {
6173 aux = createStringObject((char*)field,flen);
6174 addReplyBulk(c,aux);
6175 decrRefCount(aux);
6176 count++;
6177 }
6178 if (flags & REDIS_GETALL_VALS) {
6179 aux = createStringObject((char*)val,vlen);
6180 addReplyBulk(c,aux);
6181 decrRefCount(aux);
6182 count++;
6183 }
6184 }
6185 } else {
6186 dictIterator *di = dictGetIterator(o->ptr);
6187 dictEntry *de;
6188
6189 while((de = dictNext(di)) != NULL) {
6190 robj *fieldobj = dictGetEntryKey(de);
6191 robj *valobj = dictGetEntryVal(de);
6192
6193 if (flags & REDIS_GETALL_KEYS) {
6194 addReplyBulk(c,fieldobj);
6195 count++;
6196 }
6197 if (flags & REDIS_GETALL_VALS) {
6198 addReplyBulk(c,valobj);
6199 count++;
6200 }
6201 }
6202 dictReleaseIterator(di);
6203 }
6204 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6205}
6206
6207static void hkeysCommand(redisClient *c) {
6208 genericHgetallCommand(c,REDIS_GETALL_KEYS);
6209}
6210
6211static void hvalsCommand(redisClient *c) {
6212 genericHgetallCommand(c,REDIS_GETALL_VALS);
6213}
6214
6215static void hgetallCommand(redisClient *c) {
6216 genericHgetallCommand(c,REDIS_GETALL_KEYS|REDIS_GETALL_VALS);
6217}
6218
a86f14b1 6219static void hexistsCommand(redisClient *c) {
6220 robj *o;
6221 int exists = 0;
6222
6223 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6224 checkType(c,o,REDIS_HASH)) return;
6225
6226 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6227 robj *field;
6228 unsigned char *zm = o->ptr;
6229
6230 field = getDecodedObject(c->argv[2]);
6231 exists = zipmapExists(zm,field->ptr,sdslen(field->ptr));
6232 decrRefCount(field);
6233 } else {
6234 exists = dictFind(o->ptr,c->argv[2]) != NULL;
6235 }
6236 addReply(c,exists ? shared.cone : shared.czero);
6237}
6238
ada386b2 6239static void convertToRealHash(robj *o) {
6240 unsigned char *key, *val, *p, *zm = o->ptr;
6241 unsigned int klen, vlen;
6242 dict *dict = dictCreate(&hashDictType,NULL);
6243
6244 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6245 p = zipmapRewind(zm);
6246 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6247 robj *keyobj, *valobj;
6248
6249 keyobj = createStringObject((char*)key,klen);
6250 valobj = createStringObject((char*)val,vlen);
05df7621 6251 keyobj = tryObjectEncoding(keyobj);
6252 valobj = tryObjectEncoding(valobj);
ada386b2 6253 dictAdd(dict,keyobj,valobj);
6254 }
6255 o->encoding = REDIS_ENCODING_HT;
6256 o->ptr = dict;
6257 zfree(zm);
6258}
6259
6b47e12e 6260/* ========================= Non type-specific commands ==================== */
6261
ed9b544e 6262static void flushdbCommand(redisClient *c) {
ca37e9cd 6263 server.dirty += dictSize(c->db->dict);
3305306f 6264 dictEmpty(c->db->dict);
6265 dictEmpty(c->db->expires);
ed9b544e 6266 addReply(c,shared.ok);
ed9b544e 6267}
6268
6269static void flushallCommand(redisClient *c) {
ca37e9cd 6270 server.dirty += emptyDb();
ed9b544e 6271 addReply(c,shared.ok);
500ece7c 6272 if (server.bgsavechildpid != -1) {
6273 kill(server.bgsavechildpid,SIGKILL);
6274 rdbRemoveTempFile(server.bgsavechildpid);
6275 }
f78fd11b 6276 rdbSave(server.dbfilename);
ca37e9cd 6277 server.dirty++;
ed9b544e 6278}
6279
56906eef 6280static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 6281 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 6282 so->type = type;
6283 so->pattern = pattern;
6284 return so;
6285}
6286
6287/* Return the value associated to the key with a name obtained
6288 * substituting the first occurence of '*' in 'pattern' with 'subst' */
56906eef 6289static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
ed9b544e 6290 char *p;
6291 sds spat, ssub;
6292 robj keyobj;
6293 int prefixlen, sublen, postfixlen;
ed9b544e 6294 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6295 struct {
f1017b3f 6296 long len;
6297 long free;
ed9b544e 6298 char buf[REDIS_SORTKEY_MAX+1];
6299 } keyname;
6300
28173a49 6301 /* If the pattern is "#" return the substitution object itself in order
6302 * to implement the "SORT ... GET #" feature. */
6303 spat = pattern->ptr;
6304 if (spat[0] == '#' && spat[1] == '\0') {
6305 return subst;
6306 }
6307
6308 /* The substitution object may be specially encoded. If so we create
9d65a1bb 6309 * a decoded object on the fly. Otherwise getDecodedObject will just
6310 * increment the ref count, that we'll decrement later. */
6311 subst = getDecodedObject(subst);
942a3961 6312
ed9b544e 6313 ssub = subst->ptr;
6314 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6315 p = strchr(spat,'*');
ed5a857a 6316 if (!p) {
6317 decrRefCount(subst);
6318 return NULL;
6319 }
ed9b544e 6320
6321 prefixlen = p-spat;
6322 sublen = sdslen(ssub);
6323 postfixlen = sdslen(spat)-(prefixlen+1);
6324 memcpy(keyname.buf,spat,prefixlen);
6325 memcpy(keyname.buf+prefixlen,ssub,sublen);
6326 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6327 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6328 keyname.len = prefixlen+sublen+postfixlen;
6329
dfc5e96c 6330 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
942a3961 6331 decrRefCount(subst);
6332
a4d1ba9a 6333 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
3305306f 6334 return lookupKeyRead(db,&keyobj);
ed9b544e 6335}
6336
6337/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6338 * the additional parameter is not standard but a BSD-specific we have to
6339 * pass sorting parameters via the global 'server' structure */
6340static int sortCompare(const void *s1, const void *s2) {
6341 const redisSortObject *so1 = s1, *so2 = s2;
6342 int cmp;
6343
6344 if (!server.sort_alpha) {
6345 /* Numeric sorting. Here it's trivial as we precomputed scores */
6346 if (so1->u.score > so2->u.score) {
6347 cmp = 1;
6348 } else if (so1->u.score < so2->u.score) {
6349 cmp = -1;
6350 } else {
6351 cmp = 0;
6352 }
6353 } else {
6354 /* Alphanumeric sorting */
6355 if (server.sort_bypattern) {
6356 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6357 /* At least one compare object is NULL */
6358 if (so1->u.cmpobj == so2->u.cmpobj)
6359 cmp = 0;
6360 else if (so1->u.cmpobj == NULL)
6361 cmp = -1;
6362 else
6363 cmp = 1;
6364 } else {
6365 /* We have both the objects, use strcoll */
6366 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6367 }
6368 } else {
6369 /* Compare elements directly */
9d65a1bb 6370 robj *dec1, *dec2;
6371
6372 dec1 = getDecodedObject(so1->obj);
6373 dec2 = getDecodedObject(so2->obj);
6374 cmp = strcoll(dec1->ptr,dec2->ptr);
6375 decrRefCount(dec1);
6376 decrRefCount(dec2);
ed9b544e 6377 }
6378 }
6379 return server.sort_desc ? -cmp : cmp;
6380}
6381
6382/* The SORT command is the most complex command in Redis. Warning: this code
6383 * is optimized for speed and a bit less for readability */
6384static void sortCommand(redisClient *c) {
ed9b544e 6385 list *operations;
6386 int outputlen = 0;
6387 int desc = 0, alpha = 0;
6388 int limit_start = 0, limit_count = -1, start, end;
6389 int j, dontsort = 0, vectorlen;
6390 int getop = 0; /* GET operation counter */
443c6409 6391 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 6392 redisSortObject *vector; /* Resulting vector to sort */
6393
6394 /* Lookup the key to sort. It must be of the right types */
3305306f 6395 sortval = lookupKeyRead(c->db,c->argv[1]);
6396 if (sortval == NULL) {
d922ae65 6397 addReply(c,shared.nullmultibulk);
ed9b544e 6398 return;
6399 }
a5eb649b 6400 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6401 sortval->type != REDIS_ZSET)
6402 {
c937aa89 6403 addReply(c,shared.wrongtypeerr);
ed9b544e 6404 return;
6405 }
6406
6407 /* Create a list of operations to perform for every sorted element.
6408 * Operations can be GET/DEL/INCR/DECR */
6409 operations = listCreate();
092dac2a 6410 listSetFreeMethod(operations,zfree);
ed9b544e 6411 j = 2;
6412
6413 /* Now we need to protect sortval incrementing its count, in the future
6414 * SORT may have options able to overwrite/delete keys during the sorting
6415 * and the sorted key itself may get destroied */
6416 incrRefCount(sortval);
6417
6418 /* The SORT command has an SQL-alike syntax, parse it */
6419 while(j < c->argc) {
6420 int leftargs = c->argc-j-1;
6421 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6422 desc = 0;
6423 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6424 desc = 1;
6425 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6426 alpha = 1;
6427 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6428 limit_start = atoi(c->argv[j+1]->ptr);
6429 limit_count = atoi(c->argv[j+2]->ptr);
6430 j+=2;
443c6409 6431 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6432 storekey = c->argv[j+1];
6433 j++;
ed9b544e 6434 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6435 sortby = c->argv[j+1];
6436 /* If the BY pattern does not contain '*', i.e. it is constant,
6437 * we don't need to sort nor to lookup the weight keys. */
6438 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6439 j++;
6440 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6441 listAddNodeTail(operations,createSortOperation(
6442 REDIS_SORT_GET,c->argv[j+1]));
6443 getop++;
6444 j++;
ed9b544e 6445 } else {
6446 decrRefCount(sortval);
6447 listRelease(operations);
c937aa89 6448 addReply(c,shared.syntaxerr);
ed9b544e 6449 return;
6450 }
6451 j++;
6452 }
6453
6454 /* Load the sorting vector with all the objects to sort */
a5eb649b 6455 switch(sortval->type) {
6456 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6457 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6458 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
dfc5e96c 6459 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
a5eb649b 6460 }
ed9b544e 6461 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 6462 j = 0;
a5eb649b 6463
ed9b544e 6464 if (sortval->type == REDIS_LIST) {
6465 list *list = sortval->ptr;
6208b3a7 6466 listNode *ln;
c7df85a4 6467 listIter li;
6208b3a7 6468
c7df85a4 6469 listRewind(list,&li);
6470 while((ln = listNext(&li))) {
ed9b544e 6471 robj *ele = ln->value;
6472 vector[j].obj = ele;
6473 vector[j].u.score = 0;
6474 vector[j].u.cmpobj = NULL;
ed9b544e 6475 j++;
6476 }
6477 } else {
a5eb649b 6478 dict *set;
ed9b544e 6479 dictIterator *di;
6480 dictEntry *setele;
6481
a5eb649b 6482 if (sortval->type == REDIS_SET) {
6483 set = sortval->ptr;
6484 } else {
6485 zset *zs = sortval->ptr;
6486 set = zs->dict;
6487 }
6488
ed9b544e 6489 di = dictGetIterator(set);
ed9b544e 6490 while((setele = dictNext(di)) != NULL) {
6491 vector[j].obj = dictGetEntryKey(setele);
6492 vector[j].u.score = 0;
6493 vector[j].u.cmpobj = NULL;
6494 j++;
6495 }
6496 dictReleaseIterator(di);
6497 }
dfc5e96c 6498 redisAssert(j == vectorlen);
ed9b544e 6499
6500 /* Now it's time to load the right scores in the sorting vector */
6501 if (dontsort == 0) {
6502 for (j = 0; j < vectorlen; j++) {
6503 if (sortby) {
6504 robj *byval;
6505
3305306f 6506 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
ed9b544e 6507 if (!byval || byval->type != REDIS_STRING) continue;
6508 if (alpha) {
9d65a1bb 6509 vector[j].u.cmpobj = getDecodedObject(byval);
ed9b544e 6510 } else {
942a3961 6511 if (byval->encoding == REDIS_ENCODING_RAW) {
6512 vector[j].u.score = strtod(byval->ptr,NULL);
6513 } else {
9d65a1bb 6514 /* Don't need to decode the object if it's
6515 * integer-encoded (the only encoding supported) so
6516 * far. We can just cast it */
f1017b3f 6517 if (byval->encoding == REDIS_ENCODING_INT) {
942a3961 6518 vector[j].u.score = (long)byval->ptr;
f1017b3f 6519 } else
dfc5e96c 6520 redisAssert(1 != 1);
942a3961 6521 }
ed9b544e 6522 }
6523 } else {
942a3961 6524 if (!alpha) {
6525 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6526 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6527 else {
6528 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6529 vector[j].u.score = (long) vector[j].obj->ptr;
6530 else
dfc5e96c 6531 redisAssert(1 != 1);
942a3961 6532 }
6533 }
ed9b544e 6534 }
6535 }
6536 }
6537
6538 /* We are ready to sort the vector... perform a bit of sanity check
6539 * on the LIMIT option too. We'll use a partial version of quicksort. */
6540 start = (limit_start < 0) ? 0 : limit_start;
6541 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6542 if (start >= vectorlen) {
6543 start = vectorlen-1;
6544 end = vectorlen-2;
6545 }
6546 if (end >= vectorlen) end = vectorlen-1;
6547
6548 if (dontsort == 0) {
6549 server.sort_desc = desc;
6550 server.sort_alpha = alpha;
6551 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 6552 if (sortby && (start != 0 || end != vectorlen-1))
6553 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6554 else
6555 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 6556 }
6557
6558 /* Send command output to the output buffer, performing the specified
6559 * GET/DEL/INCR/DECR operations if any. */
6560 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 6561 if (storekey == NULL) {
6562 /* STORE option not specified, sent the sorting result to client */
6563 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6564 for (j = start; j <= end; j++) {
6565 listNode *ln;
c7df85a4 6566 listIter li;
6567
dd88747b 6568 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 6569 listRewind(operations,&li);
6570 while((ln = listNext(&li))) {
443c6409 6571 redisSortOperation *sop = ln->value;
6572 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6573 vector[j].obj);
6574
6575 if (sop->type == REDIS_SORT_GET) {
6576 if (!val || val->type != REDIS_STRING) {
6577 addReply(c,shared.nullbulk);
6578 } else {
dd88747b 6579 addReplyBulk(c,val);
443c6409 6580 }
6581 } else {
dfc5e96c 6582 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 6583 }
6584 }
ed9b544e 6585 }
443c6409 6586 } else {
6587 robj *listObject = createListObject();
6588 list *listPtr = (list*) listObject->ptr;
6589
6590 /* STORE option specified, set the sorting result as a List object */
6591 for (j = start; j <= end; j++) {
6592 listNode *ln;
c7df85a4 6593 listIter li;
6594
443c6409 6595 if (!getop) {
6596 listAddNodeTail(listPtr,vector[j].obj);
6597 incrRefCount(vector[j].obj);
6598 }
c7df85a4 6599 listRewind(operations,&li);
6600 while((ln = listNext(&li))) {
443c6409 6601 redisSortOperation *sop = ln->value;
6602 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6603 vector[j].obj);
6604
6605 if (sop->type == REDIS_SORT_GET) {
6606 if (!val || val->type != REDIS_STRING) {
6607 listAddNodeTail(listPtr,createStringObject("",0));
6608 } else {
6609 listAddNodeTail(listPtr,val);
6610 incrRefCount(val);
6611 }
ed9b544e 6612 } else {
dfc5e96c 6613 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 6614 }
ed9b544e 6615 }
ed9b544e 6616 }
121796f7 6617 if (dictReplace(c->db->dict,storekey,listObject)) {
6618 incrRefCount(storekey);
6619 }
443c6409 6620 /* Note: we add 1 because the DB is dirty anyway since even if the
6621 * SORT result is empty a new key is set and maybe the old content
6622 * replaced. */
6623 server.dirty += 1+outputlen;
6624 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 6625 }
6626
6627 /* Cleanup */
6628 decrRefCount(sortval);
6629 listRelease(operations);
6630 for (j = 0; j < vectorlen; j++) {
6631 if (sortby && alpha && vector[j].u.cmpobj)
6632 decrRefCount(vector[j].u.cmpobj);
6633 }
6634 zfree(vector);
6635}
6636
ec6c7a1d 6637/* Convert an amount of bytes into a human readable string in the form
6638 * of 100B, 2G, 100M, 4K, and so forth. */
6639static void bytesToHuman(char *s, unsigned long long n) {
6640 double d;
6641
6642 if (n < 1024) {
6643 /* Bytes */
6644 sprintf(s,"%lluB",n);
6645 return;
6646 } else if (n < (1024*1024)) {
6647 d = (double)n/(1024);
6648 sprintf(s,"%.2fK",d);
6649 } else if (n < (1024LL*1024*1024)) {
6650 d = (double)n/(1024*1024);
6651 sprintf(s,"%.2fM",d);
6652 } else if (n < (1024LL*1024*1024*1024)) {
6653 d = (double)n/(1024LL*1024*1024);
b72f6a4b 6654 sprintf(s,"%.2fG",d);
ec6c7a1d 6655 }
6656}
6657
1c85b79f 6658/* Create the string returned by the INFO command. This is decoupled
6659 * by the INFO command itself as we need to report the same information
6660 * on memory corruption problems. */
6661static sds genRedisInfoString(void) {
ed9b544e 6662 sds info;
6663 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 6664 int j;
ec6c7a1d 6665 char hmem[64];
55a8298f 6666
b72f6a4b 6667 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 6668 info = sdscatprintf(sdsempty(),
6669 "redis_version:%s\r\n"
f1017b3f 6670 "arch_bits:%s\r\n"
7a932b74 6671 "multiplexing_api:%s\r\n"
0d7170a4 6672 "process_id:%ld\r\n"
682ac724 6673 "uptime_in_seconds:%ld\r\n"
6674 "uptime_in_days:%ld\r\n"
ed9b544e 6675 "connected_clients:%d\r\n"
6676 "connected_slaves:%d\r\n"
f86a74e9 6677 "blocked_clients:%d\r\n"
5fba9f71 6678 "used_memory:%zu\r\n"
ec6c7a1d 6679 "used_memory_human:%s\r\n"
ed9b544e 6680 "changes_since_last_save:%lld\r\n"
be2bb6b0 6681 "bgsave_in_progress:%d\r\n"
682ac724 6682 "last_save_time:%ld\r\n"
b3fad521 6683 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 6684 "total_connections_received:%lld\r\n"
6685 "total_commands_processed:%lld\r\n"
2a6a2ed1 6686 "expired_keys:%lld\r\n"
55a8298f 6687 "hash_max_zipmap_entries:%ld\r\n"
6688 "hash_max_zipmap_value:%ld\r\n"
ffc6b7f8 6689 "pubsub_channels:%ld\r\n"
6690 "pubsub_patterns:%u\r\n"
7d98e08c 6691 "vm_enabled:%d\r\n"
a0f643ea 6692 "role:%s\r\n"
ed9b544e 6693 ,REDIS_VERSION,
f1017b3f 6694 (sizeof(long) == 8) ? "64" : "32",
7a932b74 6695 aeGetApiName(),
0d7170a4 6696 (long) getpid(),
a0f643ea 6697 uptime,
6698 uptime/(3600*24),
ed9b544e 6699 listLength(server.clients)-listLength(server.slaves),
6700 listLength(server.slaves),
d5d55fc3 6701 server.blpop_blocked_clients,
b72f6a4b 6702 zmalloc_used_memory(),
ec6c7a1d 6703 hmem,
ed9b544e 6704 server.dirty,
9d65a1bb 6705 server.bgsavechildpid != -1,
ed9b544e 6706 server.lastsave,
b3fad521 6707 server.bgrewritechildpid != -1,
ed9b544e 6708 server.stat_numconnections,
6709 server.stat_numcommands,
2a6a2ed1 6710 server.stat_expiredkeys,
55a8298f 6711 server.hash_max_zipmap_entries,
6712 server.hash_max_zipmap_value,
ffc6b7f8 6713 dictSize(server.pubsub_channels),
6714 listLength(server.pubsub_patterns),
7d98e08c 6715 server.vm_enabled != 0,
a0f643ea 6716 server.masterhost == NULL ? "master" : "slave"
ed9b544e 6717 );
a0f643ea 6718 if (server.masterhost) {
6719 info = sdscatprintf(info,
6720 "master_host:%s\r\n"
6721 "master_port:%d\r\n"
6722 "master_link_status:%s\r\n"
6723 "master_last_io_seconds_ago:%d\r\n"
6724 ,server.masterhost,
6725 server.masterport,
6726 (server.replstate == REDIS_REPL_CONNECTED) ?
6727 "up" : "down",
f72b934d 6728 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 6729 );
6730 }
7d98e08c 6731 if (server.vm_enabled) {
1064ef87 6732 lockThreadedIO();
7d98e08c 6733 info = sdscatprintf(info,
6734 "vm_conf_max_memory:%llu\r\n"
6735 "vm_conf_page_size:%llu\r\n"
6736 "vm_conf_pages:%llu\r\n"
6737 "vm_stats_used_pages:%llu\r\n"
6738 "vm_stats_swapped_objects:%llu\r\n"
6739 "vm_stats_swappin_count:%llu\r\n"
6740 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 6741 "vm_stats_io_newjobs_len:%lu\r\n"
6742 "vm_stats_io_processing_len:%lu\r\n"
6743 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 6744 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 6745 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 6746 ,(unsigned long long) server.vm_max_memory,
6747 (unsigned long long) server.vm_page_size,
6748 (unsigned long long) server.vm_pages,
6749 (unsigned long long) server.vm_stats_used_pages,
6750 (unsigned long long) server.vm_stats_swapped_objects,
6751 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 6752 (unsigned long long) server.vm_stats_swapouts,
6753 (unsigned long) listLength(server.io_newjobs),
6754 (unsigned long) listLength(server.io_processing),
6755 (unsigned long) listLength(server.io_processed),
d5d55fc3 6756 (unsigned long) server.io_active_threads,
6757 (unsigned long) server.vm_blocked_clients
7d98e08c 6758 );
1064ef87 6759 unlockThreadedIO();
7d98e08c 6760 }
c3cb078d 6761 for (j = 0; j < server.dbnum; j++) {
6762 long long keys, vkeys;
6763
6764 keys = dictSize(server.db[j].dict);
6765 vkeys = dictSize(server.db[j].expires);
6766 if (keys || vkeys) {
9d65a1bb 6767 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 6768 j, keys, vkeys);
6769 }
6770 }
1c85b79f 6771 return info;
6772}
6773
6774static void infoCommand(redisClient *c) {
6775 sds info = genRedisInfoString();
83c6a618 6776 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6777 (unsigned long)sdslen(info)));
ed9b544e 6778 addReplySds(c,info);
70003d28 6779 addReply(c,shared.crlf);
ed9b544e 6780}
6781
3305306f 6782static void monitorCommand(redisClient *c) {
6783 /* ignore MONITOR if aleady slave or in monitor mode */
6784 if (c->flags & REDIS_SLAVE) return;
6785
6786 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6787 c->slaveseldb = 0;
6b47e12e 6788 listAddNodeTail(server.monitors,c);
3305306f 6789 addReply(c,shared.ok);
6790}
6791
6792/* ================================= Expire ================================= */
6793static int removeExpire(redisDb *db, robj *key) {
6794 if (dictDelete(db->expires,key) == DICT_OK) {
6795 return 1;
6796 } else {
6797 return 0;
6798 }
6799}
6800
6801static int setExpire(redisDb *db, robj *key, time_t when) {
6802 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6803 return 0;
6804 } else {
6805 incrRefCount(key);
6806 return 1;
6807 }
6808}
6809
bb32ede5 6810/* Return the expire time of the specified key, or -1 if no expire
6811 * is associated with this key (i.e. the key is non volatile) */
6812static time_t getExpire(redisDb *db, robj *key) {
6813 dictEntry *de;
6814
6815 /* No expire? return ASAP */
6816 if (dictSize(db->expires) == 0 ||
6817 (de = dictFind(db->expires,key)) == NULL) return -1;
6818
6819 return (time_t) dictGetEntryVal(de);
6820}
6821
3305306f 6822static int expireIfNeeded(redisDb *db, robj *key) {
6823 time_t when;
6824 dictEntry *de;
6825
6826 /* No expire? return ASAP */
6827 if (dictSize(db->expires) == 0 ||
6828 (de = dictFind(db->expires,key)) == NULL) return 0;
6829
6830 /* Lookup the expire */
6831 when = (time_t) dictGetEntryVal(de);
6832 if (time(NULL) <= when) return 0;
6833
6834 /* Delete the key */
6835 dictDelete(db->expires,key);
2a6a2ed1 6836 server.stat_expiredkeys++;
3305306f 6837 return dictDelete(db->dict,key) == DICT_OK;
6838}
6839
6840static int deleteIfVolatile(redisDb *db, robj *key) {
6841 dictEntry *de;
6842
6843 /* No expire? return ASAP */
6844 if (dictSize(db->expires) == 0 ||
6845 (de = dictFind(db->expires,key)) == NULL) return 0;
6846
6847 /* Delete the key */
0c66a471 6848 server.dirty++;
2a6a2ed1 6849 server.stat_expiredkeys++;
3305306f 6850 dictDelete(db->expires,key);
6851 return dictDelete(db->dict,key) == DICT_OK;
6852}
6853
802e8373 6854static void expireGenericCommand(redisClient *c, robj *key, time_t seconds) {
3305306f 6855 dictEntry *de;
3305306f 6856
802e8373 6857 de = dictFind(c->db->dict,key);
3305306f 6858 if (de == NULL) {
6859 addReply(c,shared.czero);
6860 return;
6861 }
43e5ccdf 6862 if (seconds < 0) {
6863 if (deleteKey(c->db,key)) server.dirty++;
6864 addReply(c, shared.cone);
3305306f 6865 return;
6866 } else {
6867 time_t when = time(NULL)+seconds;
802e8373 6868 if (setExpire(c->db,key,when)) {
3305306f 6869 addReply(c,shared.cone);
77423026 6870 server.dirty++;
6871 } else {
3305306f 6872 addReply(c,shared.czero);
77423026 6873 }
3305306f 6874 return;
6875 }
6876}
6877
802e8373 6878static void expireCommand(redisClient *c) {
6879 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10));
6880}
6881
6882static void expireatCommand(redisClient *c) {
6883 expireGenericCommand(c,c->argv[1],strtol(c->argv[2]->ptr,NULL,10)-time(NULL));
6884}
6885
fd88489a 6886static void ttlCommand(redisClient *c) {
6887 time_t expire;
6888 int ttl = -1;
6889
6890 expire = getExpire(c->db,c->argv[1]);
6891 if (expire != -1) {
6892 ttl = (int) (expire-time(NULL));
6893 if (ttl < 0) ttl = -1;
6894 }
6895 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
6896}
6897
6e469882 6898/* ================================ MULTI/EXEC ============================== */
6899
6900/* Client state initialization for MULTI/EXEC */
6901static void initClientMultiState(redisClient *c) {
6902 c->mstate.commands = NULL;
6903 c->mstate.count = 0;
6904}
6905
6906/* Release all the resources associated with MULTI/EXEC state */
6907static void freeClientMultiState(redisClient *c) {
6908 int j;
6909
6910 for (j = 0; j < c->mstate.count; j++) {
6911 int i;
6912 multiCmd *mc = c->mstate.commands+j;
6913
6914 for (i = 0; i < mc->argc; i++)
6915 decrRefCount(mc->argv[i]);
6916 zfree(mc->argv);
6917 }
6918 zfree(c->mstate.commands);
6919}
6920
6921/* Add a new command into the MULTI commands queue */
6922static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
6923 multiCmd *mc;
6924 int j;
6925
6926 c->mstate.commands = zrealloc(c->mstate.commands,
6927 sizeof(multiCmd)*(c->mstate.count+1));
6928 mc = c->mstate.commands+c->mstate.count;
6929 mc->cmd = cmd;
6930 mc->argc = c->argc;
6931 mc->argv = zmalloc(sizeof(robj*)*c->argc);
6932 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
6933 for (j = 0; j < c->argc; j++)
6934 incrRefCount(mc->argv[j]);
6935 c->mstate.count++;
6936}
6937
6938static void multiCommand(redisClient *c) {
6939 c->flags |= REDIS_MULTI;
36c548f0 6940 addReply(c,shared.ok);
6e469882 6941}
6942
18b6cb76
DJ
6943static void discardCommand(redisClient *c) {
6944 if (!(c->flags & REDIS_MULTI)) {
6945 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
6946 return;
6947 }
6948
6949 freeClientMultiState(c);
6950 initClientMultiState(c);
6951 c->flags &= (~REDIS_MULTI);
6952 addReply(c,shared.ok);
6953}
6954
6e469882 6955static void execCommand(redisClient *c) {
6956 int j;
6957 robj **orig_argv;
6958 int orig_argc;
6959
6960 if (!(c->flags & REDIS_MULTI)) {
6961 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
6962 return;
6963 }
6964
6965 orig_argv = c->argv;
6966 orig_argc = c->argc;
6967 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
6968 for (j = 0; j < c->mstate.count; j++) {
6969 c->argc = c->mstate.commands[j].argc;
6970 c->argv = c->mstate.commands[j].argv;
6971 call(c,c->mstate.commands[j].cmd);
6972 }
6973 c->argv = orig_argv;
6974 c->argc = orig_argc;
6975 freeClientMultiState(c);
6976 initClientMultiState(c);
6977 c->flags &= (~REDIS_MULTI);
6978}
6979
4409877e 6980/* =========================== Blocking Operations ========================= */
6981
6982/* Currently Redis blocking operations support is limited to list POP ops,
6983 * so the current implementation is not fully generic, but it is also not
6984 * completely specific so it will not require a rewrite to support new
6985 * kind of blocking operations in the future.
6986 *
6987 * Still it's important to note that list blocking operations can be already
6988 * used as a notification mechanism in order to implement other blocking
6989 * operations at application level, so there must be a very strong evidence
6990 * of usefulness and generality before new blocking operations are implemented.
6991 *
6992 * This is how the current blocking POP works, we use BLPOP as example:
6993 * - If the user calls BLPOP and the key exists and contains a non empty list
6994 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
6995 * if there is not to block.
6996 * - If instead BLPOP is called and the key does not exists or the list is
6997 * empty we need to block. In order to do so we remove the notification for
6998 * new data to read in the client socket (so that we'll not serve new
6999 * requests if the blocking request is not served). Also we put the client
95242ab5 7000 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
4409877e 7001 * blocking for this keys.
7002 * - If a PUSH operation against a key with blocked clients waiting is
7003 * performed, we serve the first in the list: basically instead to push
7004 * the new element inside the list we return it to the (first / oldest)
7005 * blocking client, unblock the client, and remove it form the list.
7006 *
7007 * The above comment and the source code should be enough in order to understand
7008 * the implementation and modify / fix it later.
7009 */
7010
7011/* Set a client in blocking mode for the specified key, with the specified
7012 * timeout */
b177fd30 7013static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 7014 dictEntry *de;
7015 list *l;
b177fd30 7016 int j;
4409877e 7017
b177fd30 7018 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7019 c->blockingkeysnum = numkeys;
4409877e 7020 c->blockingto = timeout;
b177fd30 7021 for (j = 0; j < numkeys; j++) {
7022 /* Add the key in the client structure, to map clients -> keys */
7023 c->blockingkeys[j] = keys[j];
7024 incrRefCount(keys[j]);
4409877e 7025
b177fd30 7026 /* And in the other "side", to map keys -> clients */
7027 de = dictFind(c->db->blockingkeys,keys[j]);
7028 if (de == NULL) {
7029 int retval;
7030
7031 /* For every key we take a list of clients blocked for it */
7032 l = listCreate();
7033 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7034 incrRefCount(keys[j]);
7035 assert(retval == DICT_OK);
7036 } else {
7037 l = dictGetEntryVal(de);
7038 }
7039 listAddNodeTail(l,c);
4409877e 7040 }
b177fd30 7041 /* Mark the client as a blocked client */
4409877e 7042 c->flags |= REDIS_BLOCKED;
d5d55fc3 7043 server.blpop_blocked_clients++;
4409877e 7044}
7045
7046/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 7047static void unblockClientWaitingData(redisClient *c) {
4409877e 7048 dictEntry *de;
7049 list *l;
b177fd30 7050 int j;
4409877e 7051
b177fd30 7052 assert(c->blockingkeys != NULL);
7053 /* The client may wait for multiple keys, so unblock it for every key. */
7054 for (j = 0; j < c->blockingkeysnum; j++) {
7055 /* Remove this client from the list of clients waiting for this key. */
7056 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7057 assert(de != NULL);
7058 l = dictGetEntryVal(de);
7059 listDelNode(l,listSearchKey(l,c));
7060 /* If the list is empty we need to remove it to avoid wasting memory */
7061 if (listLength(l) == 0)
7062 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7063 decrRefCount(c->blockingkeys[j]);
7064 }
7065 /* Cleanup the client structure */
7066 zfree(c->blockingkeys);
7067 c->blockingkeys = NULL;
4409877e 7068 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 7069 server.blpop_blocked_clients--;
5921aa36 7070 /* We want to process data if there is some command waiting
b0d8747d 7071 * in the input buffer. Note that this is safe even if
7072 * unblockClientWaitingData() gets called from freeClient() because
7073 * freeClient() will be smart enough to call this function
7074 * *after* c->querybuf was set to NULL. */
4409877e 7075 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7076}
7077
7078/* This should be called from any function PUSHing into lists.
7079 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7080 * 'ele' is the element pushed.
7081 *
7082 * If the function returns 0 there was no client waiting for a list push
7083 * against this key.
7084 *
7085 * If the function returns 1 there was a client waiting for a list push
7086 * against this key, the element was passed to this client thus it's not
7087 * needed to actually add it to the list and the caller should return asap. */
7088static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7089 struct dictEntry *de;
7090 redisClient *receiver;
7091 list *l;
7092 listNode *ln;
7093
7094 de = dictFind(c->db->blockingkeys,key);
7095 if (de == NULL) return 0;
7096 l = dictGetEntryVal(de);
7097 ln = listFirst(l);
7098 assert(ln != NULL);
7099 receiver = ln->value;
4409877e 7100
b177fd30 7101 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 7102 addReplyBulk(receiver,key);
7103 addReplyBulk(receiver,ele);
b0d8747d 7104 unblockClientWaitingData(receiver);
4409877e 7105 return 1;
7106}
7107
7108/* Blocking RPOP/LPOP */
7109static void blockingPopGenericCommand(redisClient *c, int where) {
7110 robj *o;
7111 time_t timeout;
b177fd30 7112 int j;
4409877e 7113
b177fd30 7114 for (j = 1; j < c->argc-1; j++) {
7115 o = lookupKeyWrite(c->db,c->argv[j]);
7116 if (o != NULL) {
7117 if (o->type != REDIS_LIST) {
7118 addReply(c,shared.wrongtypeerr);
4409877e 7119 return;
b177fd30 7120 } else {
7121 list *list = o->ptr;
7122 if (listLength(list) != 0) {
7123 /* If the list contains elements fall back to the usual
7124 * non-blocking POP operation */
7125 robj *argv[2], **orig_argv;
7126 int orig_argc;
7127
7128 /* We need to alter the command arguments before to call
7129 * popGenericCommand() as the command takes a single key. */
7130 orig_argv = c->argv;
7131 orig_argc = c->argc;
7132 argv[1] = c->argv[j];
7133 c->argv = argv;
7134 c->argc = 2;
7135
7136 /* Also the return value is different, we need to output
7137 * the multi bulk reply header and the key name. The
7138 * "real" command will add the last element (the value)
7139 * for us. If this souds like an hack to you it's just
7140 * because it is... */
7141 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 7142 addReplyBulk(c,argv[1]);
b177fd30 7143 popGenericCommand(c,where);
7144
7145 /* Fix the client structure with the original stuff */
7146 c->argv = orig_argv;
7147 c->argc = orig_argc;
7148 return;
7149 }
4409877e 7150 }
7151 }
7152 }
7153 /* If the list is empty or the key does not exists we must block */
b177fd30 7154 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 7155 if (timeout > 0) timeout += time(NULL);
b177fd30 7156 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 7157}
7158
7159static void blpopCommand(redisClient *c) {
7160 blockingPopGenericCommand(c,REDIS_HEAD);
7161}
7162
7163static void brpopCommand(redisClient *c) {
7164 blockingPopGenericCommand(c,REDIS_TAIL);
7165}
7166
ed9b544e 7167/* =============================== Replication ============================= */
7168
a4d1ba9a 7169static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7170 ssize_t nwritten, ret = size;
7171 time_t start = time(NULL);
7172
7173 timeout++;
7174 while(size) {
7175 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7176 nwritten = write(fd,ptr,size);
7177 if (nwritten == -1) return -1;
7178 ptr += nwritten;
7179 size -= nwritten;
7180 }
7181 if ((time(NULL)-start) > timeout) {
7182 errno = ETIMEDOUT;
7183 return -1;
7184 }
7185 }
7186 return ret;
7187}
7188
a4d1ba9a 7189static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7190 ssize_t nread, totread = 0;
7191 time_t start = time(NULL);
7192
7193 timeout++;
7194 while(size) {
7195 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7196 nread = read(fd,ptr,size);
7197 if (nread == -1) return -1;
7198 ptr += nread;
7199 size -= nread;
7200 totread += nread;
7201 }
7202 if ((time(NULL)-start) > timeout) {
7203 errno = ETIMEDOUT;
7204 return -1;
7205 }
7206 }
7207 return totread;
7208}
7209
7210static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7211 ssize_t nread = 0;
7212
7213 size--;
7214 while(size) {
7215 char c;
7216
7217 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7218 if (c == '\n') {
7219 *ptr = '\0';
7220 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7221 return nread;
7222 } else {
7223 *ptr++ = c;
7224 *ptr = '\0';
7225 nread++;
7226 }
7227 }
7228 return nread;
7229}
7230
7231static void syncCommand(redisClient *c) {
40d224a9 7232 /* ignore SYNC if aleady slave or in monitor mode */
7233 if (c->flags & REDIS_SLAVE) return;
7234
7235 /* SYNC can't be issued when the server has pending data to send to
7236 * the client about already issued commands. We need a fresh reply
7237 * buffer registering the differences between the BGSAVE and the current
7238 * dataset, so that we can copy to other slaves if needed. */
7239 if (listLength(c->reply) != 0) {
7240 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7241 return;
7242 }
7243
7244 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7245 /* Here we need to check if there is a background saving operation
7246 * in progress, or if it is required to start one */
9d65a1bb 7247 if (server.bgsavechildpid != -1) {
40d224a9 7248 /* Ok a background save is in progress. Let's check if it is a good
7249 * one for replication, i.e. if there is another slave that is
7250 * registering differences since the server forked to save */
7251 redisClient *slave;
7252 listNode *ln;
c7df85a4 7253 listIter li;
40d224a9 7254
c7df85a4 7255 listRewind(server.slaves,&li);
7256 while((ln = listNext(&li))) {
40d224a9 7257 slave = ln->value;
7258 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 7259 }
7260 if (ln) {
7261 /* Perfect, the server is already registering differences for
7262 * another slave. Set the right state, and copy the buffer. */
7263 listRelease(c->reply);
7264 c->reply = listDup(slave->reply);
40d224a9 7265 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7266 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7267 } else {
7268 /* No way, we need to wait for the next BGSAVE in order to
7269 * register differences */
7270 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7271 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7272 }
7273 } else {
7274 /* Ok we don't have a BGSAVE in progress, let's start one */
7275 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7276 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7277 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7278 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7279 return;
7280 }
7281 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7282 }
6208b3a7 7283 c->repldbfd = -1;
40d224a9 7284 c->flags |= REDIS_SLAVE;
7285 c->slaveseldb = 0;
6b47e12e 7286 listAddNodeTail(server.slaves,c);
40d224a9 7287 return;
7288}
7289
6208b3a7 7290static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7291 redisClient *slave = privdata;
7292 REDIS_NOTUSED(el);
7293 REDIS_NOTUSED(mask);
7294 char buf[REDIS_IOBUF_LEN];
7295 ssize_t nwritten, buflen;
7296
7297 if (slave->repldboff == 0) {
7298 /* Write the bulk write count before to transfer the DB. In theory here
7299 * we don't know how much room there is in the output buffer of the
7300 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7301 * operations) will never be smaller than the few bytes we need. */
7302 sds bulkcount;
7303
7304 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7305 slave->repldbsize);
7306 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7307 {
7308 sdsfree(bulkcount);
7309 freeClient(slave);
7310 return;
7311 }
7312 sdsfree(bulkcount);
7313 }
7314 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7315 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7316 if (buflen <= 0) {
7317 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7318 (buflen == 0) ? "premature EOF" : strerror(errno));
7319 freeClient(slave);
7320 return;
7321 }
7322 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 7323 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 7324 strerror(errno));
7325 freeClient(slave);
7326 return;
7327 }
7328 slave->repldboff += nwritten;
7329 if (slave->repldboff == slave->repldbsize) {
7330 close(slave->repldbfd);
7331 slave->repldbfd = -1;
7332 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7333 slave->replstate = REDIS_REPL_ONLINE;
7334 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 7335 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 7336 freeClient(slave);
7337 return;
7338 }
7339 addReplySds(slave,sdsempty());
7340 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7341 }
7342}
ed9b544e 7343
a3b21203 7344/* This function is called at the end of every backgrond saving.
7345 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7346 * otherwise REDIS_ERR is passed to the function.
7347 *
7348 * The goal of this function is to handle slaves waiting for a successful
7349 * background saving in order to perform non-blocking synchronization. */
7350static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 7351 listNode *ln;
7352 int startbgsave = 0;
c7df85a4 7353 listIter li;
ed9b544e 7354
c7df85a4 7355 listRewind(server.slaves,&li);
7356 while((ln = listNext(&li))) {
6208b3a7 7357 redisClient *slave = ln->value;
ed9b544e 7358
6208b3a7 7359 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7360 startbgsave = 1;
7361 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7362 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 7363 struct redis_stat buf;
6208b3a7 7364
7365 if (bgsaveerr != REDIS_OK) {
7366 freeClient(slave);
7367 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7368 continue;
7369 }
7370 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 7371 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 7372 freeClient(slave);
7373 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7374 continue;
7375 }
7376 slave->repldboff = 0;
7377 slave->repldbsize = buf.st_size;
7378 slave->replstate = REDIS_REPL_SEND_BULK;
7379 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 7380 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 7381 freeClient(slave);
7382 continue;
7383 }
7384 }
ed9b544e 7385 }
6208b3a7 7386 if (startbgsave) {
7387 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 7388 listIter li;
7389
7390 listRewind(server.slaves,&li);
6208b3a7 7391 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 7392 while((ln = listNext(&li))) {
6208b3a7 7393 redisClient *slave = ln->value;
ed9b544e 7394
6208b3a7 7395 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7396 freeClient(slave);
7397 }
7398 }
7399 }
ed9b544e 7400}
7401
7402static int syncWithMaster(void) {
d0ccebcf 7403 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 7404 long dumpsize;
ed9b544e 7405 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 7406 int dfd, maxtries = 5;
ed9b544e 7407
7408 if (fd == -1) {
7409 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7410 strerror(errno));
7411 return REDIS_ERR;
7412 }
d0ccebcf 7413
7414 /* AUTH with the master if required. */
7415 if(server.masterauth) {
7416 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7417 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7418 close(fd);
7419 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7420 strerror(errno));
7421 return REDIS_ERR;
7422 }
7423 /* Read the AUTH result. */
7424 if (syncReadLine(fd,buf,1024,3600) == -1) {
7425 close(fd);
7426 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7427 strerror(errno));
7428 return REDIS_ERR;
7429 }
7430 if (buf[0] != '+') {
7431 close(fd);
7432 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7433 return REDIS_ERR;
7434 }
7435 }
7436
ed9b544e 7437 /* Issue the SYNC command */
7438 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7439 close(fd);
7440 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7441 strerror(errno));
7442 return REDIS_ERR;
7443 }
7444 /* Read the bulk write count */
8c4d91fc 7445 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 7446 close(fd);
7447 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7448 strerror(errno));
7449 return REDIS_ERR;
7450 }
4aa701c1 7451 if (buf[0] != '$') {
7452 close(fd);
7453 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7454 return REDIS_ERR;
7455 }
18e61fa2 7456 dumpsize = strtol(buf+1,NULL,10);
7457 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 7458 /* Read the bulk write data on a temp file */
8c5abee8 7459 while(maxtries--) {
7460 snprintf(tmpfile,256,
7461 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7462 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7463 if (dfd != -1) break;
5de9ad7c 7464 sleep(1);
8c5abee8 7465 }
ed9b544e 7466 if (dfd == -1) {
7467 close(fd);
7468 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7469 return REDIS_ERR;
7470 }
7471 while(dumpsize) {
7472 int nread, nwritten;
7473
7474 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7475 if (nread == -1) {
7476 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7477 strerror(errno));
7478 close(fd);
7479 close(dfd);
7480 return REDIS_ERR;
7481 }
7482 nwritten = write(dfd,buf,nread);
7483 if (nwritten == -1) {
7484 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7485 close(fd);
7486 close(dfd);
7487 return REDIS_ERR;
7488 }
7489 dumpsize -= nread;
7490 }
7491 close(dfd);
7492 if (rename(tmpfile,server.dbfilename) == -1) {
7493 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7494 unlink(tmpfile);
7495 close(fd);
7496 return REDIS_ERR;
7497 }
7498 emptyDb();
f78fd11b 7499 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 7500 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7501 close(fd);
7502 return REDIS_ERR;
7503 }
7504 server.master = createClient(fd);
7505 server.master->flags |= REDIS_MASTER;
179b3952 7506 server.master->authenticated = 1;
ed9b544e 7507 server.replstate = REDIS_REPL_CONNECTED;
7508 return REDIS_OK;
7509}
7510
321b0e13 7511static void slaveofCommand(redisClient *c) {
7512 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7513 !strcasecmp(c->argv[2]->ptr,"one")) {
7514 if (server.masterhost) {
7515 sdsfree(server.masterhost);
7516 server.masterhost = NULL;
7517 if (server.master) freeClient(server.master);
7518 server.replstate = REDIS_REPL_NONE;
7519 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7520 }
7521 } else {
7522 sdsfree(server.masterhost);
7523 server.masterhost = sdsdup(c->argv[1]->ptr);
7524 server.masterport = atoi(c->argv[2]->ptr);
7525 if (server.master) freeClient(server.master);
7526 server.replstate = REDIS_REPL_CONNECT;
7527 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7528 server.masterhost, server.masterport);
7529 }
7530 addReply(c,shared.ok);
7531}
7532
3fd78bcd 7533/* ============================ Maxmemory directive ======================== */
7534
a5819310 7535/* Try to free one object form the pre-allocated objects free list.
7536 * This is useful under low mem conditions as by default we take 1 million
7537 * free objects allocated. On success REDIS_OK is returned, otherwise
7538 * REDIS_ERR. */
7539static int tryFreeOneObjectFromFreelist(void) {
f870935d 7540 robj *o;
7541
a5819310 7542 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7543 if (listLength(server.objfreelist)) {
7544 listNode *head = listFirst(server.objfreelist);
7545 o = listNodeValue(head);
7546 listDelNode(server.objfreelist,head);
7547 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7548 zfree(o);
7549 return REDIS_OK;
7550 } else {
7551 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7552 return REDIS_ERR;
7553 }
f870935d 7554}
7555
3fd78bcd 7556/* This function gets called when 'maxmemory' is set on the config file to limit
7557 * the max memory used by the server, and we are out of memory.
7558 * This function will try to, in order:
7559 *
7560 * - Free objects from the free list
7561 * - Try to remove keys with an EXPIRE set
7562 *
7563 * It is not possible to free enough memory to reach used-memory < maxmemory
7564 * the server will start refusing commands that will enlarge even more the
7565 * memory usage.
7566 */
7567static void freeMemoryIfNeeded(void) {
7568 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 7569 int j, k, freed = 0;
7570
7571 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7572 for (j = 0; j < server.dbnum; j++) {
7573 int minttl = -1;
7574 robj *minkey = NULL;
7575 struct dictEntry *de;
7576
7577 if (dictSize(server.db[j].expires)) {
7578 freed = 1;
7579 /* From a sample of three keys drop the one nearest to
7580 * the natural expire */
7581 for (k = 0; k < 3; k++) {
7582 time_t t;
7583
7584 de = dictGetRandomKey(server.db[j].expires);
7585 t = (time_t) dictGetEntryVal(de);
7586 if (minttl == -1 || t < minttl) {
7587 minkey = dictGetEntryKey(de);
7588 minttl = t;
3fd78bcd 7589 }
3fd78bcd 7590 }
a5819310 7591 deleteKey(server.db+j,minkey);
3fd78bcd 7592 }
3fd78bcd 7593 }
a5819310 7594 if (!freed) return; /* nothing to free... */
3fd78bcd 7595 }
7596}
7597
f80dff62 7598/* ============================== Append Only file ========================== */
7599
7600static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7601 sds buf = sdsempty();
7602 int j;
7603 ssize_t nwritten;
7604 time_t now;
7605 robj *tmpargv[3];
7606
7607 /* The DB this command was targetting is not the same as the last command
7608 * we appendend. To issue a SELECT command is needed. */
7609 if (dictid != server.appendseldb) {
7610 char seldb[64];
7611
7612 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 7613 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 7614 (unsigned long)strlen(seldb),seldb);
f80dff62 7615 server.appendseldb = dictid;
7616 }
7617
7618 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7619 * EXPIREs into EXPIREATs calls */
7620 if (cmd->proc == expireCommand) {
7621 long when;
7622
7623 tmpargv[0] = createStringObject("EXPIREAT",8);
7624 tmpargv[1] = argv[1];
7625 incrRefCount(argv[1]);
7626 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7627 tmpargv[2] = createObject(REDIS_STRING,
7628 sdscatprintf(sdsempty(),"%ld",when));
7629 argv = tmpargv;
7630 }
7631
7632 /* Append the actual command */
7633 buf = sdscatprintf(buf,"*%d\r\n",argc);
7634 for (j = 0; j < argc; j++) {
7635 robj *o = argv[j];
7636
9d65a1bb 7637 o = getDecodedObject(o);
83c6a618 7638 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
f80dff62 7639 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7640 buf = sdscatlen(buf,"\r\n",2);
9d65a1bb 7641 decrRefCount(o);
f80dff62 7642 }
7643
7644 /* Free the objects from the modified argv for EXPIREAT */
7645 if (cmd->proc == expireCommand) {
7646 for (j = 0; j < 3; j++)
7647 decrRefCount(argv[j]);
7648 }
7649
7650 /* We want to perform a single write. This should be guaranteed atomic
7651 * at least if the filesystem we are writing is a real physical one.
7652 * While this will save us against the server being killed I don't think
7653 * there is much to do about the whole server stopping for power problems
7654 * or alike */
7655 nwritten = write(server.appendfd,buf,sdslen(buf));
7656 if (nwritten != (signed)sdslen(buf)) {
7657 /* Ooops, we are in troubles. The best thing to do for now is
7658 * to simply exit instead to give the illusion that everything is
7659 * working as expected. */
7660 if (nwritten == -1) {
7661 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7662 } else {
7663 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7664 }
7665 exit(1);
7666 }
85a83172 7667 /* If a background append only file rewriting is in progress we want to
7668 * accumulate the differences between the child DB and the current one
7669 * in a buffer, so that when the child process will do its work we
7670 * can append the differences to the new append only file. */
7671 if (server.bgrewritechildpid != -1)
7672 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7673
7674 sdsfree(buf);
f80dff62 7675 now = time(NULL);
7676 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7677 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7678 now-server.lastfsync > 1))
7679 {
7680 fsync(server.appendfd); /* Let's try to get this data on the disk */
7681 server.lastfsync = now;
7682 }
7683}
7684
7685/* In Redis commands are always executed in the context of a client, so in
7686 * order to load the append only file we need to create a fake client. */
7687static struct redisClient *createFakeClient(void) {
7688 struct redisClient *c = zmalloc(sizeof(*c));
7689
7690 selectDb(c,0);
7691 c->fd = -1;
7692 c->querybuf = sdsempty();
7693 c->argc = 0;
7694 c->argv = NULL;
7695 c->flags = 0;
9387d17d 7696 /* We set the fake client as a slave waiting for the synchronization
7697 * so that Redis will not try to send replies to this client. */
7698 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 7699 c->reply = listCreate();
7700 listSetFreeMethod(c->reply,decrRefCount);
7701 listSetDupMethod(c->reply,dupClientReplyValue);
7702 return c;
7703}
7704
7705static void freeFakeClient(struct redisClient *c) {
7706 sdsfree(c->querybuf);
7707 listRelease(c->reply);
7708 zfree(c);
7709}
7710
7711/* Replay the append log file. On error REDIS_OK is returned. On non fatal
7712 * error (the append only file is zero-length) REDIS_ERR is returned. On
7713 * fatal error an error message is logged and the program exists. */
7714int loadAppendOnlyFile(char *filename) {
7715 struct redisClient *fakeClient;
7716 FILE *fp = fopen(filename,"r");
7717 struct redis_stat sb;
b492cf00 7718 unsigned long long loadedkeys = 0;
f80dff62 7719
7720 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7721 return REDIS_ERR;
7722
7723 if (fp == NULL) {
7724 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7725 exit(1);
7726 }
7727
7728 fakeClient = createFakeClient();
7729 while(1) {
7730 int argc, j;
7731 unsigned long len;
7732 robj **argv;
7733 char buf[128];
7734 sds argsds;
7735 struct redisCommand *cmd;
7736
7737 if (fgets(buf,sizeof(buf),fp) == NULL) {
7738 if (feof(fp))
7739 break;
7740 else
7741 goto readerr;
7742 }
7743 if (buf[0] != '*') goto fmterr;
7744 argc = atoi(buf+1);
7745 argv = zmalloc(sizeof(robj*)*argc);
7746 for (j = 0; j < argc; j++) {
7747 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7748 if (buf[0] != '$') goto fmterr;
7749 len = strtol(buf+1,NULL,10);
7750 argsds = sdsnewlen(NULL,len);
0f151ef1 7751 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 7752 argv[j] = createObject(REDIS_STRING,argsds);
7753 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7754 }
7755
7756 /* Command lookup */
7757 cmd = lookupCommand(argv[0]->ptr);
7758 if (!cmd) {
7759 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7760 exit(1);
7761 }
bdcb92f2 7762 /* Try object encoding */
f80dff62 7763 if (cmd->flags & REDIS_CMD_BULK)
05df7621 7764 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
f80dff62 7765 /* Run the command in the context of a fake client */
7766 fakeClient->argc = argc;
7767 fakeClient->argv = argv;
7768 cmd->proc(fakeClient);
7769 /* Discard the reply objects list from the fake client */
7770 while(listLength(fakeClient->reply))
7771 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7772 /* Clean up, ready for the next command */
7773 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7774 zfree(argv);
b492cf00 7775 /* Handle swapping while loading big datasets when VM is on */
7776 loadedkeys++;
7777 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7778 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 7779 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 7780 }
7781 }
f80dff62 7782 }
7783 fclose(fp);
7784 freeFakeClient(fakeClient);
7785 return REDIS_OK;
7786
7787readerr:
7788 if (feof(fp)) {
7789 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7790 } else {
7791 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7792 }
7793 exit(1);
7794fmterr:
7795 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7796 exit(1);
7797}
7798
9d65a1bb 7799/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
9c8e3cee 7800static int fwriteBulkObject(FILE *fp, robj *obj) {
9d65a1bb 7801 char buf[128];
b9bc0eef 7802 int decrrc = 0;
7803
f2d9f50f 7804 /* Avoid the incr/decr ref count business if possible to help
7805 * copy-on-write (we are often in a child process when this function
7806 * is called).
7807 * Also makes sure that key objects don't get incrRefCount-ed when VM
7808 * is enabled */
7809 if (obj->encoding != REDIS_ENCODING_RAW) {
b9bc0eef 7810 obj = getDecodedObject(obj);
7811 decrrc = 1;
7812 }
9d65a1bb 7813 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
7814 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 7815 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
7816 goto err;
9d65a1bb 7817 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 7818 if (decrrc) decrRefCount(obj);
9d65a1bb 7819 return 1;
7820err:
b9bc0eef 7821 if (decrrc) decrRefCount(obj);
9d65a1bb 7822 return 0;
7823}
7824
9c8e3cee 7825/* Write binary-safe string into a file in the bulkformat
7826 * $<count>\r\n<payload>\r\n */
7827static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
7828 char buf[128];
7829
7830 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
7831 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7832 if (len && fwrite(s,len,1,fp) == 0) return 0;
7833 if (fwrite("\r\n",2,1,fp) == 0) return 0;
7834 return 1;
7835}
7836
9d65a1bb 7837/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
7838static int fwriteBulkDouble(FILE *fp, double d) {
7839 char buf[128], dbuf[128];
7840
7841 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
7842 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
7843 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7844 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
7845 return 1;
7846}
7847
7848/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
7849static int fwriteBulkLong(FILE *fp, long l) {
7850 char buf[128], lbuf[128];
7851
7852 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
7853 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
7854 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
7855 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
7856 return 1;
7857}
7858
7859/* Write a sequence of commands able to fully rebuild the dataset into
7860 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
7861static int rewriteAppendOnlyFile(char *filename) {
7862 dictIterator *di = NULL;
7863 dictEntry *de;
7864 FILE *fp;
7865 char tmpfile[256];
7866 int j;
7867 time_t now = time(NULL);
7868
7869 /* Note that we have to use a different temp name here compared to the
7870 * one used by rewriteAppendOnlyFileBackground() function. */
7871 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
7872 fp = fopen(tmpfile,"w");
7873 if (!fp) {
7874 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
7875 return REDIS_ERR;
7876 }
7877 for (j = 0; j < server.dbnum; j++) {
7878 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
7879 redisDb *db = server.db+j;
7880 dict *d = db->dict;
7881 if (dictSize(d) == 0) continue;
7882 di = dictGetIterator(d);
7883 if (!di) {
7884 fclose(fp);
7885 return REDIS_ERR;
7886 }
7887
7888 /* SELECT the new DB */
7889 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 7890 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 7891
7892 /* Iterate this DB writing every entry */
7893 while((de = dictNext(di)) != NULL) {
e7546c63 7894 robj *key, *o;
7895 time_t expiretime;
7896 int swapped;
7897
7898 key = dictGetEntryKey(de);
b9bc0eef 7899 /* If the value for this key is swapped, load a preview in memory.
7900 * We use a "swapped" flag to remember if we need to free the
7901 * value object instead to just increment the ref count anyway
7902 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 7903 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
7904 key->storage == REDIS_VM_SWAPPING) {
e7546c63 7905 o = dictGetEntryVal(de);
7906 swapped = 0;
7907 } else {
7908 o = vmPreviewObject(key);
e7546c63 7909 swapped = 1;
7910 }
7911 expiretime = getExpire(db,key);
9d65a1bb 7912
7913 /* Save the key and associated value */
9d65a1bb 7914 if (o->type == REDIS_STRING) {
7915 /* Emit a SET command */
7916 char cmd[]="*3\r\n$3\r\nSET\r\n";
7917 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7918 /* Key and value */
9c8e3cee 7919 if (fwriteBulkObject(fp,key) == 0) goto werr;
7920 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 7921 } else if (o->type == REDIS_LIST) {
7922 /* Emit the RPUSHes needed to rebuild the list */
7923 list *list = o->ptr;
7924 listNode *ln;
c7df85a4 7925 listIter li;
9d65a1bb 7926
c7df85a4 7927 listRewind(list,&li);
7928 while((ln = listNext(&li))) {
9d65a1bb 7929 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
7930 robj *eleobj = listNodeValue(ln);
7931
7932 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 7933 if (fwriteBulkObject(fp,key) == 0) goto werr;
7934 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 7935 }
7936 } else if (o->type == REDIS_SET) {
7937 /* Emit the SADDs needed to rebuild the set */
7938 dict *set = o->ptr;
7939 dictIterator *di = dictGetIterator(set);
7940 dictEntry *de;
7941
7942 while((de = dictNext(di)) != NULL) {
7943 char cmd[]="*3\r\n$4\r\nSADD\r\n";
7944 robj *eleobj = dictGetEntryKey(de);
7945
7946 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 7947 if (fwriteBulkObject(fp,key) == 0) goto werr;
7948 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 7949 }
7950 dictReleaseIterator(di);
7951 } else if (o->type == REDIS_ZSET) {
7952 /* Emit the ZADDs needed to rebuild the sorted set */
7953 zset *zs = o->ptr;
7954 dictIterator *di = dictGetIterator(zs->dict);
7955 dictEntry *de;
7956
7957 while((de = dictNext(di)) != NULL) {
7958 char cmd[]="*4\r\n$4\r\nZADD\r\n";
7959 robj *eleobj = dictGetEntryKey(de);
7960 double *score = dictGetEntryVal(de);
7961
7962 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 7963 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 7964 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 7965 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 7966 }
7967 dictReleaseIterator(di);
9c8e3cee 7968 } else if (o->type == REDIS_HASH) {
7969 char cmd[]="*4\r\n$4\r\nHSET\r\n";
7970
7971 /* Emit the HSETs needed to rebuild the hash */
7972 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7973 unsigned char *p = zipmapRewind(o->ptr);
7974 unsigned char *field, *val;
7975 unsigned int flen, vlen;
7976
7977 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
7978 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7979 if (fwriteBulkObject(fp,key) == 0) goto werr;
7980 if (fwriteBulkString(fp,(char*)field,flen) == -1)
7981 return -1;
7982 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
7983 return -1;
7984 }
7985 } else {
7986 dictIterator *di = dictGetIterator(o->ptr);
7987 dictEntry *de;
7988
7989 while((de = dictNext(di)) != NULL) {
7990 robj *field = dictGetEntryKey(de);
7991 robj *val = dictGetEntryVal(de);
7992
7993 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
7994 if (fwriteBulkObject(fp,key) == 0) goto werr;
7995 if (fwriteBulkObject(fp,field) == -1) return -1;
7996 if (fwriteBulkObject(fp,val) == -1) return -1;
7997 }
7998 dictReleaseIterator(di);
7999 }
9d65a1bb 8000 } else {
78409a0f 8001 redisAssert(0);
9d65a1bb 8002 }
8003 /* Save the expire time */
8004 if (expiretime != -1) {
e96e4fbf 8005 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 8006 /* If this key is already expired skip it */
8007 if (expiretime < now) continue;
8008 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8009 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8010 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8011 }
b9bc0eef 8012 if (swapped) decrRefCount(o);
9d65a1bb 8013 }
8014 dictReleaseIterator(di);
8015 }
8016
8017 /* Make sure data will not remain on the OS's output buffers */
8018 fflush(fp);
8019 fsync(fileno(fp));
8020 fclose(fp);
8021
8022 /* Use RENAME to make sure the DB file is changed atomically only
8023 * if the generate DB file is ok. */
8024 if (rename(tmpfile,filename) == -1) {
8025 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8026 unlink(tmpfile);
8027 return REDIS_ERR;
8028 }
8029 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8030 return REDIS_OK;
8031
8032werr:
8033 fclose(fp);
8034 unlink(tmpfile);
e96e4fbf 8035 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 8036 if (di) dictReleaseIterator(di);
8037 return REDIS_ERR;
8038}
8039
8040/* This is how rewriting of the append only file in background works:
8041 *
8042 * 1) The user calls BGREWRITEAOF
8043 * 2) Redis calls this function, that forks():
8044 * 2a) the child rewrite the append only file in a temp file.
8045 * 2b) the parent accumulates differences in server.bgrewritebuf.
8046 * 3) When the child finished '2a' exists.
8047 * 4) The parent will trap the exit code, if it's OK, will append the
8048 * data accumulated into server.bgrewritebuf into the temp file, and
8049 * finally will rename(2) the temp file in the actual file name.
8050 * The the new file is reopened as the new append only file. Profit!
8051 */
8052static int rewriteAppendOnlyFileBackground(void) {
8053 pid_t childpid;
8054
8055 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 8056 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 8057 if ((childpid = fork()) == 0) {
8058 /* Child */
8059 char tmpfile[256];
9d65a1bb 8060
054e426d 8061 if (server.vm_enabled) vmReopenSwapFile();
8062 close(server.fd);
9d65a1bb 8063 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8064 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 8065 _exit(0);
9d65a1bb 8066 } else {
478c2c6f 8067 _exit(1);
9d65a1bb 8068 }
8069 } else {
8070 /* Parent */
8071 if (childpid == -1) {
8072 redisLog(REDIS_WARNING,
8073 "Can't rewrite append only file in background: fork: %s",
8074 strerror(errno));
8075 return REDIS_ERR;
8076 }
8077 redisLog(REDIS_NOTICE,
8078 "Background append only file rewriting started by pid %d",childpid);
8079 server.bgrewritechildpid = childpid;
884d4b39 8080 updateDictResizePolicy();
85a83172 8081 /* We set appendseldb to -1 in order to force the next call to the
8082 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8083 * accumulated by the parent into server.bgrewritebuf will start
8084 * with a SELECT statement and it will be safe to merge. */
8085 server.appendseldb = -1;
9d65a1bb 8086 return REDIS_OK;
8087 }
8088 return REDIS_OK; /* unreached */
8089}
8090
8091static void bgrewriteaofCommand(redisClient *c) {
8092 if (server.bgrewritechildpid != -1) {
8093 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8094 return;
8095 }
8096 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 8097 char *status = "+Background append only file rewriting started\r\n";
8098 addReplySds(c,sdsnew(status));
9d65a1bb 8099 } else {
8100 addReply(c,shared.err);
8101 }
8102}
8103
8104static void aofRemoveTempFile(pid_t childpid) {
8105 char tmpfile[256];
8106
8107 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8108 unlink(tmpfile);
8109}
8110
996cb5f7 8111/* Virtual Memory is composed mainly of two subsystems:
8112 * - Blocking Virutal Memory
8113 * - Threaded Virtual Memory I/O
8114 * The two parts are not fully decoupled, but functions are split among two
8115 * different sections of the source code (delimited by comments) in order to
8116 * make more clear what functionality is about the blocking VM and what about
8117 * the threaded (not blocking) VM.
8118 *
8119 * Redis VM design:
8120 *
8121 * Redis VM is a blocking VM (one that blocks reading swapped values from
8122 * disk into memory when a value swapped out is needed in memory) that is made
8123 * unblocking by trying to examine the command argument vector in order to
8124 * load in background values that will likely be needed in order to exec
8125 * the command. The command is executed only once all the relevant keys
8126 * are loaded into memory.
8127 *
8128 * This basically is almost as simple of a blocking VM, but almost as parallel
8129 * as a fully non-blocking VM.
8130 */
8131
8132/* =================== Virtual Memory - Blocking Side ====================== */
054e426d 8133
8134/* substitute the first occurrence of '%p' with the process pid in the
8135 * swap file name. */
8136static void expandVmSwapFilename(void) {
8137 char *p = strstr(server.vm_swap_file,"%p");
8138 sds new;
8139
8140 if (!p) return;
8141 new = sdsempty();
8142 *p = '\0';
8143 new = sdscat(new,server.vm_swap_file);
8144 new = sdscatprintf(new,"%ld",(long) getpid());
8145 new = sdscat(new,p+2);
8146 zfree(server.vm_swap_file);
8147 server.vm_swap_file = new;
8148}
8149
75680a3c 8150static void vmInit(void) {
8151 off_t totsize;
996cb5f7 8152 int pipefds[2];
bcaa7a4f 8153 size_t stacksize;
75680a3c 8154
4ad37480 8155 if (server.vm_max_threads != 0)
8156 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8157
054e426d 8158 expandVmSwapFilename();
8159 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
6fa987e3 8160 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8161 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8162 }
75680a3c 8163 if (server.vm_fp == NULL) {
6fa987e3 8164 redisLog(REDIS_WARNING,
8165 "Impossible to open the swap file: %s. Exiting.",
8166 strerror(errno));
75680a3c 8167 exit(1);
8168 }
8169 server.vm_fd = fileno(server.vm_fp);
8170 server.vm_next_page = 0;
8171 server.vm_near_pages = 0;
7d98e08c 8172 server.vm_stats_used_pages = 0;
8173 server.vm_stats_swapped_objects = 0;
8174 server.vm_stats_swapouts = 0;
8175 server.vm_stats_swapins = 0;
75680a3c 8176 totsize = server.vm_pages*server.vm_page_size;
8177 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8178 if (ftruncate(server.vm_fd,totsize) == -1) {
8179 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8180 strerror(errno));
8181 exit(1);
8182 } else {
8183 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8184 }
7d30035d 8185 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 8186 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 8187 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 8188 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 8189
996cb5f7 8190 /* Initialize threaded I/O (used by Virtual Memory) */
8191 server.io_newjobs = listCreate();
8192 server.io_processing = listCreate();
8193 server.io_processed = listCreate();
d5d55fc3 8194 server.io_ready_clients = listCreate();
92f8e882 8195 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 8196 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8197 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 8198 server.io_active_threads = 0;
996cb5f7 8199 if (pipe(pipefds) == -1) {
8200 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8201 ,strerror(errno));
8202 exit(1);
8203 }
8204 server.io_ready_pipe_read = pipefds[0];
8205 server.io_ready_pipe_write = pipefds[1];
8206 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 8207 /* LZF requires a lot of stack */
8208 pthread_attr_init(&server.io_threads_attr);
8209 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8210 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8211 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 8212 /* Listen for events in the threaded I/O pipe */
8213 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8214 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8215 oom("creating file event");
75680a3c 8216}
8217
06224fec 8218/* Mark the page as used */
8219static void vmMarkPageUsed(off_t page) {
8220 off_t byte = page/8;
8221 int bit = page&7;
970e10bb 8222 redisAssert(vmFreePage(page) == 1);
06224fec 8223 server.vm_bitmap[byte] |= 1<<bit;
8224}
8225
8226/* Mark N contiguous pages as used, with 'page' being the first. */
8227static void vmMarkPagesUsed(off_t page, off_t count) {
8228 off_t j;
8229
8230 for (j = 0; j < count; j++)
7d30035d 8231 vmMarkPageUsed(page+j);
7d98e08c 8232 server.vm_stats_used_pages += count;
7c775e09 8233 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8234 (long long)count, (long long)page);
06224fec 8235}
8236
8237/* Mark the page as free */
8238static void vmMarkPageFree(off_t page) {
8239 off_t byte = page/8;
8240 int bit = page&7;
970e10bb 8241 redisAssert(vmFreePage(page) == 0);
06224fec 8242 server.vm_bitmap[byte] &= ~(1<<bit);
8243}
8244
8245/* Mark N contiguous pages as free, with 'page' being the first. */
8246static void vmMarkPagesFree(off_t page, off_t count) {
8247 off_t j;
8248
8249 for (j = 0; j < count; j++)
7d30035d 8250 vmMarkPageFree(page+j);
7d98e08c 8251 server.vm_stats_used_pages -= count;
7c775e09 8252 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8253 (long long)count, (long long)page);
06224fec 8254}
8255
8256/* Test if the page is free */
8257static int vmFreePage(off_t page) {
8258 off_t byte = page/8;
8259 int bit = page&7;
7d30035d 8260 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 8261}
8262
8263/* Find N contiguous free pages storing the first page of the cluster in *first.
3a66edc7 8264 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
8265 * REDIS_ERR is returned.
06224fec 8266 *
8267 * This function uses a simple algorithm: we try to allocate
8268 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8269 * again from the start of the swap file searching for free spaces.
8270 *
8271 * If it looks pretty clear that there are no free pages near our offset
8272 * we try to find less populated places doing a forward jump of
8273 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8274 * without hurry, and then we jump again and so forth...
8275 *
8276 * This function can be improved using a free list to avoid to guess
8277 * too much, since we could collect data about freed pages.
8278 *
8279 * note: I implemented this function just after watching an episode of
8280 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8281 */
c7df85a4 8282static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 8283 off_t base, offset = 0, since_jump = 0, numfree = 0;
8284
8285 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8286 server.vm_near_pages = 0;
8287 server.vm_next_page = 0;
8288 }
8289 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8290 base = server.vm_next_page;
8291
8292 while(offset < server.vm_pages) {
8293 off_t this = base+offset;
8294
8295 /* If we overflow, restart from page zero */
8296 if (this >= server.vm_pages) {
8297 this -= server.vm_pages;
8298 if (this == 0) {
8299 /* Just overflowed, what we found on tail is no longer
8300 * interesting, as it's no longer contiguous. */
8301 numfree = 0;
8302 }
8303 }
8304 if (vmFreePage(this)) {
8305 /* This is a free page */
8306 numfree++;
8307 /* Already got N free pages? Return to the caller, with success */
8308 if (numfree == n) {
7d30035d 8309 *first = this-(n-1);
8310 server.vm_next_page = this+1;
7c775e09 8311 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 8312 return REDIS_OK;
06224fec 8313 }
8314 } else {
8315 /* The current one is not a free page */
8316 numfree = 0;
8317 }
8318
8319 /* Fast-forward if the current page is not free and we already
8320 * searched enough near this place. */
8321 since_jump++;
8322 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8323 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8324 since_jump = 0;
8325 /* Note that even if we rewind after the jump, we are don't need
8326 * to make sure numfree is set to zero as we only jump *if* it
8327 * is set to zero. */
8328 } else {
8329 /* Otherwise just check the next page */
8330 offset++;
8331 }
8332 }
3a66edc7 8333 return REDIS_ERR;
8334}
8335
a5819310 8336/* Write the specified object at the specified page of the swap file */
8337static int vmWriteObjectOnSwap(robj *o, off_t page) {
8338 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8339 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8340 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8341 redisLog(REDIS_WARNING,
9ebed7cf 8342 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 8343 strerror(errno));
8344 return REDIS_ERR;
8345 }
8346 rdbSaveObject(server.vm_fp,o);
ba76a8f9 8347 fflush(server.vm_fp);
a5819310 8348 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8349 return REDIS_OK;
8350}
8351
3a66edc7 8352/* Swap the 'val' object relative to 'key' into disk. Store all the information
8353 * needed to later retrieve the object into the key object.
8354 * If we can't find enough contiguous empty pages to swap the object on disk
8355 * REDIS_ERR is returned. */
a69a0c9c 8356static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 8357 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 8358 off_t page;
8359
8360 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 8361 assert(key->refcount == 1);
3a66edc7 8362 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 8363 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 8364 key->vm.page = page;
8365 key->vm.usedpages = pages;
8366 key->storage = REDIS_VM_SWAPPED;
d894161b 8367 key->vtype = val->type;
3a66edc7 8368 decrRefCount(val); /* Deallocate the object from memory. */
8369 vmMarkPagesUsed(page,pages);
7d30035d 8370 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8371 (unsigned char*) key->ptr,
8372 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 8373 server.vm_stats_swapped_objects++;
8374 server.vm_stats_swapouts++;
3a66edc7 8375 return REDIS_OK;
8376}
8377
a5819310 8378static robj *vmReadObjectFromSwap(off_t page, int type) {
8379 robj *o;
3a66edc7 8380
a5819310 8381 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8382 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 8383 redisLog(REDIS_WARNING,
d5d55fc3 8384 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 8385 strerror(errno));
478c2c6f 8386 _exit(1);
3a66edc7 8387 }
a5819310 8388 o = rdbLoadObject(type,server.vm_fp);
8389 if (o == NULL) {
d5d55fc3 8390 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 8391 _exit(1);
3a66edc7 8392 }
a5819310 8393 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8394 return o;
8395}
8396
8397/* Load the value object relative to the 'key' object from swap to memory.
8398 * The newly allocated object is returned.
8399 *
8400 * If preview is true the unserialized object is returned to the caller but
8401 * no changes are made to the key object, nor the pages are marked as freed */
8402static robj *vmGenericLoadObject(robj *key, int preview) {
8403 robj *val;
8404
d5d55fc3 8405 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
a5819310 8406 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 8407 if (!preview) {
8408 key->storage = REDIS_VM_MEMORY;
8409 key->vm.atime = server.unixtime;
8410 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8411 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8412 (unsigned char*) key->ptr);
7d98e08c 8413 server.vm_stats_swapped_objects--;
38aba9a1 8414 } else {
8415 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8416 (unsigned char*) key->ptr);
7e69548d 8417 }
7d98e08c 8418 server.vm_stats_swapins++;
3a66edc7 8419 return val;
06224fec 8420}
8421
7e69548d 8422/* Plain object loading, from swap to memory */
8423static robj *vmLoadObject(robj *key) {
996cb5f7 8424 /* If we are loading the object in background, stop it, we
8425 * need to load this object synchronously ASAP. */
8426 if (key->storage == REDIS_VM_LOADING)
8427 vmCancelThreadedIOJob(key);
7e69548d 8428 return vmGenericLoadObject(key,0);
8429}
8430
8431/* Just load the value on disk, without to modify the key.
8432 * This is useful when we want to perform some operation on the value
8433 * without to really bring it from swap to memory, like while saving the
8434 * dataset or rewriting the append only log. */
8435static robj *vmPreviewObject(robj *key) {
8436 return vmGenericLoadObject(key,1);
8437}
8438
4ef8de8a 8439/* How a good candidate is this object for swapping?
8440 * The better candidate it is, the greater the returned value.
8441 *
8442 * Currently we try to perform a fast estimation of the object size in
8443 * memory, and combine it with aging informations.
8444 *
8445 * Basically swappability = idle-time * log(estimated size)
8446 *
8447 * Bigger objects are preferred over smaller objects, but not
8448 * proportionally, this is why we use the logarithm. This algorithm is
8449 * just a first try and will probably be tuned later. */
8450static double computeObjectSwappability(robj *o) {
8451 time_t age = server.unixtime - o->vm.atime;
8452 long asize = 0;
8453 list *l;
8454 dict *d;
8455 struct dictEntry *de;
8456 int z;
8457
8458 if (age <= 0) return 0;
8459 switch(o->type) {
8460 case REDIS_STRING:
8461 if (o->encoding != REDIS_ENCODING_RAW) {
8462 asize = sizeof(*o);
8463 } else {
8464 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8465 }
8466 break;
8467 case REDIS_LIST:
8468 l = o->ptr;
8469 listNode *ln = listFirst(l);
8470
8471 asize = sizeof(list);
8472 if (ln) {
8473 robj *ele = ln->value;
8474 long elesize;
8475
8476 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8477 (sizeof(*o)+sdslen(ele->ptr)) :
8478 sizeof(*o);
8479 asize += (sizeof(listNode)+elesize)*listLength(l);
8480 }
8481 break;
8482 case REDIS_SET:
8483 case REDIS_ZSET:
8484 z = (o->type == REDIS_ZSET);
8485 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8486
8487 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8488 if (z) asize += sizeof(zset)-sizeof(dict);
8489 if (dictSize(d)) {
8490 long elesize;
8491 robj *ele;
8492
8493 de = dictGetRandomKey(d);
8494 ele = dictGetEntryKey(de);
8495 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8496 (sizeof(*o)+sdslen(ele->ptr)) :
8497 sizeof(*o);
8498 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8499 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8500 }
8501 break;
a97b9060 8502 case REDIS_HASH:
8503 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8504 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8505 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8506 unsigned int klen, vlen;
8507 unsigned char *key, *val;
8508
8509 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8510 klen = 0;
8511 vlen = 0;
8512 }
8513 asize = len*(klen+vlen+3);
8514 } else if (o->encoding == REDIS_ENCODING_HT) {
8515 d = o->ptr;
8516 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8517 if (dictSize(d)) {
8518 long elesize;
8519 robj *ele;
8520
8521 de = dictGetRandomKey(d);
8522 ele = dictGetEntryKey(de);
8523 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8524 (sizeof(*o)+sdslen(ele->ptr)) :
8525 sizeof(*o);
8526 ele = dictGetEntryVal(de);
8527 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8528 (sizeof(*o)+sdslen(ele->ptr)) :
8529 sizeof(*o);
8530 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8531 }
8532 }
8533 break;
4ef8de8a 8534 }
c8c72447 8535 return (double)age*log(1+asize);
4ef8de8a 8536}
8537
8538/* Try to swap an object that's a good candidate for swapping.
8539 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 8540 * to swap any object at all.
8541 *
8542 * If 'usethreaded' is true, Redis will try to swap the object in background
8543 * using I/O threads. */
8544static int vmSwapOneObject(int usethreads) {
4ef8de8a 8545 int j, i;
8546 struct dictEntry *best = NULL;
8547 double best_swappability = 0;
b9bc0eef 8548 redisDb *best_db = NULL;
4ef8de8a 8549 robj *key, *val;
8550
8551 for (j = 0; j < server.dbnum; j++) {
8552 redisDb *db = server.db+j;
b72f6a4b 8553 /* Why maxtries is set to 100?
8554 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8555 * are swappable objects */
b0d8747d 8556 int maxtries = 100;
4ef8de8a 8557
8558 if (dictSize(db->dict) == 0) continue;
8559 for (i = 0; i < 5; i++) {
8560 dictEntry *de;
8561 double swappability;
8562
e3cadb8a 8563 if (maxtries) maxtries--;
4ef8de8a 8564 de = dictGetRandomKey(db->dict);
8565 key = dictGetEntryKey(de);
8566 val = dictGetEntryVal(de);
1064ef87 8567 /* Only swap objects that are currently in memory.
8568 *
8569 * Also don't swap shared objects if threaded VM is on, as we
8570 * try to ensure that the main thread does not touch the
8571 * object while the I/O thread is using it, but we can't
8572 * control other keys without adding additional mutex. */
8573 if (key->storage != REDIS_VM_MEMORY ||
8574 (server.vm_max_threads != 0 && val->refcount != 1)) {
e3cadb8a 8575 if (maxtries) i--; /* don't count this try */
8576 continue;
8577 }
4ef8de8a 8578 swappability = computeObjectSwappability(val);
8579 if (!best || swappability > best_swappability) {
8580 best = de;
8581 best_swappability = swappability;
b9bc0eef 8582 best_db = db;
4ef8de8a 8583 }
8584 }
8585 }
7c775e09 8586 if (best == NULL) return REDIS_ERR;
4ef8de8a 8587 key = dictGetEntryKey(best);
8588 val = dictGetEntryVal(best);
8589
e3cadb8a 8590 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 8591 key->ptr, best_swappability);
8592
8593 /* Unshare the key if needed */
8594 if (key->refcount > 1) {
8595 robj *newkey = dupStringObject(key);
8596 decrRefCount(key);
8597 key = dictGetEntryKey(best) = newkey;
8598 }
8599 /* Swap it */
a69a0c9c 8600 if (usethreads) {
b9bc0eef 8601 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 8602 return REDIS_OK;
8603 } else {
a69a0c9c 8604 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8605 dictGetEntryVal(best) = NULL;
8606 return REDIS_OK;
8607 } else {
8608 return REDIS_ERR;
8609 }
4ef8de8a 8610 }
8611}
8612
a69a0c9c 8613static int vmSwapOneObjectBlocking() {
8614 return vmSwapOneObject(0);
8615}
8616
8617static int vmSwapOneObjectThreaded() {
8618 return vmSwapOneObject(1);
8619}
8620
7e69548d 8621/* Return true if it's safe to swap out objects in a given moment.
8622 * Basically we don't want to swap objects out while there is a BGSAVE
8623 * or a BGAEOREWRITE running in backgroud. */
8624static int vmCanSwapOut(void) {
8625 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8626}
8627
1b03836c 8628/* Delete a key if swapped. Returns 1 if the key was found, was swapped
8629 * and was deleted. Otherwise 0 is returned. */
8630static int deleteIfSwapped(redisDb *db, robj *key) {
8631 dictEntry *de;
8632 robj *foundkey;
8633
8634 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8635 foundkey = dictGetEntryKey(de);
8636 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8637 deleteKey(db,key);
8638 return 1;
8639}
8640
996cb5f7 8641/* =================== Virtual Memory - Threaded I/O ======================= */
8642
b9bc0eef 8643static void freeIOJob(iojob *j) {
d5d55fc3 8644 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8645 j->type == REDIS_IOJOB_DO_SWAP ||
8646 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
b9bc0eef 8647 decrRefCount(j->val);
78ebe4c8 8648 /* We don't decrRefCount the j->key field as we did't incremented
8649 * the count creating IO Jobs. This is because the key field here is
8650 * just used as an indentifier and if a key is removed the Job should
8651 * never be touched again. */
b9bc0eef 8652 zfree(j);
8653}
8654
996cb5f7 8655/* Every time a thread finished a Job, it writes a byte into the write side
8656 * of an unix pipe in order to "awake" the main thread, and this function
8657 * is called. */
8658static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8659 int mask)
8660{
8661 char buf[1];
b0d8747d 8662 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 8663 REDIS_NOTUSED(el);
8664 REDIS_NOTUSED(mask);
8665 REDIS_NOTUSED(privdata);
8666
8667 /* For every byte we read in the read side of the pipe, there is one
8668 * I/O job completed to process. */
8669 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 8670 iojob *j;
8671 listNode *ln;
8672 robj *key;
8673 struct dictEntry *de;
8674
996cb5f7 8675 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 8676
8677 /* Get the processed element (the oldest one) */
8678 lockThreadedIO();
1064ef87 8679 assert(listLength(server.io_processed) != 0);
f6c0bba8 8680 if (toprocess == -1) {
8681 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8682 if (toprocess <= 0) toprocess = 1;
8683 }
b9bc0eef 8684 ln = listFirst(server.io_processed);
8685 j = ln->value;
8686 listDelNode(server.io_processed,ln);
8687 unlockThreadedIO();
8688 /* If this job is marked as canceled, just ignore it */
8689 if (j->canceled) {
8690 freeIOJob(j);
8691 continue;
8692 }
8693 /* Post process it in the main thread, as there are things we
8694 * can do just here to avoid race conditions and/or invasive locks */
6c96ba7d 8695 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 8696 de = dictFind(j->db->dict,j->key);
8697 assert(de != NULL);
8698 key = dictGetEntryKey(de);
8699 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 8700 redisDb *db;
8701
b9bc0eef 8702 /* Key loaded, bring it at home */
8703 key->storage = REDIS_VM_MEMORY;
8704 key->vm.atime = server.unixtime;
8705 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8706 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8707 (unsigned char*) key->ptr);
8708 server.vm_stats_swapped_objects--;
8709 server.vm_stats_swapins++;
d5d55fc3 8710 dictGetEntryVal(de) = j->val;
8711 incrRefCount(j->val);
8712 db = j->db;
b9bc0eef 8713 freeIOJob(j);
d5d55fc3 8714 /* Handle clients waiting for this key to be loaded. */
8715 handleClientsBlockedOnSwappedKey(db,key);
b9bc0eef 8716 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8717 /* Now we know the amount of pages required to swap this object.
8718 * Let's find some space for it, and queue this task again
8719 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 8720 if (!vmCanSwapOut() ||
8721 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8722 {
8723 /* Ooops... no space or we can't swap as there is
8724 * a fork()ed Redis trying to save stuff on disk. */
b9bc0eef 8725 freeIOJob(j);
054e426d 8726 key->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 8727 } else {
c7df85a4 8728 /* Note that we need to mark this pages as used now,
8729 * if the job will be canceled, we'll mark them as freed
8730 * again. */
8731 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 8732 j->type = REDIS_IOJOB_DO_SWAP;
8733 lockThreadedIO();
8734 queueIOJob(j);
8735 unlockThreadedIO();
8736 }
8737 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8738 robj *val;
8739
8740 /* Key swapped. We can finally free some memory. */
6c96ba7d 8741 if (key->storage != REDIS_VM_SWAPPING) {
8742 printf("key->storage: %d\n",key->storage);
8743 printf("key->name: %s\n",(char*)key->ptr);
8744 printf("key->refcount: %d\n",key->refcount);
8745 printf("val: %p\n",(void*)j->val);
8746 printf("val->type: %d\n",j->val->type);
8747 printf("val->ptr: %s\n",(char*)j->val->ptr);
8748 }
8749 redisAssert(key->storage == REDIS_VM_SWAPPING);
b9bc0eef 8750 val = dictGetEntryVal(de);
8751 key->vm.page = j->page;
8752 key->vm.usedpages = j->pages;
8753 key->storage = REDIS_VM_SWAPPED;
8754 key->vtype = j->val->type;
8755 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 8756 dictGetEntryVal(de) = NULL;
b9bc0eef 8757 redisLog(REDIS_DEBUG,
8758 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8759 (unsigned char*) key->ptr,
8760 (unsigned long long) j->page, (unsigned long long) j->pages);
8761 server.vm_stats_swapped_objects++;
8762 server.vm_stats_swapouts++;
8763 freeIOJob(j);
f11b8647 8764 /* Put a few more swap requests in queue if we are still
8765 * out of memory */
b0d8747d 8766 if (trytoswap && vmCanSwapOut() &&
8767 zmalloc_used_memory() > server.vm_max_memory)
8768 {
f11b8647 8769 int more = 1;
8770 while(more) {
8771 lockThreadedIO();
8772 more = listLength(server.io_newjobs) <
8773 (unsigned) server.vm_max_threads;
8774 unlockThreadedIO();
8775 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 8776 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8777 trytoswap = 0;
8778 break;
8779 }
f11b8647 8780 }
8781 }
b9bc0eef 8782 }
c953f24b 8783 processed++;
f6c0bba8 8784 if (processed == toprocess) return;
996cb5f7 8785 }
8786 if (retval < 0 && errno != EAGAIN) {
8787 redisLog(REDIS_WARNING,
8788 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8789 strerror(errno));
8790 }
8791}
8792
8793static void lockThreadedIO(void) {
8794 pthread_mutex_lock(&server.io_mutex);
8795}
8796
8797static void unlockThreadedIO(void) {
8798 pthread_mutex_unlock(&server.io_mutex);
8799}
8800
8801/* Remove the specified object from the threaded I/O queue if still not
8802 * processed, otherwise make sure to flag it as canceled. */
8803static void vmCancelThreadedIOJob(robj *o) {
8804 list *lists[3] = {
6c96ba7d 8805 server.io_newjobs, /* 0 */
8806 server.io_processing, /* 1 */
8807 server.io_processed /* 2 */
996cb5f7 8808 };
8809 int i;
8810
8811 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 8812again:
996cb5f7 8813 lockThreadedIO();
8814 /* Search for a matching key in one of the queues */
8815 for (i = 0; i < 3; i++) {
8816 listNode *ln;
c7df85a4 8817 listIter li;
996cb5f7 8818
c7df85a4 8819 listRewind(lists[i],&li);
8820 while ((ln = listNext(&li)) != NULL) {
996cb5f7 8821 iojob *job = ln->value;
8822
6c96ba7d 8823 if (job->canceled) continue; /* Skip this, already canceled. */
78ebe4c8 8824 if (job->key == o) {
970e10bb 8825 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
8826 (void*)job, (char*)o->ptr, job->type, i);
427a2153 8827 /* Mark the pages as free since the swap didn't happened
8828 * or happened but is now discarded. */
970e10bb 8829 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 8830 vmMarkPagesFree(job->page,job->pages);
8831 /* Cancel the job. It depends on the list the job is
8832 * living in. */
996cb5f7 8833 switch(i) {
8834 case 0: /* io_newjobs */
6c96ba7d 8835 /* If the job was yet not processed the best thing to do
996cb5f7 8836 * is to remove it from the queue at all */
6c96ba7d 8837 freeIOJob(job);
996cb5f7 8838 listDelNode(lists[i],ln);
8839 break;
8840 case 1: /* io_processing */
d5d55fc3 8841 /* Oh Shi- the thread is messing with the Job:
8842 *
8843 * Probably it's accessing the object if this is a
8844 * PREPARE_SWAP or DO_SWAP job.
8845 * If it's a LOAD job it may be reading from disk and
8846 * if we don't wait for the job to terminate before to
8847 * cancel it, maybe in a few microseconds data can be
8848 * corrupted in this pages. So the short story is:
8849 *
8850 * Better to wait for the job to move into the
8851 * next queue (processed)... */
8852
8853 /* We try again and again until the job is completed. */
8854 unlockThreadedIO();
8855 /* But let's wait some time for the I/O thread
8856 * to finish with this job. After all this condition
8857 * should be very rare. */
8858 usleep(1);
8859 goto again;
996cb5f7 8860 case 2: /* io_processed */
2e111efe 8861 /* The job was already processed, that's easy...
8862 * just mark it as canceled so that we'll ignore it
8863 * when processing completed jobs. */
996cb5f7 8864 job->canceled = 1;
8865 break;
8866 }
c7df85a4 8867 /* Finally we have to adjust the storage type of the object
8868 * in order to "UNDO" the operaiton. */
996cb5f7 8869 if (o->storage == REDIS_VM_LOADING)
8870 o->storage = REDIS_VM_SWAPPED;
8871 else if (o->storage == REDIS_VM_SWAPPING)
8872 o->storage = REDIS_VM_MEMORY;
8873 unlockThreadedIO();
8874 return;
8875 }
8876 }
8877 }
8878 unlockThreadedIO();
8879 assert(1 != 1); /* We should never reach this */
8880}
8881
b9bc0eef 8882static void *IOThreadEntryPoint(void *arg) {
8883 iojob *j;
8884 listNode *ln;
8885 REDIS_NOTUSED(arg);
8886
8887 pthread_detach(pthread_self());
8888 while(1) {
8889 /* Get a new job to process */
8890 lockThreadedIO();
8891 if (listLength(server.io_newjobs) == 0) {
8892 /* No new jobs in queue, exit. */
9ebed7cf 8893 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
8894 (long) pthread_self());
b9bc0eef 8895 server.io_active_threads--;
8896 unlockThreadedIO();
8897 return NULL;
8898 }
8899 ln = listFirst(server.io_newjobs);
8900 j = ln->value;
8901 listDelNode(server.io_newjobs,ln);
8902 /* Add the job in the processing queue */
8903 j->thread = pthread_self();
8904 listAddNodeTail(server.io_processing,j);
8905 ln = listLast(server.io_processing); /* We use ln later to remove it */
8906 unlockThreadedIO();
9ebed7cf 8907 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
8908 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 8909
8910 /* Process the Job */
8911 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 8912 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
b9bc0eef 8913 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8914 FILE *fp = fopen("/dev/null","w+");
8915 j->pages = rdbSavedObjectPages(j->val,fp);
8916 fclose(fp);
8917 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 8918 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
8919 j->canceled = 1;
b9bc0eef 8920 }
8921
8922 /* Done: insert the job into the processed queue */
9ebed7cf 8923 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
8924 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 8925 lockThreadedIO();
8926 listDelNode(server.io_processing,ln);
8927 listAddNodeTail(server.io_processed,j);
8928 unlockThreadedIO();
8929
8930 /* Signal the main thread there is new stuff to process */
8931 assert(write(server.io_ready_pipe_write,"x",1) == 1);
8932 }
8933 return NULL; /* never reached */
8934}
8935
8936static void spawnIOThread(void) {
8937 pthread_t thread;
478c2c6f 8938 sigset_t mask, omask;
a97b9060 8939 int err;
b9bc0eef 8940
478c2c6f 8941 sigemptyset(&mask);
8942 sigaddset(&mask,SIGCHLD);
8943 sigaddset(&mask,SIGHUP);
8944 sigaddset(&mask,SIGPIPE);
8945 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 8946 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
8947 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
8948 strerror(err));
8949 usleep(1000000);
8950 }
478c2c6f 8951 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 8952 server.io_active_threads++;
8953}
8954
4ee9488d 8955/* We need to wait for the last thread to exit before we are able to
8956 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 8957static void waitEmptyIOJobsQueue(void) {
4ee9488d 8958 while(1) {
76b7233a 8959 int io_processed_len;
8960
4ee9488d 8961 lockThreadedIO();
054e426d 8962 if (listLength(server.io_newjobs) == 0 &&
8963 listLength(server.io_processing) == 0 &&
8964 server.io_active_threads == 0)
8965 {
4ee9488d 8966 unlockThreadedIO();
8967 return;
8968 }
76b7233a 8969 /* While waiting for empty jobs queue condition we post-process some
8970 * finshed job, as I/O threads may be hanging trying to write against
8971 * the io_ready_pipe_write FD but there are so much pending jobs that
8972 * it's blocking. */
8973 io_processed_len = listLength(server.io_processed);
4ee9488d 8974 unlockThreadedIO();
76b7233a 8975 if (io_processed_len) {
8976 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
8977 usleep(1000); /* 1 millisecond */
8978 } else {
8979 usleep(10000); /* 10 milliseconds */
8980 }
4ee9488d 8981 }
8982}
8983
054e426d 8984static void vmReopenSwapFile(void) {
478c2c6f 8985 /* Note: we don't close the old one as we are in the child process
8986 * and don't want to mess at all with the original file object. */
054e426d 8987 server.vm_fp = fopen(server.vm_swap_file,"r+b");
8988 if (server.vm_fp == NULL) {
8989 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
8990 server.vm_swap_file);
478c2c6f 8991 _exit(1);
054e426d 8992 }
8993 server.vm_fd = fileno(server.vm_fp);
8994}
8995
b9bc0eef 8996/* This function must be called while with threaded IO locked */
8997static void queueIOJob(iojob *j) {
6c96ba7d 8998 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
8999 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 9000 listAddNodeTail(server.io_newjobs,j);
9001 if (server.io_active_threads < server.vm_max_threads)
9002 spawnIOThread();
9003}
9004
9005static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9006 iojob *j;
9007
9008 assert(key->storage == REDIS_VM_MEMORY);
9009 assert(key->refcount == 1);
9010
9011 j = zmalloc(sizeof(*j));
9012 j->type = REDIS_IOJOB_PREPARE_SWAP;
9013 j->db = db;
78ebe4c8 9014 j->key = key;
b9bc0eef 9015 j->val = val;
9016 incrRefCount(val);
9017 j->canceled = 0;
9018 j->thread = (pthread_t) -1;
f11b8647 9019 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 9020
9021 lockThreadedIO();
9022 queueIOJob(j);
9023 unlockThreadedIO();
9024 return REDIS_OK;
9025}
9026
b0d8747d 9027/* ============ Virtual Memory - Blocking clients on missing keys =========== */
9028
d5d55fc3 9029/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9030 * If there is not already a job loading the key, it is craeted.
9031 * The key is added to the io_keys list in the client structure, and also
9032 * in the hash table mapping swapped keys to waiting clients, that is,
9033 * server.io_waited_keys. */
9034static int waitForSwappedKey(redisClient *c, robj *key) {
9035 struct dictEntry *de;
9036 robj *o;
9037 list *l;
9038
9039 /* If the key does not exist or is already in RAM we don't need to
9040 * block the client at all. */
9041 de = dictFind(c->db->dict,key);
9042 if (de == NULL) return 0;
9043 o = dictGetEntryKey(de);
9044 if (o->storage == REDIS_VM_MEMORY) {
9045 return 0;
9046 } else if (o->storage == REDIS_VM_SWAPPING) {
9047 /* We were swapping the key, undo it! */
9048 vmCancelThreadedIOJob(o);
9049 return 0;
9050 }
9051
9052 /* OK: the key is either swapped, or being loaded just now. */
9053
9054 /* Add the key to the list of keys this client is waiting for.
9055 * This maps clients to keys they are waiting for. */
9056 listAddNodeTail(c->io_keys,key);
9057 incrRefCount(key);
9058
9059 /* Add the client to the swapped keys => clients waiting map. */
9060 de = dictFind(c->db->io_keys,key);
9061 if (de == NULL) {
9062 int retval;
9063
9064 /* For every key we take a list of clients blocked for it */
9065 l = listCreate();
9066 retval = dictAdd(c->db->io_keys,key,l);
9067 incrRefCount(key);
9068 assert(retval == DICT_OK);
9069 } else {
9070 l = dictGetEntryVal(de);
9071 }
9072 listAddNodeTail(l,c);
9073
9074 /* Are we already loading the key from disk? If not create a job */
9075 if (o->storage == REDIS_VM_SWAPPED) {
9076 iojob *j;
9077
9078 o->storage = REDIS_VM_LOADING;
9079 j = zmalloc(sizeof(*j));
9080 j->type = REDIS_IOJOB_LOAD;
9081 j->db = c->db;
78ebe4c8 9082 j->key = o;
d5d55fc3 9083 j->key->vtype = o->vtype;
9084 j->page = o->vm.page;
9085 j->val = NULL;
9086 j->canceled = 0;
9087 j->thread = (pthread_t) -1;
9088 lockThreadedIO();
9089 queueIOJob(j);
9090 unlockThreadedIO();
9091 }
9092 return 1;
9093}
9094
76583ea4
PN
9095/* Preload keys needed for the ZUNION and ZINTER commands. */
9096static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9097 int i, num;
9098 num = atoi(c->argv[2]->ptr);
9099 for (i = 0; i < num; i++) {
9100 waitForSwappedKey(c,c->argv[3+i]);
9101 }
9102}
9103
b0d8747d 9104/* Is this client attempting to run a command against swapped keys?
d5d55fc3 9105 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 9106 *
d5d55fc3 9107 * The important idea about this function is that it can fail! If keys will
9108 * still be swapped when the client is resumed, this key lookups will
9109 * just block loading keys from disk. In practical terms this should only
9110 * happen with SORT BY command or if there is a bug in this function.
9111 *
9112 * Return 1 if the client is marked as blocked, 0 if the client can
9113 * continue as the keys it is going to access appear to be in memory. */
9114static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
7c775e09 9115 int j, last;
9116
76583ea4
PN
9117 if (cmd->vm_preload_proc != NULL) {
9118 cmd->vm_preload_proc(c);
9119 } else {
9120 if (cmd->vm_firstkey == 0) return 0;
9121 last = cmd->vm_lastkey;
9122 if (last < 0) last = c->argc+last;
9123 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9124 waitForSwappedKey(c,c->argv[j]);
9125 }
9126
d5d55fc3 9127 /* If the client was blocked for at least one key, mark it as blocked. */
9128 if (listLength(c->io_keys)) {
9129 c->flags |= REDIS_IO_WAIT;
9130 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9131 server.vm_blocked_clients++;
9132 return 1;
9133 } else {
9134 return 0;
9135 }
9136}
9137
9138/* Remove the 'key' from the list of blocked keys for a given client.
9139 *
9140 * The function returns 1 when there are no longer blocking keys after
9141 * the current one was removed (and the client can be unblocked). */
9142static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9143 list *l;
9144 listNode *ln;
9145 listIter li;
9146 struct dictEntry *de;
9147
9148 /* Remove the key from the list of keys this client is waiting for. */
9149 listRewind(c->io_keys,&li);
9150 while ((ln = listNext(&li)) != NULL) {
9151 if (compareStringObjects(ln->value,key) == 0) {
9152 listDelNode(c->io_keys,ln);
9153 break;
9154 }
9155 }
9156 assert(ln != NULL);
9157
9158 /* Remove the client form the key => waiting clients map. */
9159 de = dictFind(c->db->io_keys,key);
9160 assert(de != NULL);
9161 l = dictGetEntryVal(de);
9162 ln = listSearchKey(l,c);
9163 assert(ln != NULL);
9164 listDelNode(l,ln);
9165 if (listLength(l) == 0)
9166 dictDelete(c->db->io_keys,key);
9167
9168 return listLength(c->io_keys) == 0;
9169}
9170
9171static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9172 struct dictEntry *de;
9173 list *l;
9174 listNode *ln;
9175 int len;
9176
9177 de = dictFind(db->io_keys,key);
9178 if (!de) return;
9179
9180 l = dictGetEntryVal(de);
9181 len = listLength(l);
9182 /* Note: we can't use something like while(listLength(l)) as the list
9183 * can be freed by the calling function when we remove the last element. */
9184 while (len--) {
9185 ln = listFirst(l);
9186 redisClient *c = ln->value;
9187
9188 if (dontWaitForSwappedKey(c,key)) {
9189 /* Put the client in the list of clients ready to go as we
9190 * loaded all the keys about it. */
9191 listAddNodeTail(server.io_ready_clients,c);
9192 }
9193 }
b0d8747d 9194}
b0d8747d 9195
500ece7c 9196/* =========================== Remote Configuration ========================= */
9197
9198static void configSetCommand(redisClient *c) {
9199 robj *o = getDecodedObject(c->argv[3]);
9200 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9201 zfree(server.dbfilename);
9202 server.dbfilename = zstrdup(o->ptr);
9203 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9204 zfree(server.requirepass);
9205 server.requirepass = zstrdup(o->ptr);
9206 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9207 zfree(server.masterauth);
9208 server.masterauth = zstrdup(o->ptr);
9209 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9210 server.maxmemory = strtoll(o->ptr, NULL, 10);
9211 } else {
9212 addReplySds(c,sdscatprintf(sdsempty(),
9213 "-ERR not supported CONFIG parameter %s\r\n",
9214 (char*)c->argv[2]->ptr));
9215 decrRefCount(o);
9216 return;
9217 }
9218 decrRefCount(o);
9219 addReply(c,shared.ok);
9220}
9221
9222static void configGetCommand(redisClient *c) {
9223 robj *o = getDecodedObject(c->argv[2]);
9224 robj *lenobj = createObject(REDIS_STRING,NULL);
9225 char *pattern = o->ptr;
9226 int matches = 0;
9227
9228 addReply(c,lenobj);
9229 decrRefCount(lenobj);
9230
9231 if (stringmatch(pattern,"dbfilename",0)) {
9232 addReplyBulkCString(c,"dbfilename");
9233 addReplyBulkCString(c,server.dbfilename);
9234 matches++;
9235 }
9236 if (stringmatch(pattern,"requirepass",0)) {
9237 addReplyBulkCString(c,"requirepass");
9238 addReplyBulkCString(c,server.requirepass);
9239 matches++;
9240 }
9241 if (stringmatch(pattern,"masterauth",0)) {
9242 addReplyBulkCString(c,"masterauth");
9243 addReplyBulkCString(c,server.masterauth);
9244 matches++;
9245 }
9246 if (stringmatch(pattern,"maxmemory",0)) {
9247 char buf[128];
9248
9249 snprintf(buf,128,"%llu\n",server.maxmemory);
9250 addReplyBulkCString(c,"maxmemory");
9251 addReplyBulkCString(c,buf);
9252 matches++;
9253 }
9254 decrRefCount(o);
9255 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9256}
9257
9258static void configCommand(redisClient *c) {
9259 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9260 if (c->argc != 4) goto badarity;
9261 configSetCommand(c);
9262 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9263 if (c->argc != 3) goto badarity;
9264 configGetCommand(c);
9265 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9266 if (c->argc != 2) goto badarity;
9267 server.stat_numcommands = 0;
9268 server.stat_numconnections = 0;
9269 server.stat_expiredkeys = 0;
9270 server.stat_starttime = time(NULL);
9271 addReply(c,shared.ok);
9272 } else {
9273 addReplySds(c,sdscatprintf(sdsempty(),
9274 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9275 }
9276 return;
9277
9278badarity:
9279 addReplySds(c,sdscatprintf(sdsempty(),
9280 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9281 (char*) c->argv[1]->ptr));
9282}
9283
befec3cd 9284/* =========================== Pubsub implementation ======================== */
9285
ffc6b7f8 9286static void freePubsubPattern(void *p) {
9287 pubsubPattern *pat = p;
9288
9289 decrRefCount(pat->pattern);
9290 zfree(pat);
9291}
9292
9293static int listMatchPubsubPattern(void *a, void *b) {
9294 pubsubPattern *pa = a, *pb = b;
9295
9296 return (pa->client == pb->client) &&
9297 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9298}
9299
9300/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9301 * 0 if the client was already subscribed to that channel. */
9302static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
befec3cd 9303 struct dictEntry *de;
9304 list *clients = NULL;
9305 int retval = 0;
9306
ffc6b7f8 9307 /* Add the channel to the client -> channels hash table */
9308 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
befec3cd 9309 retval = 1;
ffc6b7f8 9310 incrRefCount(channel);
9311 /* Add the client to the channel -> list of clients hash table */
9312 de = dictFind(server.pubsub_channels,channel);
befec3cd 9313 if (de == NULL) {
9314 clients = listCreate();
ffc6b7f8 9315 dictAdd(server.pubsub_channels,channel,clients);
9316 incrRefCount(channel);
befec3cd 9317 } else {
9318 clients = dictGetEntryVal(de);
9319 }
9320 listAddNodeTail(clients,c);
9321 }
9322 /* Notify the client */
9323 addReply(c,shared.mbulk3);
9324 addReply(c,shared.subscribebulk);
ffc6b7f8 9325 addReplyBulk(c,channel);
9326 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
befec3cd 9327 return retval;
9328}
9329
ffc6b7f8 9330/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9331 * 0 if the client was not subscribed to the specified channel. */
9332static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
befec3cd 9333 struct dictEntry *de;
9334 list *clients;
9335 listNode *ln;
9336 int retval = 0;
9337
ffc6b7f8 9338 /* Remove the channel from the client -> channels hash table */
9339 incrRefCount(channel); /* channel may be just a pointer to the same object
201037f5 9340 we have in the hash tables. Protect it... */
ffc6b7f8 9341 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
befec3cd 9342 retval = 1;
ffc6b7f8 9343 /* Remove the client from the channel -> clients list hash table */
9344 de = dictFind(server.pubsub_channels,channel);
befec3cd 9345 assert(de != NULL);
9346 clients = dictGetEntryVal(de);
9347 ln = listSearchKey(clients,c);
9348 assert(ln != NULL);
9349 listDelNode(clients,ln);
ff767a75 9350 if (listLength(clients) == 0) {
9351 /* Free the list and associated hash entry at all if this was
9352 * the latest client, so that it will be possible to abuse
ffc6b7f8 9353 * Redis PUBSUB creating millions of channels. */
9354 dictDelete(server.pubsub_channels,channel);
ff767a75 9355 }
befec3cd 9356 }
9357 /* Notify the client */
9358 if (notify) {
9359 addReply(c,shared.mbulk3);
9360 addReply(c,shared.unsubscribebulk);
ffc6b7f8 9361 addReplyBulk(c,channel);
9362 addReplyLong(c,dictSize(c->pubsub_channels)+
9363 listLength(c->pubsub_patterns));
9364
9365 }
9366 decrRefCount(channel); /* it is finally safe to release it */
9367 return retval;
9368}
9369
9370/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9371static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9372 int retval = 0;
9373
9374 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9375 retval = 1;
9376 pubsubPattern *pat;
9377 listAddNodeTail(c->pubsub_patterns,pattern);
9378 incrRefCount(pattern);
9379 pat = zmalloc(sizeof(*pat));
9380 pat->pattern = getDecodedObject(pattern);
9381 pat->client = c;
9382 listAddNodeTail(server.pubsub_patterns,pat);
9383 }
9384 /* Notify the client */
9385 addReply(c,shared.mbulk3);
9386 addReply(c,shared.psubscribebulk);
9387 addReplyBulk(c,pattern);
9388 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9389 return retval;
9390}
9391
9392/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9393 * 0 if the client was not subscribed to the specified channel. */
9394static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9395 listNode *ln;
9396 pubsubPattern pat;
9397 int retval = 0;
9398
9399 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9400 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9401 retval = 1;
9402 listDelNode(c->pubsub_patterns,ln);
9403 pat.client = c;
9404 pat.pattern = pattern;
9405 ln = listSearchKey(server.pubsub_patterns,&pat);
9406 listDelNode(server.pubsub_patterns,ln);
9407 }
9408 /* Notify the client */
9409 if (notify) {
9410 addReply(c,shared.mbulk3);
9411 addReply(c,shared.punsubscribebulk);
9412 addReplyBulk(c,pattern);
9413 addReplyLong(c,dictSize(c->pubsub_channels)+
9414 listLength(c->pubsub_patterns));
befec3cd 9415 }
ffc6b7f8 9416 decrRefCount(pattern);
befec3cd 9417 return retval;
9418}
9419
ffc6b7f8 9420/* Unsubscribe from all the channels. Return the number of channels the
9421 * client was subscribed from. */
9422static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9423 dictIterator *di = dictGetIterator(c->pubsub_channels);
befec3cd 9424 dictEntry *de;
9425 int count = 0;
9426
9427 while((de = dictNext(di)) != NULL) {
ffc6b7f8 9428 robj *channel = dictGetEntryKey(de);
befec3cd 9429
ffc6b7f8 9430 count += pubsubUnsubscribeChannel(c,channel,notify);
befec3cd 9431 }
9432 dictReleaseIterator(di);
9433 return count;
9434}
9435
ffc6b7f8 9436/* Unsubscribe from all the patterns. Return the number of patterns the
9437 * client was subscribed from. */
9438static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9439 listNode *ln;
9440 listIter li;
9441 int count = 0;
9442
9443 listRewind(c->pubsub_patterns,&li);
9444 while ((ln = listNext(&li)) != NULL) {
9445 robj *pattern = ln->value;
9446
9447 count += pubsubUnsubscribePattern(c,pattern,notify);
9448 }
9449 return count;
9450}
9451
befec3cd 9452/* Publish a message */
ffc6b7f8 9453static int pubsubPublishMessage(robj *channel, robj *message) {
befec3cd 9454 int receivers = 0;
9455 struct dictEntry *de;
ffc6b7f8 9456 listNode *ln;
9457 listIter li;
befec3cd 9458
ffc6b7f8 9459 /* Send to clients listening for that channel */
9460 de = dictFind(server.pubsub_channels,channel);
befec3cd 9461 if (de) {
9462 list *list = dictGetEntryVal(de);
9463 listNode *ln;
9464 listIter li;
9465
9466 listRewind(list,&li);
9467 while ((ln = listNext(&li)) != NULL) {
9468 redisClient *c = ln->value;
9469
9470 addReply(c,shared.mbulk3);
9471 addReply(c,shared.messagebulk);
ffc6b7f8 9472 addReplyBulk(c,channel);
befec3cd 9473 addReplyBulk(c,message);
9474 receivers++;
9475 }
9476 }
ffc6b7f8 9477 /* Send to clients listening to matching channels */
9478 if (listLength(server.pubsub_patterns)) {
9479 listRewind(server.pubsub_patterns,&li);
9480 channel = getDecodedObject(channel);
9481 while ((ln = listNext(&li)) != NULL) {
9482 pubsubPattern *pat = ln->value;
9483
9484 if (stringmatchlen((char*)pat->pattern->ptr,
9485 sdslen(pat->pattern->ptr),
9486 (char*)channel->ptr,
9487 sdslen(channel->ptr),0)) {
9488 addReply(pat->client,shared.mbulk3);
9489 addReply(pat->client,shared.messagebulk);
9490 addReplyBulk(pat->client,channel);
9491 addReplyBulk(pat->client,message);
9492 receivers++;
9493 }
9494 }
9495 decrRefCount(channel);
9496 }
befec3cd 9497 return receivers;
9498}
9499
9500static void subscribeCommand(redisClient *c) {
9501 int j;
9502
9503 for (j = 1; j < c->argc; j++)
ffc6b7f8 9504 pubsubSubscribeChannel(c,c->argv[j]);
befec3cd 9505}
9506
9507static void unsubscribeCommand(redisClient *c) {
9508 if (c->argc == 1) {
ffc6b7f8 9509 pubsubUnsubscribeAllChannels(c,1);
9510 return;
9511 } else {
9512 int j;
9513
9514 for (j = 1; j < c->argc; j++)
9515 pubsubUnsubscribeChannel(c,c->argv[j],1);
9516 }
9517}
9518
9519static void psubscribeCommand(redisClient *c) {
9520 int j;
9521
9522 for (j = 1; j < c->argc; j++)
9523 pubsubSubscribePattern(c,c->argv[j]);
9524}
9525
9526static void punsubscribeCommand(redisClient *c) {
9527 if (c->argc == 1) {
9528 pubsubUnsubscribeAllPatterns(c,1);
befec3cd 9529 return;
9530 } else {
9531 int j;
9532
9533 for (j = 1; j < c->argc; j++)
ffc6b7f8 9534 pubsubUnsubscribePattern(c,c->argv[j],1);
befec3cd 9535 }
9536}
9537
9538static void publishCommand(redisClient *c) {
9539 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9540 addReplyLong(c,receivers);
9541}
9542
7f957c92 9543/* ================================= Debugging ============================== */
9544
9545static void debugCommand(redisClient *c) {
9546 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9547 *((char*)-1) = 'x';
210e29f7 9548 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9549 if (rdbSave(server.dbfilename) != REDIS_OK) {
9550 addReply(c,shared.err);
9551 return;
9552 }
9553 emptyDb();
9554 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9555 addReply(c,shared.err);
9556 return;
9557 }
9558 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9559 addReply(c,shared.ok);
71c2b467 9560 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9561 emptyDb();
9562 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9563 addReply(c,shared.err);
9564 return;
9565 }
9566 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9567 addReply(c,shared.ok);
333298da 9568 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9569 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9570 robj *key, *val;
9571
9572 if (!de) {
9573 addReply(c,shared.nokeyerr);
9574 return;
9575 }
9576 key = dictGetEntryKey(de);
9577 val = dictGetEntryVal(de);
59146ef3 9578 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9579 key->storage == REDIS_VM_SWAPPING)) {
07efaf74 9580 char *strenc;
9581 char buf[128];
9582
9583 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9584 strenc = strencoding[val->encoding];
9585 } else {
9586 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9587 strenc = buf;
9588 }
ace06542 9589 addReplySds(c,sdscatprintf(sdsempty(),
9590 "+Key at:%p refcount:%d, value at:%p refcount:%d "
07efaf74 9591 "encoding:%s serializedlength:%lld\r\n",
682ac724 9592 (void*)key, key->refcount, (void*)val, val->refcount,
07efaf74 9593 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 9594 } else {
9595 addReplySds(c,sdscatprintf(sdsempty(),
9596 "+Key at:%p refcount:%d, value swapped at: page %llu "
9597 "using %llu pages\r\n",
9598 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9599 (unsigned long long) key->vm.usedpages));
9600 }
78ebe4c8 9601 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
9602 lookupKeyRead(c->db,c->argv[2]);
9603 addReply(c,shared.ok);
7d30035d 9604 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9605 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9606 robj *key, *val;
9607
9608 if (!server.vm_enabled) {
9609 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9610 return;
9611 }
9612 if (!de) {
9613 addReply(c,shared.nokeyerr);
9614 return;
9615 }
9616 key = dictGetEntryKey(de);
9617 val = dictGetEntryVal(de);
4ef8de8a 9618 /* If the key is shared we want to create a copy */
9619 if (key->refcount > 1) {
9620 robj *newkey = dupStringObject(key);
9621 decrRefCount(key);
9622 key = dictGetEntryKey(de) = newkey;
9623 }
9624 /* Swap it */
7d30035d 9625 if (key->storage != REDIS_VM_MEMORY) {
9626 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 9627 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 9628 dictGetEntryVal(de) = NULL;
9629 addReply(c,shared.ok);
9630 } else {
9631 addReply(c,shared.err);
9632 }
7f957c92 9633 } else {
333298da 9634 addReplySds(c,sdsnew(
bdcb92f2 9635 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 9636 }
9637}
56906eef 9638
6c96ba7d 9639static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 9640 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
6c96ba7d 9641 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
dfc5e96c 9642#ifdef HAVE_BACKTRACE
9643 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9644 *((char*)-1) = 'x';
9645#endif
9646}
9647
bcfc686d 9648/* =================================== Main! ================================ */
56906eef 9649
bcfc686d 9650#ifdef __linux__
9651int linuxOvercommitMemoryValue(void) {
9652 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9653 char buf[64];
56906eef 9654
bcfc686d 9655 if (!fp) return -1;
9656 if (fgets(buf,64,fp) == NULL) {
9657 fclose(fp);
9658 return -1;
9659 }
9660 fclose(fp);
56906eef 9661
bcfc686d 9662 return atoi(buf);
9663}
9664
9665void linuxOvercommitMemoryWarning(void) {
9666 if (linuxOvercommitMemoryValue() == 0) {
9667 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9668 }
9669}
9670#endif /* __linux__ */
9671
9672static void daemonize(void) {
9673 int fd;
9674 FILE *fp;
9675
9676 if (fork() != 0) exit(0); /* parent exits */
9677 setsid(); /* create a new session */
9678
9679 /* Every output goes to /dev/null. If Redis is daemonized but
9680 * the 'logfile' is set to 'stdout' in the configuration file
9681 * it will not log at all. */
9682 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9683 dup2(fd, STDIN_FILENO);
9684 dup2(fd, STDOUT_FILENO);
9685 dup2(fd, STDERR_FILENO);
9686 if (fd > STDERR_FILENO) close(fd);
9687 }
9688 /* Try to write the pid file */
9689 fp = fopen(server.pidfile,"w");
9690 if (fp) {
9691 fprintf(fp,"%d\n",getpid());
9692 fclose(fp);
56906eef 9693 }
56906eef 9694}
9695
42ab0172
AO
9696static void version() {
9697 printf("Redis server version %s\n", REDIS_VERSION);
9698 exit(0);
9699}
9700
723fb69b
AO
9701static void usage() {
9702 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 9703 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
9704 exit(1);
9705}
9706
bcfc686d 9707int main(int argc, char **argv) {
9651a787 9708 time_t start;
9709
bcfc686d 9710 initServerConfig();
9711 if (argc == 2) {
44efe66e 9712 if (strcmp(argv[1], "-v") == 0 ||
9713 strcmp(argv[1], "--version") == 0) version();
9714 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 9715 resetServerSaveParams();
9716 loadServerConfig(argv[1]);
723fb69b
AO
9717 } else if ((argc > 2)) {
9718 usage();
bcfc686d 9719 } else {
9720 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9721 }
bcfc686d 9722 if (server.daemonize) daemonize();
71c54b21 9723 initServer();
bcfc686d 9724 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
9725#ifdef __linux__
9726 linuxOvercommitMemoryWarning();
9727#endif
9651a787 9728 start = time(NULL);
bcfc686d 9729 if (server.appendonly) {
9730 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 9731 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 9732 } else {
9733 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 9734 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 9735 }
bcfc686d 9736 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 9737 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 9738 aeMain(server.el);
9739 aeDeleteEventLoop(server.el);
9740 return 0;
9741}
9742
9743/* ============================= Backtrace support ========================= */
9744
9745#ifdef HAVE_BACKTRACE
9746static char *findFuncName(void *pointer, unsigned long *offset);
9747
56906eef 9748static void *getMcontextEip(ucontext_t *uc) {
9749#if defined(__FreeBSD__)
9750 return (void*) uc->uc_mcontext.mc_eip;
9751#elif defined(__dietlibc__)
9752 return (void*) uc->uc_mcontext.eip;
06db1f50 9753#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 9754 #if __x86_64__
9755 return (void*) uc->uc_mcontext->__ss.__rip;
9756 #else
56906eef 9757 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 9758 #endif
06db1f50 9759#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 9760 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 9761 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 9762 #else
9763 return (void*) uc->uc_mcontext->__ss.__eip;
9764 #endif
54bac49d 9765#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 9766 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 9767#elif defined(__ia64__) /* Linux IA64 */
9768 return (void*) uc->uc_mcontext.sc_ip;
9769#else
9770 return NULL;
56906eef 9771#endif
9772}
9773
9774static void segvHandler(int sig, siginfo_t *info, void *secret) {
9775 void *trace[100];
9776 char **messages = NULL;
9777 int i, trace_size = 0;
9778 unsigned long offset=0;
56906eef 9779 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 9780 sds infostring;
56906eef 9781 REDIS_NOTUSED(info);
9782
9783 redisLog(REDIS_WARNING,
9784 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 9785 infostring = genRedisInfoString();
9786 redisLog(REDIS_WARNING, "%s",infostring);
9787 /* It's not safe to sdsfree() the returned string under memory
9788 * corruption conditions. Let it leak as we are going to abort */
56906eef 9789
9790 trace_size = backtrace(trace, 100);
de96dbfe 9791 /* overwrite sigaction with caller's address */
b91cf5ef 9792 if (getMcontextEip(uc) != NULL) {
9793 trace[1] = getMcontextEip(uc);
9794 }
56906eef 9795 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 9796
d76412d1 9797 for (i=1; i<trace_size; ++i) {
56906eef 9798 char *fn = findFuncName(trace[i], &offset), *p;
9799
9800 p = strchr(messages[i],'+');
9801 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
9802 redisLog(REDIS_WARNING,"%s", messages[i]);
9803 } else {
9804 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
9805 }
9806 }
b177fd30 9807 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 9808 _exit(0);
fe3bbfbe 9809}
56906eef 9810
9811static void setupSigSegvAction(void) {
9812 struct sigaction act;
9813
9814 sigemptyset (&act.sa_mask);
9815 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
9816 * is used. Otherwise, sa_handler is used */
9817 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
9818 act.sa_sigaction = segvHandler;
9819 sigaction (SIGSEGV, &act, NULL);
9820 sigaction (SIGBUS, &act, NULL);
12fea928 9821 sigaction (SIGFPE, &act, NULL);
9822 sigaction (SIGILL, &act, NULL);
9823 sigaction (SIGBUS, &act, NULL);
e65fdc78 9824 return;
56906eef 9825}
e65fdc78 9826
bcfc686d 9827#include "staticsymbols.h"
9828/* This function try to convert a pointer into a function name. It's used in
9829 * oreder to provide a backtrace under segmentation fault that's able to
9830 * display functions declared as static (otherwise the backtrace is useless). */
9831static char *findFuncName(void *pointer, unsigned long *offset){
9832 int i, ret = -1;
9833 unsigned long off, minoff = 0;
ed9b544e 9834
bcfc686d 9835 /* Try to match against the Symbol with the smallest offset */
9836 for (i=0; symsTable[i].pointer; i++) {
9837 unsigned long lp = (unsigned long) pointer;
0bc03378 9838
bcfc686d 9839 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
9840 off=lp-symsTable[i].pointer;
9841 if (ret < 0 || off < minoff) {
9842 minoff=off;
9843 ret=i;
9844 }
9845 }
0bc03378 9846 }
bcfc686d 9847 if (ret == -1) return NULL;
9848 *offset = minoff;
9849 return symsTable[ret].name;
0bc03378 9850}
bcfc686d 9851#else /* HAVE_BACKTRACE */
9852static void setupSigSegvAction(void) {
0bc03378 9853}
bcfc686d 9854#endif /* HAVE_BACKTRACE */
0bc03378 9855
ed9b544e 9856
ed9b544e 9857
bcfc686d 9858/* The End */
9859
9860
ed9b544e 9861