]> git.saurik.com Git - redis.git/blame - redis.c
Incrementally rehahsing hash table! Thanks to Derek Collison and Pieter Noordhuis...
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
cac154c5 30#define REDIS_VERSION "1.3.8"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
c9468bcf 40#define __USE_POSIX199309
54bac49d 41#define __USE_UNIX98
ed9b544e 42#include <signal.h>
fbf9bcdb 43
44#ifdef HAVE_BACKTRACE
c9468bcf 45#include <execinfo.h>
46#include <ucontext.h>
fbf9bcdb 47#endif /* HAVE_BACKTRACE */
48
ed9b544e 49#include <sys/wait.h>
50#include <errno.h>
51#include <assert.h>
52#include <ctype.h>
53#include <stdarg.h>
54#include <inttypes.h>
55#include <arpa/inet.h>
56#include <sys/stat.h>
57#include <fcntl.h>
58#include <sys/time.h>
59#include <sys/resource.h>
2895e862 60#include <sys/uio.h>
f78fd11b 61#include <limits.h>
a7866db6 62#include <math.h>
92f8e882 63#include <pthread.h>
0bc1b2f6 64
65#if defined(__sun)
5043dff3 66#include "solarisfixes.h"
67#endif
ed9b544e 68
c9468bcf 69#include "redis.h"
ed9b544e 70#include "ae.h" /* Event driven programming library */
71#include "sds.h" /* Dynamic safe strings */
72#include "anet.h" /* Networking the easy way */
73#include "dict.h" /* Hash tables */
74#include "adlist.h" /* Linked lists */
75#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 76#include "lzf.h" /* LZF compression library */
77#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
5234952b 78#include "zipmap.h"
ed9b544e 79
80/* Error codes */
81#define REDIS_OK 0
82#define REDIS_ERR -1
83
84/* Static server configuration */
85#define REDIS_SERVERPORT 6379 /* TCP port */
86#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 87#define REDIS_IOBUF_LEN 1024
ed9b544e 88#define REDIS_LOADBUF_LEN 1024
248ea310 89#define REDIS_STATIC_ARGS 8
ed9b544e 90#define REDIS_DEFAULT_DBNUM 16
91#define REDIS_CONFIGLINE_MAX 1024
92#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
1763929f 94#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* try to expire 10 keys/loop */
6f376729 95#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 96#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99#define REDIS_WRITEV_THRESHOLD 3
100/* Max number of iovecs used for each writev call */
101#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 102
103/* Hash table parameters */
104#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 105
106/* Command flags */
3fd78bcd 107#define REDIS_CMD_BULK 1 /* Bulk write command */
108#define REDIS_CMD_INLINE 2 /* Inline command */
109/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113#define REDIS_CMD_DENYOOM 4
4005fef1 114#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
ed9b544e 115
116/* Object types */
117#define REDIS_STRING 0
118#define REDIS_LIST 1
119#define REDIS_SET 2
1812e024 120#define REDIS_ZSET 3
121#define REDIS_HASH 4
f78fd11b 122
5234952b 123/* Objects encoding. Some kind of objects like Strings and Hashes can be
124 * internally represented in multiple ways. The 'encoding' field of the object
125 * is set to one of this fields for this object. */
942a3961 126#define REDIS_ENCODING_RAW 0 /* Raw representation */
127#define REDIS_ENCODING_INT 1 /* Encoded as integer */
5234952b 128#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
129#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
942a3961 130
07efaf74 131static char* strencoding[] = {
132 "raw", "int", "zipmap", "hashtable"
133};
134
f78fd11b 135/* Object types only used for dumping to disk */
bb32ede5 136#define REDIS_EXPIRETIME 253
ed9b544e 137#define REDIS_SELECTDB 254
138#define REDIS_EOF 255
139
f78fd11b 140/* Defines related to the dump file format. To store 32 bits lengths for short
141 * keys requires a lot of space, so we check the most significant 2 bits of
142 * the first byte to interpreter the length:
143 *
144 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
145 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
146 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 147 * 11|000000 this means: specially encoded object will follow. The six bits
148 * number specify the kind of object that follows.
149 * See the REDIS_RDB_ENC_* defines.
f78fd11b 150 *
10c43610 151 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
152 * values, will fit inside. */
f78fd11b 153#define REDIS_RDB_6BITLEN 0
154#define REDIS_RDB_14BITLEN 1
155#define REDIS_RDB_32BITLEN 2
17be1a4a 156#define REDIS_RDB_ENCVAL 3
f78fd11b 157#define REDIS_RDB_LENERR UINT_MAX
158
a4d1ba9a 159/* When a length of a string object stored on disk has the first two bits
160 * set, the remaining two bits specify a special encoding for the object
161 * accordingly to the following defines: */
162#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
163#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
164#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 165#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 166
75680a3c 167/* Virtual memory object->where field. */
168#define REDIS_VM_MEMORY 0 /* The object is on memory */
169#define REDIS_VM_SWAPPED 1 /* The object is on disk */
170#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
171#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
172
06224fec 173/* Virtual memory static configuration stuff.
174 * Check vmFindContiguousPages() to know more about this magic numbers. */
175#define REDIS_VM_MAX_NEAR_PAGES 65536
176#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 177#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 178#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 179/* The following is the *percentage* of completed I/O jobs to process when the
180 * handelr is called. While Virtual Memory I/O operations are performed by
181 * threads, this operations must be processed by the main thread when completed
182 * in order to take effect. */
c953f24b 183#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 184
ed9b544e 185/* Client flags */
d5d55fc3 186#define REDIS_SLAVE 1 /* This client is a slave server */
187#define REDIS_MASTER 2 /* This client is a master server */
188#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
189#define REDIS_MULTI 8 /* This client is in a MULTI context */
190#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
191#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
ed9b544e 192
40d224a9 193/* Slave replication state - slave side */
ed9b544e 194#define REDIS_REPL_NONE 0 /* No active replication */
195#define REDIS_REPL_CONNECT 1 /* Must connect to master */
196#define REDIS_REPL_CONNECTED 2 /* Connected to master */
197
40d224a9 198/* Slave replication state - from the point of view of master
199 * Note that in SEND_BULK and ONLINE state the slave receives new updates
200 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
201 * to start the next background saving in order to send updates to it. */
202#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
203#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
204#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
205#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
206
ed9b544e 207/* List related stuff */
208#define REDIS_HEAD 0
209#define REDIS_TAIL 1
210
211/* Sort operations */
212#define REDIS_SORT_GET 0
443c6409 213#define REDIS_SORT_ASC 1
214#define REDIS_SORT_DESC 2
ed9b544e 215#define REDIS_SORTKEY_MAX 1024
216
217/* Log levels */
218#define REDIS_DEBUG 0
f870935d 219#define REDIS_VERBOSE 1
220#define REDIS_NOTICE 2
221#define REDIS_WARNING 3
ed9b544e 222
223/* Anti-warning macro... */
224#define REDIS_NOTUSED(V) ((void) V)
225
6b47e12e 226#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
227#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 228
48f0308a 229/* Append only defines */
230#define APPENDFSYNC_NO 0
231#define APPENDFSYNC_ALWAYS 1
232#define APPENDFSYNC_EVERYSEC 2
233
cbba7dd7 234/* Hashes related defaults */
235#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
236#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
237
dfc5e96c 238/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 239#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
6c96ba7d 240static void _redisAssert(char *estr, char *file, int line);
dfc5e96c 241
ed9b544e 242/*================================= Data types ============================== */
243
244/* A redis object, that is a type able to hold a string / list / set */
75680a3c 245
246/* The VM object structure */
247struct redisObjectVM {
3a66edc7 248 off_t page; /* the page at witch the object is stored on disk */
249 off_t usedpages; /* number of pages used on disk */
250 time_t atime; /* Last access time */
75680a3c 251} vm;
252
253/* The actual Redis Object */
ed9b544e 254typedef struct redisObject {
ed9b544e 255 void *ptr;
942a3961 256 unsigned char type;
257 unsigned char encoding;
d894161b 258 unsigned char storage; /* If this object is a key, where is the value?
259 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
260 unsigned char vtype; /* If this object is a key, and value is swapped out,
261 * this is the type of the swapped out object. */
ed9b544e 262 int refcount;
75680a3c 263 /* VM fields, this are only allocated if VM is active, otherwise the
264 * object allocation function will just allocate
265 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
266 * Redis without VM active will not have any overhead. */
267 struct redisObjectVM vm;
ed9b544e 268} robj;
269
dfc5e96c 270/* Macro used to initalize a Redis object allocated on the stack.
271 * Note that this macro is taken near the structure definition to make sure
272 * we'll update it when the structure is changed, to avoid bugs like
273 * bug #85 introduced exactly in this way. */
274#define initStaticStringObject(_var,_ptr) do { \
275 _var.refcount = 1; \
276 _var.type = REDIS_STRING; \
277 _var.encoding = REDIS_ENCODING_RAW; \
278 _var.ptr = _ptr; \
3a66edc7 279 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 280} while(0);
281
3305306f 282typedef struct redisDb {
4409877e 283 dict *dict; /* The keyspace for this DB */
284 dict *expires; /* Timeout of keys with a timeout set */
285 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 286 dict *io_keys; /* Keys with clients waiting for VM I/O */
3305306f 287 int id;
288} redisDb;
289
6e469882 290/* Client MULTI/EXEC state */
291typedef struct multiCmd {
292 robj **argv;
293 int argc;
294 struct redisCommand *cmd;
295} multiCmd;
296
297typedef struct multiState {
298 multiCmd *commands; /* Array of MULTI commands */
299 int count; /* Total number of MULTI commands */
300} multiState;
301
ed9b544e 302/* With multiplexing we need to take per-clinet state.
303 * Clients are taken in a liked list. */
304typedef struct redisClient {
305 int fd;
3305306f 306 redisDb *db;
ed9b544e 307 int dictid;
308 sds querybuf;
e8a74421 309 robj **argv, **mbargv;
310 int argc, mbargc;
40d224a9 311 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 312 int multibulk; /* multi bulk command format active */
ed9b544e 313 list *reply;
314 int sentlen;
315 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 316 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 317 int slaveseldb; /* slave selected db, if this client is a slave */
318 int authenticated; /* when requirepass is non-NULL */
319 int replstate; /* replication state if this is a slave */
320 int repldbfd; /* replication DB file descriptor */
6e469882 321 long repldboff; /* replication DB file offset */
40d224a9 322 off_t repldbsize; /* replication DB file size */
6e469882 323 multiState mstate; /* MULTI/EXEC state */
d5d55fc3 324 robj **blockingkeys; /* The key we are waiting to terminate a blocking
4409877e 325 * operation such as BLPOP. Otherwise NULL. */
b177fd30 326 int blockingkeysnum; /* Number of blocking keys */
4409877e 327 time_t blockingto; /* Blocking operation timeout. If UNIX current time
328 * is >= blockingto then the operation timed out. */
92f8e882 329 list *io_keys; /* Keys this client is waiting to be loaded from the
330 * swap file in order to continue. */
ffc6b7f8 331 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
332 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
ed9b544e 333} redisClient;
334
335struct saveparam {
336 time_t seconds;
337 int changes;
338};
339
340/* Global server state structure */
341struct redisServer {
342 int port;
343 int fd;
3305306f 344 redisDb *db;
ed9b544e 345 long long dirty; /* changes to DB from the last save */
346 list *clients;
87eca727 347 list *slaves, *monitors;
ed9b544e 348 char neterr[ANET_ERR_LEN];
349 aeEventLoop *el;
350 int cronloops; /* number of times the cron function run */
351 list *objfreelist; /* A list of freed objects to avoid malloc() */
352 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 353 /* Fields used only for stats */
354 time_t stat_starttime; /* server start time */
355 long long stat_numcommands; /* number of processed commands */
356 long long stat_numconnections; /* number of connections received */
2a6a2ed1 357 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 358 /* Configuration */
359 int verbosity;
360 int glueoutputbuf;
361 int maxidletime;
362 int dbnum;
363 int daemonize;
44b38ef4 364 int appendonly;
48f0308a 365 int appendfsync;
366 time_t lastfsync;
44b38ef4 367 int appendfd;
368 int appendseldb;
ed329fcf 369 char *pidfile;
9f3c422c 370 pid_t bgsavechildpid;
9d65a1bb 371 pid_t bgrewritechildpid;
372 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
ed9b544e 373 struct saveparam *saveparams;
374 int saveparamslen;
375 char *logfile;
376 char *bindaddr;
377 char *dbfilename;
44b38ef4 378 char *appendfilename;
abcb223e 379 char *requirepass;
10c43610 380 int shareobjects;
121f70cf 381 int rdbcompression;
ed9b544e 382 /* Replication related */
383 int isslave;
d0ccebcf 384 char *masterauth;
ed9b544e 385 char *masterhost;
386 int masterport;
40d224a9 387 redisClient *master; /* client that is master for this slave */
ed9b544e 388 int replstate;
285add55 389 unsigned int maxclients;
4ef8de8a 390 unsigned long long maxmemory;
d5d55fc3 391 unsigned int blpop_blocked_clients;
392 unsigned int vm_blocked_clients;
ed9b544e 393 /* Sort parameters - qsort_r() is only available under BSD so we
394 * have to take this state global, in order to pass it to sortCompare() */
395 int sort_desc;
396 int sort_alpha;
397 int sort_bypattern;
75680a3c 398 /* Virtual memory configuration */
399 int vm_enabled;
054e426d 400 char *vm_swap_file;
75680a3c 401 off_t vm_page_size;
402 off_t vm_pages;
4ef8de8a 403 unsigned long long vm_max_memory;
cbba7dd7 404 /* Hashes config */
405 size_t hash_max_zipmap_entries;
406 size_t hash_max_zipmap_value;
75680a3c 407 /* Virtual memory state */
408 FILE *vm_fp;
409 int vm_fd;
410 off_t vm_next_page; /* Next probably empty page */
411 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 412 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 413 time_t unixtime; /* Unix time sampled every second. */
92f8e882 414 /* Virtual memory I/O threads stuff */
92f8e882 415 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 416 * put the result of the operation in the io_done list. While the
417 * job is being processed, it's put on io_processing queue. */
418 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
419 list *io_processing; /* List of VM I/O jobs being processed */
420 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 421 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 422 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 423 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
424 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 425 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 426 int io_active_threads; /* Number of running I/O threads */
427 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 428 /* Our main thread is blocked on the event loop, locking for sockets ready
429 * to be read or written, so when a threaded I/O operation is ready to be
430 * processed by the main thread, the I/O thread will use a unix pipe to
431 * awake the main thread. The followings are the two pipe FDs. */
432 int io_ready_pipe_read;
433 int io_ready_pipe_write;
7d98e08c 434 /* Virtual memory stats */
435 unsigned long long vm_stats_used_pages;
436 unsigned long long vm_stats_swapped_objects;
437 unsigned long long vm_stats_swapouts;
438 unsigned long long vm_stats_swapins;
befec3cd 439 /* Pubsub */
ffc6b7f8 440 dict *pubsub_channels; /* Map channels to list of subscribed clients */
441 list *pubsub_patterns; /* A list of pubsub_patterns */
befec3cd 442 /* Misc */
b9bc0eef 443 FILE *devnull;
ed9b544e 444};
445
ffc6b7f8 446typedef struct pubsubPattern {
447 redisClient *client;
448 robj *pattern;
449} pubsubPattern;
450
ed9b544e 451typedef void redisCommandProc(redisClient *c);
452struct redisCommand {
453 char *name;
454 redisCommandProc *proc;
455 int arity;
456 int flags;
76583ea4
PN
457 /* Use a function to determine which keys need to be loaded
458 * in the background prior to executing this command. Takes precedence
459 * over vm_firstkey and others, ignored when NULL */
460 redisCommandProc *vm_preload_proc;
7c775e09 461 /* What keys should be loaded in background when calling this command? */
462 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
463 int vm_lastkey; /* THe last argument that's a key */
464 int vm_keystep; /* The step between first and last key */
ed9b544e 465};
466
de96dbfe 467struct redisFunctionSym {
468 char *name;
56906eef 469 unsigned long pointer;
de96dbfe 470};
471
ed9b544e 472typedef struct _redisSortObject {
473 robj *obj;
474 union {
475 double score;
476 robj *cmpobj;
477 } u;
478} redisSortObject;
479
480typedef struct _redisSortOperation {
481 int type;
482 robj *pattern;
483} redisSortOperation;
484
6b47e12e 485/* ZSETs use a specialized version of Skiplists */
486
487typedef struct zskiplistNode {
488 struct zskiplistNode **forward;
e3870fab 489 struct zskiplistNode *backward;
912b9165 490 unsigned int *span;
6b47e12e 491 double score;
492 robj *obj;
493} zskiplistNode;
494
495typedef struct zskiplist {
e3870fab 496 struct zskiplistNode *header, *tail;
d13f767c 497 unsigned long length;
6b47e12e 498 int level;
499} zskiplist;
500
1812e024 501typedef struct zset {
502 dict *dict;
6b47e12e 503 zskiplist *zsl;
1812e024 504} zset;
505
6b47e12e 506/* Our shared "common" objects */
507
05df7621 508#define REDIS_SHARED_INTEGERS 10000
ed9b544e 509struct sharedObjectsStruct {
c937aa89 510 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 511 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 512 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
513 *outofrangeerr, *plus,
ed9b544e 514 *select0, *select1, *select2, *select3, *select4,
befec3cd 515 *select5, *select6, *select7, *select8, *select9,
ffc6b7f8 516 *messagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
05df7621 517 *psubscribebulk, *punsubscribebulk, *integers[REDIS_SHARED_INTEGERS];
ed9b544e 518} shared;
519
a7866db6 520/* Global vars that are actally used as constants. The following double
521 * values are used for double on-disk serialization, and are initialized
522 * at runtime to avoid strange compiler optimizations. */
523
524static double R_Zero, R_PosInf, R_NegInf, R_Nan;
525
92f8e882 526/* VM threaded I/O request message */
b9bc0eef 527#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
528#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
529#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 530typedef struct iojob {
996cb5f7 531 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 532 redisDb *db;/* Redis database */
92f8e882 533 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 534 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 535 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
536 off_t page; /* Swap page where to read/write the object */
248ea310 537 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 538 int canceled; /* True if this command was canceled by blocking side of VM */
539 pthread_t thread; /* ID of the thread processing this entry */
540} iojob;
92f8e882 541
ed9b544e 542/*================================ Prototypes =============================== */
543
544static void freeStringObject(robj *o);
545static void freeListObject(robj *o);
546static void freeSetObject(robj *o);
547static void decrRefCount(void *o);
548static robj *createObject(int type, void *ptr);
549static void freeClient(redisClient *c);
f78fd11b 550static int rdbLoad(char *filename);
ed9b544e 551static void addReply(redisClient *c, robj *obj);
552static void addReplySds(redisClient *c, sds s);
553static void incrRefCount(robj *o);
f78fd11b 554static int rdbSaveBackground(char *filename);
ed9b544e 555static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 556static robj *dupStringObject(robj *o);
248ea310 557static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
44b38ef4 558static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 559static int syncWithMaster(void);
05df7621 560static robj *tryObjectEncoding(robj *o);
9d65a1bb 561static robj *getDecodedObject(robj *o);
3305306f 562static int removeExpire(redisDb *db, robj *key);
563static int expireIfNeeded(redisDb *db, robj *key);
564static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 565static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 566static int deleteKey(redisDb *db, robj *key);
bb32ede5 567static time_t getExpire(redisDb *db, robj *key);
568static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 569static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 570static void freeMemoryIfNeeded(void);
de96dbfe 571static int processCommand(redisClient *c);
56906eef 572static void setupSigSegvAction(void);
a3b21203 573static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 574static void aofRemoveTempFile(pid_t childpid);
0ea663ea 575static size_t stringObjectLen(robj *o);
638e42ac 576static void processInputBuffer(redisClient *c);
6b47e12e 577static zskiplist *zslCreate(void);
fd8ccf44 578static void zslFree(zskiplist *zsl);
2b59cfdf 579static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 580static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 581static void initClientMultiState(redisClient *c);
582static void freeClientMultiState(redisClient *c);
583static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 584static void unblockClientWaitingData(redisClient *c);
4409877e 585static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 586static void vmInit(void);
a35ddf12 587static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 588static robj *vmLoadObject(robj *key);
7e69548d 589static robj *vmPreviewObject(robj *key);
a69a0c9c 590static int vmSwapOneObjectBlocking(void);
591static int vmSwapOneObjectThreaded(void);
7e69548d 592static int vmCanSwapOut(void);
a5819310 593static int tryFreeOneObjectFromFreelist(void);
996cb5f7 594static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
595static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
596static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 597static void lockThreadedIO(void);
598static void unlockThreadedIO(void);
599static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
600static void freeIOJob(iojob *j);
601static void queueIOJob(iojob *j);
a5819310 602static int vmWriteObjectOnSwap(robj *o, off_t page);
603static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 604static void waitEmptyIOJobsQueue(void);
605static void vmReopenSwapFile(void);
970e10bb 606static int vmFreePage(off_t page);
76583ea4 607static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
d5d55fc3 608static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
609static int dontWaitForSwappedKey(redisClient *c, robj *key);
610static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
611static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
612static struct redisCommand *lookupCommand(char *name);
613static void call(redisClient *c, struct redisCommand *cmd);
614static void resetClient(redisClient *c);
ada386b2 615static void convertToRealHash(robj *o);
ffc6b7f8 616static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
617static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
618static void freePubsubPattern(void *p);
619static int listMatchPubsubPattern(void *a, void *b);
620static int compareStringObjects(robj *a, robj *b);
befec3cd 621static void usage();
ed9b544e 622
abcb223e 623static void authCommand(redisClient *c);
ed9b544e 624static void pingCommand(redisClient *c);
625static void echoCommand(redisClient *c);
626static void setCommand(redisClient *c);
627static void setnxCommand(redisClient *c);
628static void getCommand(redisClient *c);
629static void delCommand(redisClient *c);
630static void existsCommand(redisClient *c);
631static void incrCommand(redisClient *c);
632static void decrCommand(redisClient *c);
633static void incrbyCommand(redisClient *c);
634static void decrbyCommand(redisClient *c);
635static void selectCommand(redisClient *c);
636static void randomkeyCommand(redisClient *c);
637static void keysCommand(redisClient *c);
638static void dbsizeCommand(redisClient *c);
639static void lastsaveCommand(redisClient *c);
640static void saveCommand(redisClient *c);
641static void bgsaveCommand(redisClient *c);
9d65a1bb 642static void bgrewriteaofCommand(redisClient *c);
ed9b544e 643static void shutdownCommand(redisClient *c);
644static void moveCommand(redisClient *c);
645static void renameCommand(redisClient *c);
646static void renamenxCommand(redisClient *c);
647static void lpushCommand(redisClient *c);
648static void rpushCommand(redisClient *c);
649static void lpopCommand(redisClient *c);
650static void rpopCommand(redisClient *c);
651static void llenCommand(redisClient *c);
652static void lindexCommand(redisClient *c);
653static void lrangeCommand(redisClient *c);
654static void ltrimCommand(redisClient *c);
655static void typeCommand(redisClient *c);
656static void lsetCommand(redisClient *c);
657static void saddCommand(redisClient *c);
658static void sremCommand(redisClient *c);
a4460ef4 659static void smoveCommand(redisClient *c);
ed9b544e 660static void sismemberCommand(redisClient *c);
661static void scardCommand(redisClient *c);
12fea928 662static void spopCommand(redisClient *c);
2abb95a9 663static void srandmemberCommand(redisClient *c);
ed9b544e 664static void sinterCommand(redisClient *c);
665static void sinterstoreCommand(redisClient *c);
40d224a9 666static void sunionCommand(redisClient *c);
667static void sunionstoreCommand(redisClient *c);
f4f56e1d 668static void sdiffCommand(redisClient *c);
669static void sdiffstoreCommand(redisClient *c);
ed9b544e 670static void syncCommand(redisClient *c);
671static void flushdbCommand(redisClient *c);
672static void flushallCommand(redisClient *c);
673static void sortCommand(redisClient *c);
674static void lremCommand(redisClient *c);
0f5f7e9a 675static void rpoplpushcommand(redisClient *c);
ed9b544e 676static void infoCommand(redisClient *c);
70003d28 677static void mgetCommand(redisClient *c);
87eca727 678static void monitorCommand(redisClient *c);
3305306f 679static void expireCommand(redisClient *c);
802e8373 680static void expireatCommand(redisClient *c);
f6b141c5 681static void getsetCommand(redisClient *c);
fd88489a 682static void ttlCommand(redisClient *c);
321b0e13 683static void slaveofCommand(redisClient *c);
7f957c92 684static void debugCommand(redisClient *c);
f6b141c5 685static void msetCommand(redisClient *c);
686static void msetnxCommand(redisClient *c);
fd8ccf44 687static void zaddCommand(redisClient *c);
7db723ad 688static void zincrbyCommand(redisClient *c);
cc812361 689static void zrangeCommand(redisClient *c);
50c55df5 690static void zrangebyscoreCommand(redisClient *c);
f44dd428 691static void zcountCommand(redisClient *c);
e3870fab 692static void zrevrangeCommand(redisClient *c);
3c41331e 693static void zcardCommand(redisClient *c);
1b7106e7 694static void zremCommand(redisClient *c);
6e333bbe 695static void zscoreCommand(redisClient *c);
1807985b 696static void zremrangebyscoreCommand(redisClient *c);
6e469882 697static void multiCommand(redisClient *c);
698static void execCommand(redisClient *c);
18b6cb76 699static void discardCommand(redisClient *c);
4409877e 700static void blpopCommand(redisClient *c);
701static void brpopCommand(redisClient *c);
4b00bebd 702static void appendCommand(redisClient *c);
39191553 703static void substrCommand(redisClient *c);
69d95c3e 704static void zrankCommand(redisClient *c);
798d9e55 705static void zrevrankCommand(redisClient *c);
978c2c94 706static void hsetCommand(redisClient *c);
707static void hgetCommand(redisClient *c);
09aeb579
PN
708static void hmsetCommand(redisClient *c);
709static void hmgetCommand(redisClient *c);
07efaf74 710static void hdelCommand(redisClient *c);
92b27fe9 711static void hlenCommand(redisClient *c);
9212eafd 712static void zremrangebyrankCommand(redisClient *c);
2830ca53
PN
713static void zunionCommand(redisClient *c);
714static void zinterCommand(redisClient *c);
78409a0f 715static void hkeysCommand(redisClient *c);
716static void hvalsCommand(redisClient *c);
717static void hgetallCommand(redisClient *c);
a86f14b1 718static void hexistsCommand(redisClient *c);
500ece7c 719static void configCommand(redisClient *c);
01426b05 720static void hincrbyCommand(redisClient *c);
befec3cd 721static void subscribeCommand(redisClient *c);
722static void unsubscribeCommand(redisClient *c);
ffc6b7f8 723static void psubscribeCommand(redisClient *c);
724static void punsubscribeCommand(redisClient *c);
befec3cd 725static void publishCommand(redisClient *c);
f6b141c5 726
ed9b544e 727/*================================= Globals ================================= */
728
729/* Global vars */
730static struct redisServer server; /* server global state */
731static struct redisCommand cmdTable[] = {
76583ea4
PN
732 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
733 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
734 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
735 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
736 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
737 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
738 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
739 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
740 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
741 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
742 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
743 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
744 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
745 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
746 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
748 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
749 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
750 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
751 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
752 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
753 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
754 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
755 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
756 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
757 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
758 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
759 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
760 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
761 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
762 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
763 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
764 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
765 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
766 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
767 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
768 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
769 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
770 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
771 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
772 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
775 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
776 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
780 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
781 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
782 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
783 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
784 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 785 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
d33278d1 786 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 787 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
01426b05 788 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
789 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
790 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
791 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
792 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
793 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 794 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
795 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
798 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
799 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
800 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
801 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
802 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
806 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
807 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
808 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
809 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
810 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
811 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
812 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
814 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
815 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
816 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
958cd5f3 819 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
820 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
823 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
824 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
825 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
826 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
828 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 830 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
befec3cd 831 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
832 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
ffc6b7f8 833 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
4005fef1 835 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
76583ea4 836 {NULL,NULL,0,0,NULL,0,0,0}
ed9b544e 837};
bcfc686d 838
ed9b544e 839/*============================ Utility functions ============================ */
840
841/* Glob-style pattern matching. */
500ece7c 842static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 843 const char *string, int stringLen, int nocase)
844{
845 while(patternLen) {
846 switch(pattern[0]) {
847 case '*':
848 while (pattern[1] == '*') {
849 pattern++;
850 patternLen--;
851 }
852 if (patternLen == 1)
853 return 1; /* match */
854 while(stringLen) {
855 if (stringmatchlen(pattern+1, patternLen-1,
856 string, stringLen, nocase))
857 return 1; /* match */
858 string++;
859 stringLen--;
860 }
861 return 0; /* no match */
862 break;
863 case '?':
864 if (stringLen == 0)
865 return 0; /* no match */
866 string++;
867 stringLen--;
868 break;
869 case '[':
870 {
871 int not, match;
872
873 pattern++;
874 patternLen--;
875 not = pattern[0] == '^';
876 if (not) {
877 pattern++;
878 patternLen--;
879 }
880 match = 0;
881 while(1) {
882 if (pattern[0] == '\\') {
883 pattern++;
884 patternLen--;
885 if (pattern[0] == string[0])
886 match = 1;
887 } else if (pattern[0] == ']') {
888 break;
889 } else if (patternLen == 0) {
890 pattern--;
891 patternLen++;
892 break;
893 } else if (pattern[1] == '-' && patternLen >= 3) {
894 int start = pattern[0];
895 int end = pattern[2];
896 int c = string[0];
897 if (start > end) {
898 int t = start;
899 start = end;
900 end = t;
901 }
902 if (nocase) {
903 start = tolower(start);
904 end = tolower(end);
905 c = tolower(c);
906 }
907 pattern += 2;
908 patternLen -= 2;
909 if (c >= start && c <= end)
910 match = 1;
911 } else {
912 if (!nocase) {
913 if (pattern[0] == string[0])
914 match = 1;
915 } else {
916 if (tolower((int)pattern[0]) == tolower((int)string[0]))
917 match = 1;
918 }
919 }
920 pattern++;
921 patternLen--;
922 }
923 if (not)
924 match = !match;
925 if (!match)
926 return 0; /* no match */
927 string++;
928 stringLen--;
929 break;
930 }
931 case '\\':
932 if (patternLen >= 2) {
933 pattern++;
934 patternLen--;
935 }
936 /* fall through */
937 default:
938 if (!nocase) {
939 if (pattern[0] != string[0])
940 return 0; /* no match */
941 } else {
942 if (tolower((int)pattern[0]) != tolower((int)string[0]))
943 return 0; /* no match */
944 }
945 string++;
946 stringLen--;
947 break;
948 }
949 pattern++;
950 patternLen--;
951 if (stringLen == 0) {
952 while(*pattern == '*') {
953 pattern++;
954 patternLen--;
955 }
956 break;
957 }
958 }
959 if (patternLen == 0 && stringLen == 0)
960 return 1;
961 return 0;
962}
963
500ece7c 964static int stringmatch(const char *pattern, const char *string, int nocase) {
965 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
966}
967
56906eef 968static void redisLog(int level, const char *fmt, ...) {
ed9b544e 969 va_list ap;
970 FILE *fp;
971
972 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
973 if (!fp) return;
974
975 va_start(ap, fmt);
976 if (level >= server.verbosity) {
6766f45e 977 char *c = ".-*#";
1904ecc1 978 char buf[64];
979 time_t now;
980
981 now = time(NULL);
6c9385e0 982 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 983 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 984 vfprintf(fp, fmt, ap);
985 fprintf(fp,"\n");
986 fflush(fp);
987 }
988 va_end(ap);
989
990 if (server.logfile) fclose(fp);
991}
992
993/*====================== Hash table type implementation ==================== */
994
995/* This is an hash table type that uses the SDS dynamic strings libary as
996 * keys and radis objects as values (objects can hold SDS strings,
997 * lists, sets). */
998
1812e024 999static void dictVanillaFree(void *privdata, void *val)
1000{
1001 DICT_NOTUSED(privdata);
1002 zfree(val);
1003}
1004
4409877e 1005static void dictListDestructor(void *privdata, void *val)
1006{
1007 DICT_NOTUSED(privdata);
1008 listRelease((list*)val);
1009}
1010
ed9b544e 1011static int sdsDictKeyCompare(void *privdata, const void *key1,
1012 const void *key2)
1013{
1014 int l1,l2;
1015 DICT_NOTUSED(privdata);
1016
1017 l1 = sdslen((sds)key1);
1018 l2 = sdslen((sds)key2);
1019 if (l1 != l2) return 0;
1020 return memcmp(key1, key2, l1) == 0;
1021}
1022
1023static void dictRedisObjectDestructor(void *privdata, void *val)
1024{
1025 DICT_NOTUSED(privdata);
1026
a35ddf12 1027 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 1028 decrRefCount(val);
1029}
1030
942a3961 1031static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1032 const void *key2)
1033{
1034 const robj *o1 = key1, *o2 = key2;
1035 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1036}
1037
942a3961 1038static unsigned int dictObjHash(const void *key) {
ed9b544e 1039 const robj *o = key;
1040 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1041}
1042
942a3961 1043static int dictEncObjKeyCompare(void *privdata, const void *key1,
1044 const void *key2)
1045{
9d65a1bb 1046 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1047 int cmp;
942a3961 1048
2a1198b4 1049 if (o1->encoding == REDIS_ENCODING_INT &&
1050 o2->encoding == REDIS_ENCODING_INT &&
db5946fc 1051 o1->ptr == o2->ptr) return 1;
2a1198b4 1052
9d65a1bb 1053 o1 = getDecodedObject(o1);
1054 o2 = getDecodedObject(o2);
1055 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1056 decrRefCount(o1);
1057 decrRefCount(o2);
1058 return cmp;
942a3961 1059}
1060
1061static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1062 robj *o = (robj*) key;
942a3961 1063
ed9e4966 1064 if (o->encoding == REDIS_ENCODING_RAW) {
1065 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1066 } else {
1067 if (o->encoding == REDIS_ENCODING_INT) {
1068 char buf[32];
1069 int len;
1070
1071 len = snprintf(buf,32,"%ld",(long)o->ptr);
1072 return dictGenHashFunction((unsigned char*)buf, len);
1073 } else {
1074 unsigned int hash;
1075
1076 o = getDecodedObject(o);
1077 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1078 decrRefCount(o);
1079 return hash;
1080 }
1081 }
942a3961 1082}
1083
f2d9f50f 1084/* Sets type and expires */
ed9b544e 1085static dictType setDictType = {
942a3961 1086 dictEncObjHash, /* hash function */
ed9b544e 1087 NULL, /* key dup */
1088 NULL, /* val dup */
942a3961 1089 dictEncObjKeyCompare, /* key compare */
ed9b544e 1090 dictRedisObjectDestructor, /* key destructor */
1091 NULL /* val destructor */
1092};
1093
f2d9f50f 1094/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1095static dictType zsetDictType = {
1096 dictEncObjHash, /* hash function */
1097 NULL, /* key dup */
1098 NULL, /* val dup */
1099 dictEncObjKeyCompare, /* key compare */
1100 dictRedisObjectDestructor, /* key destructor */
da0a1620 1101 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1102};
1103
f2d9f50f 1104/* Db->dict */
5234952b 1105static dictType dbDictType = {
942a3961 1106 dictObjHash, /* hash function */
ed9b544e 1107 NULL, /* key dup */
1108 NULL, /* val dup */
942a3961 1109 dictObjKeyCompare, /* key compare */
ed9b544e 1110 dictRedisObjectDestructor, /* key destructor */
1111 dictRedisObjectDestructor /* val destructor */
1112};
1113
f2d9f50f 1114/* Db->expires */
1115static dictType keyptrDictType = {
1116 dictObjHash, /* hash function */
1117 NULL, /* key dup */
1118 NULL, /* val dup */
1119 dictObjKeyCompare, /* key compare */
1120 dictRedisObjectDestructor, /* key destructor */
1121 NULL /* val destructor */
1122};
1123
5234952b 1124/* Hash type hash table (note that small hashes are represented with zimpaps) */
1125static dictType hashDictType = {
1126 dictEncObjHash, /* hash function */
1127 NULL, /* key dup */
1128 NULL, /* val dup */
1129 dictEncObjKeyCompare, /* key compare */
1130 dictRedisObjectDestructor, /* key destructor */
1131 dictRedisObjectDestructor /* val destructor */
1132};
1133
4409877e 1134/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1135 * lists as values. It's used for blocking operations (BLPOP) and to
1136 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1137static dictType keylistDictType = {
1138 dictObjHash, /* hash function */
1139 NULL, /* key dup */
1140 NULL, /* val dup */
1141 dictObjKeyCompare, /* key compare */
1142 dictRedisObjectDestructor, /* key destructor */
1143 dictListDestructor /* val destructor */
1144};
1145
42ab0172
AO
1146static void version();
1147
ed9b544e 1148/* ========================= Random utility functions ======================= */
1149
1150/* Redis generally does not try to recover from out of memory conditions
1151 * when allocating objects or strings, it is not clear if it will be possible
1152 * to report this condition to the client since the networking layer itself
1153 * is based on heap allocation for send buffers, so we simply abort.
1154 * At least the code will be simpler to read... */
1155static void oom(const char *msg) {
71c54b21 1156 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1157 sleep(1);
1158 abort();
1159}
1160
1161/* ====================== Redis server networking stuff ===================== */
56906eef 1162static void closeTimedoutClients(void) {
ed9b544e 1163 redisClient *c;
ed9b544e 1164 listNode *ln;
1165 time_t now = time(NULL);
c7df85a4 1166 listIter li;
ed9b544e 1167
c7df85a4 1168 listRewind(server.clients,&li);
1169 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1170 c = listNodeValue(ln);
f86a74e9 1171 if (server.maxidletime &&
1172 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1173 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
ffc6b7f8 1174 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1175 listLength(c->pubsub_patterns) == 0 &&
d6cc8867 1176 (now - c->lastinteraction > server.maxidletime))
f86a74e9 1177 {
f870935d 1178 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1179 freeClient(c);
f86a74e9 1180 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1181 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1182 addReply(c,shared.nullmultibulk);
b0d8747d 1183 unblockClientWaitingData(c);
f86a74e9 1184 }
ed9b544e 1185 }
1186 }
ed9b544e 1187}
1188
12fea928 1189static int htNeedsResize(dict *dict) {
1190 long long size, used;
1191
1192 size = dictSlots(dict);
1193 used = dictSize(dict);
1194 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1195 (used*100/size < REDIS_HT_MINFILL));
1196}
1197
0bc03378 1198/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1199 * we resize the hash table to save memory */
56906eef 1200static void tryResizeHashTables(void) {
0bc03378 1201 int j;
1202
1203 for (j = 0; j < server.dbnum; j++) {
5413c40d 1204 if (htNeedsResize(server.db[j].dict))
0bc03378 1205 dictResize(server.db[j].dict);
12fea928 1206 if (htNeedsResize(server.db[j].expires))
1207 dictResize(server.db[j].expires);
0bc03378 1208 }
1209}
1210
9d65a1bb 1211/* A background saving child (BGSAVE) terminated its work. Handle this. */
1212void backgroundSaveDoneHandler(int statloc) {
1213 int exitcode = WEXITSTATUS(statloc);
1214 int bysignal = WIFSIGNALED(statloc);
1215
1216 if (!bysignal && exitcode == 0) {
1217 redisLog(REDIS_NOTICE,
1218 "Background saving terminated with success");
1219 server.dirty = 0;
1220 server.lastsave = time(NULL);
1221 } else if (!bysignal && exitcode != 0) {
1222 redisLog(REDIS_WARNING, "Background saving error");
1223 } else {
1224 redisLog(REDIS_WARNING,
454eea7c 1225 "Background saving terminated by signal %d", WTERMSIG(statloc));
9d65a1bb 1226 rdbRemoveTempFile(server.bgsavechildpid);
1227 }
1228 server.bgsavechildpid = -1;
1229 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1230 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1231 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1232}
1233
1234/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1235 * Handle this. */
1236void backgroundRewriteDoneHandler(int statloc) {
1237 int exitcode = WEXITSTATUS(statloc);
1238 int bysignal = WIFSIGNALED(statloc);
1239
1240 if (!bysignal && exitcode == 0) {
1241 int fd;
1242 char tmpfile[256];
1243
1244 redisLog(REDIS_NOTICE,
1245 "Background append only file rewriting terminated with success");
1246 /* Now it's time to flush the differences accumulated by the parent */
1247 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1248 fd = open(tmpfile,O_WRONLY|O_APPEND);
1249 if (fd == -1) {
1250 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1251 goto cleanup;
1252 }
1253 /* Flush our data... */
1254 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1255 (signed) sdslen(server.bgrewritebuf)) {
1256 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1257 close(fd);
1258 goto cleanup;
1259 }
b32627cd 1260 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1261 /* Now our work is to rename the temp file into the stable file. And
1262 * switch the file descriptor used by the server for append only. */
1263 if (rename(tmpfile,server.appendfilename) == -1) {
1264 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1265 close(fd);
1266 goto cleanup;
1267 }
1268 /* Mission completed... almost */
1269 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1270 if (server.appendfd != -1) {
1271 /* If append only is actually enabled... */
1272 close(server.appendfd);
1273 server.appendfd = fd;
1274 fsync(fd);
85a83172 1275 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1276 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1277 } else {
1278 /* If append only is disabled we just generate a dump in this
1279 * format. Why not? */
1280 close(fd);
1281 }
1282 } else if (!bysignal && exitcode != 0) {
1283 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1284 } else {
1285 redisLog(REDIS_WARNING,
454eea7c 1286 "Background append only file rewriting terminated by signal %d",
1287 WTERMSIG(statloc));
9d65a1bb 1288 }
1289cleanup:
1290 sdsfree(server.bgrewritebuf);
1291 server.bgrewritebuf = sdsempty();
1292 aofRemoveTempFile(server.bgrewritechildpid);
1293 server.bgrewritechildpid = -1;
1294}
1295
884d4b39 1296/* This function is called once a background process of some kind terminates,
1297 * as we want to avoid resizing the hash tables when there is a child in order
1298 * to play well with copy-on-write (otherwise when a resize happens lots of
1299 * memory pages are copied). The goal of this function is to update the ability
1300 * for dict.c to resize the hash tables accordingly to the fact we have o not
1301 * running childs. */
1302static void updateDictResizePolicy(void) {
1303 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1304 dictEnableResize();
1305 else
1306 dictDisableResize();
1307}
1308
56906eef 1309static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1310 int j, loops = server.cronloops++;
ed9b544e 1311 REDIS_NOTUSED(eventLoop);
1312 REDIS_NOTUSED(id);
1313 REDIS_NOTUSED(clientData);
1314
3a66edc7 1315 /* We take a cached value of the unix time in the global state because
1316 * with virtual memory and aging there is to store the current time
1317 * in objects at every object access, and accuracy is not needed.
1318 * To access a global var is faster than calling time(NULL) */
1319 server.unixtime = time(NULL);
1320
0bc03378 1321 /* Show some info about non-empty databases */
ed9b544e 1322 for (j = 0; j < server.dbnum; j++) {
dec423d9 1323 long long size, used, vkeys;
94754ccc 1324
3305306f 1325 size = dictSlots(server.db[j].dict);
1326 used = dictSize(server.db[j].dict);
94754ccc 1327 vkeys = dictSize(server.db[j].expires);
1763929f 1328 if (!(loops % 50) && (used || vkeys)) {
f870935d 1329 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1330 /* dictPrintStats(server.dict); */
ed9b544e 1331 }
ed9b544e 1332 }
1333
0bc03378 1334 /* We don't want to resize the hash tables while a bacground saving
1335 * is in progress: the saving child is created using fork() that is
1336 * implemented with a copy-on-write semantic in most modern systems, so
1337 * if we resize the HT while there is the saving child at work actually
1338 * a lot of memory movements in the parent will cause a lot of pages
1339 * copied. */
884d4b39 1340 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1 &&
1341 !(loops % 10))
1342 {
1343 tryResizeHashTables();
1344 }
0bc03378 1345
ed9b544e 1346 /* Show information about connected clients */
1763929f 1347 if (!(loops % 50)) {
bdcb92f2 1348 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
ed9b544e 1349 listLength(server.clients)-listLength(server.slaves),
1350 listLength(server.slaves),
bdcb92f2 1351 zmalloc_used_memory());
ed9b544e 1352 }
1353
1354 /* Close connections of timedout clients */
1763929f 1355 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1356 closeTimedoutClients();
1357
9d65a1bb 1358 /* Check if a background saving or AOF rewrite in progress terminated */
1359 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1360 int statloc;
9d65a1bb 1361 pid_t pid;
1362
1363 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1364 if (pid == server.bgsavechildpid) {
1365 backgroundSaveDoneHandler(statloc);
ed9b544e 1366 } else {
9d65a1bb 1367 backgroundRewriteDoneHandler(statloc);
ed9b544e 1368 }
884d4b39 1369 updateDictResizePolicy();
ed9b544e 1370 }
1371 } else {
1372 /* If there is not a background saving in progress check if
1373 * we have to save now */
1374 time_t now = time(NULL);
1375 for (j = 0; j < server.saveparamslen; j++) {
1376 struct saveparam *sp = server.saveparams+j;
1377
1378 if (server.dirty >= sp->changes &&
1379 now-server.lastsave > sp->seconds) {
1380 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1381 sp->changes, sp->seconds);
f78fd11b 1382 rdbSaveBackground(server.dbfilename);
ed9b544e 1383 break;
1384 }
1385 }
1386 }
94754ccc 1387
f2324293 1388 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1389 * will use few CPU cycles if there are few expiring keys, otherwise
1390 * it will get more aggressive to avoid that too much memory is used by
1391 * keys that can be removed from the keyspace. */
94754ccc 1392 for (j = 0; j < server.dbnum; j++) {
f2324293 1393 int expired;
94754ccc 1394 redisDb *db = server.db+j;
94754ccc 1395
f2324293 1396 /* Continue to expire if at the end of the cycle more than 25%
1397 * of the keys were expired. */
1398 do {
4ef8de8a 1399 long num = dictSize(db->expires);
94754ccc 1400 time_t now = time(NULL);
1401
f2324293 1402 expired = 0;
94754ccc 1403 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1404 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1405 while (num--) {
1406 dictEntry *de;
1407 time_t t;
1408
1409 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1410 t = (time_t) dictGetEntryVal(de);
1411 if (now > t) {
1412 deleteKey(db,dictGetEntryKey(de));
f2324293 1413 expired++;
2a6a2ed1 1414 server.stat_expiredkeys++;
94754ccc 1415 }
1416 }
f2324293 1417 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1418 }
1419
4ef8de8a 1420 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1421 * is enbled. Try to free objects from the free list first. */
7e69548d 1422 if (vmCanSwapOut()) {
1423 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1424 server.vm_max_memory)
1425 {
72e9fd40 1426 int retval;
1427
a5819310 1428 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1429 retval = (server.vm_max_threads == 0) ?
1430 vmSwapOneObjectBlocking() :
1431 vmSwapOneObjectThreaded();
1763929f 1432 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1433 zmalloc_used_memory() >
1434 (server.vm_max_memory+server.vm_max_memory/10))
1435 {
1436 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1437 }
72e9fd40 1438 /* Note that when using threade I/O we free just one object,
1439 * because anyway when the I/O thread in charge to swap this
1440 * object out will finish, the handler of completed jobs
1441 * will try to swap more objects if we are still out of memory. */
1442 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1443 }
1444 }
1445
ed9b544e 1446 /* Check if we should connect to a MASTER */
1763929f 1447 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1448 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1449 if (syncWithMaster() == REDIS_OK) {
1450 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1451 }
1452 }
1763929f 1453 return 100;
ed9b544e 1454}
1455
d5d55fc3 1456/* This function gets called every time Redis is entering the
1457 * main loop of the event driven library, that is, before to sleep
1458 * for ready file descriptors. */
1459static void beforeSleep(struct aeEventLoop *eventLoop) {
1460 REDIS_NOTUSED(eventLoop);
1461
1462 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1463 listIter li;
1464 listNode *ln;
1465
1466 listRewind(server.io_ready_clients,&li);
1467 while((ln = listNext(&li))) {
1468 redisClient *c = ln->value;
1469 struct redisCommand *cmd;
1470
1471 /* Resume the client. */
1472 listDelNode(server.io_ready_clients,ln);
1473 c->flags &= (~REDIS_IO_WAIT);
1474 server.vm_blocked_clients--;
1475 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1476 readQueryFromClient, c);
1477 cmd = lookupCommand(c->argv[0]->ptr);
1478 assert(cmd != NULL);
1479 call(c,cmd);
1480 resetClient(c);
1481 /* There may be more data to process in the input buffer. */
1482 if (c->querybuf && sdslen(c->querybuf) > 0)
1483 processInputBuffer(c);
1484 }
1485 }
1486}
1487
ed9b544e 1488static void createSharedObjects(void) {
05df7621 1489 int j;
1490
ed9b544e 1491 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1492 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1493 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1494 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1495 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1496 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1497 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1498 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1499 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1500 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1501 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1502 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1503 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1504 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1505 "-ERR no such key\r\n"));
ed9b544e 1506 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1507 "-ERR syntax error\r\n"));
c937aa89 1508 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1509 "-ERR source and destination objects are the same\r\n"));
1510 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1511 "-ERR index out of range\r\n"));
ed9b544e 1512 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1513 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1514 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1515 shared.select0 = createStringObject("select 0\r\n",10);
1516 shared.select1 = createStringObject("select 1\r\n",10);
1517 shared.select2 = createStringObject("select 2\r\n",10);
1518 shared.select3 = createStringObject("select 3\r\n",10);
1519 shared.select4 = createStringObject("select 4\r\n",10);
1520 shared.select5 = createStringObject("select 5\r\n",10);
1521 shared.select6 = createStringObject("select 6\r\n",10);
1522 shared.select7 = createStringObject("select 7\r\n",10);
1523 shared.select8 = createStringObject("select 8\r\n",10);
1524 shared.select9 = createStringObject("select 9\r\n",10);
befec3cd 1525 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1526 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
fc46bb71 1527 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
ffc6b7f8 1528 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1529 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
befec3cd 1530 shared.mbulk3 = createStringObject("*3\r\n",4);
05df7621 1531 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1532 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1533 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1534 }
ed9b544e 1535}
1536
1537static void appendServerSaveParams(time_t seconds, int changes) {
1538 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1539 server.saveparams[server.saveparamslen].seconds = seconds;
1540 server.saveparams[server.saveparamslen].changes = changes;
1541 server.saveparamslen++;
1542}
1543
bcfc686d 1544static void resetServerSaveParams() {
ed9b544e 1545 zfree(server.saveparams);
1546 server.saveparams = NULL;
1547 server.saveparamslen = 0;
1548}
1549
1550static void initServerConfig() {
1551 server.dbnum = REDIS_DEFAULT_DBNUM;
1552 server.port = REDIS_SERVERPORT;
f870935d 1553 server.verbosity = REDIS_VERBOSE;
ed9b544e 1554 server.maxidletime = REDIS_MAXIDLETIME;
1555 server.saveparams = NULL;
1556 server.logfile = NULL; /* NULL = log on standard output */
1557 server.bindaddr = NULL;
1558 server.glueoutputbuf = 1;
1559 server.daemonize = 0;
44b38ef4 1560 server.appendonly = 0;
4e141d5a 1561 server.appendfsync = APPENDFSYNC_ALWAYS;
48f0308a 1562 server.lastfsync = time(NULL);
44b38ef4 1563 server.appendfd = -1;
1564 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1565 server.pidfile = zstrdup("/var/run/redis.pid");
1566 server.dbfilename = zstrdup("dump.rdb");
1567 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1568 server.requirepass = NULL;
10c43610 1569 server.shareobjects = 0;
b0553789 1570 server.rdbcompression = 1;
285add55 1571 server.maxclients = 0;
d5d55fc3 1572 server.blpop_blocked_clients = 0;
3fd78bcd 1573 server.maxmemory = 0;
75680a3c 1574 server.vm_enabled = 0;
054e426d 1575 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1576 server.vm_page_size = 256; /* 256 bytes per page */
1577 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1578 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1579 server.vm_max_threads = 4;
d5d55fc3 1580 server.vm_blocked_clients = 0;
cbba7dd7 1581 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1582 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
75680a3c 1583
bcfc686d 1584 resetServerSaveParams();
ed9b544e 1585
1586 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1587 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1588 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1589 /* Replication related */
1590 server.isslave = 0;
d0ccebcf 1591 server.masterauth = NULL;
ed9b544e 1592 server.masterhost = NULL;
1593 server.masterport = 6379;
1594 server.master = NULL;
1595 server.replstate = REDIS_REPL_NONE;
a7866db6 1596
1597 /* Double constants initialization */
1598 R_Zero = 0.0;
1599 R_PosInf = 1.0/R_Zero;
1600 R_NegInf = -1.0/R_Zero;
1601 R_Nan = R_Zero/R_Zero;
ed9b544e 1602}
1603
1604static void initServer() {
1605 int j;
1606
1607 signal(SIGHUP, SIG_IGN);
1608 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1609 setupSigSegvAction();
ed9b544e 1610
b9bc0eef 1611 server.devnull = fopen("/dev/null","w");
1612 if (server.devnull == NULL) {
1613 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1614 exit(1);
1615 }
ed9b544e 1616 server.clients = listCreate();
1617 server.slaves = listCreate();
87eca727 1618 server.monitors = listCreate();
ed9b544e 1619 server.objfreelist = listCreate();
1620 createSharedObjects();
1621 server.el = aeCreateEventLoop();
3305306f 1622 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
ed9b544e 1623 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1624 if (server.fd == -1) {
1625 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1626 exit(1);
1627 }
3305306f 1628 for (j = 0; j < server.dbnum; j++) {
5234952b 1629 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1630 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
4409877e 1631 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1632 if (server.vm_enabled)
1633 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1634 server.db[j].id = j;
1635 }
ffc6b7f8 1636 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1637 server.pubsub_patterns = listCreate();
1638 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1639 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
ed9b544e 1640 server.cronloops = 0;
9f3c422c 1641 server.bgsavechildpid = -1;
9d65a1bb 1642 server.bgrewritechildpid = -1;
1643 server.bgrewritebuf = sdsempty();
ed9b544e 1644 server.lastsave = time(NULL);
1645 server.dirty = 0;
ed9b544e 1646 server.stat_numcommands = 0;
1647 server.stat_numconnections = 0;
2a6a2ed1 1648 server.stat_expiredkeys = 0;
ed9b544e 1649 server.stat_starttime = time(NULL);
3a66edc7 1650 server.unixtime = time(NULL);
d8f8b666 1651 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1652 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1653 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1654
1655 if (server.appendonly) {
71eba477 1656 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1657 if (server.appendfd == -1) {
1658 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1659 strerror(errno));
1660 exit(1);
1661 }
1662 }
75680a3c 1663
1664 if (server.vm_enabled) vmInit();
ed9b544e 1665}
1666
1667/* Empty the whole database */
ca37e9cd 1668static long long emptyDb() {
ed9b544e 1669 int j;
ca37e9cd 1670 long long removed = 0;
ed9b544e 1671
3305306f 1672 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1673 removed += dictSize(server.db[j].dict);
3305306f 1674 dictEmpty(server.db[j].dict);
1675 dictEmpty(server.db[j].expires);
1676 }
ca37e9cd 1677 return removed;
ed9b544e 1678}
1679
85dd2f3a 1680static int yesnotoi(char *s) {
1681 if (!strcasecmp(s,"yes")) return 1;
1682 else if (!strcasecmp(s,"no")) return 0;
1683 else return -1;
1684}
1685
ed9b544e 1686/* I agree, this is a very rudimental way to load a configuration...
1687 will improve later if the config gets more complex */
1688static void loadServerConfig(char *filename) {
c9a111ac 1689 FILE *fp;
ed9b544e 1690 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1691 int linenum = 0;
1692 sds line = NULL;
c9a111ac 1693
1694 if (filename[0] == '-' && filename[1] == '\0')
1695 fp = stdin;
1696 else {
1697 if ((fp = fopen(filename,"r")) == NULL) {
9a22de82 1698 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
c9a111ac 1699 exit(1);
1700 }
ed9b544e 1701 }
c9a111ac 1702
ed9b544e 1703 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1704 sds *argv;
1705 int argc, j;
1706
1707 linenum++;
1708 line = sdsnew(buf);
1709 line = sdstrim(line," \t\r\n");
1710
1711 /* Skip comments and blank lines*/
1712 if (line[0] == '#' || line[0] == '\0') {
1713 sdsfree(line);
1714 continue;
1715 }
1716
1717 /* Split into arguments */
1718 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1719 sdstolower(argv[0]);
1720
1721 /* Execute config directives */
bb0b03a3 1722 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1723 server.maxidletime = atoi(argv[1]);
0150db36 1724 if (server.maxidletime < 0) {
ed9b544e 1725 err = "Invalid timeout value"; goto loaderr;
1726 }
bb0b03a3 1727 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1728 server.port = atoi(argv[1]);
1729 if (server.port < 1 || server.port > 65535) {
1730 err = "Invalid port"; goto loaderr;
1731 }
bb0b03a3 1732 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1733 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1734 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1735 int seconds = atoi(argv[1]);
1736 int changes = atoi(argv[2]);
1737 if (seconds < 1 || changes < 0) {
1738 err = "Invalid save parameters"; goto loaderr;
1739 }
1740 appendServerSaveParams(seconds,changes);
bb0b03a3 1741 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1742 if (chdir(argv[1]) == -1) {
1743 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1744 argv[1], strerror(errno));
1745 exit(1);
1746 }
bb0b03a3 1747 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1748 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1749 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1750 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1751 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1752 else {
1753 err = "Invalid log level. Must be one of debug, notice, warning";
1754 goto loaderr;
1755 }
bb0b03a3 1756 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1757 FILE *logfp;
ed9b544e 1758
1759 server.logfile = zstrdup(argv[1]);
bb0b03a3 1760 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1761 zfree(server.logfile);
1762 server.logfile = NULL;
1763 }
1764 if (server.logfile) {
1765 /* Test if we are able to open the file. The server will not
1766 * be able to abort just for this problem later... */
c9a111ac 1767 logfp = fopen(server.logfile,"a");
1768 if (logfp == NULL) {
ed9b544e 1769 err = sdscatprintf(sdsempty(),
1770 "Can't open the log file: %s", strerror(errno));
1771 goto loaderr;
1772 }
c9a111ac 1773 fclose(logfp);
ed9b544e 1774 }
bb0b03a3 1775 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1776 server.dbnum = atoi(argv[1]);
1777 if (server.dbnum < 1) {
1778 err = "Invalid number of databases"; goto loaderr;
1779 }
b3f83f12
JZ
1780 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1781 loadServerConfig(argv[1]);
285add55 1782 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1783 server.maxclients = atoi(argv[1]);
3fd78bcd 1784 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
d4465900 1785 server.maxmemory = strtoll(argv[1], NULL, 10);
bb0b03a3 1786 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1787 server.masterhost = sdsnew(argv[1]);
1788 server.masterport = atoi(argv[2]);
1789 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1790 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1791 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1792 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1793 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1794 err = "argument must be 'yes' or 'no'"; goto loaderr;
1795 }
bb0b03a3 1796 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
85dd2f3a 1797 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
10c43610 1798 err = "argument must be 'yes' or 'no'"; goto loaderr;
1799 }
121f70cf 1800 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1801 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1802 err = "argument must be 'yes' or 'no'"; goto loaderr;
1803 }
bb0b03a3 1804 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1805 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1806 err = "argument must be 'yes' or 'no'"; goto loaderr;
1807 }
44b38ef4 1808 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1809 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1810 err = "argument must be 'yes' or 'no'"; goto loaderr;
1811 }
48f0308a 1812 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1813 if (!strcasecmp(argv[1],"no")) {
48f0308a 1814 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1815 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1816 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1817 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1818 server.appendfsync = APPENDFSYNC_EVERYSEC;
1819 } else {
1820 err = "argument must be 'no', 'always' or 'everysec'";
1821 goto loaderr;
1822 }
bb0b03a3 1823 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 1824 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1825 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 1826 zfree(server.pidfile);
054e426d 1827 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1828 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 1829 zfree(server.dbfilename);
054e426d 1830 server.dbfilename = zstrdup(argv[1]);
75680a3c 1831 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1832 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1833 err = "argument must be 'yes' or 'no'"; goto loaderr;
1834 }
054e426d 1835 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 1836 zfree(server.vm_swap_file);
054e426d 1837 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 1838 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1839 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1840 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1841 server.vm_page_size = strtoll(argv[1], NULL, 10);
1842 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1843 server.vm_pages = strtoll(argv[1], NULL, 10);
92f8e882 1844 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1845 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 1846 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1847 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1848 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1849 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1850 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1851 server.vm_max_threads = strtoll(argv[1], NULL, 10);
ed9b544e 1852 } else {
1853 err = "Bad directive or wrong number of arguments"; goto loaderr;
1854 }
1855 for (j = 0; j < argc; j++)
1856 sdsfree(argv[j]);
1857 zfree(argv);
1858 sdsfree(line);
1859 }
c9a111ac 1860 if (fp != stdin) fclose(fp);
ed9b544e 1861 return;
1862
1863loaderr:
1864 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1865 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1866 fprintf(stderr, ">>> '%s'\n", line);
1867 fprintf(stderr, "%s\n", err);
1868 exit(1);
1869}
1870
1871static void freeClientArgv(redisClient *c) {
1872 int j;
1873
1874 for (j = 0; j < c->argc; j++)
1875 decrRefCount(c->argv[j]);
e8a74421 1876 for (j = 0; j < c->mbargc; j++)
1877 decrRefCount(c->mbargv[j]);
ed9b544e 1878 c->argc = 0;
e8a74421 1879 c->mbargc = 0;
ed9b544e 1880}
1881
1882static void freeClient(redisClient *c) {
1883 listNode *ln;
1884
4409877e 1885 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 1886 * call, we have to set querybuf to NULL *before* to call
1887 * unblockClientWaitingData() to avoid processInputBuffer() will get
1888 * called. Also it is important to remove the file events after
1889 * this, because this call adds the READABLE event. */
4409877e 1890 sdsfree(c->querybuf);
1891 c->querybuf = NULL;
1892 if (c->flags & REDIS_BLOCKED)
b0d8747d 1893 unblockClientWaitingData(c);
4409877e 1894
ffc6b7f8 1895 /* Unsubscribe from all the pubsub channels */
1896 pubsubUnsubscribeAllChannels(c,0);
1897 pubsubUnsubscribeAllPatterns(c,0);
1898 dictRelease(c->pubsub_channels);
1899 listRelease(c->pubsub_patterns);
befec3cd 1900 /* Obvious cleanup */
ed9b544e 1901 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1902 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 1903 listRelease(c->reply);
1904 freeClientArgv(c);
1905 close(c->fd);
92f8e882 1906 /* Remove from the list of clients */
ed9b544e 1907 ln = listSearchKey(server.clients,c);
dfc5e96c 1908 redisAssert(ln != NULL);
ed9b544e 1909 listDelNode(server.clients,ln);
d5d55fc3 1910 /* Remove from the list of clients waiting for swapped keys */
1911 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1912 ln = listSearchKey(server.io_ready_clients,c);
1913 if (ln) {
1914 listDelNode(server.io_ready_clients,ln);
1915 server.vm_blocked_clients--;
1916 }
1917 }
1918 while (server.vm_enabled && listLength(c->io_keys)) {
1919 ln = listFirst(c->io_keys);
1920 dontWaitForSwappedKey(c,ln->value);
92f8e882 1921 }
b3e3d0d7 1922 listRelease(c->io_keys);
befec3cd 1923 /* Master/slave cleanup */
ed9b544e 1924 if (c->flags & REDIS_SLAVE) {
6208b3a7 1925 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1926 close(c->repldbfd);
87eca727 1927 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1928 ln = listSearchKey(l,c);
dfc5e96c 1929 redisAssert(ln != NULL);
87eca727 1930 listDelNode(l,ln);
ed9b544e 1931 }
1932 if (c->flags & REDIS_MASTER) {
1933 server.master = NULL;
1934 server.replstate = REDIS_REPL_CONNECT;
1935 }
befec3cd 1936 /* Release memory */
93ea3759 1937 zfree(c->argv);
e8a74421 1938 zfree(c->mbargv);
6e469882 1939 freeClientMultiState(c);
ed9b544e 1940 zfree(c);
1941}
1942
cc30e368 1943#define GLUEREPLY_UP_TO (1024)
ed9b544e 1944static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 1945 int copylen = 0;
1946 char buf[GLUEREPLY_UP_TO];
6208b3a7 1947 listNode *ln;
c7df85a4 1948 listIter li;
ed9b544e 1949 robj *o;
1950
c7df85a4 1951 listRewind(c->reply,&li);
1952 while((ln = listNext(&li))) {
c28b42ac 1953 int objlen;
1954
ed9b544e 1955 o = ln->value;
c28b42ac 1956 objlen = sdslen(o->ptr);
1957 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1958 memcpy(buf+copylen,o->ptr,objlen);
1959 copylen += objlen;
ed9b544e 1960 listDelNode(c->reply,ln);
c28b42ac 1961 } else {
1962 if (copylen == 0) return;
1963 break;
ed9b544e 1964 }
ed9b544e 1965 }
c28b42ac 1966 /* Now the output buffer is empty, add the new single element */
1967 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1968 listAddNodeHead(c->reply,o);
ed9b544e 1969}
1970
1971static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1972 redisClient *c = privdata;
1973 int nwritten = 0, totwritten = 0, objlen;
1974 robj *o;
1975 REDIS_NOTUSED(el);
1976 REDIS_NOTUSED(mask);
1977
2895e862 1978 /* Use writev() if we have enough buffers to send */
7ea870c0 1979 if (!server.glueoutputbuf &&
e0a62c7f 1980 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
7ea870c0 1981 !(c->flags & REDIS_MASTER))
2895e862 1982 {
1983 sendReplyToClientWritev(el, fd, privdata, mask);
1984 return;
1985 }
2895e862 1986
ed9b544e 1987 while(listLength(c->reply)) {
c28b42ac 1988 if (server.glueoutputbuf && listLength(c->reply) > 1)
1989 glueReplyBuffersIfNeeded(c);
1990
ed9b544e 1991 o = listNodeValue(listFirst(c->reply));
1992 objlen = sdslen(o->ptr);
1993
1994 if (objlen == 0) {
1995 listDelNode(c->reply,listFirst(c->reply));
1996 continue;
1997 }
1998
1999 if (c->flags & REDIS_MASTER) {
6f376729 2000 /* Don't reply to a master */
ed9b544e 2001 nwritten = objlen - c->sentlen;
2002 } else {
a4d1ba9a 2003 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 2004 if (nwritten <= 0) break;
2005 }
2006 c->sentlen += nwritten;
2007 totwritten += nwritten;
2008 /* If we fully sent the object on head go to the next one */
2009 if (c->sentlen == objlen) {
2010 listDelNode(c->reply,listFirst(c->reply));
2011 c->sentlen = 0;
2012 }
6f376729 2013 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 2014 * bytes, in a single threaded server it's a good idea to serve
6f376729 2015 * other clients as well, even if a very large request comes from
2016 * super fast link that is always able to accept data (in real world
12f9d551 2017 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 2018 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 2019 }
2020 if (nwritten == -1) {
2021 if (errno == EAGAIN) {
2022 nwritten = 0;
2023 } else {
f870935d 2024 redisLog(REDIS_VERBOSE,
ed9b544e 2025 "Error writing to client: %s", strerror(errno));
2026 freeClient(c);
2027 return;
2028 }
2029 }
2030 if (totwritten > 0) c->lastinteraction = time(NULL);
2031 if (listLength(c->reply) == 0) {
2032 c->sentlen = 0;
2033 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2034 }
2035}
2036
2895e862 2037static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2038{
2039 redisClient *c = privdata;
2040 int nwritten = 0, totwritten = 0, objlen, willwrite;
2041 robj *o;
2042 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2043 int offset, ion = 0;
2044 REDIS_NOTUSED(el);
2045 REDIS_NOTUSED(mask);
2046
2047 listNode *node;
2048 while (listLength(c->reply)) {
2049 offset = c->sentlen;
2050 ion = 0;
2051 willwrite = 0;
2052
2053 /* fill-in the iov[] array */
2054 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2055 o = listNodeValue(node);
2056 objlen = sdslen(o->ptr);
2057
e0a62c7f 2058 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2895e862 2059 break;
2060
2061 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2062 break; /* no more iovecs */
2063
2064 iov[ion].iov_base = ((char*)o->ptr) + offset;
2065 iov[ion].iov_len = objlen - offset;
2066 willwrite += objlen - offset;
2067 offset = 0; /* just for the first item */
2068 ion++;
2069 }
2070
2071 if(willwrite == 0)
2072 break;
2073
2074 /* write all collected blocks at once */
2075 if((nwritten = writev(fd, iov, ion)) < 0) {
2076 if (errno != EAGAIN) {
f870935d 2077 redisLog(REDIS_VERBOSE,
2895e862 2078 "Error writing to client: %s", strerror(errno));
2079 freeClient(c);
2080 return;
2081 }
2082 break;
2083 }
2084
2085 totwritten += nwritten;
2086 offset = c->sentlen;
2087
2088 /* remove written robjs from c->reply */
2089 while (nwritten && listLength(c->reply)) {
2090 o = listNodeValue(listFirst(c->reply));
2091 objlen = sdslen(o->ptr);
2092
2093 if(nwritten >= objlen - offset) {
2094 listDelNode(c->reply, listFirst(c->reply));
2095 nwritten -= objlen - offset;
2096 c->sentlen = 0;
2097 } else {
2098 /* partial write */
2099 c->sentlen += nwritten;
2100 break;
2101 }
2102 offset = 0;
2103 }
2104 }
2105
e0a62c7f 2106 if (totwritten > 0)
2895e862 2107 c->lastinteraction = time(NULL);
2108
2109 if (listLength(c->reply) == 0) {
2110 c->sentlen = 0;
2111 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2112 }
2113}
2114
ed9b544e 2115static struct redisCommand *lookupCommand(char *name) {
2116 int j = 0;
2117 while(cmdTable[j].name != NULL) {
bb0b03a3 2118 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
ed9b544e 2119 j++;
2120 }
2121 return NULL;
2122}
2123
2124/* resetClient prepare the client to process the next command */
2125static void resetClient(redisClient *c) {
2126 freeClientArgv(c);
2127 c->bulklen = -1;
e8a74421 2128 c->multibulk = 0;
ed9b544e 2129}
2130
6e469882 2131/* Call() is the core of Redis execution of a command */
2132static void call(redisClient *c, struct redisCommand *cmd) {
2133 long long dirty;
2134
2135 dirty = server.dirty;
2136 cmd->proc(c);
4005fef1 2137 dirty = server.dirty-dirty;
2138
2139 if (server.appendonly && dirty)
6e469882 2140 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
4005fef1 2141 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2142 listLength(server.slaves))
248ea310 2143 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2144 if (listLength(server.monitors))
248ea310 2145 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2146 server.stat_numcommands++;
2147}
2148
ed9b544e 2149/* If this function gets called we already read a whole
2150 * command, argments are in the client argv/argc fields.
2151 * processCommand() execute the command or prepare the
2152 * server for a bulk read from the client.
2153 *
2154 * If 1 is returned the client is still alive and valid and
2155 * and other operations can be performed by the caller. Otherwise
2156 * if 0 is returned the client was destroied (i.e. after QUIT). */
2157static int processCommand(redisClient *c) {
2158 struct redisCommand *cmd;
ed9b544e 2159
3fd78bcd 2160 /* Free some memory if needed (maxmemory setting) */
2161 if (server.maxmemory) freeMemoryIfNeeded();
2162
e8a74421 2163 /* Handle the multi bulk command type. This is an alternative protocol
2164 * supported by Redis in order to receive commands that are composed of
2165 * multiple binary-safe "bulk" arguments. The latency of processing is
2166 * a bit higher but this allows things like multi-sets, so if this
2167 * protocol is used only for MSET and similar commands this is a big win. */
2168 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2169 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2170 if (c->multibulk <= 0) {
2171 resetClient(c);
2172 return 1;
2173 } else {
2174 decrRefCount(c->argv[c->argc-1]);
2175 c->argc--;
2176 return 1;
2177 }
2178 } else if (c->multibulk) {
2179 if (c->bulklen == -1) {
2180 if (((char*)c->argv[0]->ptr)[0] != '$') {
2181 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2182 resetClient(c);
2183 return 1;
2184 } else {
2185 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2186 decrRefCount(c->argv[0]);
2187 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2188 c->argc--;
2189 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2190 resetClient(c);
2191 return 1;
2192 }
2193 c->argc--;
2194 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2195 return 1;
2196 }
2197 } else {
2198 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2199 c->mbargv[c->mbargc] = c->argv[0];
2200 c->mbargc++;
2201 c->argc--;
2202 c->multibulk--;
2203 if (c->multibulk == 0) {
2204 robj **auxargv;
2205 int auxargc;
2206
2207 /* Here we need to swap the multi-bulk argc/argv with the
2208 * normal argc/argv of the client structure. */
2209 auxargv = c->argv;
2210 c->argv = c->mbargv;
2211 c->mbargv = auxargv;
2212
2213 auxargc = c->argc;
2214 c->argc = c->mbargc;
2215 c->mbargc = auxargc;
2216
2217 /* We need to set bulklen to something different than -1
2218 * in order for the code below to process the command without
2219 * to try to read the last argument of a bulk command as
2220 * a special argument. */
2221 c->bulklen = 0;
2222 /* continue below and process the command */
2223 } else {
2224 c->bulklen = -1;
2225 return 1;
2226 }
2227 }
2228 }
2229 /* -- end of multi bulk commands processing -- */
2230
ed9b544e 2231 /* The QUIT command is handled as a special case. Normal command
2232 * procs are unable to close the client connection safely */
bb0b03a3 2233 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2234 freeClient(c);
2235 return 0;
2236 }
d5d55fc3 2237
2238 /* Now lookup the command and check ASAP about trivial error conditions
2239 * such wrong arity, bad command name and so forth. */
ed9b544e 2240 cmd = lookupCommand(c->argv[0]->ptr);
2241 if (!cmd) {
2c14807b 2242 addReplySds(c,
2243 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2244 (char*)c->argv[0]->ptr));
ed9b544e 2245 resetClient(c);
2246 return 1;
2247 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2248 (c->argc < -cmd->arity)) {
454d4e43 2249 addReplySds(c,
2250 sdscatprintf(sdsempty(),
2251 "-ERR wrong number of arguments for '%s' command\r\n",
2252 cmd->name));
ed9b544e 2253 resetClient(c);
2254 return 1;
2255 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2256 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2257 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2258
2259 decrRefCount(c->argv[c->argc-1]);
2260 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2261 c->argc--;
2262 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2263 resetClient(c);
2264 return 1;
2265 }
2266 c->argc--;
2267 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2268 /* It is possible that the bulk read is already in the
8d0490e7 2269 * buffer. Check this condition and handle it accordingly.
2270 * This is just a fast path, alternative to call processInputBuffer().
2271 * It's a good idea since the code is small and this condition
2272 * happens most of the times. */
ed9b544e 2273 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2274 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2275 c->argc++;
2276 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2277 } else {
d5d55fc3 2278 /* Otherwise return... there is to read the last argument
2279 * from the socket. */
ed9b544e 2280 return 1;
2281 }
2282 }
942a3961 2283 /* Let's try to encode the bulk object to save space. */
2284 if (cmd->flags & REDIS_CMD_BULK)
05df7621 2285 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
942a3961 2286
e63943a4 2287 /* Check if the user is authenticated */
2288 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2289 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2290 resetClient(c);
2291 return 1;
2292 }
2293
b61a28fe 2294 /* Handle the maxmemory directive */
2295 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2296 zmalloc_used_memory() > server.maxmemory)
2297 {
2298 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2299 resetClient(c);
2300 return 1;
2301 }
2302
d6cc8867 2303 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
e6cca5db 2304 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2305 &&
ffc6b7f8 2306 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2307 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2308 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
d6cc8867 2309 resetClient(c);
2310 return 1;
2311 }
2312
ed9b544e 2313 /* Exec the command */
18b6cb76 2314 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
6e469882 2315 queueMultiCommand(c,cmd);
2316 addReply(c,shared.queued);
2317 } else {
d5d55fc3 2318 if (server.vm_enabled && server.vm_max_threads > 0 &&
2319 blockClientOnSwappedKeys(cmd,c)) return 1;
6e469882 2320 call(c,cmd);
2321 }
ed9b544e 2322
2323 /* Prepare the client for the next command */
ed9b544e 2324 resetClient(c);
2325 return 1;
2326}
2327
248ea310 2328static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2329 listNode *ln;
c7df85a4 2330 listIter li;
ed9b544e 2331 int outc = 0, j;
93ea3759 2332 robj **outv;
248ea310 2333 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2334 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2335 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2336 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2337 robj *lenobj;
93ea3759 2338
2339 if (argc <= REDIS_STATIC_ARGS) {
2340 outv = static_outv;
2341 } else {
248ea310 2342 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2343 }
248ea310 2344
2345 lenobj = createObject(REDIS_STRING,
2346 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2347 lenobj->refcount = 0;
2348 outv[outc++] = lenobj;
ed9b544e 2349 for (j = 0; j < argc; j++) {
248ea310 2350 lenobj = createObject(REDIS_STRING,
2351 sdscatprintf(sdsempty(),"$%lu\r\n",
2352 (unsigned long) stringObjectLen(argv[j])));
2353 lenobj->refcount = 0;
2354 outv[outc++] = lenobj;
ed9b544e 2355 outv[outc++] = argv[j];
248ea310 2356 outv[outc++] = shared.crlf;
ed9b544e 2357 }
ed9b544e 2358
40d224a9 2359 /* Increment all the refcounts at start and decrement at end in order to
2360 * be sure to free objects if there is no slave in a replication state
2361 * able to be feed with commands */
2362 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2363 listRewind(slaves,&li);
2364 while((ln = listNext(&li))) {
ed9b544e 2365 redisClient *slave = ln->value;
40d224a9 2366
2367 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2368 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2369
2370 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2371 if (slave->slaveseldb != dictid) {
2372 robj *selectcmd;
2373
2374 switch(dictid) {
2375 case 0: selectcmd = shared.select0; break;
2376 case 1: selectcmd = shared.select1; break;
2377 case 2: selectcmd = shared.select2; break;
2378 case 3: selectcmd = shared.select3; break;
2379 case 4: selectcmd = shared.select4; break;
2380 case 5: selectcmd = shared.select5; break;
2381 case 6: selectcmd = shared.select6; break;
2382 case 7: selectcmd = shared.select7; break;
2383 case 8: selectcmd = shared.select8; break;
2384 case 9: selectcmd = shared.select9; break;
2385 default:
2386 selectcmd = createObject(REDIS_STRING,
2387 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2388 selectcmd->refcount = 0;
2389 break;
2390 }
2391 addReply(slave,selectcmd);
2392 slave->slaveseldb = dictid;
2393 }
2394 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2395 }
40d224a9 2396 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2397 if (outv != static_outv) zfree(outv);
ed9b544e 2398}
2399
638e42ac 2400static void processInputBuffer(redisClient *c) {
ed9b544e 2401again:
4409877e 2402 /* Before to process the input buffer, make sure the client is not
2403 * waitig for a blocking operation such as BLPOP. Note that the first
2404 * iteration the client is never blocked, otherwise the processInputBuffer
2405 * would not be called at all, but after the execution of the first commands
2406 * in the input buffer the client may be blocked, and the "goto again"
2407 * will try to reiterate. The following line will make it return asap. */
92f8e882 2408 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2409 if (c->bulklen == -1) {
2410 /* Read the first line of the query */
2411 char *p = strchr(c->querybuf,'\n');
2412 size_t querylen;
644fafa3 2413
ed9b544e 2414 if (p) {
2415 sds query, *argv;
2416 int argc, j;
e0a62c7f 2417
ed9b544e 2418 query = c->querybuf;
2419 c->querybuf = sdsempty();
2420 querylen = 1+(p-(query));
2421 if (sdslen(query) > querylen) {
2422 /* leave data after the first line of the query in the buffer */
2423 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2424 }
2425 *p = '\0'; /* remove "\n" */
2426 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2427 sdsupdatelen(query);
2428
2429 /* Now we can split the query in arguments */
ed9b544e 2430 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2431 sdsfree(query);
2432
2433 if (c->argv) zfree(c->argv);
2434 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2435
2436 for (j = 0; j < argc; j++) {
ed9b544e 2437 if (sdslen(argv[j])) {
2438 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2439 c->argc++;
2440 } else {
2441 sdsfree(argv[j]);
2442 }
2443 }
2444 zfree(argv);
7c49733c 2445 if (c->argc) {
2446 /* Execute the command. If the client is still valid
2447 * after processCommand() return and there is something
2448 * on the query buffer try to process the next command. */
2449 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2450 } else {
2451 /* Nothing to process, argc == 0. Just process the query
2452 * buffer if it's not empty or return to the caller */
2453 if (sdslen(c->querybuf)) goto again;
2454 }
ed9b544e 2455 return;
644fafa3 2456 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2457 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2458 freeClient(c);
2459 return;
2460 }
2461 } else {
2462 /* Bulk read handling. Note that if we are at this point
2463 the client already sent a command terminated with a newline,
2464 we are reading the bulk data that is actually the last
2465 argument of the command. */
2466 int qbl = sdslen(c->querybuf);
2467
2468 if (c->bulklen <= qbl) {
2469 /* Copy everything but the final CRLF as final argument */
2470 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2471 c->argc++;
2472 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2473 /* Process the command. If the client is still valid after
2474 * the processing and there is more data in the buffer
2475 * try to parse it. */
2476 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2477 return;
2478 }
2479 }
2480}
2481
638e42ac 2482static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2483 redisClient *c = (redisClient*) privdata;
2484 char buf[REDIS_IOBUF_LEN];
2485 int nread;
2486 REDIS_NOTUSED(el);
2487 REDIS_NOTUSED(mask);
2488
2489 nread = read(fd, buf, REDIS_IOBUF_LEN);
2490 if (nread == -1) {
2491 if (errno == EAGAIN) {
2492 nread = 0;
2493 } else {
f870935d 2494 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2495 freeClient(c);
2496 return;
2497 }
2498 } else if (nread == 0) {
f870935d 2499 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2500 freeClient(c);
2501 return;
2502 }
2503 if (nread) {
2504 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2505 c->lastinteraction = time(NULL);
2506 } else {
2507 return;
2508 }
168ac5c6 2509 processInputBuffer(c);
638e42ac 2510}
2511
ed9b544e 2512static int selectDb(redisClient *c, int id) {
2513 if (id < 0 || id >= server.dbnum)
2514 return REDIS_ERR;
3305306f 2515 c->db = &server.db[id];
ed9b544e 2516 return REDIS_OK;
2517}
2518
40d224a9 2519static void *dupClientReplyValue(void *o) {
2520 incrRefCount((robj*)o);
12d090d2 2521 return o;
40d224a9 2522}
2523
ffc6b7f8 2524static int listMatchObjects(void *a, void *b) {
2525 return compareStringObjects(a,b) == 0;
2526}
2527
ed9b544e 2528static redisClient *createClient(int fd) {
2529 redisClient *c = zmalloc(sizeof(*c));
2530
2531 anetNonBlock(NULL,fd);
2532 anetTcpNoDelay(NULL,fd);
2533 if (!c) return NULL;
2534 selectDb(c,0);
2535 c->fd = fd;
2536 c->querybuf = sdsempty();
2537 c->argc = 0;
93ea3759 2538 c->argv = NULL;
ed9b544e 2539 c->bulklen = -1;
e8a74421 2540 c->multibulk = 0;
2541 c->mbargc = 0;
2542 c->mbargv = NULL;
ed9b544e 2543 c->sentlen = 0;
2544 c->flags = 0;
2545 c->lastinteraction = time(NULL);
abcb223e 2546 c->authenticated = 0;
40d224a9 2547 c->replstate = REDIS_REPL_NONE;
6b47e12e 2548 c->reply = listCreate();
ed9b544e 2549 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2550 listSetDupMethod(c->reply,dupClientReplyValue);
92f8e882 2551 c->blockingkeys = NULL;
2552 c->blockingkeysnum = 0;
2553 c->io_keys = listCreate();
2554 listSetFreeMethod(c->io_keys,decrRefCount);
ffc6b7f8 2555 c->pubsub_channels = dictCreate(&setDictType,NULL);
2556 c->pubsub_patterns = listCreate();
2557 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2558 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
ed9b544e 2559 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2560 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2561 freeClient(c);
2562 return NULL;
2563 }
6b47e12e 2564 listAddNodeTail(server.clients,c);
6e469882 2565 initClientMultiState(c);
ed9b544e 2566 return c;
2567}
2568
2569static void addReply(redisClient *c, robj *obj) {
2570 if (listLength(c->reply) == 0 &&
6208b3a7 2571 (c->replstate == REDIS_REPL_NONE ||
2572 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2573 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2574 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2575
2576 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2577 obj = dupStringObject(obj);
2578 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2579 }
9d65a1bb 2580 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2581}
2582
2583static void addReplySds(redisClient *c, sds s) {
2584 robj *o = createObject(REDIS_STRING,s);
2585 addReply(c,o);
2586 decrRefCount(o);
2587}
2588
e2665397 2589static void addReplyDouble(redisClient *c, double d) {
2590 char buf[128];
2591
2592 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2593 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2594 (unsigned long) strlen(buf),buf));
e2665397 2595}
2596
f44dd428 2597static void addReplyLong(redisClient *c, long l) {
2598 char buf[128];
2599 size_t len;
2600
dd88747b 2601 if (l == 0) {
2602 addReply(c,shared.czero);
2603 return;
2604 } else if (l == 1) {
2605 addReply(c,shared.cone);
2606 return;
2607 }
f44dd428 2608 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2609 addReplySds(c,sdsnewlen(buf,len));
2610}
2611
aa7c2934
PN
2612static void addReplyLongLong(redisClient *c, long long ll) {
2613 char buf[128];
2614 size_t len;
2615
2616 if (ll == 0) {
2617 addReply(c,shared.czero);
2618 return;
2619 } else if (ll == 1) {
2620 addReply(c,shared.cone);
2621 return;
2622 }
2623 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2624 addReplySds(c,sdsnewlen(buf,len));
2625}
2626
92b27fe9 2627static void addReplyUlong(redisClient *c, unsigned long ul) {
2628 char buf[128];
2629 size_t len;
2630
dd88747b 2631 if (ul == 0) {
2632 addReply(c,shared.czero);
2633 return;
2634 } else if (ul == 1) {
2635 addReply(c,shared.cone);
2636 return;
2637 }
92b27fe9 2638 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2639 addReplySds(c,sdsnewlen(buf,len));
2640}
2641
942a3961 2642static void addReplyBulkLen(redisClient *c, robj *obj) {
2643 size_t len;
2644
2645 if (obj->encoding == REDIS_ENCODING_RAW) {
2646 len = sdslen(obj->ptr);
2647 } else {
2648 long n = (long)obj->ptr;
2649
e054afda 2650 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2651 len = 1;
2652 if (n < 0) {
2653 len++;
2654 n = -n;
2655 }
2656 while((n = n/10) != 0) {
2657 len++;
2658 }
2659 }
83c6a618 2660 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
942a3961 2661}
2662
dd88747b 2663static void addReplyBulk(redisClient *c, robj *obj) {
2664 addReplyBulkLen(c,obj);
2665 addReply(c,obj);
2666 addReply(c,shared.crlf);
2667}
2668
500ece7c 2669/* In the CONFIG command we need to add vanilla C string as bulk replies */
2670static void addReplyBulkCString(redisClient *c, char *s) {
2671 if (s == NULL) {
2672 addReply(c,shared.nullbulk);
2673 } else {
2674 robj *o = createStringObject(s,strlen(s));
2675 addReplyBulk(c,o);
2676 decrRefCount(o);
2677 }
2678}
2679
ed9b544e 2680static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2681 int cport, cfd;
2682 char cip[128];
285add55 2683 redisClient *c;
ed9b544e 2684 REDIS_NOTUSED(el);
2685 REDIS_NOTUSED(mask);
2686 REDIS_NOTUSED(privdata);
2687
2688 cfd = anetAccept(server.neterr, fd, cip, &cport);
2689 if (cfd == AE_ERR) {
f870935d 2690 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2691 return;
2692 }
f870935d 2693 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2694 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2695 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2696 close(cfd); /* May be already closed, just ingore errors */
2697 return;
2698 }
285add55 2699 /* If maxclient directive is set and this is one client more... close the
2700 * connection. Note that we create the client instead to check before
2701 * for this condition, since now the socket is already set in nonblocking
2702 * mode and we can send an error for free using the Kernel I/O */
2703 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2704 char *err = "-ERR max number of clients reached\r\n";
2705
2706 /* That's a best effort error message, don't check write errors */
fee803ba 2707 if (write(c->fd,err,strlen(err)) == -1) {
2708 /* Nothing to do, Just to avoid the warning... */
2709 }
285add55 2710 freeClient(c);
2711 return;
2712 }
ed9b544e 2713 server.stat_numconnections++;
2714}
2715
2716/* ======================= Redis objects implementation ===================== */
2717
2718static robj *createObject(int type, void *ptr) {
2719 robj *o;
2720
a5819310 2721 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2722 if (listLength(server.objfreelist)) {
2723 listNode *head = listFirst(server.objfreelist);
2724 o = listNodeValue(head);
2725 listDelNode(server.objfreelist,head);
a5819310 2726 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2727 } else {
75680a3c 2728 if (server.vm_enabled) {
a5819310 2729 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2730 o = zmalloc(sizeof(*o));
2731 } else {
2732 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2733 }
ed9b544e 2734 }
ed9b544e 2735 o->type = type;
942a3961 2736 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2737 o->ptr = ptr;
2738 o->refcount = 1;
3a66edc7 2739 if (server.vm_enabled) {
1064ef87 2740 /* Note that this code may run in the context of an I/O thread
2741 * and accessing to server.unixtime in theory is an error
2742 * (no locks). But in practice this is safe, and even if we read
2743 * garbage Redis will not fail, as it's just a statistical info */
3a66edc7 2744 o->vm.atime = server.unixtime;
2745 o->storage = REDIS_VM_MEMORY;
2746 }
ed9b544e 2747 return o;
2748}
2749
2750static robj *createStringObject(char *ptr, size_t len) {
2751 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2752}
2753
4ef8de8a 2754static robj *dupStringObject(robj *o) {
b9bc0eef 2755 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2756 return createStringObject(o->ptr,sdslen(o->ptr));
2757}
2758
ed9b544e 2759static robj *createListObject(void) {
2760 list *l = listCreate();
2761
ed9b544e 2762 listSetFreeMethod(l,decrRefCount);
2763 return createObject(REDIS_LIST,l);
2764}
2765
2766static robj *createSetObject(void) {
2767 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2768 return createObject(REDIS_SET,d);
2769}
2770
5234952b 2771static robj *createHashObject(void) {
2772 /* All the Hashes start as zipmaps. Will be automatically converted
2773 * into hash tables if there are enough elements or big elements
2774 * inside. */
2775 unsigned char *zm = zipmapNew();
2776 robj *o = createObject(REDIS_HASH,zm);
2777 o->encoding = REDIS_ENCODING_ZIPMAP;
2778 return o;
2779}
2780
1812e024 2781static robj *createZsetObject(void) {
6b47e12e 2782 zset *zs = zmalloc(sizeof(*zs));
2783
2784 zs->dict = dictCreate(&zsetDictType,NULL);
2785 zs->zsl = zslCreate();
2786 return createObject(REDIS_ZSET,zs);
1812e024 2787}
2788
ed9b544e 2789static void freeStringObject(robj *o) {
942a3961 2790 if (o->encoding == REDIS_ENCODING_RAW) {
2791 sdsfree(o->ptr);
2792 }
ed9b544e 2793}
2794
2795static void freeListObject(robj *o) {
2796 listRelease((list*) o->ptr);
2797}
2798
2799static void freeSetObject(robj *o) {
2800 dictRelease((dict*) o->ptr);
2801}
2802
fd8ccf44 2803static void freeZsetObject(robj *o) {
2804 zset *zs = o->ptr;
2805
2806 dictRelease(zs->dict);
2807 zslFree(zs->zsl);
2808 zfree(zs);
2809}
2810
ed9b544e 2811static void freeHashObject(robj *o) {
cbba7dd7 2812 switch (o->encoding) {
2813 case REDIS_ENCODING_HT:
2814 dictRelease((dict*) o->ptr);
2815 break;
2816 case REDIS_ENCODING_ZIPMAP:
2817 zfree(o->ptr);
2818 break;
2819 default:
2820 redisAssert(0);
2821 break;
2822 }
ed9b544e 2823}
2824
2825static void incrRefCount(robj *o) {
2826 o->refcount++;
2827}
2828
2829static void decrRefCount(void *obj) {
2830 robj *o = obj;
94754ccc 2831
970e10bb 2832 /* Object is a key of a swapped out value, or in the process of being
2833 * loaded. */
996cb5f7 2834 if (server.vm_enabled &&
2835 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2836 {
996cb5f7 2837 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 2838 redisAssert(o->type == REDIS_STRING);
a35ddf12 2839 freeStringObject(o);
2840 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 2841 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 2842 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2843 !listAddNodeHead(server.objfreelist,o))
2844 zfree(o);
a5819310 2845 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 2846 server.vm_stats_swapped_objects--;
a35ddf12 2847 return;
2848 }
996cb5f7 2849 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 2850 if (--(o->refcount) == 0) {
996cb5f7 2851 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2852 vmCancelThreadedIOJob(obj);
ed9b544e 2853 switch(o->type) {
2854 case REDIS_STRING: freeStringObject(o); break;
2855 case REDIS_LIST: freeListObject(o); break;
2856 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 2857 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 2858 case REDIS_HASH: freeHashObject(o); break;
78409a0f 2859 default: redisAssert(0); break;
ed9b544e 2860 }
a5819310 2861 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2862 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2863 !listAddNodeHead(server.objfreelist,o))
2864 zfree(o);
a5819310 2865 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2866 }
2867}
2868
942a3961 2869static robj *lookupKey(redisDb *db, robj *key) {
2870 dictEntry *de = dictFind(db->dict,key);
3a66edc7 2871 if (de) {
55cf8433 2872 robj *key = dictGetEntryKey(de);
2873 robj *val = dictGetEntryVal(de);
3a66edc7 2874
55cf8433 2875 if (server.vm_enabled) {
996cb5f7 2876 if (key->storage == REDIS_VM_MEMORY ||
2877 key->storage == REDIS_VM_SWAPPING)
2878 {
2879 /* If we were swapping the object out, stop it, this key
2880 * was requested. */
2881 if (key->storage == REDIS_VM_SWAPPING)
2882 vmCancelThreadedIOJob(key);
55cf8433 2883 /* Update the access time of the key for the aging algorithm. */
2884 key->vm.atime = server.unixtime;
2885 } else {
d5d55fc3 2886 int notify = (key->storage == REDIS_VM_LOADING);
2887
55cf8433 2888 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 2889 redisAssert(val == NULL);
55cf8433 2890 val = vmLoadObject(key);
2891 dictGetEntryVal(de) = val;
d5d55fc3 2892
2893 /* Clients blocked by the VM subsystem may be waiting for
2894 * this key... */
2895 if (notify) handleClientsBlockedOnSwappedKey(db,key);
55cf8433 2896 }
2897 }
2898 return val;
3a66edc7 2899 } else {
2900 return NULL;
2901 }
942a3961 2902}
2903
2904static robj *lookupKeyRead(redisDb *db, robj *key) {
2905 expireIfNeeded(db,key);
2906 return lookupKey(db,key);
2907}
2908
2909static robj *lookupKeyWrite(redisDb *db, robj *key) {
2910 deleteIfVolatile(db,key);
2911 return lookupKey(db,key);
2912}
2913
92b27fe9 2914static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2915 robj *o = lookupKeyRead(c->db, key);
2916 if (!o) addReply(c,reply);
2917 return o;
2918}
2919
2920static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2921 robj *o = lookupKeyWrite(c->db, key);
2922 if (!o) addReply(c,reply);
2923 return o;
2924}
2925
2926static int checkType(redisClient *c, robj *o, int type) {
2927 if (o->type != type) {
2928 addReply(c,shared.wrongtypeerr);
2929 return 1;
2930 }
2931 return 0;
2932}
2933
942a3961 2934static int deleteKey(redisDb *db, robj *key) {
2935 int retval;
2936
2937 /* We need to protect key from destruction: after the first dictDelete()
2938 * it may happen that 'key' is no longer valid if we don't increment
2939 * it's count. This may happen when we get the object reference directly
2940 * from the hash table with dictRandomKey() or dict iterators */
2941 incrRefCount(key);
2942 if (dictSize(db->expires)) dictDelete(db->expires,key);
2943 retval = dictDelete(db->dict,key);
2944 decrRefCount(key);
2945
2946 return retval == DICT_OK;
2947}
2948
724a51b1 2949/* Check if the nul-terminated string 's' can be represented by a long
2950 * (that is, is a number that fits into long without any other space or
2951 * character before or after the digits).
2952 *
2953 * If so, the function returns REDIS_OK and *longval is set to the value
2954 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 2955static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 2956 char buf[32], *endptr;
2957 long value;
2958 int slen;
e0a62c7f 2959
724a51b1 2960 value = strtol(s, &endptr, 10);
2961 if (endptr[0] != '\0') return REDIS_ERR;
2962 slen = snprintf(buf,32,"%ld",value);
2963
2964 /* If the number converted back into a string is not identical
2965 * then it's not possible to encode the string as integer */
f69f2cba 2966 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 2967 if (longval) *longval = value;
2968 return REDIS_OK;
2969}
2970
942a3961 2971/* Try to encode a string object in order to save space */
05df7621 2972static robj *tryObjectEncoding(robj *o) {
942a3961 2973 long value;
942a3961 2974 sds s = o->ptr;
3305306f 2975
942a3961 2976 if (o->encoding != REDIS_ENCODING_RAW)
05df7621 2977 return o; /* Already encoded */
3305306f 2978
05df7621 2979 /* It's not safe to encode shared objects: shared objects can be shared
942a3961 2980 * everywhere in the "object space" of Redis. Encoded objects can only
2981 * appear as "values" (and not, for instance, as keys) */
05df7621 2982 if (o->refcount > 1) return o;
3305306f 2983
942a3961 2984 /* Currently we try to encode only strings */
dfc5e96c 2985 redisAssert(o->type == REDIS_STRING);
94754ccc 2986
724a51b1 2987 /* Check if we can represent this string as a long integer */
05df7621 2988 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
942a3961 2989
2990 /* Ok, this object can be encoded */
05df7621 2991 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2992 decrRefCount(o);
2993 incrRefCount(shared.integers[value]);
2994 return shared.integers[value];
2995 } else {
2996 o->encoding = REDIS_ENCODING_INT;
2997 sdsfree(o->ptr);
2998 o->ptr = (void*) value;
2999 return o;
3000 }
942a3961 3001}
3002
9d65a1bb 3003/* Get a decoded version of an encoded object (returned as a new object).
3004 * If the object is already raw-encoded just increment the ref count. */
3005static robj *getDecodedObject(robj *o) {
942a3961 3006 robj *dec;
e0a62c7f 3007
9d65a1bb 3008 if (o->encoding == REDIS_ENCODING_RAW) {
3009 incrRefCount(o);
3010 return o;
3011 }
942a3961 3012 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3013 char buf[32];
3014
3015 snprintf(buf,32,"%ld",(long)o->ptr);
3016 dec = createStringObject(buf,strlen(buf));
3017 return dec;
3018 } else {
dfc5e96c 3019 redisAssert(1 != 1);
942a3961 3020 }
3305306f 3021}
3022
d7f43c08 3023/* Compare two string objects via strcmp() or alike.
3024 * Note that the objects may be integer-encoded. In such a case we
3025 * use snprintf() to get a string representation of the numbers on the stack
1fd9bc8a 3026 * and compare the strings, it's much faster than calling getDecodedObject().
3027 *
3028 * Important note: if objects are not integer encoded, but binary-safe strings,
3029 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3030 * binary safe. */
724a51b1 3031static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 3032 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 3033 char bufa[128], bufb[128], *astr, *bstr;
3034 int bothsds = 1;
724a51b1 3035
e197b441 3036 if (a == b) return 0;
d7f43c08 3037 if (a->encoding != REDIS_ENCODING_RAW) {
3038 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3039 astr = bufa;
3040 bothsds = 0;
724a51b1 3041 } else {
d7f43c08 3042 astr = a->ptr;
724a51b1 3043 }
d7f43c08 3044 if (b->encoding != REDIS_ENCODING_RAW) {
3045 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3046 bstr = bufb;
3047 bothsds = 0;
3048 } else {
3049 bstr = b->ptr;
3050 }
3051 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3052}
3053
0ea663ea 3054static size_t stringObjectLen(robj *o) {
dfc5e96c 3055 redisAssert(o->type == REDIS_STRING);
0ea663ea 3056 if (o->encoding == REDIS_ENCODING_RAW) {
3057 return sdslen(o->ptr);
3058 } else {
3059 char buf[32];
3060
3061 return snprintf(buf,32,"%ld",(long)o->ptr);
3062 }
3063}
3064
bbe025e0
AM
3065static int getDoubleFromObject(redisClient *c, robj *o, double *value) {
3066 double parsedValue;
3067 char *eptr = NULL;
3068
3069 if (o && o->type != REDIS_STRING) {
3070 addReplySds(c,sdsnew("-ERR value is not a double\r\n"));
3071 return REDIS_ERR;
3072 }
3073
3074 if (o == NULL)
3075 parsedValue = 0;
3076 else if (o->encoding == REDIS_ENCODING_RAW)
3077 parsedValue = strtod(o->ptr, &eptr);
3078 else if (o->encoding == REDIS_ENCODING_INT)
3079 parsedValue = (long)o->ptr;
3080 else
3081 redisAssert(1 != 1);
3082
3083 if (eptr != NULL && *eptr != '\0') {
3084 addReplySds(c,sdsnew("-ERR value is not a double\r\n"));
3085 return REDIS_ERR;
3086 }
3087
3088 *value = parsedValue;
3089
3090 return REDIS_OK;
3091}
3092
3093static int getLongLongFromObject(redisClient *c, robj *o, long long *value) {
3094 long long parsedValue;
3095 char *eptr = NULL;
3096
3097 if (o && o->type != REDIS_STRING) {
3098 addReplySds(c,sdsnew("-ERR value is not an integer\r\n"));
3099 return REDIS_ERR;
3100 }
3101
3102 if (o == NULL)
3103 parsedValue = 0;
3104 else if (o->encoding == REDIS_ENCODING_RAW)
3105 parsedValue = strtoll(o->ptr, &eptr, 10);
3106 else if (o->encoding == REDIS_ENCODING_INT)
3107 parsedValue = (long)o->ptr;
3108 else
3109 redisAssert(1 != 1);
3110
3111 if (eptr != NULL && *eptr != '\0') {
3112 addReplySds(c,sdsnew("-ERR value is not an integer\r\n"));
3113 return REDIS_ERR;
3114 }
3115
3116 *value = parsedValue;
3117
3118 return REDIS_OK;
3119}
3120
3121static int getLongFromObject(redisClient *c, robj *o, long *value) {
3122 long long actualValue;
3123
3124 if (getLongLongFromObject(c, o, &actualValue) != REDIS_OK) return REDIS_ERR;
3125
3126 if (actualValue < LONG_MIN || actualValue > LONG_MAX) {
3127 addReplySds(c,sdsnew("-ERR value is out of range\r\n"));
3128 return REDIS_ERR;
3129 }
3130
3131 *value = actualValue;
3132
3133 return REDIS_OK;
3134}
3135
06233c45 3136/*============================ RDB saving/loading =========================== */
ed9b544e 3137
f78fd11b 3138static int rdbSaveType(FILE *fp, unsigned char type) {
3139 if (fwrite(&type,1,1,fp) == 0) return -1;
3140 return 0;
3141}
3142
bb32ede5 3143static int rdbSaveTime(FILE *fp, time_t t) {
3144 int32_t t32 = (int32_t) t;
3145 if (fwrite(&t32,4,1,fp) == 0) return -1;
3146 return 0;
3147}
3148
e3566d4b 3149/* check rdbLoadLen() comments for more info */
f78fd11b 3150static int rdbSaveLen(FILE *fp, uint32_t len) {
3151 unsigned char buf[2];
3152
3153 if (len < (1<<6)) {
3154 /* Save a 6 bit len */
10c43610 3155 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3156 if (fwrite(buf,1,1,fp) == 0) return -1;
3157 } else if (len < (1<<14)) {
3158 /* Save a 14 bit len */
10c43610 3159 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3160 buf[1] = len&0xFF;
17be1a4a 3161 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3162 } else {
3163 /* Save a 32 bit len */
10c43610 3164 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3165 if (fwrite(buf,1,1,fp) == 0) return -1;
3166 len = htonl(len);
3167 if (fwrite(&len,4,1,fp) == 0) return -1;
3168 }
3169 return 0;
3170}
3171
e3566d4b 3172/* String objects in the form "2391" "-100" without any space and with a
3173 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3174 * encoded as integers to save space */
b1befe6a 3175static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
e3566d4b 3176 long long value;
3177 char *endptr, buf[32];
3178
3179 /* Check if it's possible to encode this value as a number */
3180 value = strtoll(s, &endptr, 10);
3181 if (endptr[0] != '\0') return 0;
3182 snprintf(buf,32,"%lld",value);
3183
3184 /* If the number converted back into a string is not identical
3185 * then it's not possible to encode the string as integer */
b1befe6a 3186 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
e3566d4b 3187
3188 /* Finally check if it fits in our ranges */
3189 if (value >= -(1<<7) && value <= (1<<7)-1) {
3190 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3191 enc[1] = value&0xFF;
3192 return 2;
3193 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3194 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3195 enc[1] = value&0xFF;
3196 enc[2] = (value>>8)&0xFF;
3197 return 3;
3198 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3199 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3200 enc[1] = value&0xFF;
3201 enc[2] = (value>>8)&0xFF;
3202 enc[3] = (value>>16)&0xFF;
3203 enc[4] = (value>>24)&0xFF;
3204 return 5;
3205 } else {
3206 return 0;
3207 }
3208}
3209
b1befe6a 3210static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3211 size_t comprlen, outlen;
774e3047 3212 unsigned char byte;
3213 void *out;
3214
3215 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3216 if (len <= 4) return 0;
3217 outlen = len-4;
3a2694c4 3218 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3219 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3220 if (comprlen == 0) {
88e85998 3221 zfree(out);
774e3047 3222 return 0;
3223 }
3224 /* Data compressed! Let's save it on disk */
3225 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3226 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3227 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3228 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3229 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3230 zfree(out);
774e3047 3231 return comprlen;
3232
3233writeerr:
88e85998 3234 zfree(out);
774e3047 3235 return -1;
3236}
3237
e3566d4b 3238/* Save a string objet as [len][data] on disk. If the object is a string
3239 * representation of an integer value we try to safe it in a special form */
b1befe6a 3240static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3241 int enclen;
10c43610 3242
774e3047 3243 /* Try integer encoding */
e3566d4b 3244 if (len <= 11) {
3245 unsigned char buf[5];
b1befe6a 3246 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3247 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3248 return 0;
3249 }
3250 }
774e3047 3251
3252 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3253 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3254 if (server.rdbcompression && len > 20) {
774e3047 3255 int retval;
3256
b1befe6a 3257 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3258 if (retval == -1) return -1;
3259 if (retval > 0) return 0;
3260 /* retval == 0 means data can't be compressed, save the old way */
3261 }
3262
3263 /* Store verbatim */
10c43610 3264 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3265 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3266 return 0;
3267}
3268
942a3961 3269/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3270static int rdbSaveStringObject(FILE *fp, robj *obj) {
3271 int retval;
942a3961 3272
f2d9f50f 3273 /* Avoid incr/decr ref count business when possible.
3274 * This plays well with copy-on-write given that we are probably
3275 * in a child process (BGSAVE). Also this makes sure key objects
3276 * of swapped objects are not incRefCount-ed (an assert does not allow
3277 * this in order to avoid bugs) */
3278 if (obj->encoding != REDIS_ENCODING_RAW) {
996cb5f7 3279 obj = getDecodedObject(obj);
b1befe6a 3280 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3281 decrRefCount(obj);
3282 } else {
b1befe6a 3283 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3284 }
9d65a1bb 3285 return retval;
942a3961 3286}
3287
a7866db6 3288/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3289 * 8 bit integer specifing the length of the representation.
3290 * This 8 bit integer has special values in order to specify the following
3291 * conditions:
3292 * 253: not a number
3293 * 254: + inf
3294 * 255: - inf
3295 */
3296static int rdbSaveDoubleValue(FILE *fp, double val) {
3297 unsigned char buf[128];
3298 int len;
3299
3300 if (isnan(val)) {
3301 buf[0] = 253;
3302 len = 1;
3303 } else if (!isfinite(val)) {
3304 len = 1;
3305 buf[0] = (val < 0) ? 255 : 254;
3306 } else {
eaa256ad 3307 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3308 buf[0] = strlen((char*)buf+1);
a7866db6 3309 len = buf[0]+1;
3310 }
3311 if (fwrite(buf,len,1,fp) == 0) return -1;
3312 return 0;
3313}
3314
06233c45 3315/* Save a Redis object. */
3316static int rdbSaveObject(FILE *fp, robj *o) {
3317 if (o->type == REDIS_STRING) {
3318 /* Save a string value */
3319 if (rdbSaveStringObject(fp,o) == -1) return -1;
3320 } else if (o->type == REDIS_LIST) {
3321 /* Save a list value */
3322 list *list = o->ptr;
c7df85a4 3323 listIter li;
06233c45 3324 listNode *ln;
3325
06233c45 3326 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
c7df85a4 3327 listRewind(list,&li);
3328 while((ln = listNext(&li))) {
06233c45 3329 robj *eleobj = listNodeValue(ln);
3330
3331 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3332 }
3333 } else if (o->type == REDIS_SET) {
3334 /* Save a set value */
3335 dict *set = o->ptr;
3336 dictIterator *di = dictGetIterator(set);
3337 dictEntry *de;
3338
3339 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3340 while((de = dictNext(di)) != NULL) {
3341 robj *eleobj = dictGetEntryKey(de);
3342
3343 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3344 }
3345 dictReleaseIterator(di);
3346 } else if (o->type == REDIS_ZSET) {
3347 /* Save a set value */
3348 zset *zs = o->ptr;
3349 dictIterator *di = dictGetIterator(zs->dict);
3350 dictEntry *de;
3351
3352 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3353 while((de = dictNext(di)) != NULL) {
3354 robj *eleobj = dictGetEntryKey(de);
3355 double *score = dictGetEntryVal(de);
3356
3357 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3358 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3359 }
3360 dictReleaseIterator(di);
b1befe6a 3361 } else if (o->type == REDIS_HASH) {
3362 /* Save a hash value */
3363 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3364 unsigned char *p = zipmapRewind(o->ptr);
3365 unsigned int count = zipmapLen(o->ptr);
3366 unsigned char *key, *val;
3367 unsigned int klen, vlen;
3368
3369 if (rdbSaveLen(fp,count) == -1) return -1;
3370 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3371 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3372 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3373 }
3374 } else {
3375 dictIterator *di = dictGetIterator(o->ptr);
3376 dictEntry *de;
3377
3378 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3379 while((de = dictNext(di)) != NULL) {
3380 robj *key = dictGetEntryKey(de);
3381 robj *val = dictGetEntryVal(de);
3382
3383 if (rdbSaveStringObject(fp,key) == -1) return -1;
3384 if (rdbSaveStringObject(fp,val) == -1) return -1;
3385 }
3386 dictReleaseIterator(di);
3387 }
06233c45 3388 } else {
78409a0f 3389 redisAssert(0);
06233c45 3390 }
3391 return 0;
3392}
3393
3394/* Return the length the object will have on disk if saved with
3395 * the rdbSaveObject() function. Currently we use a trick to get
3396 * this length with very little changes to the code. In the future
3397 * we could switch to a faster solution. */
b9bc0eef 3398static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3399 if (fp == NULL) fp = server.devnull;
06233c45 3400 rewind(fp);
3401 assert(rdbSaveObject(fp,o) != 1);
3402 return ftello(fp);
3403}
3404
06224fec 3405/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3406static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3407 off_t bytes = rdbSavedObjectLen(o,fp);
e0a62c7f 3408
06224fec 3409 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3410}
3411
ed9b544e 3412/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3413static int rdbSave(char *filename) {
ed9b544e 3414 dictIterator *di = NULL;
3415 dictEntry *de;
ed9b544e 3416 FILE *fp;
3417 char tmpfile[256];
3418 int j;
bb32ede5 3419 time_t now = time(NULL);
ed9b544e 3420
2316bb3b 3421 /* Wait for I/O therads to terminate, just in case this is a
3422 * foreground-saving, to avoid seeking the swap file descriptor at the
3423 * same time. */
3424 if (server.vm_enabled)
3425 waitEmptyIOJobsQueue();
3426
a3b21203 3427 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3428 fp = fopen(tmpfile,"w");
3429 if (!fp) {
3430 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3431 return REDIS_ERR;
3432 }
f78fd11b 3433 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3434 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3435 redisDb *db = server.db+j;
3436 dict *d = db->dict;
3305306f 3437 if (dictSize(d) == 0) continue;
ed9b544e 3438 di = dictGetIterator(d);
3439 if (!di) {
3440 fclose(fp);
3441 return REDIS_ERR;
3442 }
3443
3444 /* Write the SELECT DB opcode */
f78fd11b 3445 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3446 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3447
3448 /* Iterate this DB writing every entry */
3449 while((de = dictNext(di)) != NULL) {
3450 robj *key = dictGetEntryKey(de);
3451 robj *o = dictGetEntryVal(de);
bb32ede5 3452 time_t expiretime = getExpire(db,key);
3453
3454 /* Save the expire time */
3455 if (expiretime != -1) {
3456 /* If this key is already expired skip it */
3457 if (expiretime < now) continue;
3458 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3459 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3460 }
7e69548d 3461 /* Save the key and associated value. This requires special
3462 * handling if the value is swapped out. */
996cb5f7 3463 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3464 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3465 /* Save type, key, value */
3466 if (rdbSaveType(fp,o->type) == -1) goto werr;
3467 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3468 if (rdbSaveObject(fp,o) == -1) goto werr;
3469 } else {
996cb5f7 3470 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3471 robj *po;
7e69548d 3472 /* Get a preview of the object in memory */
3473 po = vmPreviewObject(key);
7e69548d 3474 /* Save type, key, value */
3475 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3476 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3477 if (rdbSaveObject(fp,po) == -1) goto werr;
3478 /* Remove the loaded object from memory */
3479 decrRefCount(po);
7e69548d 3480 }
ed9b544e 3481 }
3482 dictReleaseIterator(di);
3483 }
3484 /* EOF opcode */
f78fd11b 3485 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3486
3487 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3488 fflush(fp);
3489 fsync(fileno(fp));
3490 fclose(fp);
e0a62c7f 3491
ed9b544e 3492 /* Use RENAME to make sure the DB file is changed atomically only
3493 * if the generate DB file is ok. */
3494 if (rename(tmpfile,filename) == -1) {
325d1eb4 3495 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3496 unlink(tmpfile);
3497 return REDIS_ERR;
3498 }
3499 redisLog(REDIS_NOTICE,"DB saved on disk");
3500 server.dirty = 0;
3501 server.lastsave = time(NULL);
3502 return REDIS_OK;
3503
3504werr:
3505 fclose(fp);
3506 unlink(tmpfile);
3507 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3508 if (di) dictReleaseIterator(di);
3509 return REDIS_ERR;
3510}
3511
f78fd11b 3512static int rdbSaveBackground(char *filename) {
ed9b544e 3513 pid_t childpid;
3514
9d65a1bb 3515 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3516 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3517 if ((childpid = fork()) == 0) {
3518 /* Child */
054e426d 3519 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3520 close(server.fd);
f78fd11b 3521 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3522 _exit(0);
ed9b544e 3523 } else {
478c2c6f 3524 _exit(1);
ed9b544e 3525 }
3526 } else {
3527 /* Parent */
5a7c647e 3528 if (childpid == -1) {
3529 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3530 strerror(errno));
3531 return REDIS_ERR;
3532 }
ed9b544e 3533 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3534 server.bgsavechildpid = childpid;
884d4b39 3535 updateDictResizePolicy();
ed9b544e 3536 return REDIS_OK;
3537 }
3538 return REDIS_OK; /* unreached */
3539}
3540
a3b21203 3541static void rdbRemoveTempFile(pid_t childpid) {
3542 char tmpfile[256];
3543
3544 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3545 unlink(tmpfile);
3546}
3547
f78fd11b 3548static int rdbLoadType(FILE *fp) {
3549 unsigned char type;
7b45bfb2 3550 if (fread(&type,1,1,fp) == 0) return -1;
3551 return type;
3552}
3553
bb32ede5 3554static time_t rdbLoadTime(FILE *fp) {
3555 int32_t t32;
3556 if (fread(&t32,4,1,fp) == 0) return -1;
3557 return (time_t) t32;
3558}
3559
e3566d4b 3560/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3561 * of this file for a description of how this are stored on disk.
3562 *
3563 * isencoded is set to 1 if the readed length is not actually a length but
3564 * an "encoding type", check the above comments for more info */
c78a8ccc 3565static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3566 unsigned char buf[2];
3567 uint32_t len;
c78a8ccc 3568 int type;
f78fd11b 3569
e3566d4b 3570 if (isencoded) *isencoded = 0;
c78a8ccc 3571 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3572 type = (buf[0]&0xC0)>>6;
3573 if (type == REDIS_RDB_6BITLEN) {
3574 /* Read a 6 bit len */
3575 return buf[0]&0x3F;
3576 } else if (type == REDIS_RDB_ENCVAL) {
3577 /* Read a 6 bit len encoding type */
3578 if (isencoded) *isencoded = 1;
3579 return buf[0]&0x3F;
3580 } else if (type == REDIS_RDB_14BITLEN) {
3581 /* Read a 14 bit len */
3582 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3583 return ((buf[0]&0x3F)<<8)|buf[1];
3584 } else {
3585 /* Read a 32 bit len */
f78fd11b 3586 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3587 return ntohl(len);
f78fd11b 3588 }
f78fd11b 3589}
3590
e3566d4b 3591static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3592 unsigned char enc[4];
3593 long long val;
3594
3595 if (enctype == REDIS_RDB_ENC_INT8) {
3596 if (fread(enc,1,1,fp) == 0) return NULL;
3597 val = (signed char)enc[0];
3598 } else if (enctype == REDIS_RDB_ENC_INT16) {
3599 uint16_t v;
3600 if (fread(enc,2,1,fp) == 0) return NULL;
3601 v = enc[0]|(enc[1]<<8);
3602 val = (int16_t)v;
3603 } else if (enctype == REDIS_RDB_ENC_INT32) {
3604 uint32_t v;
3605 if (fread(enc,4,1,fp) == 0) return NULL;
3606 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3607 val = (int32_t)v;
3608 } else {
3609 val = 0; /* anti-warning */
78409a0f 3610 redisAssert(0);
e3566d4b 3611 }
3612 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3613}
3614
c78a8ccc 3615static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3616 unsigned int len, clen;
3617 unsigned char *c = NULL;
3618 sds val = NULL;
3619
c78a8ccc 3620 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3621 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3622 if ((c = zmalloc(clen)) == NULL) goto err;
3623 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3624 if (fread(c,clen,1,fp) == 0) goto err;
3625 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3626 zfree(c);
88e85998 3627 return createObject(REDIS_STRING,val);
3628err:
3629 zfree(c);
3630 sdsfree(val);
3631 return NULL;
3632}
3633
c78a8ccc 3634static robj *rdbLoadStringObject(FILE*fp) {
e3566d4b 3635 int isencoded;
3636 uint32_t len;
f78fd11b 3637 sds val;
3638
c78a8ccc 3639 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3640 if (isencoded) {
3641 switch(len) {
3642 case REDIS_RDB_ENC_INT8:
3643 case REDIS_RDB_ENC_INT16:
3644 case REDIS_RDB_ENC_INT32:
bdcb92f2 3645 return rdbLoadIntegerObject(fp,len);
88e85998 3646 case REDIS_RDB_ENC_LZF:
bdcb92f2 3647 return rdbLoadLzfStringObject(fp);
e3566d4b 3648 default:
78409a0f 3649 redisAssert(0);
e3566d4b 3650 }
3651 }
3652
f78fd11b 3653 if (len == REDIS_RDB_LENERR) return NULL;
3654 val = sdsnewlen(NULL,len);
3655 if (len && fread(val,len,1,fp) == 0) {
3656 sdsfree(val);
3657 return NULL;
3658 }
bdcb92f2 3659 return createObject(REDIS_STRING,val);
f78fd11b 3660}
3661
a7866db6 3662/* For information about double serialization check rdbSaveDoubleValue() */
3663static int rdbLoadDoubleValue(FILE *fp, double *val) {
3664 char buf[128];
3665 unsigned char len;
3666
3667 if (fread(&len,1,1,fp) == 0) return -1;
3668 switch(len) {
3669 case 255: *val = R_NegInf; return 0;
3670 case 254: *val = R_PosInf; return 0;
3671 case 253: *val = R_Nan; return 0;
3672 default:
3673 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3674 buf[len] = '\0';
a7866db6 3675 sscanf(buf, "%lg", val);
3676 return 0;
3677 }
3678}
3679
c78a8ccc 3680/* Load a Redis object of the specified type from the specified file.
3681 * On success a newly allocated object is returned, otherwise NULL. */
3682static robj *rdbLoadObject(int type, FILE *fp) {
3683 robj *o;
3684
bcd11906 3685 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 3686 if (type == REDIS_STRING) {
3687 /* Read string value */
3688 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
05df7621 3689 o = tryObjectEncoding(o);
c78a8ccc 3690 } else if (type == REDIS_LIST || type == REDIS_SET) {
3691 /* Read list/set value */
3692 uint32_t listlen;
3693
3694 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3695 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3c68de9b 3696 /* It's faster to expand the dict to the right size asap in order
3697 * to avoid rehashing */
3698 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3699 dictExpand(o->ptr,listlen);
c78a8ccc 3700 /* Load every single element of the list/set */
3701 while(listlen--) {
3702 robj *ele;
3703
3704 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
05df7621 3705 ele = tryObjectEncoding(ele);
c78a8ccc 3706 if (type == REDIS_LIST) {
3707 listAddNodeTail((list*)o->ptr,ele);
3708 } else {
3709 dictAdd((dict*)o->ptr,ele,NULL);
3710 }
3711 }
3712 } else if (type == REDIS_ZSET) {
3713 /* Read list/set value */
ada386b2 3714 size_t zsetlen;
c78a8ccc 3715 zset *zs;
3716
3717 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3718 o = createZsetObject();
3719 zs = o->ptr;
3720 /* Load every single element of the list/set */
3721 while(zsetlen--) {
3722 robj *ele;
3723 double *score = zmalloc(sizeof(double));
3724
3725 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
05df7621 3726 ele = tryObjectEncoding(ele);
c78a8ccc 3727 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3728 dictAdd(zs->dict,ele,score);
3729 zslInsert(zs->zsl,*score,ele);
3730 incrRefCount(ele); /* added to skiplist */
3731 }
ada386b2 3732 } else if (type == REDIS_HASH) {
3733 size_t hashlen;
3734
3735 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3736 o = createHashObject();
3737 /* Too many entries? Use an hash table. */
3738 if (hashlen > server.hash_max_zipmap_entries)
3739 convertToRealHash(o);
3740 /* Load every key/value, then set it into the zipmap or hash
3741 * table, as needed. */
3742 while(hashlen--) {
3743 robj *key, *val;
3744
3745 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3746 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3747 /* If we are using a zipmap and there are too big values
3748 * the object is converted to real hash table encoding. */
3749 if (o->encoding != REDIS_ENCODING_HT &&
3750 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3751 sdslen(val->ptr) > server.hash_max_zipmap_value))
3752 {
3753 convertToRealHash(o);
3754 }
3755
3756 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3757 unsigned char *zm = o->ptr;
3758
3759 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3760 val->ptr,sdslen(val->ptr),NULL);
3761 o->ptr = zm;
3762 decrRefCount(key);
3763 decrRefCount(val);
3764 } else {
05df7621 3765 key = tryObjectEncoding(key);
3766 val = tryObjectEncoding(val);
ada386b2 3767 dictAdd((dict*)o->ptr,key,val);
ada386b2 3768 }
3769 }
c78a8ccc 3770 } else {
78409a0f 3771 redisAssert(0);
c78a8ccc 3772 }
3773 return o;
3774}
3775
f78fd11b 3776static int rdbLoad(char *filename) {
ed9b544e 3777 FILE *fp;
f78fd11b 3778 robj *keyobj = NULL;
3779 uint32_t dbid;
bb32ede5 3780 int type, retval, rdbver;
3305306f 3781 dict *d = server.db[0].dict;
bb32ede5 3782 redisDb *db = server.db+0;
f78fd11b 3783 char buf[1024];
bb32ede5 3784 time_t expiretime = -1, now = time(NULL);
b492cf00 3785 long long loadedkeys = 0;
bb32ede5 3786
ed9b544e 3787 fp = fopen(filename,"r");
3788 if (!fp) return REDIS_ERR;
3789 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 3790 buf[9] = '\0';
3791 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 3792 fclose(fp);
3793 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3794 return REDIS_ERR;
3795 }
f78fd11b 3796 rdbver = atoi(buf+5);
c78a8ccc 3797 if (rdbver != 1) {
f78fd11b 3798 fclose(fp);
3799 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3800 return REDIS_ERR;
3801 }
ed9b544e 3802 while(1) {
3803 robj *o;
3804
3805 /* Read type. */
f78fd11b 3806 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 3807 if (type == REDIS_EXPIRETIME) {
3808 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3809 /* We read the time so we need to read the object type again */
3810 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3811 }
ed9b544e 3812 if (type == REDIS_EOF) break;
3813 /* Handle SELECT DB opcode as a special case */
3814 if (type == REDIS_SELECTDB) {
c78a8ccc 3815 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 3816 goto eoferr;
ed9b544e 3817 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 3818 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 3819 exit(1);
3820 }
bb32ede5 3821 db = server.db+dbid;
3822 d = db->dict;
ed9b544e 3823 continue;
3824 }
3825 /* Read key */
c78a8ccc 3826 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3827 /* Read value */
3828 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
ed9b544e 3829 /* Add the new object in the hash table */
f78fd11b 3830 retval = dictAdd(d,keyobj,o);
ed9b544e 3831 if (retval == DICT_ERR) {
f78fd11b 3832 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
ed9b544e 3833 exit(1);
3834 }
bb32ede5 3835 /* Set the expire time if needed */
3836 if (expiretime != -1) {
3837 setExpire(db,keyobj,expiretime);
3838 /* Delete this key if already expired */
3839 if (expiretime < now) deleteKey(db,keyobj);
3840 expiretime = -1;
3841 }
f78fd11b 3842 keyobj = o = NULL;
b492cf00 3843 /* Handle swapping while loading big datasets when VM is on */
3844 loadedkeys++;
3845 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3846 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 3847 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 3848 }
3849 }
ed9b544e 3850 }
3851 fclose(fp);
3852 return REDIS_OK;
3853
3854eoferr: /* unexpected end of file is handled here with a fatal exit */
e3566d4b 3855 if (keyobj) decrRefCount(keyobj);
f80dff62 3856 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 3857 exit(1);
3858 return REDIS_ERR; /* Just to avoid warning */
3859}
3860
3861/*================================== Commands =============================== */
3862
abcb223e 3863static void authCommand(redisClient *c) {
2e77c2ee 3864 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
3865 c->authenticated = 1;
3866 addReply(c,shared.ok);
3867 } else {
3868 c->authenticated = 0;
fa4c0aba 3869 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
3870 }
3871}
3872
ed9b544e 3873static void pingCommand(redisClient *c) {
3874 addReply(c,shared.pong);
3875}
3876
3877static void echoCommand(redisClient *c) {
dd88747b 3878 addReplyBulk(c,c->argv[1]);
ed9b544e 3879}
3880
3881/*=================================== Strings =============================== */
3882
3883static void setGenericCommand(redisClient *c, int nx) {
3884 int retval;
3885
333fd216 3886 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3305306f 3887 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3888 if (retval == DICT_ERR) {
3889 if (!nx) {
1b03836c 3890 /* If the key is about a swapped value, we want a new key object
3891 * to overwrite the old. So we delete the old key in the database.
3892 * This will also make sure that swap pages about the old object
3893 * will be marked as free. */
ddfaca9d 3894 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
1b03836c 3895 incrRefCount(c->argv[1]);
3305306f 3896 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3897 incrRefCount(c->argv[2]);
3898 } else {
c937aa89 3899 addReply(c,shared.czero);
ed9b544e 3900 return;
3901 }
3902 } else {
3903 incrRefCount(c->argv[1]);
3904 incrRefCount(c->argv[2]);
3905 }
3906 server.dirty++;
3305306f 3907 removeExpire(c->db,c->argv[1]);
c937aa89 3908 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 3909}
3910
3911static void setCommand(redisClient *c) {
a4d1ba9a 3912 setGenericCommand(c,0);
ed9b544e 3913}
3914
3915static void setnxCommand(redisClient *c) {
a4d1ba9a 3916 setGenericCommand(c,1);
ed9b544e 3917}
3918
322fc7d8 3919static int getGenericCommand(redisClient *c) {
dd88747b 3920 robj *o;
e0a62c7f 3921
dd88747b 3922 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 3923 return REDIS_OK;
dd88747b 3924
3925 if (o->type != REDIS_STRING) {
3926 addReply(c,shared.wrongtypeerr);
3927 return REDIS_ERR;
ed9b544e 3928 } else {
dd88747b 3929 addReplyBulk(c,o);
3930 return REDIS_OK;
ed9b544e 3931 }
3932}
3933
322fc7d8 3934static void getCommand(redisClient *c) {
3935 getGenericCommand(c);
3936}
3937
f6b141c5 3938static void getsetCommand(redisClient *c) {
322fc7d8 3939 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 3940 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3941 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3942 } else {
3943 incrRefCount(c->argv[1]);
3944 }
3945 incrRefCount(c->argv[2]);
3946 server.dirty++;
3947 removeExpire(c->db,c->argv[1]);
3948}
3949
70003d28 3950static void mgetCommand(redisClient *c) {
70003d28 3951 int j;
e0a62c7f 3952
c937aa89 3953 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 3954 for (j = 1; j < c->argc; j++) {
3305306f 3955 robj *o = lookupKeyRead(c->db,c->argv[j]);
3956 if (o == NULL) {
c937aa89 3957 addReply(c,shared.nullbulk);
70003d28 3958 } else {
70003d28 3959 if (o->type != REDIS_STRING) {
c937aa89 3960 addReply(c,shared.nullbulk);
70003d28 3961 } else {
dd88747b 3962 addReplyBulk(c,o);
70003d28 3963 }
3964 }
3965 }
3966}
3967
6c446631 3968static void msetGenericCommand(redisClient *c, int nx) {
906573e7 3969 int j, busykeys = 0;
6c446631 3970
3971 if ((c->argc % 2) == 0) {
454d4e43 3972 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 3973 return;
3974 }
3975 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3976 * set nothing at all if at least one already key exists. */
3977 if (nx) {
3978 for (j = 1; j < c->argc; j += 2) {
906573e7 3979 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3980 busykeys++;
6c446631 3981 }
3982 }
3983 }
906573e7 3984 if (busykeys) {
3985 addReply(c, shared.czero);
3986 return;
3987 }
6c446631 3988
3989 for (j = 1; j < c->argc; j += 2) {
3990 int retval;
3991
05df7621 3992 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
6c446631 3993 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3994 if (retval == DICT_ERR) {
3995 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
3996 incrRefCount(c->argv[j+1]);
3997 } else {
3998 incrRefCount(c->argv[j]);
3999 incrRefCount(c->argv[j+1]);
4000 }
4001 removeExpire(c->db,c->argv[j]);
4002 }
4003 server.dirty += (c->argc-1)/2;
4004 addReply(c, nx ? shared.cone : shared.ok);
4005}
4006
4007static void msetCommand(redisClient *c) {
4008 msetGenericCommand(c,0);
4009}
4010
4011static void msetnxCommand(redisClient *c) {
4012 msetGenericCommand(c,1);
4013}
4014
d68ed120 4015static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 4016 long long value;
4017 int retval;
4018 robj *o;
e0a62c7f 4019
3305306f 4020 o = lookupKeyWrite(c->db,c->argv[1]);
ed9b544e 4021
bbe025e0 4022 if (getLongLongFromObject(c, o, &value) != REDIS_OK) return;
ed9b544e 4023
4024 value += incr;
4025 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
05df7621 4026 o = tryObjectEncoding(o);
3305306f 4027 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 4028 if (retval == DICT_ERR) {
3305306f 4029 dictReplace(c->db->dict,c->argv[1],o);
4030 removeExpire(c->db,c->argv[1]);
ed9b544e 4031 } else {
4032 incrRefCount(c->argv[1]);
4033 }
4034 server.dirty++;
c937aa89 4035 addReply(c,shared.colon);
ed9b544e 4036 addReply(c,o);
4037 addReply(c,shared.crlf);
4038}
4039
4040static void incrCommand(redisClient *c) {
a4d1ba9a 4041 incrDecrCommand(c,1);
ed9b544e 4042}
4043
4044static void decrCommand(redisClient *c) {
a4d1ba9a 4045 incrDecrCommand(c,-1);
ed9b544e 4046}
4047
4048static void incrbyCommand(redisClient *c) {
bbe025e0
AM
4049 long long incr;
4050
4051 if (getLongLongFromObject(c, c->argv[2], &incr) != REDIS_OK) return;
4052
a4d1ba9a 4053 incrDecrCommand(c,incr);
ed9b544e 4054}
4055
4056static void decrbyCommand(redisClient *c) {
bbe025e0
AM
4057 long long incr;
4058
4059 if (getLongLongFromObject(c, c->argv[2], &incr) != REDIS_OK) return;
4060
a4d1ba9a 4061 incrDecrCommand(c,-incr);
ed9b544e 4062}
4063
4b00bebd 4064static void appendCommand(redisClient *c) {
4065 int retval;
4066 size_t totlen;
4067 robj *o;
4068
4069 o = lookupKeyWrite(c->db,c->argv[1]);
4070 if (o == NULL) {
4071 /* Create the key */
4072 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4073 incrRefCount(c->argv[1]);
4074 incrRefCount(c->argv[2]);
4075 totlen = stringObjectLen(c->argv[2]);
4076 } else {
4077 dictEntry *de;
e0a62c7f 4078
4b00bebd 4079 de = dictFind(c->db->dict,c->argv[1]);
4080 assert(de != NULL);
4081
4082 o = dictGetEntryVal(de);
4083 if (o->type != REDIS_STRING) {
4084 addReply(c,shared.wrongtypeerr);
4085 return;
4086 }
4087 /* If the object is specially encoded or shared we have to make
4088 * a copy */
4089 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4090 robj *decoded = getDecodedObject(o);
4091
4092 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4093 decrRefCount(decoded);
4094 dictReplace(c->db->dict,c->argv[1],o);
4095 }
4096 /* APPEND! */
4097 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4098 o->ptr = sdscatlen(o->ptr,
4099 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4100 } else {
4101 o->ptr = sdscatprintf(o->ptr, "%ld",
4102 (unsigned long) c->argv[2]->ptr);
4103 }
4104 totlen = sdslen(o->ptr);
4105 }
4106 server.dirty++;
4107 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4108}
4109
39191553 4110static void substrCommand(redisClient *c) {
4111 robj *o;
4112 long start = atoi(c->argv[2]->ptr);
4113 long end = atoi(c->argv[3]->ptr);
dd88747b 4114 size_t rangelen, strlen;
4115 sds range;
39191553 4116
dd88747b 4117 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4118 checkType(c,o,REDIS_STRING)) return;
39191553 4119
dd88747b 4120 o = getDecodedObject(o);
4121 strlen = sdslen(o->ptr);
8fe7fad7 4122
dd88747b 4123 /* convert negative indexes */
4124 if (start < 0) start = strlen+start;
4125 if (end < 0) end = strlen+end;
4126 if (start < 0) start = 0;
4127 if (end < 0) end = 0;
39191553 4128
dd88747b 4129 /* indexes sanity checks */
4130 if (start > end || (size_t)start >= strlen) {
4131 /* Out of range start or start > end result in null reply */
4132 addReply(c,shared.nullbulk);
4133 decrRefCount(o);
4134 return;
39191553 4135 }
dd88747b 4136 if ((size_t)end >= strlen) end = strlen-1;
4137 rangelen = (end-start)+1;
4138
4139 /* Return the result */
4140 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4141 range = sdsnewlen((char*)o->ptr+start,rangelen);
4142 addReplySds(c,range);
4143 addReply(c,shared.crlf);
4144 decrRefCount(o);
39191553 4145}
4146
ed9b544e 4147/* ========================= Type agnostic commands ========================= */
4148
4149static void delCommand(redisClient *c) {
5109cdff 4150 int deleted = 0, j;
4151
4152 for (j = 1; j < c->argc; j++) {
4153 if (deleteKey(c->db,c->argv[j])) {
4154 server.dirty++;
4155 deleted++;
4156 }
4157 }
dd88747b 4158 addReplyLong(c,deleted);
ed9b544e 4159}
4160
4161static void existsCommand(redisClient *c) {
3305306f 4162 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
ed9b544e 4163}
4164
4165static void selectCommand(redisClient *c) {
4166 int id = atoi(c->argv[1]->ptr);
e0a62c7f 4167
ed9b544e 4168 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4169 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4170 } else {
4171 addReply(c,shared.ok);
4172 }
4173}
4174
4175static void randomkeyCommand(redisClient *c) {
4176 dictEntry *de;
e0a62c7f 4177
3305306f 4178 while(1) {
4179 de = dictGetRandomKey(c->db->dict);
ce7bef07 4180 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 4181 }
ed9b544e 4182 if (de == NULL) {
ce7bef07 4183 addReply(c,shared.plus);
ed9b544e 4184 addReply(c,shared.crlf);
4185 } else {
c937aa89 4186 addReply(c,shared.plus);
ed9b544e 4187 addReply(c,dictGetEntryKey(de));
4188 addReply(c,shared.crlf);
4189 }
4190}
4191
4192static void keysCommand(redisClient *c) {
4193 dictIterator *di;
4194 dictEntry *de;
4195 sds pattern = c->argv[1]->ptr;
4196 int plen = sdslen(pattern);
a3f9eec2 4197 unsigned long numkeys = 0;
ed9b544e 4198 robj *lenobj = createObject(REDIS_STRING,NULL);
4199
3305306f 4200 di = dictGetIterator(c->db->dict);
ed9b544e 4201 addReply(c,lenobj);
4202 decrRefCount(lenobj);
4203 while((de = dictNext(di)) != NULL) {
4204 robj *keyobj = dictGetEntryKey(de);
3305306f 4205
ed9b544e 4206 sds key = keyobj->ptr;
4207 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4208 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 4209 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4210 addReplyBulk(c,keyobj);
3305306f 4211 numkeys++;
3305306f 4212 }
ed9b544e 4213 }
4214 }
4215 dictReleaseIterator(di);
a3f9eec2 4216 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4217}
4218
4219static void dbsizeCommand(redisClient *c) {
4220 addReplySds(c,
3305306f 4221 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4222}
4223
4224static void lastsaveCommand(redisClient *c) {
4225 addReplySds(c,
c937aa89 4226 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4227}
4228
4229static void typeCommand(redisClient *c) {
3305306f 4230 robj *o;
ed9b544e 4231 char *type;
3305306f 4232
4233 o = lookupKeyRead(c->db,c->argv[1]);
4234 if (o == NULL) {
c937aa89 4235 type = "+none";
ed9b544e 4236 } else {
ed9b544e 4237 switch(o->type) {
c937aa89 4238 case REDIS_STRING: type = "+string"; break;
4239 case REDIS_LIST: type = "+list"; break;
4240 case REDIS_SET: type = "+set"; break;
412a8bce 4241 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4242 case REDIS_HASH: type = "+hash"; break;
4243 default: type = "+unknown"; break;
ed9b544e 4244 }
4245 }
4246 addReplySds(c,sdsnew(type));
4247 addReply(c,shared.crlf);
4248}
4249
4250static void saveCommand(redisClient *c) {
9d65a1bb 4251 if (server.bgsavechildpid != -1) {
05557f6d 4252 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4253 return;
4254 }
f78fd11b 4255 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4256 addReply(c,shared.ok);
4257 } else {
4258 addReply(c,shared.err);
4259 }
4260}
4261
4262static void bgsaveCommand(redisClient *c) {
9d65a1bb 4263 if (server.bgsavechildpid != -1) {
ed9b544e 4264 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4265 return;
4266 }
f78fd11b 4267 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4268 char *status = "+Background saving started\r\n";
4269 addReplySds(c,sdsnew(status));
ed9b544e 4270 } else {
4271 addReply(c,shared.err);
4272 }
4273}
4274
4275static void shutdownCommand(redisClient *c) {
4276 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
a3b21203 4277 /* Kill the saving child if there is a background saving in progress.
4278 We want to avoid race conditions, for instance our saving child may
4279 overwrite the synchronous saving did by SHUTDOWN. */
9d65a1bb 4280 if (server.bgsavechildpid != -1) {
9f3c422c 4281 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4282 kill(server.bgsavechildpid,SIGKILL);
a3b21203 4283 rdbRemoveTempFile(server.bgsavechildpid);
9f3c422c 4284 }
ac945e2d 4285 if (server.appendonly) {
4286 /* Append only file: fsync() the AOF and exit */
4287 fsync(server.appendfd);
054e426d 4288 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 4289 exit(0);
ed9b544e 4290 } else {
ac945e2d 4291 /* Snapshotting. Perform a SYNC SAVE and exit */
4292 if (rdbSave(server.dbfilename) == REDIS_OK) {
4293 if (server.daemonize)
4294 unlink(server.pidfile);
4295 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4296 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
054e426d 4297 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 4298 exit(0);
4299 } else {
dd88747b 4300 /* Ooops.. error saving! The best we can do is to continue
4301 * operating. Note that if there was a background saving process,
4302 * in the next cron() Redis will be notified that the background
4303 * saving aborted, handling special stuff like slaves pending for
4304 * synchronization... */
e0a62c7f 4305 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
dd88747b 4306 addReplySds(c,
4307 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
ac945e2d 4308 }
ed9b544e 4309 }
4310}
4311
4312static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4313 robj *o;
4314
4315 /* To use the same key as src and dst is probably an error */
4316 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4317 addReply(c,shared.sameobjecterr);
ed9b544e 4318 return;
4319 }
4320
dd88747b 4321 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4322 return;
dd88747b 4323
ed9b544e 4324 incrRefCount(o);
3305306f 4325 deleteIfVolatile(c->db,c->argv[2]);
4326 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 4327 if (nx) {
4328 decrRefCount(o);
c937aa89 4329 addReply(c,shared.czero);
ed9b544e 4330 return;
4331 }
3305306f 4332 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 4333 } else {
4334 incrRefCount(c->argv[2]);
4335 }
3305306f 4336 deleteKey(c->db,c->argv[1]);
ed9b544e 4337 server.dirty++;
c937aa89 4338 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4339}
4340
4341static void renameCommand(redisClient *c) {
4342 renameGenericCommand(c,0);
4343}
4344
4345static void renamenxCommand(redisClient *c) {
4346 renameGenericCommand(c,1);
4347}
4348
4349static void moveCommand(redisClient *c) {
3305306f 4350 robj *o;
4351 redisDb *src, *dst;
ed9b544e 4352 int srcid;
4353
4354 /* Obtain source and target DB pointers */
3305306f 4355 src = c->db;
4356 srcid = c->db->id;
ed9b544e 4357 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4358 addReply(c,shared.outofrangeerr);
ed9b544e 4359 return;
4360 }
3305306f 4361 dst = c->db;
4362 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4363
4364 /* If the user is moving using as target the same
4365 * DB as the source DB it is probably an error. */
4366 if (src == dst) {
c937aa89 4367 addReply(c,shared.sameobjecterr);
ed9b544e 4368 return;
4369 }
4370
4371 /* Check if the element exists and get a reference */
3305306f 4372 o = lookupKeyWrite(c->db,c->argv[1]);
4373 if (!o) {
c937aa89 4374 addReply(c,shared.czero);
ed9b544e 4375 return;
4376 }
4377
4378 /* Try to add the element to the target DB */
3305306f 4379 deleteIfVolatile(dst,c->argv[1]);
4380 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 4381 addReply(c,shared.czero);
ed9b544e 4382 return;
4383 }
3305306f 4384 incrRefCount(c->argv[1]);
ed9b544e 4385 incrRefCount(o);
4386
4387 /* OK! key moved, free the entry in the source DB */
3305306f 4388 deleteKey(src,c->argv[1]);
ed9b544e 4389 server.dirty++;
c937aa89 4390 addReply(c,shared.cone);
ed9b544e 4391}
4392
4393/* =================================== Lists ================================ */
4394static void pushGenericCommand(redisClient *c, int where) {
4395 robj *lobj;
ed9b544e 4396 list *list;
3305306f 4397
4398 lobj = lookupKeyWrite(c->db,c->argv[1]);
4399 if (lobj == NULL) {
95242ab5 4400 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4401 addReply(c,shared.cone);
95242ab5 4402 return;
4403 }
ed9b544e 4404 lobj = createListObject();
4405 list = lobj->ptr;
4406 if (where == REDIS_HEAD) {
6b47e12e 4407 listAddNodeHead(list,c->argv[2]);
ed9b544e 4408 } else {
6b47e12e 4409 listAddNodeTail(list,c->argv[2]);
ed9b544e 4410 }
3305306f 4411 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 4412 incrRefCount(c->argv[1]);
4413 incrRefCount(c->argv[2]);
4414 } else {
ed9b544e 4415 if (lobj->type != REDIS_LIST) {
4416 addReply(c,shared.wrongtypeerr);
4417 return;
4418 }
95242ab5 4419 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4420 addReply(c,shared.cone);
95242ab5 4421 return;
4422 }
ed9b544e 4423 list = lobj->ptr;
4424 if (where == REDIS_HEAD) {
6b47e12e 4425 listAddNodeHead(list,c->argv[2]);
ed9b544e 4426 } else {
6b47e12e 4427 listAddNodeTail(list,c->argv[2]);
ed9b544e 4428 }
4429 incrRefCount(c->argv[2]);
4430 }
4431 server.dirty++;
520b5a33 4432 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
ed9b544e 4433}
4434
4435static void lpushCommand(redisClient *c) {
4436 pushGenericCommand(c,REDIS_HEAD);
4437}
4438
4439static void rpushCommand(redisClient *c) {
4440 pushGenericCommand(c,REDIS_TAIL);
4441}
4442
4443static void llenCommand(redisClient *c) {
3305306f 4444 robj *o;
ed9b544e 4445 list *l;
dd88747b 4446
4447 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4448 checkType(c,o,REDIS_LIST)) return;
e0a62c7f 4449
dd88747b 4450 l = o->ptr;
4451 addReplyUlong(c,listLength(l));
ed9b544e 4452}
4453
4454static void lindexCommand(redisClient *c) {
3305306f 4455 robj *o;
ed9b544e 4456 int index = atoi(c->argv[2]->ptr);
dd88747b 4457 list *list;
4458 listNode *ln;
4459
4460 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4461 checkType(c,o,REDIS_LIST)) return;
4462 list = o->ptr;
4463
4464 ln = listIndex(list, index);
4465 if (ln == NULL) {
c937aa89 4466 addReply(c,shared.nullbulk);
ed9b544e 4467 } else {
dd88747b 4468 robj *ele = listNodeValue(ln);
4469 addReplyBulk(c,ele);
ed9b544e 4470 }
4471}
4472
4473static void lsetCommand(redisClient *c) {
3305306f 4474 robj *o;
ed9b544e 4475 int index = atoi(c->argv[2]->ptr);
dd88747b 4476 list *list;
4477 listNode *ln;
4478
4479 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4480 checkType(c,o,REDIS_LIST)) return;
4481 list = o->ptr;
4482
4483 ln = listIndex(list, index);
4484 if (ln == NULL) {
4485 addReply(c,shared.outofrangeerr);
ed9b544e 4486 } else {
dd88747b 4487 robj *ele = listNodeValue(ln);
ed9b544e 4488
dd88747b 4489 decrRefCount(ele);
4490 listNodeValue(ln) = c->argv[3];
4491 incrRefCount(c->argv[3]);
4492 addReply(c,shared.ok);
4493 server.dirty++;
ed9b544e 4494 }
4495}
4496
4497static void popGenericCommand(redisClient *c, int where) {
3305306f 4498 robj *o;
dd88747b 4499 list *list;
4500 listNode *ln;
3305306f 4501
dd88747b 4502 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4503 checkType(c,o,REDIS_LIST)) return;
4504 list = o->ptr;
ed9b544e 4505
dd88747b 4506 if (where == REDIS_HEAD)
4507 ln = listFirst(list);
4508 else
4509 ln = listLast(list);
ed9b544e 4510
dd88747b 4511 if (ln == NULL) {
4512 addReply(c,shared.nullbulk);
4513 } else {
4514 robj *ele = listNodeValue(ln);
4515 addReplyBulk(c,ele);
4516 listDelNode(list,ln);
3ea27d37 4517 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4518 server.dirty++;
ed9b544e 4519 }
4520}
4521
4522static void lpopCommand(redisClient *c) {
4523 popGenericCommand(c,REDIS_HEAD);
4524}
4525
4526static void rpopCommand(redisClient *c) {
4527 popGenericCommand(c,REDIS_TAIL);
4528}
4529
4530static void lrangeCommand(redisClient *c) {
3305306f 4531 robj *o;
ed9b544e 4532 int start = atoi(c->argv[2]->ptr);
4533 int end = atoi(c->argv[3]->ptr);
dd88747b 4534 int llen;
4535 int rangelen, j;
4536 list *list;
4537 listNode *ln;
4538 robj *ele;
4539
4e27f268 4540 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4541 || checkType(c,o,REDIS_LIST)) return;
dd88747b 4542 list = o->ptr;
4543 llen = listLength(list);
4544
4545 /* convert negative indexes */
4546 if (start < 0) start = llen+start;
4547 if (end < 0) end = llen+end;
4548 if (start < 0) start = 0;
4549 if (end < 0) end = 0;
4550
4551 /* indexes sanity checks */
4552 if (start > end || start >= llen) {
4553 /* Out of range start or start > end result in empty list */
4554 addReply(c,shared.emptymultibulk);
4555 return;
4556 }
4557 if (end >= llen) end = llen-1;
4558 rangelen = (end-start)+1;
3305306f 4559
dd88747b 4560 /* Return the result in form of a multi-bulk reply */
4561 ln = listIndex(list, start);
4562 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4563 for (j = 0; j < rangelen; j++) {
4564 ele = listNodeValue(ln);
4565 addReplyBulk(c,ele);
4566 ln = ln->next;
ed9b544e 4567 }
4568}
4569
4570static void ltrimCommand(redisClient *c) {
3305306f 4571 robj *o;
ed9b544e 4572 int start = atoi(c->argv[2]->ptr);
4573 int end = atoi(c->argv[3]->ptr);
dd88747b 4574 int llen;
4575 int j, ltrim, rtrim;
4576 list *list;
4577 listNode *ln;
4578
4579 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4580 checkType(c,o,REDIS_LIST)) return;
4581 list = o->ptr;
4582 llen = listLength(list);
4583
4584 /* convert negative indexes */
4585 if (start < 0) start = llen+start;
4586 if (end < 0) end = llen+end;
4587 if (start < 0) start = 0;
4588 if (end < 0) end = 0;
4589
4590 /* indexes sanity checks */
4591 if (start > end || start >= llen) {
4592 /* Out of range start or start > end result in empty list */
4593 ltrim = llen;
4594 rtrim = 0;
ed9b544e 4595 } else {
dd88747b 4596 if (end >= llen) end = llen-1;
4597 ltrim = start;
4598 rtrim = llen-end-1;
4599 }
ed9b544e 4600
dd88747b 4601 /* Remove list elements to perform the trim */
4602 for (j = 0; j < ltrim; j++) {
4603 ln = listFirst(list);
4604 listDelNode(list,ln);
4605 }
4606 for (j = 0; j < rtrim; j++) {
4607 ln = listLast(list);
4608 listDelNode(list,ln);
ed9b544e 4609 }
3ea27d37 4610 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4611 server.dirty++;
4612 addReply(c,shared.ok);
ed9b544e 4613}
4614
4615static void lremCommand(redisClient *c) {
3305306f 4616 robj *o;
dd88747b 4617 list *list;
4618 listNode *ln, *next;
4619 int toremove = atoi(c->argv[2]->ptr);
4620 int removed = 0;
4621 int fromtail = 0;
a4d1ba9a 4622
dd88747b 4623 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4624 checkType(c,o,REDIS_LIST)) return;
4625 list = o->ptr;
4626
4627 if (toremove < 0) {
4628 toremove = -toremove;
4629 fromtail = 1;
4630 }
4631 ln = fromtail ? list->tail : list->head;
4632 while (ln) {
4633 robj *ele = listNodeValue(ln);
4634
4635 next = fromtail ? ln->prev : ln->next;
4636 if (compareStringObjects(ele,c->argv[3]) == 0) {
4637 listDelNode(list,ln);
4638 server.dirty++;
4639 removed++;
4640 if (toremove && removed == toremove) break;
ed9b544e 4641 }
dd88747b 4642 ln = next;
ed9b544e 4643 }
3ea27d37 4644 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4645 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 4646}
4647
12f9d551 4648/* This is the semantic of this command:
0f5f7e9a 4649 * RPOPLPUSH srclist dstlist:
12f9d551 4650 * IF LLEN(srclist) > 0
4651 * element = RPOP srclist
4652 * LPUSH dstlist element
4653 * RETURN element
4654 * ELSE
4655 * RETURN nil
4656 * END
4657 * END
4658 *
4659 * The idea is to be able to get an element from a list in a reliable way
4660 * since the element is not just returned but pushed against another list
4661 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4662 */
0f5f7e9a 4663static void rpoplpushcommand(redisClient *c) {
12f9d551 4664 robj *sobj;
dd88747b 4665 list *srclist;
4666 listNode *ln;
4667
4668 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4669 checkType(c,sobj,REDIS_LIST)) return;
4670 srclist = sobj->ptr;
4671 ln = listLast(srclist);
12f9d551 4672
dd88747b 4673 if (ln == NULL) {
12f9d551 4674 addReply(c,shared.nullbulk);
4675 } else {
dd88747b 4676 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4677 robj *ele = listNodeValue(ln);
4678 list *dstlist;
e20fb74f 4679
dd88747b 4680 if (dobj && dobj->type != REDIS_LIST) {
4681 addReply(c,shared.wrongtypeerr);
4682 return;
4683 }
12f9d551 4684
dd88747b 4685 /* Add the element to the target list (unless it's directly
4686 * passed to some BLPOP-ing client */
4687 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4688 if (dobj == NULL) {
4689 /* Create the list if the key does not exist */
4690 dobj = createListObject();
4691 dictAdd(c->db->dict,c->argv[2],dobj);
4692 incrRefCount(c->argv[2]);
12f9d551 4693 }
dd88747b 4694 dstlist = dobj->ptr;
4695 listAddNodeHead(dstlist,ele);
4696 incrRefCount(ele);
12f9d551 4697 }
dd88747b 4698
4699 /* Send the element to the client as reply as well */
4700 addReplyBulk(c,ele);
4701
4702 /* Finally remove the element from the source list */
4703 listDelNode(srclist,ln);
3ea27d37 4704 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4705 server.dirty++;
12f9d551 4706 }
4707}
4708
ed9b544e 4709/* ==================================== Sets ================================ */
4710
4711static void saddCommand(redisClient *c) {
ed9b544e 4712 robj *set;
4713
3305306f 4714 set = lookupKeyWrite(c->db,c->argv[1]);
4715 if (set == NULL) {
ed9b544e 4716 set = createSetObject();
3305306f 4717 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 4718 incrRefCount(c->argv[1]);
4719 } else {
ed9b544e 4720 if (set->type != REDIS_SET) {
c937aa89 4721 addReply(c,shared.wrongtypeerr);
ed9b544e 4722 return;
4723 }
4724 }
4725 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4726 incrRefCount(c->argv[2]);
4727 server.dirty++;
c937aa89 4728 addReply(c,shared.cone);
ed9b544e 4729 } else {
c937aa89 4730 addReply(c,shared.czero);
ed9b544e 4731 }
4732}
4733
4734static void sremCommand(redisClient *c) {
3305306f 4735 robj *set;
ed9b544e 4736
dd88747b 4737 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4738 checkType(c,set,REDIS_SET)) return;
4739
4740 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4741 server.dirty++;
4742 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 4743 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4744 addReply(c,shared.cone);
ed9b544e 4745 } else {
dd88747b 4746 addReply(c,shared.czero);
ed9b544e 4747 }
4748}
4749
a4460ef4 4750static void smoveCommand(redisClient *c) {
4751 robj *srcset, *dstset;
4752
4753 srcset = lookupKeyWrite(c->db,c->argv[1]);
4754 dstset = lookupKeyWrite(c->db,c->argv[2]);
4755
4756 /* If the source key does not exist return 0, if it's of the wrong type
4757 * raise an error */
4758 if (srcset == NULL || srcset->type != REDIS_SET) {
4759 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4760 return;
4761 }
4762 /* Error if the destination key is not a set as well */
4763 if (dstset && dstset->type != REDIS_SET) {
4764 addReply(c,shared.wrongtypeerr);
4765 return;
4766 }
4767 /* Remove the element from the source set */
4768 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4769 /* Key not found in the src set! return zero */
4770 addReply(c,shared.czero);
4771 return;
4772 }
3ea27d37 4773 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4774 deleteKey(c->db,c->argv[1]);
a4460ef4 4775 server.dirty++;
4776 /* Add the element to the destination set */
4777 if (!dstset) {
4778 dstset = createSetObject();
4779 dictAdd(c->db->dict,c->argv[2],dstset);
4780 incrRefCount(c->argv[2]);
4781 }
4782 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4783 incrRefCount(c->argv[3]);
4784 addReply(c,shared.cone);
4785}
4786
ed9b544e 4787static void sismemberCommand(redisClient *c) {
3305306f 4788 robj *set;
ed9b544e 4789
dd88747b 4790 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4791 checkType(c,set,REDIS_SET)) return;
4792
4793 if (dictFind(set->ptr,c->argv[2]))
4794 addReply(c,shared.cone);
4795 else
c937aa89 4796 addReply(c,shared.czero);
ed9b544e 4797}
4798
4799static void scardCommand(redisClient *c) {
3305306f 4800 robj *o;
ed9b544e 4801 dict *s;
dd88747b 4802
4803 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4804 checkType(c,o,REDIS_SET)) return;
e0a62c7f 4805
dd88747b 4806 s = o->ptr;
4807 addReplyUlong(c,dictSize(s));
ed9b544e 4808}
4809
12fea928 4810static void spopCommand(redisClient *c) {
4811 robj *set;
4812 dictEntry *de;
4813
dd88747b 4814 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4815 checkType(c,set,REDIS_SET)) return;
4816
4817 de = dictGetRandomKey(set->ptr);
4818 if (de == NULL) {
12fea928 4819 addReply(c,shared.nullbulk);
4820 } else {
dd88747b 4821 robj *ele = dictGetEntryKey(de);
12fea928 4822
dd88747b 4823 addReplyBulk(c,ele);
4824 dictDelete(set->ptr,ele);
4825 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 4826 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4827 server.dirty++;
12fea928 4828 }
4829}
4830
2abb95a9 4831static void srandmemberCommand(redisClient *c) {
4832 robj *set;
4833 dictEntry *de;
4834
dd88747b 4835 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4836 checkType(c,set,REDIS_SET)) return;
4837
4838 de = dictGetRandomKey(set->ptr);
4839 if (de == NULL) {
2abb95a9 4840 addReply(c,shared.nullbulk);
4841 } else {
dd88747b 4842 robj *ele = dictGetEntryKey(de);
2abb95a9 4843
dd88747b 4844 addReplyBulk(c,ele);
2abb95a9 4845 }
4846}
4847
ed9b544e 4848static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4849 dict **d1 = (void*) s1, **d2 = (void*) s2;
4850
3305306f 4851 return dictSize(*d1)-dictSize(*d2);
ed9b544e 4852}
4853
682ac724 4854static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 4855 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4856 dictIterator *di;
4857 dictEntry *de;
4858 robj *lenobj = NULL, *dstset = NULL;
682ac724 4859 unsigned long j, cardinality = 0;
ed9b544e 4860
ed9b544e 4861 for (j = 0; j < setsnum; j++) {
4862 robj *setobj;
3305306f 4863
4864 setobj = dstkey ?
4865 lookupKeyWrite(c->db,setskeys[j]) :
4866 lookupKeyRead(c->db,setskeys[j]);
4867 if (!setobj) {
ed9b544e 4868 zfree(dv);
5faa6025 4869 if (dstkey) {
fdcaae84 4870 if (deleteKey(c->db,dstkey))
4871 server.dirty++;
0d36ded0 4872 addReply(c,shared.czero);
5faa6025 4873 } else {
4e27f268 4874 addReply(c,shared.emptymultibulk);
5faa6025 4875 }
ed9b544e 4876 return;
4877 }
ed9b544e 4878 if (setobj->type != REDIS_SET) {
4879 zfree(dv);
c937aa89 4880 addReply(c,shared.wrongtypeerr);
ed9b544e 4881 return;
4882 }
4883 dv[j] = setobj->ptr;
4884 }
4885 /* Sort sets from the smallest to largest, this will improve our
4886 * algorithm's performace */
4887 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4888
4889 /* The first thing we should output is the total number of elements...
4890 * since this is a multi-bulk write, but at this stage we don't know
4891 * the intersection set size, so we use a trick, append an empty object
4892 * to the output list and save the pointer to later modify it with the
4893 * right length */
4894 if (!dstkey) {
4895 lenobj = createObject(REDIS_STRING,NULL);
4896 addReply(c,lenobj);
4897 decrRefCount(lenobj);
4898 } else {
4899 /* If we have a target key where to store the resulting set
4900 * create this key with an empty set inside */
4901 dstset = createSetObject();
ed9b544e 4902 }
4903
4904 /* Iterate all the elements of the first (smallest) set, and test
4905 * the element against all the other sets, if at least one set does
4906 * not include the element it is discarded */
4907 di = dictGetIterator(dv[0]);
ed9b544e 4908
4909 while((de = dictNext(di)) != NULL) {
4910 robj *ele;
4911
4912 for (j = 1; j < setsnum; j++)
4913 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4914 if (j != setsnum)
4915 continue; /* at least one set does not contain the member */
4916 ele = dictGetEntryKey(de);
4917 if (!dstkey) {
dd88747b 4918 addReplyBulk(c,ele);
ed9b544e 4919 cardinality++;
4920 } else {
4921 dictAdd(dstset->ptr,ele,NULL);
4922 incrRefCount(ele);
4923 }
4924 }
4925 dictReleaseIterator(di);
4926
83cdfe18 4927 if (dstkey) {
3ea27d37 4928 /* Store the resulting set into the target, if the intersection
4929 * is not an empty set. */
83cdfe18 4930 deleteKey(c->db,dstkey);
3ea27d37 4931 if (dictSize((dict*)dstset->ptr) > 0) {
4932 dictAdd(c->db->dict,dstkey,dstset);
4933 incrRefCount(dstkey);
d36c4e97 4934 addReplyLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 4935 } else {
4936 decrRefCount(dstset);
d36c4e97 4937 addReply(c,shared.czero);
3ea27d37 4938 }
40d224a9 4939 server.dirty++;
d36c4e97 4940 } else {
4941 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 4942 }
ed9b544e 4943 zfree(dv);
4944}
4945
4946static void sinterCommand(redisClient *c) {
4947 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4948}
4949
4950static void sinterstoreCommand(redisClient *c) {
4951 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4952}
4953
f4f56e1d 4954#define REDIS_OP_UNION 0
4955#define REDIS_OP_DIFF 1
2830ca53 4956#define REDIS_OP_INTER 2
f4f56e1d 4957
4958static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 4959 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4960 dictIterator *di;
4961 dictEntry *de;
f4f56e1d 4962 robj *dstset = NULL;
40d224a9 4963 int j, cardinality = 0;
4964
40d224a9 4965 for (j = 0; j < setsnum; j++) {
4966 robj *setobj;
4967
4968 setobj = dstkey ?
4969 lookupKeyWrite(c->db,setskeys[j]) :
4970 lookupKeyRead(c->db,setskeys[j]);
4971 if (!setobj) {
4972 dv[j] = NULL;
4973 continue;
4974 }
4975 if (setobj->type != REDIS_SET) {
4976 zfree(dv);
4977 addReply(c,shared.wrongtypeerr);
4978 return;
4979 }
4980 dv[j] = setobj->ptr;
4981 }
4982
4983 /* We need a temp set object to store our union. If the dstkey
4984 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4985 * this set object will be the resulting object to set into the target key*/
4986 dstset = createSetObject();
4987
40d224a9 4988 /* Iterate all the elements of all the sets, add every element a single
4989 * time to the result set */
4990 for (j = 0; j < setsnum; j++) {
51829ed3 4991 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 4992 if (!dv[j]) continue; /* non existing keys are like empty sets */
4993
4994 di = dictGetIterator(dv[j]);
40d224a9 4995
4996 while((de = dictNext(di)) != NULL) {
4997 robj *ele;
4998
4999 /* dictAdd will not add the same element multiple times */
5000 ele = dictGetEntryKey(de);
f4f56e1d 5001 if (op == REDIS_OP_UNION || j == 0) {
5002 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5003 incrRefCount(ele);
40d224a9 5004 cardinality++;
5005 }
f4f56e1d 5006 } else if (op == REDIS_OP_DIFF) {
5007 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5008 cardinality--;
5009 }
40d224a9 5010 }
5011 }
5012 dictReleaseIterator(di);
51829ed3 5013
d36c4e97 5014 /* result set is empty? Exit asap. */
5015 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 5016 }
5017
f4f56e1d 5018 /* Output the content of the resulting set, if not in STORE mode */
5019 if (!dstkey) {
5020 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5021 di = dictGetIterator(dstset->ptr);
f4f56e1d 5022 while((de = dictNext(di)) != NULL) {
5023 robj *ele;
5024
5025 ele = dictGetEntryKey(de);
dd88747b 5026 addReplyBulk(c,ele);
f4f56e1d 5027 }
5028 dictReleaseIterator(di);
d36c4e97 5029 decrRefCount(dstset);
83cdfe18
AG
5030 } else {
5031 /* If we have a target key where to store the resulting set
5032 * create this key with the result set inside */
5033 deleteKey(c->db,dstkey);
3ea27d37 5034 if (dictSize((dict*)dstset->ptr) > 0) {
5035 dictAdd(c->db->dict,dstkey,dstset);
5036 incrRefCount(dstkey);
d36c4e97 5037 addReplyLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5038 } else {
5039 decrRefCount(dstset);
d36c4e97 5040 addReply(c,shared.czero);
3ea27d37 5041 }
40d224a9 5042 server.dirty++;
5043 }
5044 zfree(dv);
5045}
5046
5047static void sunionCommand(redisClient *c) {
f4f56e1d 5048 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 5049}
5050
5051static void sunionstoreCommand(redisClient *c) {
f4f56e1d 5052 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5053}
5054
5055static void sdiffCommand(redisClient *c) {
5056 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5057}
5058
5059static void sdiffstoreCommand(redisClient *c) {
5060 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 5061}
5062
6b47e12e 5063/* ==================================== ZSets =============================== */
5064
5065/* ZSETs are ordered sets using two data structures to hold the same elements
5066 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5067 * data structure.
5068 *
5069 * The elements are added to an hash table mapping Redis objects to scores.
5070 * At the same time the elements are added to a skip list mapping scores
5071 * to Redis objects (so objects are sorted by scores in this "view"). */
5072
5073/* This skiplist implementation is almost a C translation of the original
5074 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5075 * Alternative to Balanced Trees", modified in three ways:
5076 * a) this implementation allows for repeated values.
5077 * b) the comparison is not just by key (our 'score') but by satellite data.
5078 * c) there is a back pointer, so it's a doubly linked list with the back
5079 * pointers being only at "level 1". This allows to traverse the list
5080 * from tail to head, useful for ZREVRANGE. */
5081
5082static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5083 zskiplistNode *zn = zmalloc(sizeof(*zn));
5084
5085 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2b37892e
PN
5086 if (level > 0)
5087 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
6b47e12e 5088 zn->score = score;
5089 zn->obj = obj;
5090 return zn;
5091}
5092
5093static zskiplist *zslCreate(void) {
5094 int j;
5095 zskiplist *zsl;
e0a62c7f 5096
6b47e12e 5097 zsl = zmalloc(sizeof(*zsl));
5098 zsl->level = 1;
cc812361 5099 zsl->length = 0;
6b47e12e 5100 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 5101 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 5102 zsl->header->forward[j] = NULL;
94e543b5 5103
5104 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5105 if (j < ZSKIPLIST_MAXLEVEL-1)
5106 zsl->header->span[j] = 0;
69d95c3e 5107 }
e3870fab 5108 zsl->header->backward = NULL;
5109 zsl->tail = NULL;
6b47e12e 5110 return zsl;
5111}
5112
fd8ccf44 5113static void zslFreeNode(zskiplistNode *node) {
5114 decrRefCount(node->obj);
ad807e6f 5115 zfree(node->forward);
69d95c3e 5116 zfree(node->span);
fd8ccf44 5117 zfree(node);
5118}
5119
5120static void zslFree(zskiplist *zsl) {
ad807e6f 5121 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 5122
ad807e6f 5123 zfree(zsl->header->forward);
69d95c3e 5124 zfree(zsl->header->span);
ad807e6f 5125 zfree(zsl->header);
fd8ccf44 5126 while(node) {
599379dd 5127 next = node->forward[0];
fd8ccf44 5128 zslFreeNode(node);
5129 node = next;
5130 }
ad807e6f 5131 zfree(zsl);
fd8ccf44 5132}
5133
6b47e12e 5134static int zslRandomLevel(void) {
5135 int level = 1;
5136 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5137 level += 1;
10c2baa5 5138 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6b47e12e 5139}
5140
5141static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5142 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 5143 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 5144 int i, level;
5145
5146 x = zsl->header;
5147 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
5148 /* store rank that is crossed to reach the insert position */
5149 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 5150
9d60e6e4 5151 while (x->forward[i] &&
5152 (x->forward[i]->score < score ||
5153 (x->forward[i]->score == score &&
69d95c3e 5154 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 5155 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 5156 x = x->forward[i];
69d95c3e 5157 }
6b47e12e 5158 update[i] = x;
5159 }
6b47e12e 5160 /* we assume the key is not already inside, since we allow duplicated
5161 * scores, and the re-insertion of score and redis object should never
5162 * happpen since the caller of zslInsert() should test in the hash table
5163 * if the element is already inside or not. */
5164 level = zslRandomLevel();
5165 if (level > zsl->level) {
69d95c3e 5166 for (i = zsl->level; i < level; i++) {
2b37892e 5167 rank[i] = 0;
6b47e12e 5168 update[i] = zsl->header;
2b37892e 5169 update[i]->span[i-1] = zsl->length;
69d95c3e 5170 }
6b47e12e 5171 zsl->level = level;
5172 }
5173 x = zslCreateNode(level,score,obj);
5174 for (i = 0; i < level; i++) {
5175 x->forward[i] = update[i]->forward[i];
5176 update[i]->forward[i] = x;
69d95c3e
PN
5177
5178 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
5179 if (i > 0) {
5180 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5181 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5182 }
6b47e12e 5183 }
69d95c3e
PN
5184
5185 /* increment span for untouched levels */
5186 for (i = level; i < zsl->level; i++) {
2b37892e 5187 update[i]->span[i-1]++;
69d95c3e
PN
5188 }
5189
bb975144 5190 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 5191 if (x->forward[0])
5192 x->forward[0]->backward = x;
5193 else
5194 zsl->tail = x;
cc812361 5195 zsl->length++;
6b47e12e 5196}
5197
84105336
PN
5198/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5199void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5200 int i;
5201 for (i = 0; i < zsl->level; i++) {
5202 if (update[i]->forward[i] == x) {
5203 if (i > 0) {
5204 update[i]->span[i-1] += x->span[i-1] - 1;
5205 }
5206 update[i]->forward[i] = x->forward[i];
5207 } else {
5208 /* invariant: i > 0, because update[0]->forward[0]
5209 * is always equal to x */
5210 update[i]->span[i-1] -= 1;
5211 }
5212 }
5213 if (x->forward[0]) {
5214 x->forward[0]->backward = x->backward;
5215 } else {
5216 zsl->tail = x->backward;
5217 }
5218 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5219 zsl->level--;
5220 zsl->length--;
5221}
5222
50c55df5 5223/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 5224static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 5225 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5226 int i;
5227
5228 x = zsl->header;
5229 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 5230 while (x->forward[i] &&
5231 (x->forward[i]->score < score ||
5232 (x->forward[i]->score == score &&
5233 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 5234 x = x->forward[i];
5235 update[i] = x;
5236 }
5237 /* We may have multiple elements with the same score, what we need
5238 * is to find the element with both the right score and object. */
5239 x = x->forward[0];
50c55df5 5240 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
84105336 5241 zslDeleteNode(zsl, x, update);
9d60e6e4 5242 zslFreeNode(x);
9d60e6e4 5243 return 1;
5244 } else {
5245 return 0; /* not found */
e197b441 5246 }
5247 return 0; /* not found */
fd8ccf44 5248}
5249
1807985b 5250/* Delete all the elements with score between min and max from the skiplist.
5251 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5252 * Note that this function takes the reference to the hash table view of the
5253 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 5254static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 5255 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5256 unsigned long removed = 0;
5257 int i;
5258
5259 x = zsl->header;
5260 for (i = zsl->level-1; i >= 0; i--) {
5261 while (x->forward[i] && x->forward[i]->score < min)
5262 x = x->forward[i];
5263 update[i] = x;
5264 }
5265 /* We may have multiple elements with the same score, what we need
5266 * is to find the element with both the right score and object. */
5267 x = x->forward[0];
5268 while (x && x->score <= max) {
84105336
PN
5269 zskiplistNode *next = x->forward[0];
5270 zslDeleteNode(zsl, x, update);
1807985b 5271 dictDelete(dict,x->obj);
5272 zslFreeNode(x);
1807985b 5273 removed++;
5274 x = next;
5275 }
5276 return removed; /* not found */
5277}
1807985b 5278
9212eafd 5279/* Delete all the elements with rank between start and end from the skiplist.
2424490f 5280 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
5281static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5282 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5283 unsigned long traversed = 0, removed = 0;
5284 int i;
5285
9212eafd
PN
5286 x = zsl->header;
5287 for (i = zsl->level-1; i >= 0; i--) {
5288 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5289 traversed += i > 0 ? x->span[i-1] : 1;
5290 x = x->forward[i];
1807985b 5291 }
9212eafd
PN
5292 update[i] = x;
5293 }
5294
5295 traversed++;
5296 x = x->forward[0];
5297 while (x && traversed <= end) {
84105336
PN
5298 zskiplistNode *next = x->forward[0];
5299 zslDeleteNode(zsl, x, update);
1807985b 5300 dictDelete(dict,x->obj);
5301 zslFreeNode(x);
1807985b 5302 removed++;
9212eafd 5303 traversed++;
1807985b 5304 x = next;
5305 }
9212eafd 5306 return removed;
1807985b 5307}
5308
50c55df5 5309/* Find the first node having a score equal or greater than the specified one.
5310 * Returns NULL if there is no match. */
5311static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5312 zskiplistNode *x;
5313 int i;
5314
5315 x = zsl->header;
5316 for (i = zsl->level-1; i >= 0; i--) {
5317 while (x->forward[i] && x->forward[i]->score < score)
5318 x = x->forward[i];
5319 }
5320 /* We may have multiple elements with the same score, what we need
5321 * is to find the element with both the right score and object. */
5322 return x->forward[0];
5323}
5324
27b0ccca
PN
5325/* Find the rank for an element by both score and key.
5326 * Returns 0 when the element cannot be found, rank otherwise.
5327 * Note that the rank is 1-based due to the span of zsl->header to the
5328 * first element. */
5329static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5330 zskiplistNode *x;
5331 unsigned long rank = 0;
5332 int i;
5333
5334 x = zsl->header;
5335 for (i = zsl->level-1; i >= 0; i--) {
5336 while (x->forward[i] &&
5337 (x->forward[i]->score < score ||
5338 (x->forward[i]->score == score &&
5339 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 5340 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
5341 x = x->forward[i];
5342 }
5343
5344 /* x might be equal to zsl->header, so test if obj is non-NULL */
5345 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5346 return rank;
5347 }
5348 }
5349 return 0;
5350}
5351
e74825c2
PN
5352/* Finds an element by its rank. The rank argument needs to be 1-based. */
5353zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5354 zskiplistNode *x;
5355 unsigned long traversed = 0;
5356 int i;
5357
5358 x = zsl->header;
5359 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 5360 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5361 {
a50ea45c 5362 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
5363 x = x->forward[i];
5364 }
e74825c2
PN
5365 if (traversed == rank) {
5366 return x;
5367 }
5368 }
5369 return NULL;
5370}
5371
fd8ccf44 5372/* The actual Z-commands implementations */
5373
7db723ad 5374/* This generic command implements both ZADD and ZINCRBY.
e2665397 5375 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 5376 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 5377static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 5378 robj *zsetobj;
5379 zset *zs;
5380 double *score;
5381
e2665397 5382 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 5383 if (zsetobj == NULL) {
5384 zsetobj = createZsetObject();
e2665397 5385 dictAdd(c->db->dict,key,zsetobj);
5386 incrRefCount(key);
fd8ccf44 5387 } else {
5388 if (zsetobj->type != REDIS_ZSET) {
5389 addReply(c,shared.wrongtypeerr);
5390 return;
5391 }
5392 }
fd8ccf44 5393 zs = zsetobj->ptr;
e2665397 5394
7db723ad 5395 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 5396 * needs to handle the two different conditions. It's all about setting
5397 * '*score', that is, the new score to set, to the right value. */
5398 score = zmalloc(sizeof(double));
5399 if (doincrement) {
5400 dictEntry *de;
5401
5402 /* Read the old score. If the element was not present starts from 0 */
5403 de = dictFind(zs->dict,ele);
5404 if (de) {
5405 double *oldscore = dictGetEntryVal(de);
5406 *score = *oldscore + scoreval;
5407 } else {
5408 *score = scoreval;
5409 }
5410 } else {
5411 *score = scoreval;
5412 }
5413
5414 /* What follows is a simple remove and re-insert operation that is common
7db723ad 5415 * to both ZADD and ZINCRBY... */
e2665397 5416 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 5417 /* case 1: New element */
e2665397 5418 incrRefCount(ele); /* added to hash */
5419 zslInsert(zs->zsl,*score,ele);
5420 incrRefCount(ele); /* added to skiplist */
fd8ccf44 5421 server.dirty++;
e2665397 5422 if (doincrement)
e2665397 5423 addReplyDouble(c,*score);
91d71bfc 5424 else
5425 addReply(c,shared.cone);
fd8ccf44 5426 } else {
5427 dictEntry *de;
5428 double *oldscore;
e0a62c7f 5429
fd8ccf44 5430 /* case 2: Score update operation */
e2665397 5431 de = dictFind(zs->dict,ele);
dfc5e96c 5432 redisAssert(de != NULL);
fd8ccf44 5433 oldscore = dictGetEntryVal(de);
5434 if (*score != *oldscore) {
5435 int deleted;
5436
e2665397 5437 /* Remove and insert the element in the skip list with new score */
5438 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 5439 redisAssert(deleted != 0);
e2665397 5440 zslInsert(zs->zsl,*score,ele);
5441 incrRefCount(ele);
5442 /* Update the score in the hash table */
5443 dictReplace(zs->dict,ele,score);
fd8ccf44 5444 server.dirty++;
2161a965 5445 } else {
5446 zfree(score);
fd8ccf44 5447 }
e2665397 5448 if (doincrement)
5449 addReplyDouble(c,*score);
5450 else
5451 addReply(c,shared.czero);
fd8ccf44 5452 }
5453}
5454
e2665397 5455static void zaddCommand(redisClient *c) {
5456 double scoreval;
5457
bbe025e0
AM
5458 if (getDoubleFromObject(c, c->argv[2], &scoreval) != REDIS_OK) return;
5459
e2665397 5460 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5461}
5462
7db723ad 5463static void zincrbyCommand(redisClient *c) {
e2665397 5464 double scoreval;
5465
bbe025e0
AM
5466 if (getDoubleFromObject(c, c->argv[2], &scoreval) != REDIS_OK) return;
5467
e2665397 5468 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5469}
5470
1b7106e7 5471static void zremCommand(redisClient *c) {
5472 robj *zsetobj;
5473 zset *zs;
dd88747b 5474 dictEntry *de;
5475 double *oldscore;
5476 int deleted;
1b7106e7 5477
dd88747b 5478 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5479 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 5480
dd88747b 5481 zs = zsetobj->ptr;
5482 de = dictFind(zs->dict,c->argv[2]);
5483 if (de == NULL) {
5484 addReply(c,shared.czero);
5485 return;
1b7106e7 5486 }
dd88747b 5487 /* Delete from the skiplist */
5488 oldscore = dictGetEntryVal(de);
5489 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5490 redisAssert(deleted != 0);
5491
5492 /* Delete from the hash table */
5493 dictDelete(zs->dict,c->argv[2]);
5494 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5495 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5496 server.dirty++;
5497 addReply(c,shared.cone);
1b7106e7 5498}
5499
1807985b 5500static void zremrangebyscoreCommand(redisClient *c) {
bbe025e0
AM
5501 double min;
5502 double max;
dd88747b 5503 long deleted;
1807985b 5504 robj *zsetobj;
5505 zset *zs;
5506
bbe025e0
AM
5507 if ((getDoubleFromObject(c, c->argv[2], &min) != REDIS_OK) ||
5508 (getDoubleFromObject(c, c->argv[3], &max) != REDIS_OK)) return;
5509
dd88747b 5510 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5511 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 5512
dd88747b 5513 zs = zsetobj->ptr;
5514 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5515 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5516 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5517 server.dirty += deleted;
5518 addReplyLong(c,deleted);
1807985b 5519}
5520
9212eafd 5521static void zremrangebyrankCommand(redisClient *c) {
bbe025e0
AM
5522 long start;
5523 long end;
dd88747b 5524 int llen;
5525 long deleted;
9212eafd
PN
5526 robj *zsetobj;
5527 zset *zs;
5528
bbe025e0
AM
5529 if ((getLongFromObject(c, c->argv[2], &start) != REDIS_OK) ||
5530 (getLongFromObject(c, c->argv[3], &end) != REDIS_OK)) return;
5531
dd88747b 5532 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5533 checkType(c,zsetobj,REDIS_ZSET)) return;
5534 zs = zsetobj->ptr;
5535 llen = zs->zsl->length;
9212eafd 5536
dd88747b 5537 /* convert negative indexes */
5538 if (start < 0) start = llen+start;
5539 if (end < 0) end = llen+end;
5540 if (start < 0) start = 0;
5541 if (end < 0) end = 0;
9212eafd 5542
dd88747b 5543 /* indexes sanity checks */
5544 if (start > end || start >= llen) {
5545 addReply(c,shared.czero);
5546 return;
9212eafd 5547 }
dd88747b 5548 if (end >= llen) end = llen-1;
5549
5550 /* increment start and end because zsl*Rank functions
5551 * use 1-based rank */
5552 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5553 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5554 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5555 server.dirty += deleted;
5556 addReplyLong(c, deleted);
9212eafd
PN
5557}
5558
8f92e768
PN
5559typedef struct {
5560 dict *dict;
5561 double weight;
5562} zsetopsrc;
5563
5564static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5565 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5566 unsigned long size1, size2;
5567 size1 = d1->dict ? dictSize(d1->dict) : 0;
5568 size2 = d2->dict ? dictSize(d2->dict) : 0;
5569 return size1 - size2;
5570}
5571
d2764cd6
PN
5572#define REDIS_AGGR_SUM 1
5573#define REDIS_AGGR_MIN 2
5574#define REDIS_AGGR_MAX 3
5575
5576inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5577 if (aggregate == REDIS_AGGR_SUM) {
5578 *target = *target + val;
5579 } else if (aggregate == REDIS_AGGR_MIN) {
5580 *target = val < *target ? val : *target;
5581 } else if (aggregate == REDIS_AGGR_MAX) {
5582 *target = val > *target ? val : *target;
5583 } else {
5584 /* safety net */
5585 redisAssert(0 != 0);
5586 }
5587}
5588
2830ca53 5589static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
8f92e768 5590 int i, j, zsetnum;
d2764cd6 5591 int aggregate = REDIS_AGGR_SUM;
8f92e768 5592 zsetopsrc *src;
2830ca53
PN
5593 robj *dstobj;
5594 zset *dstzset;
b287c9bb
PN
5595 dictIterator *di;
5596 dictEntry *de;
5597
2830ca53
PN
5598 /* expect zsetnum input keys to be given */
5599 zsetnum = atoi(c->argv[2]->ptr);
5600 if (zsetnum < 1) {
5601 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5602 return;
b287c9bb 5603 }
2830ca53
PN
5604
5605 /* test if the expected number of keys would overflow */
5606 if (3+zsetnum > c->argc) {
b287c9bb
PN
5607 addReply(c,shared.syntaxerr);
5608 return;
5609 }
5610
2830ca53 5611 /* read keys to be used for input */
b9eed483 5612 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
2830ca53 5613 for (i = 0, j = 3; i < zsetnum; i++, j++) {
b287c9bb
PN
5614 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5615 if (!zsetobj) {
8f92e768 5616 src[i].dict = NULL;
b287c9bb
PN
5617 } else {
5618 if (zsetobj->type != REDIS_ZSET) {
8f92e768 5619 zfree(src);
b287c9bb
PN
5620 addReply(c,shared.wrongtypeerr);
5621 return;
5622 }
8f92e768 5623 src[i].dict = ((zset*)zsetobj->ptr)->dict;
b287c9bb 5624 }
2830ca53
PN
5625
5626 /* default all weights to 1 */
8f92e768 5627 src[i].weight = 1.0;
b287c9bb
PN
5628 }
5629
2830ca53
PN
5630 /* parse optional extra arguments */
5631 if (j < c->argc) {
d2764cd6 5632 int remaining = c->argc - j;
b287c9bb 5633
2830ca53 5634 while (remaining) {
d2764cd6 5635 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 5636 j++; remaining--;
2830ca53 5637 for (i = 0; i < zsetnum; i++, j++, remaining--) {
bbe025e0
AM
5638 if (getDoubleFromObject(c, c->argv[j], &src[i].weight) != REDIS_OK)
5639 return;
2830ca53 5640 }
d2764cd6
PN
5641 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5642 j++; remaining--;
5643 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5644 aggregate = REDIS_AGGR_SUM;
5645 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5646 aggregate = REDIS_AGGR_MIN;
5647 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5648 aggregate = REDIS_AGGR_MAX;
5649 } else {
5650 zfree(src);
5651 addReply(c,shared.syntaxerr);
5652 return;
5653 }
5654 j++; remaining--;
2830ca53 5655 } else {
8f92e768 5656 zfree(src);
2830ca53
PN
5657 addReply(c,shared.syntaxerr);
5658 return;
5659 }
5660 }
5661 }
b287c9bb 5662
d2764cd6
PN
5663 /* sort sets from the smallest to largest, this will improve our
5664 * algorithm's performance */
5665 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5666
2830ca53
PN
5667 dstobj = createZsetObject();
5668 dstzset = dstobj->ptr;
5669
5670 if (op == REDIS_OP_INTER) {
8f92e768
PN
5671 /* skip going over all entries if the smallest zset is NULL or empty */
5672 if (src[0].dict && dictSize(src[0].dict) > 0) {
5673 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5674 * from small to large, all src[i > 0].dict are non-empty too */
5675 di = dictGetIterator(src[0].dict);
2830ca53 5676 while((de = dictNext(di)) != NULL) {
d2764cd6
PN
5677 double *score = zmalloc(sizeof(double)), value;
5678 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
2830ca53 5679
d2764cd6
PN
5680 for (j = 1; j < zsetnum; j++) {
5681 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 5682 if (other) {
d2764cd6
PN
5683 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5684 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
5685 } else {
5686 break;
5687 }
5688 }
b287c9bb 5689
2830ca53 5690 /* skip entry when not present in every source dict */
8f92e768 5691 if (j != zsetnum) {
2830ca53
PN
5692 zfree(score);
5693 } else {
5694 robj *o = dictGetEntryKey(de);
5695 dictAdd(dstzset->dict,o,score);
5696 incrRefCount(o); /* added to dictionary */
5697 zslInsert(dstzset->zsl,*score,o);
5698 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
5699 }
5700 }
2830ca53
PN
5701 dictReleaseIterator(di);
5702 }
5703 } else if (op == REDIS_OP_UNION) {
5704 for (i = 0; i < zsetnum; i++) {
8f92e768 5705 if (!src[i].dict) continue;
2830ca53 5706
8f92e768 5707 di = dictGetIterator(src[i].dict);
2830ca53
PN
5708 while((de = dictNext(di)) != NULL) {
5709 /* skip key when already processed */
5710 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5711
d2764cd6
PN
5712 double *score = zmalloc(sizeof(double)), value;
5713 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
2830ca53 5714
d2764cd6
PN
5715 /* because the zsets are sorted by size, its only possible
5716 * for sets at larger indices to hold this entry */
5717 for (j = (i+1); j < zsetnum; j++) {
5718 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 5719 if (other) {
d2764cd6
PN
5720 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5721 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
5722 }
5723 }
b287c9bb 5724
2830ca53
PN
5725 robj *o = dictGetEntryKey(de);
5726 dictAdd(dstzset->dict,o,score);
5727 incrRefCount(o); /* added to dictionary */
5728 zslInsert(dstzset->zsl,*score,o);
5729 incrRefCount(o); /* added to skiplist */
5730 }
5731 dictReleaseIterator(di);
b287c9bb 5732 }
2830ca53
PN
5733 } else {
5734 /* unknown operator */
5735 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
5736 }
5737
5738 deleteKey(c->db,dstkey);
3ea27d37 5739 if (dstzset->zsl->length) {
5740 dictAdd(c->db->dict,dstkey,dstobj);
5741 incrRefCount(dstkey);
5742 addReplyLong(c, dstzset->zsl->length);
5743 server.dirty++;
5744 } else {
8bca8773 5745 decrRefCount(dstobj);
3ea27d37 5746 addReply(c, shared.czero);
5747 }
8f92e768 5748 zfree(src);
b287c9bb
PN
5749}
5750
2830ca53
PN
5751static void zunionCommand(redisClient *c) {
5752 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
5753}
5754
2830ca53
PN
5755static void zinterCommand(redisClient *c) {
5756 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
5757}
5758
e3870fab 5759static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 5760 robj *o;
bbe025e0
AM
5761 long start;
5762 long end;
752da584 5763 int withscores = 0;
dd88747b 5764 int llen;
5765 int rangelen, j;
5766 zset *zsetobj;
5767 zskiplist *zsl;
5768 zskiplistNode *ln;
5769 robj *ele;
752da584 5770
bbe025e0
AM
5771 if ((getLongFromObject(c, c->argv[2], &start) != REDIS_OK) ||
5772 (getLongFromObject(c, c->argv[3], &end) != REDIS_OK)) return;
5773
752da584 5774 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5775 withscores = 1;
5776 } else if (c->argc >= 5) {
5777 addReply(c,shared.syntaxerr);
5778 return;
5779 }
cc812361 5780
4e27f268 5781 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5782 || checkType(c,o,REDIS_ZSET)) return;
dd88747b 5783 zsetobj = o->ptr;
5784 zsl = zsetobj->zsl;
5785 llen = zsl->length;
cc812361 5786
dd88747b 5787 /* convert negative indexes */
5788 if (start < 0) start = llen+start;
5789 if (end < 0) end = llen+end;
5790 if (start < 0) start = 0;
5791 if (end < 0) end = 0;
cc812361 5792
dd88747b 5793 /* indexes sanity checks */
5794 if (start > end || start >= llen) {
5795 /* Out of range start or start > end result in empty list */
5796 addReply(c,shared.emptymultibulk);
5797 return;
5798 }
5799 if (end >= llen) end = llen-1;
5800 rangelen = (end-start)+1;
cc812361 5801
dd88747b 5802 /* check if starting point is trivial, before searching
5803 * the element in log(N) time */
5804 if (reverse) {
5805 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5806 } else {
5807 ln = start == 0 ?
5808 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5809 }
cc812361 5810
dd88747b 5811 /* Return the result in form of a multi-bulk reply */
5812 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5813 withscores ? (rangelen*2) : rangelen));
5814 for (j = 0; j < rangelen; j++) {
5815 ele = ln->obj;
5816 addReplyBulk(c,ele);
5817 if (withscores)
5818 addReplyDouble(c,ln->score);
5819 ln = reverse ? ln->backward : ln->forward[0];
cc812361 5820 }
5821}
5822
e3870fab 5823static void zrangeCommand(redisClient *c) {
5824 zrangeGenericCommand(c,0);
5825}
5826
5827static void zrevrangeCommand(redisClient *c) {
5828 zrangeGenericCommand(c,1);
5829}
5830
f44dd428 5831/* This command implements both ZRANGEBYSCORE and ZCOUNT.
5832 * If justcount is non-zero, just the count is returned. */
5833static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 5834 robj *o;
f44dd428 5835 double min, max;
5836 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 5837 int offset = 0, limit = -1;
0500ef27
SH
5838 int withscores = 0;
5839 int badsyntax = 0;
5840
f44dd428 5841 /* Parse the min-max interval. If one of the values is prefixed
5842 * by the "(" character, it's considered "open". For instance
5843 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5844 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5845 if (((char*)c->argv[2]->ptr)[0] == '(') {
5846 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5847 minex = 1;
5848 } else {
5849 min = strtod(c->argv[2]->ptr,NULL);
5850 }
5851 if (((char*)c->argv[3]->ptr)[0] == '(') {
5852 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5853 maxex = 1;
5854 } else {
5855 max = strtod(c->argv[3]->ptr,NULL);
5856 }
5857
5858 /* Parse "WITHSCORES": note that if the command was called with
5859 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5860 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 5861 if (c->argc == 5 || c->argc == 8) {
3a3978b1 5862 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5863 withscores = 1;
5864 else
5865 badsyntax = 1;
0500ef27 5866 }
3a3978b1 5867 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 5868 badsyntax = 1;
0500ef27 5869 if (badsyntax) {
454d4e43 5870 addReplySds(c,
5871 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 5872 return;
0500ef27
SH
5873 }
5874
f44dd428 5875 /* Parse "LIMIT" */
0500ef27 5876 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 5877 addReply(c,shared.syntaxerr);
5878 return;
0500ef27 5879 } else if (c->argc == (7 + withscores)) {
80181f78 5880 offset = atoi(c->argv[5]->ptr);
5881 limit = atoi(c->argv[6]->ptr);
0b13687c 5882 if (offset < 0) offset = 0;
80181f78 5883 }
50c55df5 5884
f44dd428 5885 /* Ok, lookup the key and get the range */
50c55df5 5886 o = lookupKeyRead(c->db,c->argv[1]);
5887 if (o == NULL) {
4e27f268 5888 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 5889 } else {
5890 if (o->type != REDIS_ZSET) {
5891 addReply(c,shared.wrongtypeerr);
5892 } else {
5893 zset *zsetobj = o->ptr;
5894 zskiplist *zsl = zsetobj->zsl;
5895 zskiplistNode *ln;
f44dd428 5896 robj *ele, *lenobj = NULL;
5897 unsigned long rangelen = 0;
50c55df5 5898
f44dd428 5899 /* Get the first node with the score >= min, or with
5900 * score > min if 'minex' is true. */
50c55df5 5901 ln = zslFirstWithScore(zsl,min);
f44dd428 5902 while (minex && ln && ln->score == min) ln = ln->forward[0];
5903
50c55df5 5904 if (ln == NULL) {
5905 /* No element matching the speciifed interval */
f44dd428 5906 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 5907 return;
5908 }
5909
5910 /* We don't know in advance how many matching elements there
5911 * are in the list, so we push this object that will represent
5912 * the multi-bulk length in the output buffer, and will "fix"
5913 * it later */
f44dd428 5914 if (!justcount) {
5915 lenobj = createObject(REDIS_STRING,NULL);
5916 addReply(c,lenobj);
5917 decrRefCount(lenobj);
5918 }
50c55df5 5919
f44dd428 5920 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 5921 if (offset) {
5922 offset--;
5923 ln = ln->forward[0];
5924 continue;
5925 }
5926 if (limit == 0) break;
f44dd428 5927 if (!justcount) {
5928 ele = ln->obj;
dd88747b 5929 addReplyBulk(c,ele);
f44dd428 5930 if (withscores)
5931 addReplyDouble(c,ln->score);
5932 }
50c55df5 5933 ln = ln->forward[0];
5934 rangelen++;
80181f78 5935 if (limit > 0) limit--;
50c55df5 5936 }
f44dd428 5937 if (justcount) {
5938 addReplyLong(c,(long)rangelen);
5939 } else {
5940 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5941 withscores ? (rangelen*2) : rangelen);
5942 }
50c55df5 5943 }
5944 }
5945}
5946
f44dd428 5947static void zrangebyscoreCommand(redisClient *c) {
5948 genericZrangebyscoreCommand(c,0);
5949}
5950
5951static void zcountCommand(redisClient *c) {
5952 genericZrangebyscoreCommand(c,1);
5953}
5954
3c41331e 5955static void zcardCommand(redisClient *c) {
e197b441 5956 robj *o;
5957 zset *zs;
dd88747b 5958
5959 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5960 checkType(c,o,REDIS_ZSET)) return;
5961
5962 zs = o->ptr;
5963 addReplyUlong(c,zs->zsl->length);
e197b441 5964}
5965
6e333bbe 5966static void zscoreCommand(redisClient *c) {
5967 robj *o;
5968 zset *zs;
dd88747b 5969 dictEntry *de;
5970
5971 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5972 checkType(c,o,REDIS_ZSET)) return;
5973
5974 zs = o->ptr;
5975 de = dictFind(zs->dict,c->argv[2]);
5976 if (!de) {
96d8b4ee 5977 addReply(c,shared.nullbulk);
6e333bbe 5978 } else {
dd88747b 5979 double *score = dictGetEntryVal(de);
6e333bbe 5980
dd88747b 5981 addReplyDouble(c,*score);
6e333bbe 5982 }
5983}
5984
798d9e55 5985static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 5986 robj *o;
dd88747b 5987 zset *zs;
5988 zskiplist *zsl;
5989 dictEntry *de;
5990 unsigned long rank;
5991 double *score;
5992
5993 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5994 checkType(c,o,REDIS_ZSET)) return;
5995
5996 zs = o->ptr;
5997 zsl = zs->zsl;
5998 de = dictFind(zs->dict,c->argv[2]);
5999 if (!de) {
69d95c3e
PN
6000 addReply(c,shared.nullbulk);
6001 return;
6002 }
69d95c3e 6003
dd88747b 6004 score = dictGetEntryVal(de);
6005 rank = zslGetRank(zsl, *score, c->argv[2]);
6006 if (rank) {
6007 if (reverse) {
6008 addReplyLong(c, zsl->length - rank);
27b0ccca 6009 } else {
dd88747b 6010 addReplyLong(c, rank-1);
69d95c3e 6011 }
dd88747b 6012 } else {
6013 addReply(c,shared.nullbulk);
978c2c94 6014 }
6015}
6016
798d9e55
PN
6017static void zrankCommand(redisClient *c) {
6018 zrankGenericCommand(c, 0);
6019}
6020
6021static void zrevrankCommand(redisClient *c) {
6022 zrankGenericCommand(c, 1);
6023}
6024
cbba7dd7 6025/* =================================== Hashes =============================== */
978c2c94 6026static void hsetCommand(redisClient *c) {
6027 int update = 0;
6028 robj *o = lookupKeyWrite(c->db,c->argv[1]);
6029
6030 if (o == NULL) {
6031 o = createHashObject();
6032 dictAdd(c->db->dict,c->argv[1],o);
6033 incrRefCount(c->argv[1]);
6034 } else {
6035 if (o->type != REDIS_HASH) {
6036 addReply(c,shared.wrongtypeerr);
6037 return;
6038 }
6039 }
bae2c7ec 6040 /* We want to convert the zipmap into an hash table right now if the
6041 * entry to be added is too big. Note that we check if the object
6042 * is integer encoded before to try fetching the length in the test below.
6043 * This is because integers are small, but currently stringObjectLen()
6044 * performs a slow conversion: not worth it. */
6045 if (o->encoding == REDIS_ENCODING_ZIPMAP &&
6046 ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
6047 sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
6048 (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
6049 sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
6050 {
6051 convertToRealHash(o);
6052 }
6053
978c2c94 6054 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6055 unsigned char *zm = o->ptr;
b1befe6a 6056 robj *valobj = getDecodedObject(c->argv[3]);
978c2c94 6057
6058 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
b1befe6a 6059 valobj->ptr,sdslen(valobj->ptr),&update);
6060 decrRefCount(valobj);
cbba7dd7 6061 o->ptr = zm;
bae2c7ec 6062
e9484a85
PN
6063 /* And here there is the second check for hash conversion. */
6064 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
bae2c7ec 6065 convertToRealHash(o);
978c2c94 6066 } else {
05df7621 6067 c->argv[2] = tryObjectEncoding(c->argv[2]);
bae2c7ec 6068 /* note that c->argv[3] is already encoded, as the latest arg
6069 * of a bulk command is always integer encoded if possible. */
2069d06a 6070 if (dictReplace(o->ptr,c->argv[2],c->argv[3])) {
978c2c94 6071 incrRefCount(c->argv[2]);
6072 } else {
6073 update = 1;
6074 }
6075 incrRefCount(c->argv[3]);
6076 }
6077 server.dirty++;
6078 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
6079}
6080
d33278d1
PN
6081static void hmsetCommand(redisClient *c) {
6082 int i;
6083 robj *o, *key, *val;
6084
6085 if ((c->argc % 2) == 1) {
6086 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6087 return;
6088 }
6089
6090 if ((o = lookupKeyWrite(c->db,c->argv[1])) == NULL) {
6091 o = createHashObject();
6092 dictAdd(c->db->dict,c->argv[1],o);
6093 incrRefCount(c->argv[1]);
6094 } else {
6095 if (o->type != REDIS_HASH) {
6096 addReply(c,shared.wrongtypeerr);
6097 return;
6098 }
6099 }
6100
6101 /* We want to convert the zipmap into an hash table right now if the
6102 * entry to be added is too big. */
6103 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6104 for (i = 2; i < c->argc; i+=2) {
6105 if ((c->argv[i]->encoding == REDIS_ENCODING_RAW &&
6106 sdslen(c->argv[i]->ptr) > server.hash_max_zipmap_value) ||
6107 (c->argv[i+1]->encoding == REDIS_ENCODING_RAW &&
6108 sdslen(c->argv[i+1]->ptr) > server.hash_max_zipmap_value)) {
6109 convertToRealHash(o);
6110 break;
6111 }
6112 }
6113 }
6114
6115 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6116 unsigned char *zm = o->ptr;
6117
6118 for (i = 2; i < c->argc; i+=2) {
6119 key = getDecodedObject(c->argv[i]);
6120 val = getDecodedObject(c->argv[i+1]);
6121 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
6122 val->ptr,sdslen(val->ptr),NULL);
6123 decrRefCount(key);
6124 decrRefCount(val);
6125 o->ptr = zm;
6126 }
6127
6128 /* And here there is the second check for hash conversion. */
6129 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
6130 convertToRealHash(o);
6131 } else {
6132 for (i = 2; i < c->argc; i+=2) {
6133 key = tryObjectEncoding(c->argv[i]);
6134 val = tryObjectEncoding(c->argv[i+1]);
6135 if (dictReplace(o->ptr,key,val)) {
6136 incrRefCount(key);
6137 }
6138 incrRefCount(val);
6139 }
6140 }
6141
6142 addReply(c, shared.ok);
6143}
6144
01426b05 6145static void hincrbyCommand(redisClient *c) {
01426b05
PN
6146 long long value = 0, incr = 0;
6147 robj *o = lookupKeyWrite(c->db,c->argv[1]);
6148
6149 if (o == NULL) {
6150 o = createHashObject();
6151 dictAdd(c->db->dict,c->argv[1],o);
6152 incrRefCount(c->argv[1]);
6153 } else {
6154 if (o->type != REDIS_HASH) {
6155 addReply(c,shared.wrongtypeerr);
6156 return;
6157 }
6158 }
6159
bbe025e0
AM
6160 if (getLongLongFromObject(c, c->argv[3], &incr) != REDIS_OK) return;
6161
01426b05
PN
6162 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6163 unsigned char *zm = o->ptr;
6164 unsigned char *zval;
6165 unsigned int zvlen;
6166
6167 /* Find value if already present in hash */
6168 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6169 &zval,&zvlen)) {
6170 /* strtoll needs the char* to have a trailing \0, but
6171 * the zipmap doesn't include them. */
6172 sds szval = sdsnewlen(zval, zvlen);
6173 value = strtoll(szval,NULL,10);
6174 sdsfree(szval);
6175 }
6176
6177 value += incr;
6178 sds svalue = sdscatprintf(sdsempty(),"%lld",value);
6179 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
e9484a85 6180 (unsigned char*)svalue,sdslen(svalue),NULL);
01426b05
PN
6181 sdsfree(svalue);
6182 o->ptr = zm;
6183
e9484a85
PN
6184 /* Check if the zipmap needs to be converted. */
6185 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
01426b05
PN
6186 convertToRealHash(o);
6187 } else {
6188 robj *hval;
6189 dictEntry *de;
6190
6191 /* Find value if already present in hash */
6192 de = dictFind(o->ptr,c->argv[2]);
6193 if (de != NULL) {
6194 hval = dictGetEntryVal(de);
6195 if (hval->encoding == REDIS_ENCODING_RAW)
6196 value = strtoll(hval->ptr,NULL,10);
6197 else if (hval->encoding == REDIS_ENCODING_INT)
6198 value = (long)hval->ptr;
6199 else
6200 redisAssert(1 != 1);
6201 }
6202
6203 value += incr;
6204 hval = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
05df7621 6205 hval = tryObjectEncoding(hval);
01426b05
PN
6206 if (dictReplace(o->ptr,c->argv[2],hval)) {
6207 incrRefCount(c->argv[2]);
6208 }
6209 }
6210
6211 server.dirty++;
aa7c2934 6212 addReplyLongLong(c, value);
01426b05
PN
6213}
6214
978c2c94 6215static void hgetCommand(redisClient *c) {
dd88747b 6216 robj *o;
978c2c94 6217
dd88747b 6218 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6219 checkType(c,o,REDIS_HASH)) return;
6220
6221 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6222 unsigned char *zm = o->ptr;
6223 unsigned char *val;
6224 unsigned int vlen;
164ee595 6225 robj *field;
dd88747b 6226
164ee595 6227 field = getDecodedObject(c->argv[2]);
6228 if (zipmapGet(zm,field->ptr,sdslen(field->ptr), &val,&vlen)) {
dd88747b 6229 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6230 addReplySds(c,sdsnewlen(val,vlen));
6231 addReply(c,shared.crlf);
164ee595 6232 decrRefCount(field);
dd88747b 6233 return;
6234 } else {
6235 addReply(c,shared.nullbulk);
164ee595 6236 decrRefCount(field);
bcd11906 6237 return;
6238 }
dd88747b 6239 } else {
6240 struct dictEntry *de;
bcd11906 6241
dd88747b 6242 de = dictFind(o->ptr,c->argv[2]);
6243 if (de == NULL) {
6244 addReply(c,shared.nullbulk);
978c2c94 6245 } else {
dd88747b 6246 robj *e = dictGetEntryVal(de);
978c2c94 6247
dd88747b 6248 addReplyBulk(c,e);
978c2c94 6249 }
69d95c3e 6250 }
69d95c3e
PN
6251}
6252
09aeb579
PN
6253static void hmgetCommand(redisClient *c) {
6254 int i;
6255
6256 robj *o = lookupKeyRead(c->db, c->argv[1]);
6257 if (o == NULL) {
6258 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6259 for (i = 2; i < c->argc; i++) {
6260 addReply(c,shared.nullbulk);
6261 }
6262 return;
6263 } else {
6264 if (o->type != REDIS_HASH) {
6265 addReply(c,shared.wrongtypeerr);
6266 return;
6267 }
6268 }
6269
6270 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6271 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6272 unsigned char *zm = o->ptr;
6273 unsigned char *v;
6274 unsigned int vlen;
6275 robj *field;
6276
6277 for (i = 2; i < c->argc; i++) {
6278 field = getDecodedObject(c->argv[i]);
6279 if (zipmapGet(zm,field->ptr,sdslen(field->ptr),&v,&vlen)) {
6280 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6281 addReplySds(c,sdsnewlen(v,vlen));
6282 addReply(c,shared.crlf);
6283 } else {
6284 addReply(c,shared.nullbulk);
6285 }
6286 decrRefCount(field);
6287 }
6288 } else {
6289 dictEntry *de;
6290
6291 for (i = 2; i < c->argc; i++) {
6292 de = dictFind(o->ptr,c->argv[i]);
6293 if (de != NULL) {
6294 addReplyBulk(c,(robj*)dictGetEntryVal(de));
6295 } else {
6296 addReply(c,shared.nullbulk);
6297 }
6298 }
6299 }
6300}
6301
07efaf74 6302static void hdelCommand(redisClient *c) {
dd88747b 6303 robj *o;
6304 int deleted = 0;
07efaf74 6305
dd88747b 6306 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6307 checkType(c,o,REDIS_HASH)) return;
07efaf74 6308
dd88747b 6309 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
2a1198b4 6310 robj *field = getDecodedObject(c->argv[2]);
6311
dd88747b 6312 o->ptr = zipmapDel((unsigned char*) o->ptr,
2a1198b4 6313 (unsigned char*) field->ptr,
6314 sdslen(field->ptr), &deleted);
6315 decrRefCount(field);
3ea27d37 6316 if (zipmapLen((unsigned char*) o->ptr) == 0)
6317 deleteKey(c->db,c->argv[1]);
dd88747b 6318 } else {
6319 deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
3ea27d37 6320 if (htNeedsResize(o->ptr)) dictResize(o->ptr);
6321 if (dictSize((dict*)o->ptr) == 0) deleteKey(c->db,c->argv[1]);
07efaf74 6322 }
c77169b7 6323 if (deleted) server.dirty++;
dd88747b 6324 addReply(c,deleted ? shared.cone : shared.czero);
07efaf74 6325}
6326
92b27fe9 6327static void hlenCommand(redisClient *c) {
6328 robj *o;
6329 unsigned long len;
6330
dd88747b 6331 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 6332 checkType(c,o,REDIS_HASH)) return;
6333
6334 len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6335 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6336 addReplyUlong(c,len);
6337}
6338
78409a0f 6339#define REDIS_GETALL_KEYS 1
6340#define REDIS_GETALL_VALS 2
6341static void genericHgetallCommand(redisClient *c, int flags) {
6342 robj *o, *lenobj;
6343 unsigned long count = 0;
6344
4e27f268 6345 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
78409a0f 6346 || checkType(c,o,REDIS_HASH)) return;
6347
6348 lenobj = createObject(REDIS_STRING,NULL);
6349 addReply(c,lenobj);
6350 decrRefCount(lenobj);
6351
6352 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6353 unsigned char *p = zipmapRewind(o->ptr);
6354 unsigned char *field, *val;
6355 unsigned int flen, vlen;
6356
6357 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
6358 robj *aux;
6359
6360 if (flags & REDIS_GETALL_KEYS) {
6361 aux = createStringObject((char*)field,flen);
6362 addReplyBulk(c,aux);
6363 decrRefCount(aux);
6364 count++;
6365 }
6366 if (flags & REDIS_GETALL_VALS) {
6367 aux = createStringObject((char*)val,vlen);
6368 addReplyBulk(c,aux);
6369 decrRefCount(aux);
6370 count++;
6371 }
6372 }
6373 } else {
6374 dictIterator *di = dictGetIterator(o->ptr);
6375 dictEntry *de;
6376
6377 while((de = dictNext(di)) != NULL) {
6378 robj *fieldobj = dictGetEntryKey(de);
6379 robj *valobj = dictGetEntryVal(de);
6380
6381 if (flags & REDIS_GETALL_KEYS) {
6382 addReplyBulk(c,fieldobj);
6383 count++;
6384 }
6385 if (flags & REDIS_GETALL_VALS) {
6386 addReplyBulk(c,valobj);
6387 count++;
6388 }
6389 }
6390 dictReleaseIterator(di);
6391 }
6392 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6393}
6394
6395static void hkeysCommand(redisClient *c) {
6396 genericHgetallCommand(c,REDIS_GETALL_KEYS);
6397}
6398
6399static void hvalsCommand(redisClient *c) {
6400 genericHgetallCommand(c,REDIS_GETALL_VALS);
6401}
6402
6403static void hgetallCommand(redisClient *c) {
6404 genericHgetallCommand(c,REDIS_GETALL_KEYS|REDIS_GETALL_VALS);
6405}
6406
a86f14b1 6407static void hexistsCommand(redisClient *c) {
6408 robj *o;
6409 int exists = 0;
6410
6411 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6412 checkType(c,o,REDIS_HASH)) return;
6413
6414 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6415 robj *field;
6416 unsigned char *zm = o->ptr;
6417
6418 field = getDecodedObject(c->argv[2]);
6419 exists = zipmapExists(zm,field->ptr,sdslen(field->ptr));
6420 decrRefCount(field);
6421 } else {
6422 exists = dictFind(o->ptr,c->argv[2]) != NULL;
6423 }
6424 addReply(c,exists ? shared.cone : shared.czero);
6425}
6426
ada386b2 6427static void convertToRealHash(robj *o) {
6428 unsigned char *key, *val, *p, *zm = o->ptr;
6429 unsigned int klen, vlen;
6430 dict *dict = dictCreate(&hashDictType,NULL);
6431
6432 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6433 p = zipmapRewind(zm);
6434 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6435 robj *keyobj, *valobj;
6436
6437 keyobj = createStringObject((char*)key,klen);
6438 valobj = createStringObject((char*)val,vlen);
05df7621 6439 keyobj = tryObjectEncoding(keyobj);
6440 valobj = tryObjectEncoding(valobj);
ada386b2 6441 dictAdd(dict,keyobj,valobj);
6442 }
6443 o->encoding = REDIS_ENCODING_HT;
6444 o->ptr = dict;
6445 zfree(zm);
6446}
6447
6b47e12e 6448/* ========================= Non type-specific commands ==================== */
6449
ed9b544e 6450static void flushdbCommand(redisClient *c) {
ca37e9cd 6451 server.dirty += dictSize(c->db->dict);
3305306f 6452 dictEmpty(c->db->dict);
6453 dictEmpty(c->db->expires);
ed9b544e 6454 addReply(c,shared.ok);
ed9b544e 6455}
6456
6457static void flushallCommand(redisClient *c) {
ca37e9cd 6458 server.dirty += emptyDb();
ed9b544e 6459 addReply(c,shared.ok);
500ece7c 6460 if (server.bgsavechildpid != -1) {
6461 kill(server.bgsavechildpid,SIGKILL);
6462 rdbRemoveTempFile(server.bgsavechildpid);
6463 }
f78fd11b 6464 rdbSave(server.dbfilename);
ca37e9cd 6465 server.dirty++;
ed9b544e 6466}
6467
56906eef 6468static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 6469 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 6470 so->type = type;
6471 so->pattern = pattern;
6472 return so;
6473}
6474
6475/* Return the value associated to the key with a name obtained
6476 * substituting the first occurence of '*' in 'pattern' with 'subst' */
56906eef 6477static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
ed9b544e 6478 char *p;
6479 sds spat, ssub;
6480 robj keyobj;
6481 int prefixlen, sublen, postfixlen;
ed9b544e 6482 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6483 struct {
f1017b3f 6484 long len;
6485 long free;
ed9b544e 6486 char buf[REDIS_SORTKEY_MAX+1];
6487 } keyname;
6488
28173a49 6489 /* If the pattern is "#" return the substitution object itself in order
6490 * to implement the "SORT ... GET #" feature. */
6491 spat = pattern->ptr;
6492 if (spat[0] == '#' && spat[1] == '\0') {
6493 return subst;
6494 }
6495
6496 /* The substitution object may be specially encoded. If so we create
9d65a1bb 6497 * a decoded object on the fly. Otherwise getDecodedObject will just
6498 * increment the ref count, that we'll decrement later. */
6499 subst = getDecodedObject(subst);
942a3961 6500
ed9b544e 6501 ssub = subst->ptr;
6502 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6503 p = strchr(spat,'*');
ed5a857a 6504 if (!p) {
6505 decrRefCount(subst);
6506 return NULL;
6507 }
ed9b544e 6508
6509 prefixlen = p-spat;
6510 sublen = sdslen(ssub);
6511 postfixlen = sdslen(spat)-(prefixlen+1);
6512 memcpy(keyname.buf,spat,prefixlen);
6513 memcpy(keyname.buf+prefixlen,ssub,sublen);
6514 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6515 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6516 keyname.len = prefixlen+sublen+postfixlen;
6517
dfc5e96c 6518 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
942a3961 6519 decrRefCount(subst);
6520
a4d1ba9a 6521 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
3305306f 6522 return lookupKeyRead(db,&keyobj);
ed9b544e 6523}
6524
6525/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6526 * the additional parameter is not standard but a BSD-specific we have to
6527 * pass sorting parameters via the global 'server' structure */
6528static int sortCompare(const void *s1, const void *s2) {
6529 const redisSortObject *so1 = s1, *so2 = s2;
6530 int cmp;
6531
6532 if (!server.sort_alpha) {
6533 /* Numeric sorting. Here it's trivial as we precomputed scores */
6534 if (so1->u.score > so2->u.score) {
6535 cmp = 1;
6536 } else if (so1->u.score < so2->u.score) {
6537 cmp = -1;
6538 } else {
6539 cmp = 0;
6540 }
6541 } else {
6542 /* Alphanumeric sorting */
6543 if (server.sort_bypattern) {
6544 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6545 /* At least one compare object is NULL */
6546 if (so1->u.cmpobj == so2->u.cmpobj)
6547 cmp = 0;
6548 else if (so1->u.cmpobj == NULL)
6549 cmp = -1;
6550 else
6551 cmp = 1;
6552 } else {
6553 /* We have both the objects, use strcoll */
6554 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6555 }
6556 } else {
6557 /* Compare elements directly */
9d65a1bb 6558 robj *dec1, *dec2;
6559
6560 dec1 = getDecodedObject(so1->obj);
6561 dec2 = getDecodedObject(so2->obj);
6562 cmp = strcoll(dec1->ptr,dec2->ptr);
6563 decrRefCount(dec1);
6564 decrRefCount(dec2);
ed9b544e 6565 }
6566 }
6567 return server.sort_desc ? -cmp : cmp;
6568}
6569
6570/* The SORT command is the most complex command in Redis. Warning: this code
6571 * is optimized for speed and a bit less for readability */
6572static void sortCommand(redisClient *c) {
ed9b544e 6573 list *operations;
6574 int outputlen = 0;
6575 int desc = 0, alpha = 0;
6576 int limit_start = 0, limit_count = -1, start, end;
6577 int j, dontsort = 0, vectorlen;
6578 int getop = 0; /* GET operation counter */
443c6409 6579 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 6580 redisSortObject *vector; /* Resulting vector to sort */
6581
6582 /* Lookup the key to sort. It must be of the right types */
3305306f 6583 sortval = lookupKeyRead(c->db,c->argv[1]);
6584 if (sortval == NULL) {
4e27f268 6585 addReply(c,shared.emptymultibulk);
ed9b544e 6586 return;
6587 }
a5eb649b 6588 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6589 sortval->type != REDIS_ZSET)
6590 {
c937aa89 6591 addReply(c,shared.wrongtypeerr);
ed9b544e 6592 return;
6593 }
6594
6595 /* Create a list of operations to perform for every sorted element.
6596 * Operations can be GET/DEL/INCR/DECR */
6597 operations = listCreate();
092dac2a 6598 listSetFreeMethod(operations,zfree);
ed9b544e 6599 j = 2;
6600
6601 /* Now we need to protect sortval incrementing its count, in the future
6602 * SORT may have options able to overwrite/delete keys during the sorting
6603 * and the sorted key itself may get destroied */
6604 incrRefCount(sortval);
6605
6606 /* The SORT command has an SQL-alike syntax, parse it */
6607 while(j < c->argc) {
6608 int leftargs = c->argc-j-1;
6609 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6610 desc = 0;
6611 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6612 desc = 1;
6613 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6614 alpha = 1;
6615 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6616 limit_start = atoi(c->argv[j+1]->ptr);
6617 limit_count = atoi(c->argv[j+2]->ptr);
6618 j+=2;
443c6409 6619 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6620 storekey = c->argv[j+1];
6621 j++;
ed9b544e 6622 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6623 sortby = c->argv[j+1];
6624 /* If the BY pattern does not contain '*', i.e. it is constant,
6625 * we don't need to sort nor to lookup the weight keys. */
6626 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6627 j++;
6628 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6629 listAddNodeTail(operations,createSortOperation(
6630 REDIS_SORT_GET,c->argv[j+1]));
6631 getop++;
6632 j++;
ed9b544e 6633 } else {
6634 decrRefCount(sortval);
6635 listRelease(operations);
c937aa89 6636 addReply(c,shared.syntaxerr);
ed9b544e 6637 return;
6638 }
6639 j++;
6640 }
6641
6642 /* Load the sorting vector with all the objects to sort */
a5eb649b 6643 switch(sortval->type) {
6644 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6645 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6646 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
dfc5e96c 6647 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
a5eb649b 6648 }
ed9b544e 6649 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 6650 j = 0;
a5eb649b 6651
ed9b544e 6652 if (sortval->type == REDIS_LIST) {
6653 list *list = sortval->ptr;
6208b3a7 6654 listNode *ln;
c7df85a4 6655 listIter li;
6208b3a7 6656
c7df85a4 6657 listRewind(list,&li);
6658 while((ln = listNext(&li))) {
ed9b544e 6659 robj *ele = ln->value;
6660 vector[j].obj = ele;
6661 vector[j].u.score = 0;
6662 vector[j].u.cmpobj = NULL;
ed9b544e 6663 j++;
6664 }
6665 } else {
a5eb649b 6666 dict *set;
ed9b544e 6667 dictIterator *di;
6668 dictEntry *setele;
6669
a5eb649b 6670 if (sortval->type == REDIS_SET) {
6671 set = sortval->ptr;
6672 } else {
6673 zset *zs = sortval->ptr;
6674 set = zs->dict;
6675 }
6676
ed9b544e 6677 di = dictGetIterator(set);
ed9b544e 6678 while((setele = dictNext(di)) != NULL) {
6679 vector[j].obj = dictGetEntryKey(setele);
6680 vector[j].u.score = 0;
6681 vector[j].u.cmpobj = NULL;
6682 j++;
6683 }
6684 dictReleaseIterator(di);
6685 }
dfc5e96c 6686 redisAssert(j == vectorlen);
ed9b544e 6687
6688 /* Now it's time to load the right scores in the sorting vector */
6689 if (dontsort == 0) {
6690 for (j = 0; j < vectorlen; j++) {
6691 if (sortby) {
6692 robj *byval;
6693
3305306f 6694 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
ed9b544e 6695 if (!byval || byval->type != REDIS_STRING) continue;
6696 if (alpha) {
9d65a1bb 6697 vector[j].u.cmpobj = getDecodedObject(byval);
ed9b544e 6698 } else {
942a3961 6699 if (byval->encoding == REDIS_ENCODING_RAW) {
6700 vector[j].u.score = strtod(byval->ptr,NULL);
6701 } else {
9d65a1bb 6702 /* Don't need to decode the object if it's
6703 * integer-encoded (the only encoding supported) so
6704 * far. We can just cast it */
f1017b3f 6705 if (byval->encoding == REDIS_ENCODING_INT) {
942a3961 6706 vector[j].u.score = (long)byval->ptr;
f1017b3f 6707 } else
dfc5e96c 6708 redisAssert(1 != 1);
942a3961 6709 }
ed9b544e 6710 }
6711 } else {
942a3961 6712 if (!alpha) {
6713 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6714 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6715 else {
6716 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6717 vector[j].u.score = (long) vector[j].obj->ptr;
6718 else
dfc5e96c 6719 redisAssert(1 != 1);
942a3961 6720 }
6721 }
ed9b544e 6722 }
6723 }
6724 }
6725
6726 /* We are ready to sort the vector... perform a bit of sanity check
6727 * on the LIMIT option too. We'll use a partial version of quicksort. */
6728 start = (limit_start < 0) ? 0 : limit_start;
6729 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6730 if (start >= vectorlen) {
6731 start = vectorlen-1;
6732 end = vectorlen-2;
6733 }
6734 if (end >= vectorlen) end = vectorlen-1;
6735
6736 if (dontsort == 0) {
6737 server.sort_desc = desc;
6738 server.sort_alpha = alpha;
6739 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 6740 if (sortby && (start != 0 || end != vectorlen-1))
6741 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6742 else
6743 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 6744 }
6745
6746 /* Send command output to the output buffer, performing the specified
6747 * GET/DEL/INCR/DECR operations if any. */
6748 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 6749 if (storekey == NULL) {
6750 /* STORE option not specified, sent the sorting result to client */
6751 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6752 for (j = start; j <= end; j++) {
6753 listNode *ln;
c7df85a4 6754 listIter li;
6755
dd88747b 6756 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 6757 listRewind(operations,&li);
6758 while((ln = listNext(&li))) {
443c6409 6759 redisSortOperation *sop = ln->value;
6760 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6761 vector[j].obj);
6762
6763 if (sop->type == REDIS_SORT_GET) {
6764 if (!val || val->type != REDIS_STRING) {
6765 addReply(c,shared.nullbulk);
6766 } else {
dd88747b 6767 addReplyBulk(c,val);
443c6409 6768 }
6769 } else {
dfc5e96c 6770 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 6771 }
6772 }
ed9b544e 6773 }
443c6409 6774 } else {
6775 robj *listObject = createListObject();
6776 list *listPtr = (list*) listObject->ptr;
6777
6778 /* STORE option specified, set the sorting result as a List object */
6779 for (j = start; j <= end; j++) {
6780 listNode *ln;
c7df85a4 6781 listIter li;
6782
443c6409 6783 if (!getop) {
6784 listAddNodeTail(listPtr,vector[j].obj);
6785 incrRefCount(vector[j].obj);
6786 }
c7df85a4 6787 listRewind(operations,&li);
6788 while((ln = listNext(&li))) {
443c6409 6789 redisSortOperation *sop = ln->value;
6790 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6791 vector[j].obj);
6792
6793 if (sop->type == REDIS_SORT_GET) {
6794 if (!val || val->type != REDIS_STRING) {
6795 listAddNodeTail(listPtr,createStringObject("",0));
6796 } else {
6797 listAddNodeTail(listPtr,val);
6798 incrRefCount(val);
6799 }
ed9b544e 6800 } else {
dfc5e96c 6801 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 6802 }
ed9b544e 6803 }
ed9b544e 6804 }
121796f7 6805 if (dictReplace(c->db->dict,storekey,listObject)) {
6806 incrRefCount(storekey);
6807 }
443c6409 6808 /* Note: we add 1 because the DB is dirty anyway since even if the
6809 * SORT result is empty a new key is set and maybe the old content
6810 * replaced. */
6811 server.dirty += 1+outputlen;
6812 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 6813 }
6814
6815 /* Cleanup */
6816 decrRefCount(sortval);
6817 listRelease(operations);
6818 for (j = 0; j < vectorlen; j++) {
6819 if (sortby && alpha && vector[j].u.cmpobj)
6820 decrRefCount(vector[j].u.cmpobj);
6821 }
6822 zfree(vector);
6823}
6824
ec6c7a1d 6825/* Convert an amount of bytes into a human readable string in the form
6826 * of 100B, 2G, 100M, 4K, and so forth. */
6827static void bytesToHuman(char *s, unsigned long long n) {
6828 double d;
6829
6830 if (n < 1024) {
6831 /* Bytes */
6832 sprintf(s,"%lluB",n);
6833 return;
6834 } else if (n < (1024*1024)) {
6835 d = (double)n/(1024);
6836 sprintf(s,"%.2fK",d);
6837 } else if (n < (1024LL*1024*1024)) {
6838 d = (double)n/(1024*1024);
6839 sprintf(s,"%.2fM",d);
6840 } else if (n < (1024LL*1024*1024*1024)) {
6841 d = (double)n/(1024LL*1024*1024);
b72f6a4b 6842 sprintf(s,"%.2fG",d);
ec6c7a1d 6843 }
6844}
6845
1c85b79f 6846/* Create the string returned by the INFO command. This is decoupled
6847 * by the INFO command itself as we need to report the same information
6848 * on memory corruption problems. */
6849static sds genRedisInfoString(void) {
ed9b544e 6850 sds info;
6851 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 6852 int j;
ec6c7a1d 6853 char hmem[64];
55a8298f 6854
b72f6a4b 6855 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 6856 info = sdscatprintf(sdsempty(),
6857 "redis_version:%s\r\n"
f1017b3f 6858 "arch_bits:%s\r\n"
7a932b74 6859 "multiplexing_api:%s\r\n"
0d7170a4 6860 "process_id:%ld\r\n"
682ac724 6861 "uptime_in_seconds:%ld\r\n"
6862 "uptime_in_days:%ld\r\n"
ed9b544e 6863 "connected_clients:%d\r\n"
6864 "connected_slaves:%d\r\n"
f86a74e9 6865 "blocked_clients:%d\r\n"
5fba9f71 6866 "used_memory:%zu\r\n"
ec6c7a1d 6867 "used_memory_human:%s\r\n"
ed9b544e 6868 "changes_since_last_save:%lld\r\n"
be2bb6b0 6869 "bgsave_in_progress:%d\r\n"
682ac724 6870 "last_save_time:%ld\r\n"
b3fad521 6871 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 6872 "total_connections_received:%lld\r\n"
6873 "total_commands_processed:%lld\r\n"
2a6a2ed1 6874 "expired_keys:%lld\r\n"
55a8298f 6875 "hash_max_zipmap_entries:%ld\r\n"
6876 "hash_max_zipmap_value:%ld\r\n"
ffc6b7f8 6877 "pubsub_channels:%ld\r\n"
6878 "pubsub_patterns:%u\r\n"
7d98e08c 6879 "vm_enabled:%d\r\n"
a0f643ea 6880 "role:%s\r\n"
ed9b544e 6881 ,REDIS_VERSION,
f1017b3f 6882 (sizeof(long) == 8) ? "64" : "32",
7a932b74 6883 aeGetApiName(),
0d7170a4 6884 (long) getpid(),
a0f643ea 6885 uptime,
6886 uptime/(3600*24),
ed9b544e 6887 listLength(server.clients)-listLength(server.slaves),
6888 listLength(server.slaves),
d5d55fc3 6889 server.blpop_blocked_clients,
b72f6a4b 6890 zmalloc_used_memory(),
ec6c7a1d 6891 hmem,
ed9b544e 6892 server.dirty,
9d65a1bb 6893 server.bgsavechildpid != -1,
ed9b544e 6894 server.lastsave,
b3fad521 6895 server.bgrewritechildpid != -1,
ed9b544e 6896 server.stat_numconnections,
6897 server.stat_numcommands,
2a6a2ed1 6898 server.stat_expiredkeys,
55a8298f 6899 server.hash_max_zipmap_entries,
6900 server.hash_max_zipmap_value,
ffc6b7f8 6901 dictSize(server.pubsub_channels),
6902 listLength(server.pubsub_patterns),
7d98e08c 6903 server.vm_enabled != 0,
a0f643ea 6904 server.masterhost == NULL ? "master" : "slave"
ed9b544e 6905 );
a0f643ea 6906 if (server.masterhost) {
6907 info = sdscatprintf(info,
6908 "master_host:%s\r\n"
6909 "master_port:%d\r\n"
6910 "master_link_status:%s\r\n"
6911 "master_last_io_seconds_ago:%d\r\n"
6912 ,server.masterhost,
6913 server.masterport,
6914 (server.replstate == REDIS_REPL_CONNECTED) ?
6915 "up" : "down",
f72b934d 6916 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 6917 );
6918 }
7d98e08c 6919 if (server.vm_enabled) {
1064ef87 6920 lockThreadedIO();
7d98e08c 6921 info = sdscatprintf(info,
6922 "vm_conf_max_memory:%llu\r\n"
6923 "vm_conf_page_size:%llu\r\n"
6924 "vm_conf_pages:%llu\r\n"
6925 "vm_stats_used_pages:%llu\r\n"
6926 "vm_stats_swapped_objects:%llu\r\n"
6927 "vm_stats_swappin_count:%llu\r\n"
6928 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 6929 "vm_stats_io_newjobs_len:%lu\r\n"
6930 "vm_stats_io_processing_len:%lu\r\n"
6931 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 6932 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 6933 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 6934 ,(unsigned long long) server.vm_max_memory,
6935 (unsigned long long) server.vm_page_size,
6936 (unsigned long long) server.vm_pages,
6937 (unsigned long long) server.vm_stats_used_pages,
6938 (unsigned long long) server.vm_stats_swapped_objects,
6939 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 6940 (unsigned long long) server.vm_stats_swapouts,
6941 (unsigned long) listLength(server.io_newjobs),
6942 (unsigned long) listLength(server.io_processing),
6943 (unsigned long) listLength(server.io_processed),
d5d55fc3 6944 (unsigned long) server.io_active_threads,
6945 (unsigned long) server.vm_blocked_clients
7d98e08c 6946 );
1064ef87 6947 unlockThreadedIO();
7d98e08c 6948 }
c3cb078d 6949 for (j = 0; j < server.dbnum; j++) {
6950 long long keys, vkeys;
6951
6952 keys = dictSize(server.db[j].dict);
6953 vkeys = dictSize(server.db[j].expires);
6954 if (keys || vkeys) {
9d65a1bb 6955 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 6956 j, keys, vkeys);
6957 }
6958 }
1c85b79f 6959 return info;
6960}
6961
6962static void infoCommand(redisClient *c) {
6963 sds info = genRedisInfoString();
83c6a618 6964 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6965 (unsigned long)sdslen(info)));
ed9b544e 6966 addReplySds(c,info);
70003d28 6967 addReply(c,shared.crlf);
ed9b544e 6968}
6969
3305306f 6970static void monitorCommand(redisClient *c) {
6971 /* ignore MONITOR if aleady slave or in monitor mode */
6972 if (c->flags & REDIS_SLAVE) return;
6973
6974 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6975 c->slaveseldb = 0;
6b47e12e 6976 listAddNodeTail(server.monitors,c);
3305306f 6977 addReply(c,shared.ok);
6978}
6979
6980/* ================================= Expire ================================= */
6981static int removeExpire(redisDb *db, robj *key) {
6982 if (dictDelete(db->expires,key) == DICT_OK) {
6983 return 1;
6984 } else {
6985 return 0;
6986 }
6987}
6988
6989static int setExpire(redisDb *db, robj *key, time_t when) {
6990 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6991 return 0;
6992 } else {
6993 incrRefCount(key);
6994 return 1;
6995 }
6996}
6997
bb32ede5 6998/* Return the expire time of the specified key, or -1 if no expire
6999 * is associated with this key (i.e. the key is non volatile) */
7000static time_t getExpire(redisDb *db, robj *key) {
7001 dictEntry *de;
7002
7003 /* No expire? return ASAP */
7004 if (dictSize(db->expires) == 0 ||
7005 (de = dictFind(db->expires,key)) == NULL) return -1;
7006
7007 return (time_t) dictGetEntryVal(de);
7008}
7009
3305306f 7010static int expireIfNeeded(redisDb *db, robj *key) {
7011 time_t when;
7012 dictEntry *de;
7013
7014 /* No expire? return ASAP */
7015 if (dictSize(db->expires) == 0 ||
7016 (de = dictFind(db->expires,key)) == NULL) return 0;
7017
7018 /* Lookup the expire */
7019 when = (time_t) dictGetEntryVal(de);
7020 if (time(NULL) <= when) return 0;
7021
7022 /* Delete the key */
7023 dictDelete(db->expires,key);
2a6a2ed1 7024 server.stat_expiredkeys++;
3305306f 7025 return dictDelete(db->dict,key) == DICT_OK;
7026}
7027
7028static int deleteIfVolatile(redisDb *db, robj *key) {
7029 dictEntry *de;
7030
7031 /* No expire? return ASAP */
7032 if (dictSize(db->expires) == 0 ||
7033 (de = dictFind(db->expires,key)) == NULL) return 0;
7034
7035 /* Delete the key */
0c66a471 7036 server.dirty++;
2a6a2ed1 7037 server.stat_expiredkeys++;
3305306f 7038 dictDelete(db->expires,key);
7039 return dictDelete(db->dict,key) == DICT_OK;
7040}
7041
bbe025e0 7042static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
3305306f 7043 dictEntry *de;
bbe025e0
AM
7044 time_t seconds;
7045
7046 if (getLongFromObject(c, param, &seconds) != REDIS_OK) return;
7047
7048 seconds -= offset;
3305306f 7049
802e8373 7050 de = dictFind(c->db->dict,key);
3305306f 7051 if (de == NULL) {
7052 addReply(c,shared.czero);
7053 return;
7054 }
43e5ccdf 7055 if (seconds < 0) {
7056 if (deleteKey(c->db,key)) server.dirty++;
7057 addReply(c, shared.cone);
3305306f 7058 return;
7059 } else {
7060 time_t when = time(NULL)+seconds;
802e8373 7061 if (setExpire(c->db,key,when)) {
3305306f 7062 addReply(c,shared.cone);
77423026 7063 server.dirty++;
7064 } else {
3305306f 7065 addReply(c,shared.czero);
77423026 7066 }
3305306f 7067 return;
7068 }
7069}
7070
802e8373 7071static void expireCommand(redisClient *c) {
bbe025e0 7072 expireGenericCommand(c,c->argv[1],c->argv[2],0);
802e8373 7073}
7074
7075static void expireatCommand(redisClient *c) {
bbe025e0 7076 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
802e8373 7077}
7078
fd88489a 7079static void ttlCommand(redisClient *c) {
7080 time_t expire;
7081 int ttl = -1;
7082
7083 expire = getExpire(c->db,c->argv[1]);
7084 if (expire != -1) {
7085 ttl = (int) (expire-time(NULL));
7086 if (ttl < 0) ttl = -1;
7087 }
7088 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7089}
7090
6e469882 7091/* ================================ MULTI/EXEC ============================== */
7092
7093/* Client state initialization for MULTI/EXEC */
7094static void initClientMultiState(redisClient *c) {
7095 c->mstate.commands = NULL;
7096 c->mstate.count = 0;
7097}
7098
7099/* Release all the resources associated with MULTI/EXEC state */
7100static void freeClientMultiState(redisClient *c) {
7101 int j;
7102
7103 for (j = 0; j < c->mstate.count; j++) {
7104 int i;
7105 multiCmd *mc = c->mstate.commands+j;
7106
7107 for (i = 0; i < mc->argc; i++)
7108 decrRefCount(mc->argv[i]);
7109 zfree(mc->argv);
7110 }
7111 zfree(c->mstate.commands);
7112}
7113
7114/* Add a new command into the MULTI commands queue */
7115static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7116 multiCmd *mc;
7117 int j;
7118
7119 c->mstate.commands = zrealloc(c->mstate.commands,
7120 sizeof(multiCmd)*(c->mstate.count+1));
7121 mc = c->mstate.commands+c->mstate.count;
7122 mc->cmd = cmd;
7123 mc->argc = c->argc;
7124 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7125 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7126 for (j = 0; j < c->argc; j++)
7127 incrRefCount(mc->argv[j]);
7128 c->mstate.count++;
7129}
7130
7131static void multiCommand(redisClient *c) {
7132 c->flags |= REDIS_MULTI;
36c548f0 7133 addReply(c,shared.ok);
6e469882 7134}
7135
18b6cb76
DJ
7136static void discardCommand(redisClient *c) {
7137 if (!(c->flags & REDIS_MULTI)) {
7138 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7139 return;
7140 }
7141
7142 freeClientMultiState(c);
7143 initClientMultiState(c);
7144 c->flags &= (~REDIS_MULTI);
7145 addReply(c,shared.ok);
7146}
7147
6e469882 7148static void execCommand(redisClient *c) {
7149 int j;
7150 robj **orig_argv;
7151 int orig_argc;
7152
7153 if (!(c->flags & REDIS_MULTI)) {
7154 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7155 return;
7156 }
7157
7158 orig_argv = c->argv;
7159 orig_argc = c->argc;
7160 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7161 for (j = 0; j < c->mstate.count; j++) {
7162 c->argc = c->mstate.commands[j].argc;
7163 c->argv = c->mstate.commands[j].argv;
7164 call(c,c->mstate.commands[j].cmd);
7165 }
7166 c->argv = orig_argv;
7167 c->argc = orig_argc;
7168 freeClientMultiState(c);
7169 initClientMultiState(c);
7170 c->flags &= (~REDIS_MULTI);
7171}
7172
4409877e 7173/* =========================== Blocking Operations ========================= */
7174
7175/* Currently Redis blocking operations support is limited to list POP ops,
7176 * so the current implementation is not fully generic, but it is also not
7177 * completely specific so it will not require a rewrite to support new
7178 * kind of blocking operations in the future.
7179 *
7180 * Still it's important to note that list blocking operations can be already
7181 * used as a notification mechanism in order to implement other blocking
7182 * operations at application level, so there must be a very strong evidence
7183 * of usefulness and generality before new blocking operations are implemented.
7184 *
7185 * This is how the current blocking POP works, we use BLPOP as example:
7186 * - If the user calls BLPOP and the key exists and contains a non empty list
7187 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7188 * if there is not to block.
7189 * - If instead BLPOP is called and the key does not exists or the list is
7190 * empty we need to block. In order to do so we remove the notification for
7191 * new data to read in the client socket (so that we'll not serve new
7192 * requests if the blocking request is not served). Also we put the client
95242ab5 7193 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
4409877e 7194 * blocking for this keys.
7195 * - If a PUSH operation against a key with blocked clients waiting is
7196 * performed, we serve the first in the list: basically instead to push
7197 * the new element inside the list we return it to the (first / oldest)
7198 * blocking client, unblock the client, and remove it form the list.
7199 *
7200 * The above comment and the source code should be enough in order to understand
7201 * the implementation and modify / fix it later.
7202 */
7203
7204/* Set a client in blocking mode for the specified key, with the specified
7205 * timeout */
b177fd30 7206static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 7207 dictEntry *de;
7208 list *l;
b177fd30 7209 int j;
4409877e 7210
b177fd30 7211 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7212 c->blockingkeysnum = numkeys;
4409877e 7213 c->blockingto = timeout;
b177fd30 7214 for (j = 0; j < numkeys; j++) {
7215 /* Add the key in the client structure, to map clients -> keys */
7216 c->blockingkeys[j] = keys[j];
7217 incrRefCount(keys[j]);
4409877e 7218
b177fd30 7219 /* And in the other "side", to map keys -> clients */
7220 de = dictFind(c->db->blockingkeys,keys[j]);
7221 if (de == NULL) {
7222 int retval;
7223
7224 /* For every key we take a list of clients blocked for it */
7225 l = listCreate();
7226 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7227 incrRefCount(keys[j]);
7228 assert(retval == DICT_OK);
7229 } else {
7230 l = dictGetEntryVal(de);
7231 }
7232 listAddNodeTail(l,c);
4409877e 7233 }
b177fd30 7234 /* Mark the client as a blocked client */
4409877e 7235 c->flags |= REDIS_BLOCKED;
d5d55fc3 7236 server.blpop_blocked_clients++;
4409877e 7237}
7238
7239/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 7240static void unblockClientWaitingData(redisClient *c) {
4409877e 7241 dictEntry *de;
7242 list *l;
b177fd30 7243 int j;
4409877e 7244
b177fd30 7245 assert(c->blockingkeys != NULL);
7246 /* The client may wait for multiple keys, so unblock it for every key. */
7247 for (j = 0; j < c->blockingkeysnum; j++) {
7248 /* Remove this client from the list of clients waiting for this key. */
7249 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7250 assert(de != NULL);
7251 l = dictGetEntryVal(de);
7252 listDelNode(l,listSearchKey(l,c));
7253 /* If the list is empty we need to remove it to avoid wasting memory */
7254 if (listLength(l) == 0)
7255 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7256 decrRefCount(c->blockingkeys[j]);
7257 }
7258 /* Cleanup the client structure */
7259 zfree(c->blockingkeys);
7260 c->blockingkeys = NULL;
4409877e 7261 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 7262 server.blpop_blocked_clients--;
5921aa36 7263 /* We want to process data if there is some command waiting
b0d8747d 7264 * in the input buffer. Note that this is safe even if
7265 * unblockClientWaitingData() gets called from freeClient() because
7266 * freeClient() will be smart enough to call this function
7267 * *after* c->querybuf was set to NULL. */
4409877e 7268 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7269}
7270
7271/* This should be called from any function PUSHing into lists.
7272 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7273 * 'ele' is the element pushed.
7274 *
7275 * If the function returns 0 there was no client waiting for a list push
7276 * against this key.
7277 *
7278 * If the function returns 1 there was a client waiting for a list push
7279 * against this key, the element was passed to this client thus it's not
7280 * needed to actually add it to the list and the caller should return asap. */
7281static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7282 struct dictEntry *de;
7283 redisClient *receiver;
7284 list *l;
7285 listNode *ln;
7286
7287 de = dictFind(c->db->blockingkeys,key);
7288 if (de == NULL) return 0;
7289 l = dictGetEntryVal(de);
7290 ln = listFirst(l);
7291 assert(ln != NULL);
7292 receiver = ln->value;
4409877e 7293
b177fd30 7294 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 7295 addReplyBulk(receiver,key);
7296 addReplyBulk(receiver,ele);
b0d8747d 7297 unblockClientWaitingData(receiver);
4409877e 7298 return 1;
7299}
7300
7301/* Blocking RPOP/LPOP */
7302static void blockingPopGenericCommand(redisClient *c, int where) {
7303 robj *o;
7304 time_t timeout;
b177fd30 7305 int j;
4409877e 7306
b177fd30 7307 for (j = 1; j < c->argc-1; j++) {
7308 o = lookupKeyWrite(c->db,c->argv[j]);
7309 if (o != NULL) {
7310 if (o->type != REDIS_LIST) {
7311 addReply(c,shared.wrongtypeerr);
4409877e 7312 return;
b177fd30 7313 } else {
7314 list *list = o->ptr;
7315 if (listLength(list) != 0) {
7316 /* If the list contains elements fall back to the usual
7317 * non-blocking POP operation */
7318 robj *argv[2], **orig_argv;
7319 int orig_argc;
e0a62c7f 7320
b177fd30 7321 /* We need to alter the command arguments before to call
7322 * popGenericCommand() as the command takes a single key. */
7323 orig_argv = c->argv;
7324 orig_argc = c->argc;
7325 argv[1] = c->argv[j];
7326 c->argv = argv;
7327 c->argc = 2;
7328
7329 /* Also the return value is different, we need to output
7330 * the multi bulk reply header and the key name. The
7331 * "real" command will add the last element (the value)
7332 * for us. If this souds like an hack to you it's just
7333 * because it is... */
7334 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 7335 addReplyBulk(c,argv[1]);
b177fd30 7336 popGenericCommand(c,where);
7337
7338 /* Fix the client structure with the original stuff */
7339 c->argv = orig_argv;
7340 c->argc = orig_argc;
7341 return;
7342 }
4409877e 7343 }
7344 }
7345 }
7346 /* If the list is empty or the key does not exists we must block */
b177fd30 7347 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 7348 if (timeout > 0) timeout += time(NULL);
b177fd30 7349 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 7350}
7351
7352static void blpopCommand(redisClient *c) {
7353 blockingPopGenericCommand(c,REDIS_HEAD);
7354}
7355
7356static void brpopCommand(redisClient *c) {
7357 blockingPopGenericCommand(c,REDIS_TAIL);
7358}
7359
ed9b544e 7360/* =============================== Replication ============================= */
7361
a4d1ba9a 7362static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7363 ssize_t nwritten, ret = size;
7364 time_t start = time(NULL);
7365
7366 timeout++;
7367 while(size) {
7368 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7369 nwritten = write(fd,ptr,size);
7370 if (nwritten == -1) return -1;
7371 ptr += nwritten;
7372 size -= nwritten;
7373 }
7374 if ((time(NULL)-start) > timeout) {
7375 errno = ETIMEDOUT;
7376 return -1;
7377 }
7378 }
7379 return ret;
7380}
7381
a4d1ba9a 7382static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7383 ssize_t nread, totread = 0;
7384 time_t start = time(NULL);
7385
7386 timeout++;
7387 while(size) {
7388 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7389 nread = read(fd,ptr,size);
7390 if (nread == -1) return -1;
7391 ptr += nread;
7392 size -= nread;
7393 totread += nread;
7394 }
7395 if ((time(NULL)-start) > timeout) {
7396 errno = ETIMEDOUT;
7397 return -1;
7398 }
7399 }
7400 return totread;
7401}
7402
7403static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7404 ssize_t nread = 0;
7405
7406 size--;
7407 while(size) {
7408 char c;
7409
7410 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7411 if (c == '\n') {
7412 *ptr = '\0';
7413 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7414 return nread;
7415 } else {
7416 *ptr++ = c;
7417 *ptr = '\0';
7418 nread++;
7419 }
7420 }
7421 return nread;
7422}
7423
7424static void syncCommand(redisClient *c) {
40d224a9 7425 /* ignore SYNC if aleady slave or in monitor mode */
7426 if (c->flags & REDIS_SLAVE) return;
7427
7428 /* SYNC can't be issued when the server has pending data to send to
7429 * the client about already issued commands. We need a fresh reply
7430 * buffer registering the differences between the BGSAVE and the current
7431 * dataset, so that we can copy to other slaves if needed. */
7432 if (listLength(c->reply) != 0) {
7433 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7434 return;
7435 }
7436
7437 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7438 /* Here we need to check if there is a background saving operation
7439 * in progress, or if it is required to start one */
9d65a1bb 7440 if (server.bgsavechildpid != -1) {
40d224a9 7441 /* Ok a background save is in progress. Let's check if it is a good
7442 * one for replication, i.e. if there is another slave that is
7443 * registering differences since the server forked to save */
7444 redisClient *slave;
7445 listNode *ln;
c7df85a4 7446 listIter li;
40d224a9 7447
c7df85a4 7448 listRewind(server.slaves,&li);
7449 while((ln = listNext(&li))) {
40d224a9 7450 slave = ln->value;
7451 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 7452 }
7453 if (ln) {
7454 /* Perfect, the server is already registering differences for
7455 * another slave. Set the right state, and copy the buffer. */
7456 listRelease(c->reply);
7457 c->reply = listDup(slave->reply);
40d224a9 7458 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7459 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7460 } else {
7461 /* No way, we need to wait for the next BGSAVE in order to
7462 * register differences */
7463 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7464 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7465 }
7466 } else {
7467 /* Ok we don't have a BGSAVE in progress, let's start one */
7468 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7469 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7470 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7471 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7472 return;
7473 }
7474 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7475 }
6208b3a7 7476 c->repldbfd = -1;
40d224a9 7477 c->flags |= REDIS_SLAVE;
7478 c->slaveseldb = 0;
6b47e12e 7479 listAddNodeTail(server.slaves,c);
40d224a9 7480 return;
7481}
7482
6208b3a7 7483static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7484 redisClient *slave = privdata;
7485 REDIS_NOTUSED(el);
7486 REDIS_NOTUSED(mask);
7487 char buf[REDIS_IOBUF_LEN];
7488 ssize_t nwritten, buflen;
7489
7490 if (slave->repldboff == 0) {
7491 /* Write the bulk write count before to transfer the DB. In theory here
7492 * we don't know how much room there is in the output buffer of the
7493 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7494 * operations) will never be smaller than the few bytes we need. */
7495 sds bulkcount;
7496
7497 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7498 slave->repldbsize);
7499 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7500 {
7501 sdsfree(bulkcount);
7502 freeClient(slave);
7503 return;
7504 }
7505 sdsfree(bulkcount);
7506 }
7507 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7508 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7509 if (buflen <= 0) {
7510 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7511 (buflen == 0) ? "premature EOF" : strerror(errno));
7512 freeClient(slave);
7513 return;
7514 }
7515 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 7516 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 7517 strerror(errno));
7518 freeClient(slave);
7519 return;
7520 }
7521 slave->repldboff += nwritten;
7522 if (slave->repldboff == slave->repldbsize) {
7523 close(slave->repldbfd);
7524 slave->repldbfd = -1;
7525 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7526 slave->replstate = REDIS_REPL_ONLINE;
7527 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 7528 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 7529 freeClient(slave);
7530 return;
7531 }
7532 addReplySds(slave,sdsempty());
7533 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7534 }
7535}
ed9b544e 7536
a3b21203 7537/* This function is called at the end of every backgrond saving.
7538 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7539 * otherwise REDIS_ERR is passed to the function.
7540 *
7541 * The goal of this function is to handle slaves waiting for a successful
7542 * background saving in order to perform non-blocking synchronization. */
7543static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 7544 listNode *ln;
7545 int startbgsave = 0;
c7df85a4 7546 listIter li;
ed9b544e 7547
c7df85a4 7548 listRewind(server.slaves,&li);
7549 while((ln = listNext(&li))) {
6208b3a7 7550 redisClient *slave = ln->value;
ed9b544e 7551
6208b3a7 7552 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7553 startbgsave = 1;
7554 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7555 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 7556 struct redis_stat buf;
e0a62c7f 7557
6208b3a7 7558 if (bgsaveerr != REDIS_OK) {
7559 freeClient(slave);
7560 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7561 continue;
7562 }
7563 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 7564 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 7565 freeClient(slave);
7566 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7567 continue;
7568 }
7569 slave->repldboff = 0;
7570 slave->repldbsize = buf.st_size;
7571 slave->replstate = REDIS_REPL_SEND_BULK;
7572 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 7573 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 7574 freeClient(slave);
7575 continue;
7576 }
7577 }
ed9b544e 7578 }
6208b3a7 7579 if (startbgsave) {
7580 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 7581 listIter li;
7582
7583 listRewind(server.slaves,&li);
6208b3a7 7584 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 7585 while((ln = listNext(&li))) {
6208b3a7 7586 redisClient *slave = ln->value;
ed9b544e 7587
6208b3a7 7588 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7589 freeClient(slave);
7590 }
7591 }
7592 }
ed9b544e 7593}
7594
7595static int syncWithMaster(void) {
d0ccebcf 7596 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 7597 long dumpsize;
ed9b544e 7598 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 7599 int dfd, maxtries = 5;
ed9b544e 7600
7601 if (fd == -1) {
7602 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7603 strerror(errno));
7604 return REDIS_ERR;
7605 }
d0ccebcf 7606
7607 /* AUTH with the master if required. */
7608 if(server.masterauth) {
7609 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7610 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7611 close(fd);
7612 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7613 strerror(errno));
7614 return REDIS_ERR;
7615 }
7616 /* Read the AUTH result. */
7617 if (syncReadLine(fd,buf,1024,3600) == -1) {
7618 close(fd);
7619 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7620 strerror(errno));
7621 return REDIS_ERR;
7622 }
7623 if (buf[0] != '+') {
7624 close(fd);
7625 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7626 return REDIS_ERR;
7627 }
7628 }
7629
ed9b544e 7630 /* Issue the SYNC command */
7631 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7632 close(fd);
7633 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7634 strerror(errno));
7635 return REDIS_ERR;
7636 }
7637 /* Read the bulk write count */
8c4d91fc 7638 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 7639 close(fd);
7640 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7641 strerror(errno));
7642 return REDIS_ERR;
7643 }
4aa701c1 7644 if (buf[0] != '$') {
7645 close(fd);
7646 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7647 return REDIS_ERR;
7648 }
18e61fa2 7649 dumpsize = strtol(buf+1,NULL,10);
7650 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 7651 /* Read the bulk write data on a temp file */
8c5abee8 7652 while(maxtries--) {
7653 snprintf(tmpfile,256,
7654 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7655 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7656 if (dfd != -1) break;
5de9ad7c 7657 sleep(1);
8c5abee8 7658 }
ed9b544e 7659 if (dfd == -1) {
7660 close(fd);
7661 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7662 return REDIS_ERR;
7663 }
7664 while(dumpsize) {
7665 int nread, nwritten;
7666
7667 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7668 if (nread == -1) {
7669 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7670 strerror(errno));
7671 close(fd);
7672 close(dfd);
7673 return REDIS_ERR;
7674 }
7675 nwritten = write(dfd,buf,nread);
7676 if (nwritten == -1) {
7677 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7678 close(fd);
7679 close(dfd);
7680 return REDIS_ERR;
7681 }
7682 dumpsize -= nread;
7683 }
7684 close(dfd);
7685 if (rename(tmpfile,server.dbfilename) == -1) {
7686 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7687 unlink(tmpfile);
7688 close(fd);
7689 return REDIS_ERR;
7690 }
7691 emptyDb();
f78fd11b 7692 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 7693 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7694 close(fd);
7695 return REDIS_ERR;
7696 }
7697 server.master = createClient(fd);
7698 server.master->flags |= REDIS_MASTER;
179b3952 7699 server.master->authenticated = 1;
ed9b544e 7700 server.replstate = REDIS_REPL_CONNECTED;
7701 return REDIS_OK;
7702}
7703
321b0e13 7704static void slaveofCommand(redisClient *c) {
7705 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7706 !strcasecmp(c->argv[2]->ptr,"one")) {
7707 if (server.masterhost) {
7708 sdsfree(server.masterhost);
7709 server.masterhost = NULL;
7710 if (server.master) freeClient(server.master);
7711 server.replstate = REDIS_REPL_NONE;
7712 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7713 }
7714 } else {
7715 sdsfree(server.masterhost);
7716 server.masterhost = sdsdup(c->argv[1]->ptr);
7717 server.masterport = atoi(c->argv[2]->ptr);
7718 if (server.master) freeClient(server.master);
7719 server.replstate = REDIS_REPL_CONNECT;
7720 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7721 server.masterhost, server.masterport);
7722 }
7723 addReply(c,shared.ok);
7724}
7725
3fd78bcd 7726/* ============================ Maxmemory directive ======================== */
7727
a5819310 7728/* Try to free one object form the pre-allocated objects free list.
7729 * This is useful under low mem conditions as by default we take 1 million
7730 * free objects allocated. On success REDIS_OK is returned, otherwise
7731 * REDIS_ERR. */
7732static int tryFreeOneObjectFromFreelist(void) {
f870935d 7733 robj *o;
7734
a5819310 7735 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7736 if (listLength(server.objfreelist)) {
7737 listNode *head = listFirst(server.objfreelist);
7738 o = listNodeValue(head);
7739 listDelNode(server.objfreelist,head);
7740 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7741 zfree(o);
7742 return REDIS_OK;
7743 } else {
7744 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7745 return REDIS_ERR;
7746 }
f870935d 7747}
7748
3fd78bcd 7749/* This function gets called when 'maxmemory' is set on the config file to limit
7750 * the max memory used by the server, and we are out of memory.
7751 * This function will try to, in order:
7752 *
7753 * - Free objects from the free list
7754 * - Try to remove keys with an EXPIRE set
7755 *
7756 * It is not possible to free enough memory to reach used-memory < maxmemory
7757 * the server will start refusing commands that will enlarge even more the
7758 * memory usage.
7759 */
7760static void freeMemoryIfNeeded(void) {
7761 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 7762 int j, k, freed = 0;
7763
7764 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7765 for (j = 0; j < server.dbnum; j++) {
7766 int minttl = -1;
7767 robj *minkey = NULL;
7768 struct dictEntry *de;
7769
7770 if (dictSize(server.db[j].expires)) {
7771 freed = 1;
7772 /* From a sample of three keys drop the one nearest to
7773 * the natural expire */
7774 for (k = 0; k < 3; k++) {
7775 time_t t;
7776
7777 de = dictGetRandomKey(server.db[j].expires);
7778 t = (time_t) dictGetEntryVal(de);
7779 if (minttl == -1 || t < minttl) {
7780 minkey = dictGetEntryKey(de);
7781 minttl = t;
3fd78bcd 7782 }
3fd78bcd 7783 }
a5819310 7784 deleteKey(server.db+j,minkey);
3fd78bcd 7785 }
3fd78bcd 7786 }
a5819310 7787 if (!freed) return; /* nothing to free... */
3fd78bcd 7788 }
7789}
7790
f80dff62 7791/* ============================== Append Only file ========================== */
7792
7793static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7794 sds buf = sdsempty();
7795 int j;
7796 ssize_t nwritten;
7797 time_t now;
7798 robj *tmpargv[3];
7799
7800 /* The DB this command was targetting is not the same as the last command
7801 * we appendend. To issue a SELECT command is needed. */
7802 if (dictid != server.appendseldb) {
7803 char seldb[64];
7804
7805 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 7806 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 7807 (unsigned long)strlen(seldb),seldb);
f80dff62 7808 server.appendseldb = dictid;
7809 }
7810
7811 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7812 * EXPIREs into EXPIREATs calls */
7813 if (cmd->proc == expireCommand) {
7814 long when;
7815
7816 tmpargv[0] = createStringObject("EXPIREAT",8);
7817 tmpargv[1] = argv[1];
7818 incrRefCount(argv[1]);
7819 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7820 tmpargv[2] = createObject(REDIS_STRING,
7821 sdscatprintf(sdsempty(),"%ld",when));
7822 argv = tmpargv;
7823 }
7824
7825 /* Append the actual command */
7826 buf = sdscatprintf(buf,"*%d\r\n",argc);
7827 for (j = 0; j < argc; j++) {
7828 robj *o = argv[j];
7829
9d65a1bb 7830 o = getDecodedObject(o);
83c6a618 7831 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
f80dff62 7832 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7833 buf = sdscatlen(buf,"\r\n",2);
9d65a1bb 7834 decrRefCount(o);
f80dff62 7835 }
7836
7837 /* Free the objects from the modified argv for EXPIREAT */
7838 if (cmd->proc == expireCommand) {
7839 for (j = 0; j < 3; j++)
7840 decrRefCount(argv[j]);
7841 }
7842
7843 /* We want to perform a single write. This should be guaranteed atomic
7844 * at least if the filesystem we are writing is a real physical one.
7845 * While this will save us against the server being killed I don't think
7846 * there is much to do about the whole server stopping for power problems
7847 * or alike */
7848 nwritten = write(server.appendfd,buf,sdslen(buf));
7849 if (nwritten != (signed)sdslen(buf)) {
7850 /* Ooops, we are in troubles. The best thing to do for now is
7851 * to simply exit instead to give the illusion that everything is
7852 * working as expected. */
7853 if (nwritten == -1) {
7854 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7855 } else {
7856 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7857 }
7858 exit(1);
7859 }
85a83172 7860 /* If a background append only file rewriting is in progress we want to
7861 * accumulate the differences between the child DB and the current one
7862 * in a buffer, so that when the child process will do its work we
7863 * can append the differences to the new append only file. */
7864 if (server.bgrewritechildpid != -1)
7865 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7866
7867 sdsfree(buf);
f80dff62 7868 now = time(NULL);
7869 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7870 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7871 now-server.lastfsync > 1))
7872 {
7873 fsync(server.appendfd); /* Let's try to get this data on the disk */
7874 server.lastfsync = now;
7875 }
7876}
7877
7878/* In Redis commands are always executed in the context of a client, so in
7879 * order to load the append only file we need to create a fake client. */
7880static struct redisClient *createFakeClient(void) {
7881 struct redisClient *c = zmalloc(sizeof(*c));
7882
7883 selectDb(c,0);
7884 c->fd = -1;
7885 c->querybuf = sdsempty();
7886 c->argc = 0;
7887 c->argv = NULL;
7888 c->flags = 0;
9387d17d 7889 /* We set the fake client as a slave waiting for the synchronization
7890 * so that Redis will not try to send replies to this client. */
7891 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 7892 c->reply = listCreate();
7893 listSetFreeMethod(c->reply,decrRefCount);
7894 listSetDupMethod(c->reply,dupClientReplyValue);
7895 return c;
7896}
7897
7898static void freeFakeClient(struct redisClient *c) {
7899 sdsfree(c->querybuf);
7900 listRelease(c->reply);
7901 zfree(c);
7902}
7903
7904/* Replay the append log file. On error REDIS_OK is returned. On non fatal
7905 * error (the append only file is zero-length) REDIS_ERR is returned. On
7906 * fatal error an error message is logged and the program exists. */
7907int loadAppendOnlyFile(char *filename) {
7908 struct redisClient *fakeClient;
7909 FILE *fp = fopen(filename,"r");
7910 struct redis_stat sb;
b492cf00 7911 unsigned long long loadedkeys = 0;
f80dff62 7912
7913 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7914 return REDIS_ERR;
7915
7916 if (fp == NULL) {
7917 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7918 exit(1);
7919 }
7920
7921 fakeClient = createFakeClient();
7922 while(1) {
7923 int argc, j;
7924 unsigned long len;
7925 robj **argv;
7926 char buf[128];
7927 sds argsds;
7928 struct redisCommand *cmd;
7929
7930 if (fgets(buf,sizeof(buf),fp) == NULL) {
7931 if (feof(fp))
7932 break;
7933 else
7934 goto readerr;
7935 }
7936 if (buf[0] != '*') goto fmterr;
7937 argc = atoi(buf+1);
7938 argv = zmalloc(sizeof(robj*)*argc);
7939 for (j = 0; j < argc; j++) {
7940 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7941 if (buf[0] != '$') goto fmterr;
7942 len = strtol(buf+1,NULL,10);
7943 argsds = sdsnewlen(NULL,len);
0f151ef1 7944 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 7945 argv[j] = createObject(REDIS_STRING,argsds);
7946 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7947 }
7948
7949 /* Command lookup */
7950 cmd = lookupCommand(argv[0]->ptr);
7951 if (!cmd) {
7952 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7953 exit(1);
7954 }
bdcb92f2 7955 /* Try object encoding */
f80dff62 7956 if (cmd->flags & REDIS_CMD_BULK)
05df7621 7957 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
f80dff62 7958 /* Run the command in the context of a fake client */
7959 fakeClient->argc = argc;
7960 fakeClient->argv = argv;
7961 cmd->proc(fakeClient);
7962 /* Discard the reply objects list from the fake client */
7963 while(listLength(fakeClient->reply))
7964 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7965 /* Clean up, ready for the next command */
7966 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7967 zfree(argv);
b492cf00 7968 /* Handle swapping while loading big datasets when VM is on */
7969 loadedkeys++;
7970 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7971 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 7972 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 7973 }
7974 }
f80dff62 7975 }
7976 fclose(fp);
7977 freeFakeClient(fakeClient);
7978 return REDIS_OK;
7979
7980readerr:
7981 if (feof(fp)) {
7982 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7983 } else {
7984 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7985 }
7986 exit(1);
7987fmterr:
7988 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7989 exit(1);
7990}
7991
9d65a1bb 7992/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
9c8e3cee 7993static int fwriteBulkObject(FILE *fp, robj *obj) {
9d65a1bb 7994 char buf[128];
b9bc0eef 7995 int decrrc = 0;
7996
f2d9f50f 7997 /* Avoid the incr/decr ref count business if possible to help
7998 * copy-on-write (we are often in a child process when this function
7999 * is called).
8000 * Also makes sure that key objects don't get incrRefCount-ed when VM
8001 * is enabled */
8002 if (obj->encoding != REDIS_ENCODING_RAW) {
b9bc0eef 8003 obj = getDecodedObject(obj);
8004 decrrc = 1;
8005 }
9d65a1bb 8006 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8007 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 8008 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8009 goto err;
9d65a1bb 8010 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 8011 if (decrrc) decrRefCount(obj);
9d65a1bb 8012 return 1;
8013err:
b9bc0eef 8014 if (decrrc) decrRefCount(obj);
9d65a1bb 8015 return 0;
8016}
8017
9c8e3cee 8018/* Write binary-safe string into a file in the bulkformat
8019 * $<count>\r\n<payload>\r\n */
8020static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8021 char buf[128];
8022
8023 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8024 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8025 if (len && fwrite(s,len,1,fp) == 0) return 0;
8026 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8027 return 1;
8028}
8029
9d65a1bb 8030/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8031static int fwriteBulkDouble(FILE *fp, double d) {
8032 char buf[128], dbuf[128];
8033
8034 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8035 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8036 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8037 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8038 return 1;
8039}
8040
8041/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8042static int fwriteBulkLong(FILE *fp, long l) {
8043 char buf[128], lbuf[128];
8044
8045 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8046 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8047 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8048 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8049 return 1;
8050}
8051
8052/* Write a sequence of commands able to fully rebuild the dataset into
8053 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8054static int rewriteAppendOnlyFile(char *filename) {
8055 dictIterator *di = NULL;
8056 dictEntry *de;
8057 FILE *fp;
8058 char tmpfile[256];
8059 int j;
8060 time_t now = time(NULL);
8061
8062 /* Note that we have to use a different temp name here compared to the
8063 * one used by rewriteAppendOnlyFileBackground() function. */
8064 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8065 fp = fopen(tmpfile,"w");
8066 if (!fp) {
8067 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8068 return REDIS_ERR;
8069 }
8070 for (j = 0; j < server.dbnum; j++) {
8071 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8072 redisDb *db = server.db+j;
8073 dict *d = db->dict;
8074 if (dictSize(d) == 0) continue;
8075 di = dictGetIterator(d);
8076 if (!di) {
8077 fclose(fp);
8078 return REDIS_ERR;
8079 }
8080
8081 /* SELECT the new DB */
8082 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 8083 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 8084
8085 /* Iterate this DB writing every entry */
8086 while((de = dictNext(di)) != NULL) {
e7546c63 8087 robj *key, *o;
8088 time_t expiretime;
8089 int swapped;
8090
8091 key = dictGetEntryKey(de);
b9bc0eef 8092 /* If the value for this key is swapped, load a preview in memory.
8093 * We use a "swapped" flag to remember if we need to free the
8094 * value object instead to just increment the ref count anyway
8095 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 8096 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8097 key->storage == REDIS_VM_SWAPPING) {
e7546c63 8098 o = dictGetEntryVal(de);
8099 swapped = 0;
8100 } else {
8101 o = vmPreviewObject(key);
e7546c63 8102 swapped = 1;
8103 }
8104 expiretime = getExpire(db,key);
9d65a1bb 8105
8106 /* Save the key and associated value */
9d65a1bb 8107 if (o->type == REDIS_STRING) {
8108 /* Emit a SET command */
8109 char cmd[]="*3\r\n$3\r\nSET\r\n";
8110 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8111 /* Key and value */
9c8e3cee 8112 if (fwriteBulkObject(fp,key) == 0) goto werr;
8113 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 8114 } else if (o->type == REDIS_LIST) {
8115 /* Emit the RPUSHes needed to rebuild the list */
8116 list *list = o->ptr;
8117 listNode *ln;
c7df85a4 8118 listIter li;
9d65a1bb 8119
c7df85a4 8120 listRewind(list,&li);
8121 while((ln = listNext(&li))) {
9d65a1bb 8122 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8123 robj *eleobj = listNodeValue(ln);
8124
8125 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8126 if (fwriteBulkObject(fp,key) == 0) goto werr;
8127 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8128 }
8129 } else if (o->type == REDIS_SET) {
8130 /* Emit the SADDs needed to rebuild the set */
8131 dict *set = o->ptr;
8132 dictIterator *di = dictGetIterator(set);
8133 dictEntry *de;
8134
8135 while((de = dictNext(di)) != NULL) {
8136 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8137 robj *eleobj = dictGetEntryKey(de);
8138
8139 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8140 if (fwriteBulkObject(fp,key) == 0) goto werr;
8141 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8142 }
8143 dictReleaseIterator(di);
8144 } else if (o->type == REDIS_ZSET) {
8145 /* Emit the ZADDs needed to rebuild the sorted set */
8146 zset *zs = o->ptr;
8147 dictIterator *di = dictGetIterator(zs->dict);
8148 dictEntry *de;
8149
8150 while((de = dictNext(di)) != NULL) {
8151 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8152 robj *eleobj = dictGetEntryKey(de);
8153 double *score = dictGetEntryVal(de);
8154
8155 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8156 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8157 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 8158 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8159 }
8160 dictReleaseIterator(di);
9c8e3cee 8161 } else if (o->type == REDIS_HASH) {
8162 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8163
8164 /* Emit the HSETs needed to rebuild the hash */
8165 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8166 unsigned char *p = zipmapRewind(o->ptr);
8167 unsigned char *field, *val;
8168 unsigned int flen, vlen;
8169
8170 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8171 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8172 if (fwriteBulkObject(fp,key) == 0) goto werr;
8173 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8174 return -1;
8175 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8176 return -1;
8177 }
8178 } else {
8179 dictIterator *di = dictGetIterator(o->ptr);
8180 dictEntry *de;
8181
8182 while((de = dictNext(di)) != NULL) {
8183 robj *field = dictGetEntryKey(de);
8184 robj *val = dictGetEntryVal(de);
8185
8186 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8187 if (fwriteBulkObject(fp,key) == 0) goto werr;
8188 if (fwriteBulkObject(fp,field) == -1) return -1;
8189 if (fwriteBulkObject(fp,val) == -1) return -1;
8190 }
8191 dictReleaseIterator(di);
8192 }
9d65a1bb 8193 } else {
78409a0f 8194 redisAssert(0);
9d65a1bb 8195 }
8196 /* Save the expire time */
8197 if (expiretime != -1) {
e96e4fbf 8198 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 8199 /* If this key is already expired skip it */
8200 if (expiretime < now) continue;
8201 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8202 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8203 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8204 }
b9bc0eef 8205 if (swapped) decrRefCount(o);
9d65a1bb 8206 }
8207 dictReleaseIterator(di);
8208 }
8209
8210 /* Make sure data will not remain on the OS's output buffers */
8211 fflush(fp);
8212 fsync(fileno(fp));
8213 fclose(fp);
e0a62c7f 8214
9d65a1bb 8215 /* Use RENAME to make sure the DB file is changed atomically only
8216 * if the generate DB file is ok. */
8217 if (rename(tmpfile,filename) == -1) {
8218 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8219 unlink(tmpfile);
8220 return REDIS_ERR;
8221 }
8222 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8223 return REDIS_OK;
8224
8225werr:
8226 fclose(fp);
8227 unlink(tmpfile);
e96e4fbf 8228 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 8229 if (di) dictReleaseIterator(di);
8230 return REDIS_ERR;
8231}
8232
8233/* This is how rewriting of the append only file in background works:
8234 *
8235 * 1) The user calls BGREWRITEAOF
8236 * 2) Redis calls this function, that forks():
8237 * 2a) the child rewrite the append only file in a temp file.
8238 * 2b) the parent accumulates differences in server.bgrewritebuf.
8239 * 3) When the child finished '2a' exists.
8240 * 4) The parent will trap the exit code, if it's OK, will append the
8241 * data accumulated into server.bgrewritebuf into the temp file, and
8242 * finally will rename(2) the temp file in the actual file name.
8243 * The the new file is reopened as the new append only file. Profit!
8244 */
8245static int rewriteAppendOnlyFileBackground(void) {
8246 pid_t childpid;
8247
8248 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 8249 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 8250 if ((childpid = fork()) == 0) {
8251 /* Child */
8252 char tmpfile[256];
9d65a1bb 8253
054e426d 8254 if (server.vm_enabled) vmReopenSwapFile();
8255 close(server.fd);
9d65a1bb 8256 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8257 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 8258 _exit(0);
9d65a1bb 8259 } else {
478c2c6f 8260 _exit(1);
9d65a1bb 8261 }
8262 } else {
8263 /* Parent */
8264 if (childpid == -1) {
8265 redisLog(REDIS_WARNING,
8266 "Can't rewrite append only file in background: fork: %s",
8267 strerror(errno));
8268 return REDIS_ERR;
8269 }
8270 redisLog(REDIS_NOTICE,
8271 "Background append only file rewriting started by pid %d",childpid);
8272 server.bgrewritechildpid = childpid;
884d4b39 8273 updateDictResizePolicy();
85a83172 8274 /* We set appendseldb to -1 in order to force the next call to the
8275 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8276 * accumulated by the parent into server.bgrewritebuf will start
8277 * with a SELECT statement and it will be safe to merge. */
8278 server.appendseldb = -1;
9d65a1bb 8279 return REDIS_OK;
8280 }
8281 return REDIS_OK; /* unreached */
8282}
8283
8284static void bgrewriteaofCommand(redisClient *c) {
8285 if (server.bgrewritechildpid != -1) {
8286 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8287 return;
8288 }
8289 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 8290 char *status = "+Background append only file rewriting started\r\n";
8291 addReplySds(c,sdsnew(status));
9d65a1bb 8292 } else {
8293 addReply(c,shared.err);
8294 }
8295}
8296
8297static void aofRemoveTempFile(pid_t childpid) {
8298 char tmpfile[256];
8299
8300 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8301 unlink(tmpfile);
8302}
8303
996cb5f7 8304/* Virtual Memory is composed mainly of two subsystems:
8305 * - Blocking Virutal Memory
8306 * - Threaded Virtual Memory I/O
8307 * The two parts are not fully decoupled, but functions are split among two
8308 * different sections of the source code (delimited by comments) in order to
8309 * make more clear what functionality is about the blocking VM and what about
8310 * the threaded (not blocking) VM.
8311 *
8312 * Redis VM design:
8313 *
8314 * Redis VM is a blocking VM (one that blocks reading swapped values from
8315 * disk into memory when a value swapped out is needed in memory) that is made
8316 * unblocking by trying to examine the command argument vector in order to
8317 * load in background values that will likely be needed in order to exec
8318 * the command. The command is executed only once all the relevant keys
8319 * are loaded into memory.
8320 *
8321 * This basically is almost as simple of a blocking VM, but almost as parallel
8322 * as a fully non-blocking VM.
8323 */
8324
8325/* =================== Virtual Memory - Blocking Side ====================== */
054e426d 8326
8327/* substitute the first occurrence of '%p' with the process pid in the
8328 * swap file name. */
8329static void expandVmSwapFilename(void) {
8330 char *p = strstr(server.vm_swap_file,"%p");
8331 sds new;
e0a62c7f 8332
054e426d 8333 if (!p) return;
8334 new = sdsempty();
8335 *p = '\0';
8336 new = sdscat(new,server.vm_swap_file);
8337 new = sdscatprintf(new,"%ld",(long) getpid());
8338 new = sdscat(new,p+2);
8339 zfree(server.vm_swap_file);
8340 server.vm_swap_file = new;
8341}
8342
75680a3c 8343static void vmInit(void) {
8344 off_t totsize;
996cb5f7 8345 int pipefds[2];
bcaa7a4f 8346 size_t stacksize;
75680a3c 8347
4ad37480 8348 if (server.vm_max_threads != 0)
8349 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8350
054e426d 8351 expandVmSwapFilename();
8352 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
6fa987e3 8353 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8354 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8355 }
75680a3c 8356 if (server.vm_fp == NULL) {
6fa987e3 8357 redisLog(REDIS_WARNING,
8358 "Impossible to open the swap file: %s. Exiting.",
8359 strerror(errno));
75680a3c 8360 exit(1);
8361 }
8362 server.vm_fd = fileno(server.vm_fp);
8363 server.vm_next_page = 0;
8364 server.vm_near_pages = 0;
7d98e08c 8365 server.vm_stats_used_pages = 0;
8366 server.vm_stats_swapped_objects = 0;
8367 server.vm_stats_swapouts = 0;
8368 server.vm_stats_swapins = 0;
75680a3c 8369 totsize = server.vm_pages*server.vm_page_size;
8370 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8371 if (ftruncate(server.vm_fd,totsize) == -1) {
8372 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8373 strerror(errno));
8374 exit(1);
8375 } else {
8376 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8377 }
7d30035d 8378 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 8379 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 8380 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 8381 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 8382
996cb5f7 8383 /* Initialize threaded I/O (used by Virtual Memory) */
8384 server.io_newjobs = listCreate();
8385 server.io_processing = listCreate();
8386 server.io_processed = listCreate();
d5d55fc3 8387 server.io_ready_clients = listCreate();
92f8e882 8388 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 8389 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8390 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 8391 server.io_active_threads = 0;
996cb5f7 8392 if (pipe(pipefds) == -1) {
8393 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8394 ,strerror(errno));
8395 exit(1);
8396 }
8397 server.io_ready_pipe_read = pipefds[0];
8398 server.io_ready_pipe_write = pipefds[1];
8399 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 8400 /* LZF requires a lot of stack */
8401 pthread_attr_init(&server.io_threads_attr);
8402 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8403 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8404 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 8405 /* Listen for events in the threaded I/O pipe */
8406 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8407 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8408 oom("creating file event");
75680a3c 8409}
8410
06224fec 8411/* Mark the page as used */
8412static void vmMarkPageUsed(off_t page) {
8413 off_t byte = page/8;
8414 int bit = page&7;
970e10bb 8415 redisAssert(vmFreePage(page) == 1);
06224fec 8416 server.vm_bitmap[byte] |= 1<<bit;
8417}
8418
8419/* Mark N contiguous pages as used, with 'page' being the first. */
8420static void vmMarkPagesUsed(off_t page, off_t count) {
8421 off_t j;
8422
8423 for (j = 0; j < count; j++)
7d30035d 8424 vmMarkPageUsed(page+j);
7d98e08c 8425 server.vm_stats_used_pages += count;
7c775e09 8426 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8427 (long long)count, (long long)page);
06224fec 8428}
8429
8430/* Mark the page as free */
8431static void vmMarkPageFree(off_t page) {
8432 off_t byte = page/8;
8433 int bit = page&7;
970e10bb 8434 redisAssert(vmFreePage(page) == 0);
06224fec 8435 server.vm_bitmap[byte] &= ~(1<<bit);
8436}
8437
8438/* Mark N contiguous pages as free, with 'page' being the first. */
8439static void vmMarkPagesFree(off_t page, off_t count) {
8440 off_t j;
8441
8442 for (j = 0; j < count; j++)
7d30035d 8443 vmMarkPageFree(page+j);
7d98e08c 8444 server.vm_stats_used_pages -= count;
7c775e09 8445 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8446 (long long)count, (long long)page);
06224fec 8447}
8448
8449/* Test if the page is free */
8450static int vmFreePage(off_t page) {
8451 off_t byte = page/8;
8452 int bit = page&7;
7d30035d 8453 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 8454}
8455
8456/* Find N contiguous free pages storing the first page of the cluster in *first.
e0a62c7f 8457 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
3a66edc7 8458 * REDIS_ERR is returned.
06224fec 8459 *
8460 * This function uses a simple algorithm: we try to allocate
8461 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8462 * again from the start of the swap file searching for free spaces.
8463 *
8464 * If it looks pretty clear that there are no free pages near our offset
8465 * we try to find less populated places doing a forward jump of
8466 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8467 * without hurry, and then we jump again and so forth...
e0a62c7f 8468 *
06224fec 8469 * This function can be improved using a free list to avoid to guess
8470 * too much, since we could collect data about freed pages.
8471 *
8472 * note: I implemented this function just after watching an episode of
8473 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8474 */
c7df85a4 8475static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 8476 off_t base, offset = 0, since_jump = 0, numfree = 0;
8477
8478 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8479 server.vm_near_pages = 0;
8480 server.vm_next_page = 0;
8481 }
8482 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8483 base = server.vm_next_page;
8484
8485 while(offset < server.vm_pages) {
8486 off_t this = base+offset;
8487
8488 /* If we overflow, restart from page zero */
8489 if (this >= server.vm_pages) {
8490 this -= server.vm_pages;
8491 if (this == 0) {
8492 /* Just overflowed, what we found on tail is no longer
8493 * interesting, as it's no longer contiguous. */
8494 numfree = 0;
8495 }
8496 }
8497 if (vmFreePage(this)) {
8498 /* This is a free page */
8499 numfree++;
8500 /* Already got N free pages? Return to the caller, with success */
8501 if (numfree == n) {
7d30035d 8502 *first = this-(n-1);
8503 server.vm_next_page = this+1;
7c775e09 8504 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 8505 return REDIS_OK;
06224fec 8506 }
8507 } else {
8508 /* The current one is not a free page */
8509 numfree = 0;
8510 }
8511
8512 /* Fast-forward if the current page is not free and we already
8513 * searched enough near this place. */
8514 since_jump++;
8515 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8516 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8517 since_jump = 0;
8518 /* Note that even if we rewind after the jump, we are don't need
8519 * to make sure numfree is set to zero as we only jump *if* it
8520 * is set to zero. */
8521 } else {
8522 /* Otherwise just check the next page */
8523 offset++;
8524 }
8525 }
3a66edc7 8526 return REDIS_ERR;
8527}
8528
a5819310 8529/* Write the specified object at the specified page of the swap file */
8530static int vmWriteObjectOnSwap(robj *o, off_t page) {
8531 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8532 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8533 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8534 redisLog(REDIS_WARNING,
9ebed7cf 8535 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 8536 strerror(errno));
8537 return REDIS_ERR;
8538 }
8539 rdbSaveObject(server.vm_fp,o);
ba76a8f9 8540 fflush(server.vm_fp);
a5819310 8541 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8542 return REDIS_OK;
8543}
8544
3a66edc7 8545/* Swap the 'val' object relative to 'key' into disk. Store all the information
8546 * needed to later retrieve the object into the key object.
8547 * If we can't find enough contiguous empty pages to swap the object on disk
8548 * REDIS_ERR is returned. */
a69a0c9c 8549static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 8550 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 8551 off_t page;
8552
8553 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 8554 assert(key->refcount == 1);
3a66edc7 8555 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 8556 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 8557 key->vm.page = page;
8558 key->vm.usedpages = pages;
8559 key->storage = REDIS_VM_SWAPPED;
d894161b 8560 key->vtype = val->type;
3a66edc7 8561 decrRefCount(val); /* Deallocate the object from memory. */
8562 vmMarkPagesUsed(page,pages);
7d30035d 8563 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8564 (unsigned char*) key->ptr,
8565 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 8566 server.vm_stats_swapped_objects++;
8567 server.vm_stats_swapouts++;
3a66edc7 8568 return REDIS_OK;
8569}
8570
a5819310 8571static robj *vmReadObjectFromSwap(off_t page, int type) {
8572 robj *o;
3a66edc7 8573
a5819310 8574 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8575 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 8576 redisLog(REDIS_WARNING,
d5d55fc3 8577 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 8578 strerror(errno));
478c2c6f 8579 _exit(1);
3a66edc7 8580 }
a5819310 8581 o = rdbLoadObject(type,server.vm_fp);
8582 if (o == NULL) {
d5d55fc3 8583 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 8584 _exit(1);
3a66edc7 8585 }
a5819310 8586 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8587 return o;
8588}
8589
8590/* Load the value object relative to the 'key' object from swap to memory.
8591 * The newly allocated object is returned.
8592 *
8593 * If preview is true the unserialized object is returned to the caller but
8594 * no changes are made to the key object, nor the pages are marked as freed */
8595static robj *vmGenericLoadObject(robj *key, int preview) {
8596 robj *val;
8597
d5d55fc3 8598 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
a5819310 8599 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 8600 if (!preview) {
8601 key->storage = REDIS_VM_MEMORY;
8602 key->vm.atime = server.unixtime;
8603 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8604 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8605 (unsigned char*) key->ptr);
7d98e08c 8606 server.vm_stats_swapped_objects--;
38aba9a1 8607 } else {
8608 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8609 (unsigned char*) key->ptr);
7e69548d 8610 }
7d98e08c 8611 server.vm_stats_swapins++;
3a66edc7 8612 return val;
06224fec 8613}
8614
7e69548d 8615/* Plain object loading, from swap to memory */
8616static robj *vmLoadObject(robj *key) {
996cb5f7 8617 /* If we are loading the object in background, stop it, we
8618 * need to load this object synchronously ASAP. */
8619 if (key->storage == REDIS_VM_LOADING)
8620 vmCancelThreadedIOJob(key);
7e69548d 8621 return vmGenericLoadObject(key,0);
8622}
8623
8624/* Just load the value on disk, without to modify the key.
8625 * This is useful when we want to perform some operation on the value
8626 * without to really bring it from swap to memory, like while saving the
8627 * dataset or rewriting the append only log. */
8628static robj *vmPreviewObject(robj *key) {
8629 return vmGenericLoadObject(key,1);
8630}
8631
4ef8de8a 8632/* How a good candidate is this object for swapping?
8633 * The better candidate it is, the greater the returned value.
8634 *
8635 * Currently we try to perform a fast estimation of the object size in
8636 * memory, and combine it with aging informations.
8637 *
8638 * Basically swappability = idle-time * log(estimated size)
8639 *
8640 * Bigger objects are preferred over smaller objects, but not
8641 * proportionally, this is why we use the logarithm. This algorithm is
8642 * just a first try and will probably be tuned later. */
8643static double computeObjectSwappability(robj *o) {
8644 time_t age = server.unixtime - o->vm.atime;
8645 long asize = 0;
8646 list *l;
8647 dict *d;
8648 struct dictEntry *de;
8649 int z;
8650
8651 if (age <= 0) return 0;
8652 switch(o->type) {
8653 case REDIS_STRING:
8654 if (o->encoding != REDIS_ENCODING_RAW) {
8655 asize = sizeof(*o);
8656 } else {
8657 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8658 }
8659 break;
8660 case REDIS_LIST:
8661 l = o->ptr;
8662 listNode *ln = listFirst(l);
8663
8664 asize = sizeof(list);
8665 if (ln) {
8666 robj *ele = ln->value;
8667 long elesize;
8668
8669 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8670 (sizeof(*o)+sdslen(ele->ptr)) :
8671 sizeof(*o);
8672 asize += (sizeof(listNode)+elesize)*listLength(l);
8673 }
8674 break;
8675 case REDIS_SET:
8676 case REDIS_ZSET:
8677 z = (o->type == REDIS_ZSET);
8678 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8679
8680 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8681 if (z) asize += sizeof(zset)-sizeof(dict);
8682 if (dictSize(d)) {
8683 long elesize;
8684 robj *ele;
8685
8686 de = dictGetRandomKey(d);
8687 ele = dictGetEntryKey(de);
8688 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8689 (sizeof(*o)+sdslen(ele->ptr)) :
8690 sizeof(*o);
8691 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8692 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8693 }
8694 break;
a97b9060 8695 case REDIS_HASH:
8696 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8697 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8698 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8699 unsigned int klen, vlen;
8700 unsigned char *key, *val;
8701
8702 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8703 klen = 0;
8704 vlen = 0;
8705 }
8706 asize = len*(klen+vlen+3);
8707 } else if (o->encoding == REDIS_ENCODING_HT) {
8708 d = o->ptr;
8709 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8710 if (dictSize(d)) {
8711 long elesize;
8712 robj *ele;
8713
8714 de = dictGetRandomKey(d);
8715 ele = dictGetEntryKey(de);
8716 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8717 (sizeof(*o)+sdslen(ele->ptr)) :
8718 sizeof(*o);
8719 ele = dictGetEntryVal(de);
8720 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8721 (sizeof(*o)+sdslen(ele->ptr)) :
8722 sizeof(*o);
8723 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8724 }
8725 }
8726 break;
4ef8de8a 8727 }
c8c72447 8728 return (double)age*log(1+asize);
4ef8de8a 8729}
8730
8731/* Try to swap an object that's a good candidate for swapping.
8732 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 8733 * to swap any object at all.
8734 *
8735 * If 'usethreaded' is true, Redis will try to swap the object in background
8736 * using I/O threads. */
8737static int vmSwapOneObject(int usethreads) {
4ef8de8a 8738 int j, i;
8739 struct dictEntry *best = NULL;
8740 double best_swappability = 0;
b9bc0eef 8741 redisDb *best_db = NULL;
4ef8de8a 8742 robj *key, *val;
8743
8744 for (j = 0; j < server.dbnum; j++) {
8745 redisDb *db = server.db+j;
b72f6a4b 8746 /* Why maxtries is set to 100?
8747 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8748 * are swappable objects */
b0d8747d 8749 int maxtries = 100;
4ef8de8a 8750
8751 if (dictSize(db->dict) == 0) continue;
8752 for (i = 0; i < 5; i++) {
8753 dictEntry *de;
8754 double swappability;
8755
e3cadb8a 8756 if (maxtries) maxtries--;
4ef8de8a 8757 de = dictGetRandomKey(db->dict);
8758 key = dictGetEntryKey(de);
8759 val = dictGetEntryVal(de);
1064ef87 8760 /* Only swap objects that are currently in memory.
8761 *
8762 * Also don't swap shared objects if threaded VM is on, as we
8763 * try to ensure that the main thread does not touch the
8764 * object while the I/O thread is using it, but we can't
8765 * control other keys without adding additional mutex. */
8766 if (key->storage != REDIS_VM_MEMORY ||
8767 (server.vm_max_threads != 0 && val->refcount != 1)) {
e3cadb8a 8768 if (maxtries) i--; /* don't count this try */
8769 continue;
8770 }
4ef8de8a 8771 swappability = computeObjectSwappability(val);
8772 if (!best || swappability > best_swappability) {
8773 best = de;
8774 best_swappability = swappability;
b9bc0eef 8775 best_db = db;
4ef8de8a 8776 }
8777 }
8778 }
7c775e09 8779 if (best == NULL) return REDIS_ERR;
4ef8de8a 8780 key = dictGetEntryKey(best);
8781 val = dictGetEntryVal(best);
8782
e3cadb8a 8783 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 8784 key->ptr, best_swappability);
8785
8786 /* Unshare the key if needed */
8787 if (key->refcount > 1) {
8788 robj *newkey = dupStringObject(key);
8789 decrRefCount(key);
8790 key = dictGetEntryKey(best) = newkey;
8791 }
8792 /* Swap it */
a69a0c9c 8793 if (usethreads) {
b9bc0eef 8794 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 8795 return REDIS_OK;
8796 } else {
a69a0c9c 8797 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8798 dictGetEntryVal(best) = NULL;
8799 return REDIS_OK;
8800 } else {
8801 return REDIS_ERR;
8802 }
4ef8de8a 8803 }
8804}
8805
a69a0c9c 8806static int vmSwapOneObjectBlocking() {
8807 return vmSwapOneObject(0);
8808}
8809
8810static int vmSwapOneObjectThreaded() {
8811 return vmSwapOneObject(1);
8812}
8813
7e69548d 8814/* Return true if it's safe to swap out objects in a given moment.
8815 * Basically we don't want to swap objects out while there is a BGSAVE
8816 * or a BGAEOREWRITE running in backgroud. */
8817static int vmCanSwapOut(void) {
8818 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8819}
8820
1b03836c 8821/* Delete a key if swapped. Returns 1 if the key was found, was swapped
8822 * and was deleted. Otherwise 0 is returned. */
8823static int deleteIfSwapped(redisDb *db, robj *key) {
8824 dictEntry *de;
8825 robj *foundkey;
8826
8827 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8828 foundkey = dictGetEntryKey(de);
8829 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8830 deleteKey(db,key);
8831 return 1;
8832}
8833
996cb5f7 8834/* =================== Virtual Memory - Threaded I/O ======================= */
8835
b9bc0eef 8836static void freeIOJob(iojob *j) {
d5d55fc3 8837 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8838 j->type == REDIS_IOJOB_DO_SWAP ||
8839 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
b9bc0eef 8840 decrRefCount(j->val);
78ebe4c8 8841 /* We don't decrRefCount the j->key field as we did't incremented
8842 * the count creating IO Jobs. This is because the key field here is
8843 * just used as an indentifier and if a key is removed the Job should
8844 * never be touched again. */
b9bc0eef 8845 zfree(j);
8846}
8847
996cb5f7 8848/* Every time a thread finished a Job, it writes a byte into the write side
8849 * of an unix pipe in order to "awake" the main thread, and this function
8850 * is called. */
8851static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8852 int mask)
8853{
8854 char buf[1];
b0d8747d 8855 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 8856 REDIS_NOTUSED(el);
8857 REDIS_NOTUSED(mask);
8858 REDIS_NOTUSED(privdata);
8859
8860 /* For every byte we read in the read side of the pipe, there is one
8861 * I/O job completed to process. */
8862 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 8863 iojob *j;
8864 listNode *ln;
8865 robj *key;
8866 struct dictEntry *de;
8867
996cb5f7 8868 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 8869
8870 /* Get the processed element (the oldest one) */
8871 lockThreadedIO();
1064ef87 8872 assert(listLength(server.io_processed) != 0);
f6c0bba8 8873 if (toprocess == -1) {
8874 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8875 if (toprocess <= 0) toprocess = 1;
8876 }
b9bc0eef 8877 ln = listFirst(server.io_processed);
8878 j = ln->value;
8879 listDelNode(server.io_processed,ln);
8880 unlockThreadedIO();
8881 /* If this job is marked as canceled, just ignore it */
8882 if (j->canceled) {
8883 freeIOJob(j);
8884 continue;
8885 }
8886 /* Post process it in the main thread, as there are things we
8887 * can do just here to avoid race conditions and/or invasive locks */
6c96ba7d 8888 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 8889 de = dictFind(j->db->dict,j->key);
8890 assert(de != NULL);
8891 key = dictGetEntryKey(de);
8892 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 8893 redisDb *db;
8894
b9bc0eef 8895 /* Key loaded, bring it at home */
8896 key->storage = REDIS_VM_MEMORY;
8897 key->vm.atime = server.unixtime;
8898 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8899 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8900 (unsigned char*) key->ptr);
8901 server.vm_stats_swapped_objects--;
8902 server.vm_stats_swapins++;
d5d55fc3 8903 dictGetEntryVal(de) = j->val;
8904 incrRefCount(j->val);
8905 db = j->db;
b9bc0eef 8906 freeIOJob(j);
d5d55fc3 8907 /* Handle clients waiting for this key to be loaded. */
8908 handleClientsBlockedOnSwappedKey(db,key);
b9bc0eef 8909 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8910 /* Now we know the amount of pages required to swap this object.
8911 * Let's find some space for it, and queue this task again
8912 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 8913 if (!vmCanSwapOut() ||
8914 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8915 {
8916 /* Ooops... no space or we can't swap as there is
8917 * a fork()ed Redis trying to save stuff on disk. */
b9bc0eef 8918 freeIOJob(j);
054e426d 8919 key->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 8920 } else {
c7df85a4 8921 /* Note that we need to mark this pages as used now,
8922 * if the job will be canceled, we'll mark them as freed
8923 * again. */
8924 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 8925 j->type = REDIS_IOJOB_DO_SWAP;
8926 lockThreadedIO();
8927 queueIOJob(j);
8928 unlockThreadedIO();
8929 }
8930 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8931 robj *val;
8932
8933 /* Key swapped. We can finally free some memory. */
6c96ba7d 8934 if (key->storage != REDIS_VM_SWAPPING) {
8935 printf("key->storage: %d\n",key->storage);
8936 printf("key->name: %s\n",(char*)key->ptr);
8937 printf("key->refcount: %d\n",key->refcount);
8938 printf("val: %p\n",(void*)j->val);
8939 printf("val->type: %d\n",j->val->type);
8940 printf("val->ptr: %s\n",(char*)j->val->ptr);
8941 }
8942 redisAssert(key->storage == REDIS_VM_SWAPPING);
b9bc0eef 8943 val = dictGetEntryVal(de);
8944 key->vm.page = j->page;
8945 key->vm.usedpages = j->pages;
8946 key->storage = REDIS_VM_SWAPPED;
8947 key->vtype = j->val->type;
8948 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 8949 dictGetEntryVal(de) = NULL;
b9bc0eef 8950 redisLog(REDIS_DEBUG,
8951 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8952 (unsigned char*) key->ptr,
8953 (unsigned long long) j->page, (unsigned long long) j->pages);
8954 server.vm_stats_swapped_objects++;
8955 server.vm_stats_swapouts++;
8956 freeIOJob(j);
f11b8647 8957 /* Put a few more swap requests in queue if we are still
8958 * out of memory */
b0d8747d 8959 if (trytoswap && vmCanSwapOut() &&
8960 zmalloc_used_memory() > server.vm_max_memory)
8961 {
f11b8647 8962 int more = 1;
8963 while(more) {
8964 lockThreadedIO();
8965 more = listLength(server.io_newjobs) <
8966 (unsigned) server.vm_max_threads;
8967 unlockThreadedIO();
8968 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 8969 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8970 trytoswap = 0;
8971 break;
8972 }
f11b8647 8973 }
8974 }
b9bc0eef 8975 }
c953f24b 8976 processed++;
f6c0bba8 8977 if (processed == toprocess) return;
996cb5f7 8978 }
8979 if (retval < 0 && errno != EAGAIN) {
8980 redisLog(REDIS_WARNING,
8981 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8982 strerror(errno));
8983 }
8984}
8985
8986static void lockThreadedIO(void) {
8987 pthread_mutex_lock(&server.io_mutex);
8988}
8989
8990static void unlockThreadedIO(void) {
8991 pthread_mutex_unlock(&server.io_mutex);
8992}
8993
8994/* Remove the specified object from the threaded I/O queue if still not
8995 * processed, otherwise make sure to flag it as canceled. */
8996static void vmCancelThreadedIOJob(robj *o) {
8997 list *lists[3] = {
6c96ba7d 8998 server.io_newjobs, /* 0 */
8999 server.io_processing, /* 1 */
9000 server.io_processed /* 2 */
996cb5f7 9001 };
9002 int i;
9003
9004 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 9005again:
996cb5f7 9006 lockThreadedIO();
9007 /* Search for a matching key in one of the queues */
9008 for (i = 0; i < 3; i++) {
9009 listNode *ln;
c7df85a4 9010 listIter li;
996cb5f7 9011
c7df85a4 9012 listRewind(lists[i],&li);
9013 while ((ln = listNext(&li)) != NULL) {
996cb5f7 9014 iojob *job = ln->value;
9015
6c96ba7d 9016 if (job->canceled) continue; /* Skip this, already canceled. */
78ebe4c8 9017 if (job->key == o) {
970e10bb 9018 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9019 (void*)job, (char*)o->ptr, job->type, i);
427a2153 9020 /* Mark the pages as free since the swap didn't happened
9021 * or happened but is now discarded. */
970e10bb 9022 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 9023 vmMarkPagesFree(job->page,job->pages);
9024 /* Cancel the job. It depends on the list the job is
9025 * living in. */
996cb5f7 9026 switch(i) {
9027 case 0: /* io_newjobs */
6c96ba7d 9028 /* If the job was yet not processed the best thing to do
996cb5f7 9029 * is to remove it from the queue at all */
6c96ba7d 9030 freeIOJob(job);
996cb5f7 9031 listDelNode(lists[i],ln);
9032 break;
9033 case 1: /* io_processing */
d5d55fc3 9034 /* Oh Shi- the thread is messing with the Job:
9035 *
9036 * Probably it's accessing the object if this is a
9037 * PREPARE_SWAP or DO_SWAP job.
9038 * If it's a LOAD job it may be reading from disk and
9039 * if we don't wait for the job to terminate before to
9040 * cancel it, maybe in a few microseconds data can be
9041 * corrupted in this pages. So the short story is:
9042 *
9043 * Better to wait for the job to move into the
9044 * next queue (processed)... */
9045
9046 /* We try again and again until the job is completed. */
9047 unlockThreadedIO();
9048 /* But let's wait some time for the I/O thread
9049 * to finish with this job. After all this condition
9050 * should be very rare. */
9051 usleep(1);
9052 goto again;
996cb5f7 9053 case 2: /* io_processed */
2e111efe 9054 /* The job was already processed, that's easy...
9055 * just mark it as canceled so that we'll ignore it
9056 * when processing completed jobs. */
996cb5f7 9057 job->canceled = 1;
9058 break;
9059 }
c7df85a4 9060 /* Finally we have to adjust the storage type of the object
9061 * in order to "UNDO" the operaiton. */
996cb5f7 9062 if (o->storage == REDIS_VM_LOADING)
9063 o->storage = REDIS_VM_SWAPPED;
9064 else if (o->storage == REDIS_VM_SWAPPING)
9065 o->storage = REDIS_VM_MEMORY;
9066 unlockThreadedIO();
9067 return;
9068 }
9069 }
9070 }
9071 unlockThreadedIO();
9072 assert(1 != 1); /* We should never reach this */
9073}
9074
b9bc0eef 9075static void *IOThreadEntryPoint(void *arg) {
9076 iojob *j;
9077 listNode *ln;
9078 REDIS_NOTUSED(arg);
9079
9080 pthread_detach(pthread_self());
9081 while(1) {
9082 /* Get a new job to process */
9083 lockThreadedIO();
9084 if (listLength(server.io_newjobs) == 0) {
9085 /* No new jobs in queue, exit. */
9ebed7cf 9086 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9087 (long) pthread_self());
b9bc0eef 9088 server.io_active_threads--;
9089 unlockThreadedIO();
9090 return NULL;
9091 }
9092 ln = listFirst(server.io_newjobs);
9093 j = ln->value;
9094 listDelNode(server.io_newjobs,ln);
9095 /* Add the job in the processing queue */
9096 j->thread = pthread_self();
9097 listAddNodeTail(server.io_processing,j);
9098 ln = listLast(server.io_processing); /* We use ln later to remove it */
9099 unlockThreadedIO();
9ebed7cf 9100 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9101 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 9102
9103 /* Process the Job */
9104 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9105 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
b9bc0eef 9106 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9107 FILE *fp = fopen("/dev/null","w+");
9108 j->pages = rdbSavedObjectPages(j->val,fp);
9109 fclose(fp);
9110 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 9111 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9112 j->canceled = 1;
b9bc0eef 9113 }
9114
9115 /* Done: insert the job into the processed queue */
9ebed7cf 9116 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9117 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 9118 lockThreadedIO();
9119 listDelNode(server.io_processing,ln);
9120 listAddNodeTail(server.io_processed,j);
9121 unlockThreadedIO();
e0a62c7f 9122
b9bc0eef 9123 /* Signal the main thread there is new stuff to process */
9124 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9125 }
9126 return NULL; /* never reached */
9127}
9128
9129static void spawnIOThread(void) {
9130 pthread_t thread;
478c2c6f 9131 sigset_t mask, omask;
a97b9060 9132 int err;
b9bc0eef 9133
478c2c6f 9134 sigemptyset(&mask);
9135 sigaddset(&mask,SIGCHLD);
9136 sigaddset(&mask,SIGHUP);
9137 sigaddset(&mask,SIGPIPE);
9138 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 9139 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9140 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9141 strerror(err));
9142 usleep(1000000);
9143 }
478c2c6f 9144 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 9145 server.io_active_threads++;
9146}
9147
4ee9488d 9148/* We need to wait for the last thread to exit before we are able to
9149 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 9150static void waitEmptyIOJobsQueue(void) {
4ee9488d 9151 while(1) {
76b7233a 9152 int io_processed_len;
9153
4ee9488d 9154 lockThreadedIO();
054e426d 9155 if (listLength(server.io_newjobs) == 0 &&
9156 listLength(server.io_processing) == 0 &&
9157 server.io_active_threads == 0)
9158 {
4ee9488d 9159 unlockThreadedIO();
9160 return;
9161 }
76b7233a 9162 /* While waiting for empty jobs queue condition we post-process some
9163 * finshed job, as I/O threads may be hanging trying to write against
9164 * the io_ready_pipe_write FD but there are so much pending jobs that
9165 * it's blocking. */
9166 io_processed_len = listLength(server.io_processed);
4ee9488d 9167 unlockThreadedIO();
76b7233a 9168 if (io_processed_len) {
9169 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9170 usleep(1000); /* 1 millisecond */
9171 } else {
9172 usleep(10000); /* 10 milliseconds */
9173 }
4ee9488d 9174 }
9175}
9176
054e426d 9177static void vmReopenSwapFile(void) {
478c2c6f 9178 /* Note: we don't close the old one as we are in the child process
9179 * and don't want to mess at all with the original file object. */
054e426d 9180 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9181 if (server.vm_fp == NULL) {
9182 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9183 server.vm_swap_file);
478c2c6f 9184 _exit(1);
054e426d 9185 }
9186 server.vm_fd = fileno(server.vm_fp);
9187}
9188
b9bc0eef 9189/* This function must be called while with threaded IO locked */
9190static void queueIOJob(iojob *j) {
6c96ba7d 9191 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9192 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 9193 listAddNodeTail(server.io_newjobs,j);
9194 if (server.io_active_threads < server.vm_max_threads)
9195 spawnIOThread();
9196}
9197
9198static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9199 iojob *j;
e0a62c7f 9200
b9bc0eef 9201 assert(key->storage == REDIS_VM_MEMORY);
9202 assert(key->refcount == 1);
9203
9204 j = zmalloc(sizeof(*j));
9205 j->type = REDIS_IOJOB_PREPARE_SWAP;
9206 j->db = db;
78ebe4c8 9207 j->key = key;
b9bc0eef 9208 j->val = val;
9209 incrRefCount(val);
9210 j->canceled = 0;
9211 j->thread = (pthread_t) -1;
f11b8647 9212 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 9213
9214 lockThreadedIO();
9215 queueIOJob(j);
9216 unlockThreadedIO();
9217 return REDIS_OK;
9218}
9219
b0d8747d 9220/* ============ Virtual Memory - Blocking clients on missing keys =========== */
9221
d5d55fc3 9222/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9223 * If there is not already a job loading the key, it is craeted.
9224 * The key is added to the io_keys list in the client structure, and also
9225 * in the hash table mapping swapped keys to waiting clients, that is,
9226 * server.io_waited_keys. */
9227static int waitForSwappedKey(redisClient *c, robj *key) {
9228 struct dictEntry *de;
9229 robj *o;
9230 list *l;
9231
9232 /* If the key does not exist or is already in RAM we don't need to
9233 * block the client at all. */
9234 de = dictFind(c->db->dict,key);
9235 if (de == NULL) return 0;
9236 o = dictGetEntryKey(de);
9237 if (o->storage == REDIS_VM_MEMORY) {
9238 return 0;
9239 } else if (o->storage == REDIS_VM_SWAPPING) {
9240 /* We were swapping the key, undo it! */
9241 vmCancelThreadedIOJob(o);
9242 return 0;
9243 }
e0a62c7f 9244
d5d55fc3 9245 /* OK: the key is either swapped, or being loaded just now. */
9246
9247 /* Add the key to the list of keys this client is waiting for.
9248 * This maps clients to keys they are waiting for. */
9249 listAddNodeTail(c->io_keys,key);
9250 incrRefCount(key);
9251
9252 /* Add the client to the swapped keys => clients waiting map. */
9253 de = dictFind(c->db->io_keys,key);
9254 if (de == NULL) {
9255 int retval;
9256
9257 /* For every key we take a list of clients blocked for it */
9258 l = listCreate();
9259 retval = dictAdd(c->db->io_keys,key,l);
9260 incrRefCount(key);
9261 assert(retval == DICT_OK);
9262 } else {
9263 l = dictGetEntryVal(de);
9264 }
9265 listAddNodeTail(l,c);
9266
9267 /* Are we already loading the key from disk? If not create a job */
9268 if (o->storage == REDIS_VM_SWAPPED) {
9269 iojob *j;
9270
9271 o->storage = REDIS_VM_LOADING;
9272 j = zmalloc(sizeof(*j));
9273 j->type = REDIS_IOJOB_LOAD;
9274 j->db = c->db;
78ebe4c8 9275 j->key = o;
d5d55fc3 9276 j->key->vtype = o->vtype;
9277 j->page = o->vm.page;
9278 j->val = NULL;
9279 j->canceled = 0;
9280 j->thread = (pthread_t) -1;
9281 lockThreadedIO();
9282 queueIOJob(j);
9283 unlockThreadedIO();
9284 }
9285 return 1;
9286}
9287
76583ea4
PN
9288/* Preload keys needed for the ZUNION and ZINTER commands. */
9289static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9290 int i, num;
9291 num = atoi(c->argv[2]->ptr);
9292 for (i = 0; i < num; i++) {
9293 waitForSwappedKey(c,c->argv[3+i]);
9294 }
9295}
9296
b0d8747d 9297/* Is this client attempting to run a command against swapped keys?
d5d55fc3 9298 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 9299 *
d5d55fc3 9300 * The important idea about this function is that it can fail! If keys will
9301 * still be swapped when the client is resumed, this key lookups will
9302 * just block loading keys from disk. In practical terms this should only
9303 * happen with SORT BY command or if there is a bug in this function.
9304 *
9305 * Return 1 if the client is marked as blocked, 0 if the client can
9306 * continue as the keys it is going to access appear to be in memory. */
9307static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
7c775e09 9308 int j, last;
9309
76583ea4
PN
9310 if (cmd->vm_preload_proc != NULL) {
9311 cmd->vm_preload_proc(c);
9312 } else {
9313 if (cmd->vm_firstkey == 0) return 0;
9314 last = cmd->vm_lastkey;
9315 if (last < 0) last = c->argc+last;
9316 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9317 waitForSwappedKey(c,c->argv[j]);
9318 }
9319
d5d55fc3 9320 /* If the client was blocked for at least one key, mark it as blocked. */
9321 if (listLength(c->io_keys)) {
9322 c->flags |= REDIS_IO_WAIT;
9323 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9324 server.vm_blocked_clients++;
9325 return 1;
9326 } else {
9327 return 0;
9328 }
9329}
9330
9331/* Remove the 'key' from the list of blocked keys for a given client.
9332 *
9333 * The function returns 1 when there are no longer blocking keys after
9334 * the current one was removed (and the client can be unblocked). */
9335static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9336 list *l;
9337 listNode *ln;
9338 listIter li;
9339 struct dictEntry *de;
9340
9341 /* Remove the key from the list of keys this client is waiting for. */
9342 listRewind(c->io_keys,&li);
9343 while ((ln = listNext(&li)) != NULL) {
9344 if (compareStringObjects(ln->value,key) == 0) {
9345 listDelNode(c->io_keys,ln);
9346 break;
9347 }
9348 }
9349 assert(ln != NULL);
9350
9351 /* Remove the client form the key => waiting clients map. */
9352 de = dictFind(c->db->io_keys,key);
9353 assert(de != NULL);
9354 l = dictGetEntryVal(de);
9355 ln = listSearchKey(l,c);
9356 assert(ln != NULL);
9357 listDelNode(l,ln);
9358 if (listLength(l) == 0)
9359 dictDelete(c->db->io_keys,key);
9360
9361 return listLength(c->io_keys) == 0;
9362}
9363
9364static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9365 struct dictEntry *de;
9366 list *l;
9367 listNode *ln;
9368 int len;
9369
9370 de = dictFind(db->io_keys,key);
9371 if (!de) return;
9372
9373 l = dictGetEntryVal(de);
9374 len = listLength(l);
9375 /* Note: we can't use something like while(listLength(l)) as the list
9376 * can be freed by the calling function when we remove the last element. */
9377 while (len--) {
9378 ln = listFirst(l);
9379 redisClient *c = ln->value;
9380
9381 if (dontWaitForSwappedKey(c,key)) {
9382 /* Put the client in the list of clients ready to go as we
9383 * loaded all the keys about it. */
9384 listAddNodeTail(server.io_ready_clients,c);
9385 }
9386 }
b0d8747d 9387}
b0d8747d 9388
500ece7c 9389/* =========================== Remote Configuration ========================= */
9390
9391static void configSetCommand(redisClient *c) {
9392 robj *o = getDecodedObject(c->argv[3]);
9393 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9394 zfree(server.dbfilename);
9395 server.dbfilename = zstrdup(o->ptr);
9396 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9397 zfree(server.requirepass);
9398 server.requirepass = zstrdup(o->ptr);
9399 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9400 zfree(server.masterauth);
9401 server.masterauth = zstrdup(o->ptr);
9402 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9403 server.maxmemory = strtoll(o->ptr, NULL, 10);
9404 } else {
9405 addReplySds(c,sdscatprintf(sdsempty(),
9406 "-ERR not supported CONFIG parameter %s\r\n",
9407 (char*)c->argv[2]->ptr));
9408 decrRefCount(o);
9409 return;
9410 }
9411 decrRefCount(o);
9412 addReply(c,shared.ok);
9413}
9414
9415static void configGetCommand(redisClient *c) {
9416 robj *o = getDecodedObject(c->argv[2]);
9417 robj *lenobj = createObject(REDIS_STRING,NULL);
9418 char *pattern = o->ptr;
9419 int matches = 0;
9420
9421 addReply(c,lenobj);
9422 decrRefCount(lenobj);
9423
9424 if (stringmatch(pattern,"dbfilename",0)) {
9425 addReplyBulkCString(c,"dbfilename");
9426 addReplyBulkCString(c,server.dbfilename);
9427 matches++;
9428 }
9429 if (stringmatch(pattern,"requirepass",0)) {
9430 addReplyBulkCString(c,"requirepass");
9431 addReplyBulkCString(c,server.requirepass);
9432 matches++;
9433 }
9434 if (stringmatch(pattern,"masterauth",0)) {
9435 addReplyBulkCString(c,"masterauth");
9436 addReplyBulkCString(c,server.masterauth);
9437 matches++;
9438 }
9439 if (stringmatch(pattern,"maxmemory",0)) {
9440 char buf[128];
9441
9442 snprintf(buf,128,"%llu\n",server.maxmemory);
9443 addReplyBulkCString(c,"maxmemory");
9444 addReplyBulkCString(c,buf);
9445 matches++;
9446 }
9447 decrRefCount(o);
9448 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9449}
9450
9451static void configCommand(redisClient *c) {
9452 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9453 if (c->argc != 4) goto badarity;
9454 configSetCommand(c);
9455 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9456 if (c->argc != 3) goto badarity;
9457 configGetCommand(c);
9458 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9459 if (c->argc != 2) goto badarity;
9460 server.stat_numcommands = 0;
9461 server.stat_numconnections = 0;
9462 server.stat_expiredkeys = 0;
9463 server.stat_starttime = time(NULL);
9464 addReply(c,shared.ok);
9465 } else {
9466 addReplySds(c,sdscatprintf(sdsempty(),
9467 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9468 }
9469 return;
9470
9471badarity:
9472 addReplySds(c,sdscatprintf(sdsempty(),
9473 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9474 (char*) c->argv[1]->ptr));
9475}
9476
befec3cd 9477/* =========================== Pubsub implementation ======================== */
9478
ffc6b7f8 9479static void freePubsubPattern(void *p) {
9480 pubsubPattern *pat = p;
9481
9482 decrRefCount(pat->pattern);
9483 zfree(pat);
9484}
9485
9486static int listMatchPubsubPattern(void *a, void *b) {
9487 pubsubPattern *pa = a, *pb = b;
9488
9489 return (pa->client == pb->client) &&
9490 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9491}
9492
9493/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9494 * 0 if the client was already subscribed to that channel. */
9495static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
befec3cd 9496 struct dictEntry *de;
9497 list *clients = NULL;
9498 int retval = 0;
9499
ffc6b7f8 9500 /* Add the channel to the client -> channels hash table */
9501 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
befec3cd 9502 retval = 1;
ffc6b7f8 9503 incrRefCount(channel);
9504 /* Add the client to the channel -> list of clients hash table */
9505 de = dictFind(server.pubsub_channels,channel);
befec3cd 9506 if (de == NULL) {
9507 clients = listCreate();
ffc6b7f8 9508 dictAdd(server.pubsub_channels,channel,clients);
9509 incrRefCount(channel);
befec3cd 9510 } else {
9511 clients = dictGetEntryVal(de);
9512 }
9513 listAddNodeTail(clients,c);
9514 }
9515 /* Notify the client */
9516 addReply(c,shared.mbulk3);
9517 addReply(c,shared.subscribebulk);
ffc6b7f8 9518 addReplyBulk(c,channel);
9519 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
befec3cd 9520 return retval;
9521}
9522
ffc6b7f8 9523/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9524 * 0 if the client was not subscribed to the specified channel. */
9525static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
befec3cd 9526 struct dictEntry *de;
9527 list *clients;
9528 listNode *ln;
9529 int retval = 0;
9530
ffc6b7f8 9531 /* Remove the channel from the client -> channels hash table */
9532 incrRefCount(channel); /* channel may be just a pointer to the same object
201037f5 9533 we have in the hash tables. Protect it... */
ffc6b7f8 9534 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
befec3cd 9535 retval = 1;
ffc6b7f8 9536 /* Remove the client from the channel -> clients list hash table */
9537 de = dictFind(server.pubsub_channels,channel);
befec3cd 9538 assert(de != NULL);
9539 clients = dictGetEntryVal(de);
9540 ln = listSearchKey(clients,c);
9541 assert(ln != NULL);
9542 listDelNode(clients,ln);
ff767a75 9543 if (listLength(clients) == 0) {
9544 /* Free the list and associated hash entry at all if this was
9545 * the latest client, so that it will be possible to abuse
ffc6b7f8 9546 * Redis PUBSUB creating millions of channels. */
9547 dictDelete(server.pubsub_channels,channel);
ff767a75 9548 }
befec3cd 9549 }
9550 /* Notify the client */
9551 if (notify) {
9552 addReply(c,shared.mbulk3);
9553 addReply(c,shared.unsubscribebulk);
ffc6b7f8 9554 addReplyBulk(c,channel);
9555 addReplyLong(c,dictSize(c->pubsub_channels)+
9556 listLength(c->pubsub_patterns));
9557
9558 }
9559 decrRefCount(channel); /* it is finally safe to release it */
9560 return retval;
9561}
9562
9563/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9564static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9565 int retval = 0;
9566
9567 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9568 retval = 1;
9569 pubsubPattern *pat;
9570 listAddNodeTail(c->pubsub_patterns,pattern);
9571 incrRefCount(pattern);
9572 pat = zmalloc(sizeof(*pat));
9573 pat->pattern = getDecodedObject(pattern);
9574 pat->client = c;
9575 listAddNodeTail(server.pubsub_patterns,pat);
9576 }
9577 /* Notify the client */
9578 addReply(c,shared.mbulk3);
9579 addReply(c,shared.psubscribebulk);
9580 addReplyBulk(c,pattern);
9581 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9582 return retval;
9583}
9584
9585/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9586 * 0 if the client was not subscribed to the specified channel. */
9587static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9588 listNode *ln;
9589 pubsubPattern pat;
9590 int retval = 0;
9591
9592 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9593 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9594 retval = 1;
9595 listDelNode(c->pubsub_patterns,ln);
9596 pat.client = c;
9597 pat.pattern = pattern;
9598 ln = listSearchKey(server.pubsub_patterns,&pat);
9599 listDelNode(server.pubsub_patterns,ln);
9600 }
9601 /* Notify the client */
9602 if (notify) {
9603 addReply(c,shared.mbulk3);
9604 addReply(c,shared.punsubscribebulk);
9605 addReplyBulk(c,pattern);
9606 addReplyLong(c,dictSize(c->pubsub_channels)+
9607 listLength(c->pubsub_patterns));
befec3cd 9608 }
ffc6b7f8 9609 decrRefCount(pattern);
befec3cd 9610 return retval;
9611}
9612
ffc6b7f8 9613/* Unsubscribe from all the channels. Return the number of channels the
9614 * client was subscribed from. */
9615static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9616 dictIterator *di = dictGetIterator(c->pubsub_channels);
befec3cd 9617 dictEntry *de;
9618 int count = 0;
9619
9620 while((de = dictNext(di)) != NULL) {
ffc6b7f8 9621 robj *channel = dictGetEntryKey(de);
befec3cd 9622
ffc6b7f8 9623 count += pubsubUnsubscribeChannel(c,channel,notify);
befec3cd 9624 }
9625 dictReleaseIterator(di);
9626 return count;
9627}
9628
ffc6b7f8 9629/* Unsubscribe from all the patterns. Return the number of patterns the
9630 * client was subscribed from. */
9631static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9632 listNode *ln;
9633 listIter li;
9634 int count = 0;
9635
9636 listRewind(c->pubsub_patterns,&li);
9637 while ((ln = listNext(&li)) != NULL) {
9638 robj *pattern = ln->value;
9639
9640 count += pubsubUnsubscribePattern(c,pattern,notify);
9641 }
9642 return count;
9643}
9644
befec3cd 9645/* Publish a message */
ffc6b7f8 9646static int pubsubPublishMessage(robj *channel, robj *message) {
befec3cd 9647 int receivers = 0;
9648 struct dictEntry *de;
ffc6b7f8 9649 listNode *ln;
9650 listIter li;
befec3cd 9651
ffc6b7f8 9652 /* Send to clients listening for that channel */
9653 de = dictFind(server.pubsub_channels,channel);
befec3cd 9654 if (de) {
9655 list *list = dictGetEntryVal(de);
9656 listNode *ln;
9657 listIter li;
9658
9659 listRewind(list,&li);
9660 while ((ln = listNext(&li)) != NULL) {
9661 redisClient *c = ln->value;
9662
9663 addReply(c,shared.mbulk3);
9664 addReply(c,shared.messagebulk);
ffc6b7f8 9665 addReplyBulk(c,channel);
befec3cd 9666 addReplyBulk(c,message);
9667 receivers++;
9668 }
9669 }
ffc6b7f8 9670 /* Send to clients listening to matching channels */
9671 if (listLength(server.pubsub_patterns)) {
9672 listRewind(server.pubsub_patterns,&li);
9673 channel = getDecodedObject(channel);
9674 while ((ln = listNext(&li)) != NULL) {
9675 pubsubPattern *pat = ln->value;
9676
9677 if (stringmatchlen((char*)pat->pattern->ptr,
9678 sdslen(pat->pattern->ptr),
9679 (char*)channel->ptr,
9680 sdslen(channel->ptr),0)) {
9681 addReply(pat->client,shared.mbulk3);
9682 addReply(pat->client,shared.messagebulk);
9683 addReplyBulk(pat->client,channel);
9684 addReplyBulk(pat->client,message);
9685 receivers++;
9686 }
9687 }
9688 decrRefCount(channel);
9689 }
befec3cd 9690 return receivers;
9691}
9692
9693static void subscribeCommand(redisClient *c) {
9694 int j;
9695
9696 for (j = 1; j < c->argc; j++)
ffc6b7f8 9697 pubsubSubscribeChannel(c,c->argv[j]);
befec3cd 9698}
9699
9700static void unsubscribeCommand(redisClient *c) {
9701 if (c->argc == 1) {
ffc6b7f8 9702 pubsubUnsubscribeAllChannels(c,1);
9703 return;
9704 } else {
9705 int j;
9706
9707 for (j = 1; j < c->argc; j++)
9708 pubsubUnsubscribeChannel(c,c->argv[j],1);
9709 }
9710}
9711
9712static void psubscribeCommand(redisClient *c) {
9713 int j;
9714
9715 for (j = 1; j < c->argc; j++)
9716 pubsubSubscribePattern(c,c->argv[j]);
9717}
9718
9719static void punsubscribeCommand(redisClient *c) {
9720 if (c->argc == 1) {
9721 pubsubUnsubscribeAllPatterns(c,1);
befec3cd 9722 return;
9723 } else {
9724 int j;
9725
9726 for (j = 1; j < c->argc; j++)
ffc6b7f8 9727 pubsubUnsubscribePattern(c,c->argv[j],1);
befec3cd 9728 }
9729}
9730
9731static void publishCommand(redisClient *c) {
9732 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9733 addReplyLong(c,receivers);
9734}
9735
7f957c92 9736/* ================================= Debugging ============================== */
9737
9738static void debugCommand(redisClient *c) {
9739 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9740 *((char*)-1) = 'x';
210e29f7 9741 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9742 if (rdbSave(server.dbfilename) != REDIS_OK) {
9743 addReply(c,shared.err);
9744 return;
9745 }
9746 emptyDb();
9747 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9748 addReply(c,shared.err);
9749 return;
9750 }
9751 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9752 addReply(c,shared.ok);
71c2b467 9753 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9754 emptyDb();
9755 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9756 addReply(c,shared.err);
9757 return;
9758 }
9759 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9760 addReply(c,shared.ok);
333298da 9761 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9762 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9763 robj *key, *val;
9764
9765 if (!de) {
9766 addReply(c,shared.nokeyerr);
9767 return;
9768 }
9769 key = dictGetEntryKey(de);
9770 val = dictGetEntryVal(de);
59146ef3 9771 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9772 key->storage == REDIS_VM_SWAPPING)) {
07efaf74 9773 char *strenc;
9774 char buf[128];
9775
9776 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9777 strenc = strencoding[val->encoding];
9778 } else {
9779 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9780 strenc = buf;
9781 }
ace06542 9782 addReplySds(c,sdscatprintf(sdsempty(),
9783 "+Key at:%p refcount:%d, value at:%p refcount:%d "
07efaf74 9784 "encoding:%s serializedlength:%lld\r\n",
682ac724 9785 (void*)key, key->refcount, (void*)val, val->refcount,
07efaf74 9786 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 9787 } else {
9788 addReplySds(c,sdscatprintf(sdsempty(),
9789 "+Key at:%p refcount:%d, value swapped at: page %llu "
9790 "using %llu pages\r\n",
9791 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9792 (unsigned long long) key->vm.usedpages));
9793 }
78ebe4c8 9794 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
9795 lookupKeyRead(c->db,c->argv[2]);
9796 addReply(c,shared.ok);
7d30035d 9797 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9798 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9799 robj *key, *val;
9800
9801 if (!server.vm_enabled) {
9802 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9803 return;
9804 }
9805 if (!de) {
9806 addReply(c,shared.nokeyerr);
9807 return;
9808 }
9809 key = dictGetEntryKey(de);
9810 val = dictGetEntryVal(de);
4ef8de8a 9811 /* If the key is shared we want to create a copy */
9812 if (key->refcount > 1) {
9813 robj *newkey = dupStringObject(key);
9814 decrRefCount(key);
9815 key = dictGetEntryKey(de) = newkey;
9816 }
9817 /* Swap it */
7d30035d 9818 if (key->storage != REDIS_VM_MEMORY) {
9819 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 9820 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 9821 dictGetEntryVal(de) = NULL;
9822 addReply(c,shared.ok);
9823 } else {
9824 addReply(c,shared.err);
9825 }
7f957c92 9826 } else {
333298da 9827 addReplySds(c,sdsnew(
bdcb92f2 9828 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 9829 }
9830}
56906eef 9831
6c96ba7d 9832static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 9833 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
6c96ba7d 9834 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
dfc5e96c 9835#ifdef HAVE_BACKTRACE
9836 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9837 *((char*)-1) = 'x';
9838#endif
9839}
9840
bcfc686d 9841/* =================================== Main! ================================ */
56906eef 9842
bcfc686d 9843#ifdef __linux__
9844int linuxOvercommitMemoryValue(void) {
9845 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9846 char buf[64];
56906eef 9847
bcfc686d 9848 if (!fp) return -1;
9849 if (fgets(buf,64,fp) == NULL) {
9850 fclose(fp);
9851 return -1;
9852 }
9853 fclose(fp);
56906eef 9854
bcfc686d 9855 return atoi(buf);
9856}
9857
9858void linuxOvercommitMemoryWarning(void) {
9859 if (linuxOvercommitMemoryValue() == 0) {
9860 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9861 }
9862}
9863#endif /* __linux__ */
9864
9865static void daemonize(void) {
9866 int fd;
9867 FILE *fp;
9868
9869 if (fork() != 0) exit(0); /* parent exits */
9870 setsid(); /* create a new session */
9871
9872 /* Every output goes to /dev/null. If Redis is daemonized but
9873 * the 'logfile' is set to 'stdout' in the configuration file
9874 * it will not log at all. */
9875 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9876 dup2(fd, STDIN_FILENO);
9877 dup2(fd, STDOUT_FILENO);
9878 dup2(fd, STDERR_FILENO);
9879 if (fd > STDERR_FILENO) close(fd);
9880 }
9881 /* Try to write the pid file */
9882 fp = fopen(server.pidfile,"w");
9883 if (fp) {
9884 fprintf(fp,"%d\n",getpid());
9885 fclose(fp);
56906eef 9886 }
56906eef 9887}
9888
42ab0172
AO
9889static void version() {
9890 printf("Redis server version %s\n", REDIS_VERSION);
9891 exit(0);
9892}
9893
723fb69b
AO
9894static void usage() {
9895 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 9896 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
9897 exit(1);
9898}
9899
bcfc686d 9900int main(int argc, char **argv) {
9651a787 9901 time_t start;
9902
bcfc686d 9903 initServerConfig();
9904 if (argc == 2) {
44efe66e 9905 if (strcmp(argv[1], "-v") == 0 ||
9906 strcmp(argv[1], "--version") == 0) version();
9907 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 9908 resetServerSaveParams();
9909 loadServerConfig(argv[1]);
723fb69b
AO
9910 } else if ((argc > 2)) {
9911 usage();
bcfc686d 9912 } else {
9913 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9914 }
bcfc686d 9915 if (server.daemonize) daemonize();
71c54b21 9916 initServer();
bcfc686d 9917 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
9918#ifdef __linux__
9919 linuxOvercommitMemoryWarning();
9920#endif
9651a787 9921 start = time(NULL);
bcfc686d 9922 if (server.appendonly) {
9923 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 9924 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 9925 } else {
9926 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 9927 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 9928 }
bcfc686d 9929 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 9930 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 9931 aeMain(server.el);
9932 aeDeleteEventLoop(server.el);
9933 return 0;
9934}
9935
9936/* ============================= Backtrace support ========================= */
9937
9938#ifdef HAVE_BACKTRACE
9939static char *findFuncName(void *pointer, unsigned long *offset);
9940
56906eef 9941static void *getMcontextEip(ucontext_t *uc) {
9942#if defined(__FreeBSD__)
9943 return (void*) uc->uc_mcontext.mc_eip;
9944#elif defined(__dietlibc__)
9945 return (void*) uc->uc_mcontext.eip;
06db1f50 9946#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 9947 #if __x86_64__
9948 return (void*) uc->uc_mcontext->__ss.__rip;
9949 #else
56906eef 9950 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 9951 #endif
06db1f50 9952#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 9953 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 9954 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 9955 #else
9956 return (void*) uc->uc_mcontext->__ss.__eip;
e0a62c7f 9957 #endif
54bac49d 9958#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 9959 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 9960#elif defined(__ia64__) /* Linux IA64 */
9961 return (void*) uc->uc_mcontext.sc_ip;
9962#else
9963 return NULL;
56906eef 9964#endif
9965}
9966
9967static void segvHandler(int sig, siginfo_t *info, void *secret) {
9968 void *trace[100];
9969 char **messages = NULL;
9970 int i, trace_size = 0;
9971 unsigned long offset=0;
56906eef 9972 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 9973 sds infostring;
56906eef 9974 REDIS_NOTUSED(info);
9975
9976 redisLog(REDIS_WARNING,
9977 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 9978 infostring = genRedisInfoString();
9979 redisLog(REDIS_WARNING, "%s",infostring);
9980 /* It's not safe to sdsfree() the returned string under memory
9981 * corruption conditions. Let it leak as we are going to abort */
e0a62c7f 9982
56906eef 9983 trace_size = backtrace(trace, 100);
de96dbfe 9984 /* overwrite sigaction with caller's address */
b91cf5ef 9985 if (getMcontextEip(uc) != NULL) {
9986 trace[1] = getMcontextEip(uc);
9987 }
56906eef 9988 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 9989
d76412d1 9990 for (i=1; i<trace_size; ++i) {
56906eef 9991 char *fn = findFuncName(trace[i], &offset), *p;
9992
9993 p = strchr(messages[i],'+');
9994 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
9995 redisLog(REDIS_WARNING,"%s", messages[i]);
9996 } else {
9997 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
9998 }
9999 }
b177fd30 10000 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 10001 _exit(0);
fe3bbfbe 10002}
56906eef 10003
10004static void setupSigSegvAction(void) {
10005 struct sigaction act;
10006
10007 sigemptyset (&act.sa_mask);
10008 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10009 * is used. Otherwise, sa_handler is used */
10010 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10011 act.sa_sigaction = segvHandler;
10012 sigaction (SIGSEGV, &act, NULL);
10013 sigaction (SIGBUS, &act, NULL);
12fea928 10014 sigaction (SIGFPE, &act, NULL);
10015 sigaction (SIGILL, &act, NULL);
10016 sigaction (SIGBUS, &act, NULL);
e65fdc78 10017 return;
56906eef 10018}
e65fdc78 10019
bcfc686d 10020#include "staticsymbols.h"
10021/* This function try to convert a pointer into a function name. It's used in
10022 * oreder to provide a backtrace under segmentation fault that's able to
10023 * display functions declared as static (otherwise the backtrace is useless). */
10024static char *findFuncName(void *pointer, unsigned long *offset){
10025 int i, ret = -1;
10026 unsigned long off, minoff = 0;
ed9b544e 10027
bcfc686d 10028 /* Try to match against the Symbol with the smallest offset */
10029 for (i=0; symsTable[i].pointer; i++) {
10030 unsigned long lp = (unsigned long) pointer;
0bc03378 10031
bcfc686d 10032 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
10033 off=lp-symsTable[i].pointer;
10034 if (ret < 0 || off < minoff) {
10035 minoff=off;
10036 ret=i;
10037 }
10038 }
0bc03378 10039 }
bcfc686d 10040 if (ret == -1) return NULL;
10041 *offset = minoff;
10042 return symsTable[ret].name;
0bc03378 10043}
bcfc686d 10044#else /* HAVE_BACKTRACE */
10045static void setupSigSegvAction(void) {
0bc03378 10046}
bcfc686d 10047#endif /* HAVE_BACKTRACE */
0bc03378 10048
ed9b544e 10049
ed9b544e 10050
bcfc686d 10051/* The End */
10052
10053
ed9b544e 10054