]> git.saurik.com Git - redis.git/blame - redis.c
Merge branch 'hmget' of git://github.com/pietern/redis
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
cac154c5 30#define REDIS_VERSION "1.3.8"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
c9468bcf 40#define __USE_POSIX199309
54bac49d 41#define __USE_UNIX98
ed9b544e 42#include <signal.h>
fbf9bcdb 43
44#ifdef HAVE_BACKTRACE
c9468bcf 45#include <execinfo.h>
46#include <ucontext.h>
fbf9bcdb 47#endif /* HAVE_BACKTRACE */
48
ed9b544e 49#include <sys/wait.h>
50#include <errno.h>
51#include <assert.h>
52#include <ctype.h>
53#include <stdarg.h>
54#include <inttypes.h>
55#include <arpa/inet.h>
56#include <sys/stat.h>
57#include <fcntl.h>
58#include <sys/time.h>
59#include <sys/resource.h>
2895e862 60#include <sys/uio.h>
f78fd11b 61#include <limits.h>
a7866db6 62#include <math.h>
92f8e882 63#include <pthread.h>
0bc1b2f6 64
65#if defined(__sun)
5043dff3 66#include "solarisfixes.h"
67#endif
ed9b544e 68
c9468bcf 69#include "redis.h"
ed9b544e 70#include "ae.h" /* Event driven programming library */
71#include "sds.h" /* Dynamic safe strings */
72#include "anet.h" /* Networking the easy way */
73#include "dict.h" /* Hash tables */
74#include "adlist.h" /* Linked lists */
75#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 76#include "lzf.h" /* LZF compression library */
77#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
5234952b 78#include "zipmap.h"
ed9b544e 79
80/* Error codes */
81#define REDIS_OK 0
82#define REDIS_ERR -1
83
84/* Static server configuration */
85#define REDIS_SERVERPORT 6379 /* TCP port */
86#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 87#define REDIS_IOBUF_LEN 1024
ed9b544e 88#define REDIS_LOADBUF_LEN 1024
248ea310 89#define REDIS_STATIC_ARGS 8
ed9b544e 90#define REDIS_DEFAULT_DBNUM 16
91#define REDIS_CONFIGLINE_MAX 1024
92#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
1763929f 94#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* try to expire 10 keys/loop */
6f376729 95#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 96#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99#define REDIS_WRITEV_THRESHOLD 3
100/* Max number of iovecs used for each writev call */
101#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 102
103/* Hash table parameters */
104#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 105
106/* Command flags */
3fd78bcd 107#define REDIS_CMD_BULK 1 /* Bulk write command */
108#define REDIS_CMD_INLINE 2 /* Inline command */
109/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113#define REDIS_CMD_DENYOOM 4
4005fef1 114#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
ed9b544e 115
116/* Object types */
117#define REDIS_STRING 0
118#define REDIS_LIST 1
119#define REDIS_SET 2
1812e024 120#define REDIS_ZSET 3
121#define REDIS_HASH 4
f78fd11b 122
5234952b 123/* Objects encoding. Some kind of objects like Strings and Hashes can be
124 * internally represented in multiple ways. The 'encoding' field of the object
125 * is set to one of this fields for this object. */
942a3961 126#define REDIS_ENCODING_RAW 0 /* Raw representation */
127#define REDIS_ENCODING_INT 1 /* Encoded as integer */
5234952b 128#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
129#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
942a3961 130
07efaf74 131static char* strencoding[] = {
132 "raw", "int", "zipmap", "hashtable"
133};
134
f78fd11b 135/* Object types only used for dumping to disk */
bb32ede5 136#define REDIS_EXPIRETIME 253
ed9b544e 137#define REDIS_SELECTDB 254
138#define REDIS_EOF 255
139
f78fd11b 140/* Defines related to the dump file format. To store 32 bits lengths for short
141 * keys requires a lot of space, so we check the most significant 2 bits of
142 * the first byte to interpreter the length:
143 *
144 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
145 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
146 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 147 * 11|000000 this means: specially encoded object will follow. The six bits
148 * number specify the kind of object that follows.
149 * See the REDIS_RDB_ENC_* defines.
f78fd11b 150 *
10c43610 151 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
152 * values, will fit inside. */
f78fd11b 153#define REDIS_RDB_6BITLEN 0
154#define REDIS_RDB_14BITLEN 1
155#define REDIS_RDB_32BITLEN 2
17be1a4a 156#define REDIS_RDB_ENCVAL 3
f78fd11b 157#define REDIS_RDB_LENERR UINT_MAX
158
a4d1ba9a 159/* When a length of a string object stored on disk has the first two bits
160 * set, the remaining two bits specify a special encoding for the object
161 * accordingly to the following defines: */
162#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
163#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
164#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 165#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 166
75680a3c 167/* Virtual memory object->where field. */
168#define REDIS_VM_MEMORY 0 /* The object is on memory */
169#define REDIS_VM_SWAPPED 1 /* The object is on disk */
170#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
171#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
172
06224fec 173/* Virtual memory static configuration stuff.
174 * Check vmFindContiguousPages() to know more about this magic numbers. */
175#define REDIS_VM_MAX_NEAR_PAGES 65536
176#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 177#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 178#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 179/* The following is the *percentage* of completed I/O jobs to process when the
180 * handelr is called. While Virtual Memory I/O operations are performed by
181 * threads, this operations must be processed by the main thread when completed
182 * in order to take effect. */
c953f24b 183#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 184
ed9b544e 185/* Client flags */
d5d55fc3 186#define REDIS_SLAVE 1 /* This client is a slave server */
187#define REDIS_MASTER 2 /* This client is a master server */
188#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
189#define REDIS_MULTI 8 /* This client is in a MULTI context */
190#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
191#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
ed9b544e 192
40d224a9 193/* Slave replication state - slave side */
ed9b544e 194#define REDIS_REPL_NONE 0 /* No active replication */
195#define REDIS_REPL_CONNECT 1 /* Must connect to master */
196#define REDIS_REPL_CONNECTED 2 /* Connected to master */
197
40d224a9 198/* Slave replication state - from the point of view of master
199 * Note that in SEND_BULK and ONLINE state the slave receives new updates
200 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
201 * to start the next background saving in order to send updates to it. */
202#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
203#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
204#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
205#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
206
ed9b544e 207/* List related stuff */
208#define REDIS_HEAD 0
209#define REDIS_TAIL 1
210
211/* Sort operations */
212#define REDIS_SORT_GET 0
443c6409 213#define REDIS_SORT_ASC 1
214#define REDIS_SORT_DESC 2
ed9b544e 215#define REDIS_SORTKEY_MAX 1024
216
217/* Log levels */
218#define REDIS_DEBUG 0
f870935d 219#define REDIS_VERBOSE 1
220#define REDIS_NOTICE 2
221#define REDIS_WARNING 3
ed9b544e 222
223/* Anti-warning macro... */
224#define REDIS_NOTUSED(V) ((void) V)
225
6b47e12e 226#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
227#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 228
48f0308a 229/* Append only defines */
230#define APPENDFSYNC_NO 0
231#define APPENDFSYNC_ALWAYS 1
232#define APPENDFSYNC_EVERYSEC 2
233
cbba7dd7 234/* Hashes related defaults */
235#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
236#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
237
dfc5e96c 238/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 239#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
6c96ba7d 240static void _redisAssert(char *estr, char *file, int line);
dfc5e96c 241
ed9b544e 242/*================================= Data types ============================== */
243
244/* A redis object, that is a type able to hold a string / list / set */
75680a3c 245
246/* The VM object structure */
247struct redisObjectVM {
3a66edc7 248 off_t page; /* the page at witch the object is stored on disk */
249 off_t usedpages; /* number of pages used on disk */
250 time_t atime; /* Last access time */
75680a3c 251} vm;
252
253/* The actual Redis Object */
ed9b544e 254typedef struct redisObject {
ed9b544e 255 void *ptr;
942a3961 256 unsigned char type;
257 unsigned char encoding;
d894161b 258 unsigned char storage; /* If this object is a key, where is the value?
259 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
260 unsigned char vtype; /* If this object is a key, and value is swapped out,
261 * this is the type of the swapped out object. */
ed9b544e 262 int refcount;
75680a3c 263 /* VM fields, this are only allocated if VM is active, otherwise the
264 * object allocation function will just allocate
265 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
266 * Redis without VM active will not have any overhead. */
267 struct redisObjectVM vm;
ed9b544e 268} robj;
269
dfc5e96c 270/* Macro used to initalize a Redis object allocated on the stack.
271 * Note that this macro is taken near the structure definition to make sure
272 * we'll update it when the structure is changed, to avoid bugs like
273 * bug #85 introduced exactly in this way. */
274#define initStaticStringObject(_var,_ptr) do { \
275 _var.refcount = 1; \
276 _var.type = REDIS_STRING; \
277 _var.encoding = REDIS_ENCODING_RAW; \
278 _var.ptr = _ptr; \
3a66edc7 279 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 280} while(0);
281
3305306f 282typedef struct redisDb {
4409877e 283 dict *dict; /* The keyspace for this DB */
284 dict *expires; /* Timeout of keys with a timeout set */
285 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 286 dict *io_keys; /* Keys with clients waiting for VM I/O */
3305306f 287 int id;
288} redisDb;
289
6e469882 290/* Client MULTI/EXEC state */
291typedef struct multiCmd {
292 robj **argv;
293 int argc;
294 struct redisCommand *cmd;
295} multiCmd;
296
297typedef struct multiState {
298 multiCmd *commands; /* Array of MULTI commands */
299 int count; /* Total number of MULTI commands */
300} multiState;
301
ed9b544e 302/* With multiplexing we need to take per-clinet state.
303 * Clients are taken in a liked list. */
304typedef struct redisClient {
305 int fd;
3305306f 306 redisDb *db;
ed9b544e 307 int dictid;
308 sds querybuf;
e8a74421 309 robj **argv, **mbargv;
310 int argc, mbargc;
40d224a9 311 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 312 int multibulk; /* multi bulk command format active */
ed9b544e 313 list *reply;
314 int sentlen;
315 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 316 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 317 int slaveseldb; /* slave selected db, if this client is a slave */
318 int authenticated; /* when requirepass is non-NULL */
319 int replstate; /* replication state if this is a slave */
320 int repldbfd; /* replication DB file descriptor */
6e469882 321 long repldboff; /* replication DB file offset */
40d224a9 322 off_t repldbsize; /* replication DB file size */
6e469882 323 multiState mstate; /* MULTI/EXEC state */
d5d55fc3 324 robj **blockingkeys; /* The key we are waiting to terminate a blocking
4409877e 325 * operation such as BLPOP. Otherwise NULL. */
b177fd30 326 int blockingkeysnum; /* Number of blocking keys */
4409877e 327 time_t blockingto; /* Blocking operation timeout. If UNIX current time
328 * is >= blockingto then the operation timed out. */
92f8e882 329 list *io_keys; /* Keys this client is waiting to be loaded from the
330 * swap file in order to continue. */
ffc6b7f8 331 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
332 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
ed9b544e 333} redisClient;
334
335struct saveparam {
336 time_t seconds;
337 int changes;
338};
339
340/* Global server state structure */
341struct redisServer {
342 int port;
343 int fd;
3305306f 344 redisDb *db;
ed9b544e 345 long long dirty; /* changes to DB from the last save */
346 list *clients;
87eca727 347 list *slaves, *monitors;
ed9b544e 348 char neterr[ANET_ERR_LEN];
349 aeEventLoop *el;
350 int cronloops; /* number of times the cron function run */
351 list *objfreelist; /* A list of freed objects to avoid malloc() */
352 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 353 /* Fields used only for stats */
354 time_t stat_starttime; /* server start time */
355 long long stat_numcommands; /* number of processed commands */
356 long long stat_numconnections; /* number of connections received */
2a6a2ed1 357 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 358 /* Configuration */
359 int verbosity;
360 int glueoutputbuf;
361 int maxidletime;
362 int dbnum;
363 int daemonize;
44b38ef4 364 int appendonly;
48f0308a 365 int appendfsync;
366 time_t lastfsync;
44b38ef4 367 int appendfd;
368 int appendseldb;
ed329fcf 369 char *pidfile;
9f3c422c 370 pid_t bgsavechildpid;
9d65a1bb 371 pid_t bgrewritechildpid;
372 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
ed9b544e 373 struct saveparam *saveparams;
374 int saveparamslen;
375 char *logfile;
376 char *bindaddr;
377 char *dbfilename;
44b38ef4 378 char *appendfilename;
abcb223e 379 char *requirepass;
10c43610 380 int shareobjects;
121f70cf 381 int rdbcompression;
ed9b544e 382 /* Replication related */
383 int isslave;
d0ccebcf 384 char *masterauth;
ed9b544e 385 char *masterhost;
386 int masterport;
40d224a9 387 redisClient *master; /* client that is master for this slave */
ed9b544e 388 int replstate;
285add55 389 unsigned int maxclients;
4ef8de8a 390 unsigned long long maxmemory;
d5d55fc3 391 unsigned int blpop_blocked_clients;
392 unsigned int vm_blocked_clients;
ed9b544e 393 /* Sort parameters - qsort_r() is only available under BSD so we
394 * have to take this state global, in order to pass it to sortCompare() */
395 int sort_desc;
396 int sort_alpha;
397 int sort_bypattern;
75680a3c 398 /* Virtual memory configuration */
399 int vm_enabled;
054e426d 400 char *vm_swap_file;
75680a3c 401 off_t vm_page_size;
402 off_t vm_pages;
4ef8de8a 403 unsigned long long vm_max_memory;
cbba7dd7 404 /* Hashes config */
405 size_t hash_max_zipmap_entries;
406 size_t hash_max_zipmap_value;
75680a3c 407 /* Virtual memory state */
408 FILE *vm_fp;
409 int vm_fd;
410 off_t vm_next_page; /* Next probably empty page */
411 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 412 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 413 time_t unixtime; /* Unix time sampled every second. */
92f8e882 414 /* Virtual memory I/O threads stuff */
92f8e882 415 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 416 * put the result of the operation in the io_done list. While the
417 * job is being processed, it's put on io_processing queue. */
418 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
419 list *io_processing; /* List of VM I/O jobs being processed */
420 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 421 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 422 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 423 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
424 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 425 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 426 int io_active_threads; /* Number of running I/O threads */
427 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 428 /* Our main thread is blocked on the event loop, locking for sockets ready
429 * to be read or written, so when a threaded I/O operation is ready to be
430 * processed by the main thread, the I/O thread will use a unix pipe to
431 * awake the main thread. The followings are the two pipe FDs. */
432 int io_ready_pipe_read;
433 int io_ready_pipe_write;
7d98e08c 434 /* Virtual memory stats */
435 unsigned long long vm_stats_used_pages;
436 unsigned long long vm_stats_swapped_objects;
437 unsigned long long vm_stats_swapouts;
438 unsigned long long vm_stats_swapins;
befec3cd 439 /* Pubsub */
ffc6b7f8 440 dict *pubsub_channels; /* Map channels to list of subscribed clients */
441 list *pubsub_patterns; /* A list of pubsub_patterns */
befec3cd 442 /* Misc */
b9bc0eef 443 FILE *devnull;
ed9b544e 444};
445
ffc6b7f8 446typedef struct pubsubPattern {
447 redisClient *client;
448 robj *pattern;
449} pubsubPattern;
450
ed9b544e 451typedef void redisCommandProc(redisClient *c);
452struct redisCommand {
453 char *name;
454 redisCommandProc *proc;
455 int arity;
456 int flags;
76583ea4
PN
457 /* Use a function to determine which keys need to be loaded
458 * in the background prior to executing this command. Takes precedence
459 * over vm_firstkey and others, ignored when NULL */
460 redisCommandProc *vm_preload_proc;
7c775e09 461 /* What keys should be loaded in background when calling this command? */
462 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
463 int vm_lastkey; /* THe last argument that's a key */
464 int vm_keystep; /* The step between first and last key */
ed9b544e 465};
466
de96dbfe 467struct redisFunctionSym {
468 char *name;
56906eef 469 unsigned long pointer;
de96dbfe 470};
471
ed9b544e 472typedef struct _redisSortObject {
473 robj *obj;
474 union {
475 double score;
476 robj *cmpobj;
477 } u;
478} redisSortObject;
479
480typedef struct _redisSortOperation {
481 int type;
482 robj *pattern;
483} redisSortOperation;
484
6b47e12e 485/* ZSETs use a specialized version of Skiplists */
486
487typedef struct zskiplistNode {
488 struct zskiplistNode **forward;
e3870fab 489 struct zskiplistNode *backward;
912b9165 490 unsigned int *span;
6b47e12e 491 double score;
492 robj *obj;
493} zskiplistNode;
494
495typedef struct zskiplist {
e3870fab 496 struct zskiplistNode *header, *tail;
d13f767c 497 unsigned long length;
6b47e12e 498 int level;
499} zskiplist;
500
1812e024 501typedef struct zset {
502 dict *dict;
6b47e12e 503 zskiplist *zsl;
1812e024 504} zset;
505
6b47e12e 506/* Our shared "common" objects */
507
05df7621 508#define REDIS_SHARED_INTEGERS 10000
ed9b544e 509struct sharedObjectsStruct {
c937aa89 510 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 511 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 512 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
513 *outofrangeerr, *plus,
ed9b544e 514 *select0, *select1, *select2, *select3, *select4,
befec3cd 515 *select5, *select6, *select7, *select8, *select9,
ffc6b7f8 516 *messagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
05df7621 517 *psubscribebulk, *punsubscribebulk, *integers[REDIS_SHARED_INTEGERS];
ed9b544e 518} shared;
519
a7866db6 520/* Global vars that are actally used as constants. The following double
521 * values are used for double on-disk serialization, and are initialized
522 * at runtime to avoid strange compiler optimizations. */
523
524static double R_Zero, R_PosInf, R_NegInf, R_Nan;
525
92f8e882 526/* VM threaded I/O request message */
b9bc0eef 527#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
528#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
529#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 530typedef struct iojob {
996cb5f7 531 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 532 redisDb *db;/* Redis database */
92f8e882 533 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 534 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 535 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
536 off_t page; /* Swap page where to read/write the object */
248ea310 537 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 538 int canceled; /* True if this command was canceled by blocking side of VM */
539 pthread_t thread; /* ID of the thread processing this entry */
540} iojob;
92f8e882 541
ed9b544e 542/*================================ Prototypes =============================== */
543
544static void freeStringObject(robj *o);
545static void freeListObject(robj *o);
546static void freeSetObject(robj *o);
547static void decrRefCount(void *o);
548static robj *createObject(int type, void *ptr);
549static void freeClient(redisClient *c);
f78fd11b 550static int rdbLoad(char *filename);
ed9b544e 551static void addReply(redisClient *c, robj *obj);
552static void addReplySds(redisClient *c, sds s);
553static void incrRefCount(robj *o);
f78fd11b 554static int rdbSaveBackground(char *filename);
ed9b544e 555static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 556static robj *dupStringObject(robj *o);
248ea310 557static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
44b38ef4 558static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 559static int syncWithMaster(void);
05df7621 560static robj *tryObjectEncoding(robj *o);
9d65a1bb 561static robj *getDecodedObject(robj *o);
3305306f 562static int removeExpire(redisDb *db, robj *key);
563static int expireIfNeeded(redisDb *db, robj *key);
564static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 565static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 566static int deleteKey(redisDb *db, robj *key);
bb32ede5 567static time_t getExpire(redisDb *db, robj *key);
568static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 569static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 570static void freeMemoryIfNeeded(void);
de96dbfe 571static int processCommand(redisClient *c);
56906eef 572static void setupSigSegvAction(void);
a3b21203 573static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 574static void aofRemoveTempFile(pid_t childpid);
0ea663ea 575static size_t stringObjectLen(robj *o);
638e42ac 576static void processInputBuffer(redisClient *c);
6b47e12e 577static zskiplist *zslCreate(void);
fd8ccf44 578static void zslFree(zskiplist *zsl);
2b59cfdf 579static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 580static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 581static void initClientMultiState(redisClient *c);
582static void freeClientMultiState(redisClient *c);
583static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 584static void unblockClientWaitingData(redisClient *c);
4409877e 585static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 586static void vmInit(void);
a35ddf12 587static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 588static robj *vmLoadObject(robj *key);
7e69548d 589static robj *vmPreviewObject(robj *key);
a69a0c9c 590static int vmSwapOneObjectBlocking(void);
591static int vmSwapOneObjectThreaded(void);
7e69548d 592static int vmCanSwapOut(void);
a5819310 593static int tryFreeOneObjectFromFreelist(void);
996cb5f7 594static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
595static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
596static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 597static void lockThreadedIO(void);
598static void unlockThreadedIO(void);
599static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
600static void freeIOJob(iojob *j);
601static void queueIOJob(iojob *j);
a5819310 602static int vmWriteObjectOnSwap(robj *o, off_t page);
603static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 604static void waitEmptyIOJobsQueue(void);
605static void vmReopenSwapFile(void);
970e10bb 606static int vmFreePage(off_t page);
76583ea4 607static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
d5d55fc3 608static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
609static int dontWaitForSwappedKey(redisClient *c, robj *key);
610static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
611static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
612static struct redisCommand *lookupCommand(char *name);
613static void call(redisClient *c, struct redisCommand *cmd);
614static void resetClient(redisClient *c);
ada386b2 615static void convertToRealHash(robj *o);
ffc6b7f8 616static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
617static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
618static void freePubsubPattern(void *p);
619static int listMatchPubsubPattern(void *a, void *b);
620static int compareStringObjects(robj *a, robj *b);
befec3cd 621static void usage();
ed9b544e 622
abcb223e 623static void authCommand(redisClient *c);
ed9b544e 624static void pingCommand(redisClient *c);
625static void echoCommand(redisClient *c);
626static void setCommand(redisClient *c);
627static void setnxCommand(redisClient *c);
628static void getCommand(redisClient *c);
629static void delCommand(redisClient *c);
630static void existsCommand(redisClient *c);
631static void incrCommand(redisClient *c);
632static void decrCommand(redisClient *c);
633static void incrbyCommand(redisClient *c);
634static void decrbyCommand(redisClient *c);
635static void selectCommand(redisClient *c);
636static void randomkeyCommand(redisClient *c);
637static void keysCommand(redisClient *c);
638static void dbsizeCommand(redisClient *c);
639static void lastsaveCommand(redisClient *c);
640static void saveCommand(redisClient *c);
641static void bgsaveCommand(redisClient *c);
9d65a1bb 642static void bgrewriteaofCommand(redisClient *c);
ed9b544e 643static void shutdownCommand(redisClient *c);
644static void moveCommand(redisClient *c);
645static void renameCommand(redisClient *c);
646static void renamenxCommand(redisClient *c);
647static void lpushCommand(redisClient *c);
648static void rpushCommand(redisClient *c);
649static void lpopCommand(redisClient *c);
650static void rpopCommand(redisClient *c);
651static void llenCommand(redisClient *c);
652static void lindexCommand(redisClient *c);
653static void lrangeCommand(redisClient *c);
654static void ltrimCommand(redisClient *c);
655static void typeCommand(redisClient *c);
656static void lsetCommand(redisClient *c);
657static void saddCommand(redisClient *c);
658static void sremCommand(redisClient *c);
a4460ef4 659static void smoveCommand(redisClient *c);
ed9b544e 660static void sismemberCommand(redisClient *c);
661static void scardCommand(redisClient *c);
12fea928 662static void spopCommand(redisClient *c);
2abb95a9 663static void srandmemberCommand(redisClient *c);
ed9b544e 664static void sinterCommand(redisClient *c);
665static void sinterstoreCommand(redisClient *c);
40d224a9 666static void sunionCommand(redisClient *c);
667static void sunionstoreCommand(redisClient *c);
f4f56e1d 668static void sdiffCommand(redisClient *c);
669static void sdiffstoreCommand(redisClient *c);
ed9b544e 670static void syncCommand(redisClient *c);
671static void flushdbCommand(redisClient *c);
672static void flushallCommand(redisClient *c);
673static void sortCommand(redisClient *c);
674static void lremCommand(redisClient *c);
0f5f7e9a 675static void rpoplpushcommand(redisClient *c);
ed9b544e 676static void infoCommand(redisClient *c);
70003d28 677static void mgetCommand(redisClient *c);
87eca727 678static void monitorCommand(redisClient *c);
3305306f 679static void expireCommand(redisClient *c);
802e8373 680static void expireatCommand(redisClient *c);
f6b141c5 681static void getsetCommand(redisClient *c);
fd88489a 682static void ttlCommand(redisClient *c);
321b0e13 683static void slaveofCommand(redisClient *c);
7f957c92 684static void debugCommand(redisClient *c);
f6b141c5 685static void msetCommand(redisClient *c);
686static void msetnxCommand(redisClient *c);
fd8ccf44 687static void zaddCommand(redisClient *c);
7db723ad 688static void zincrbyCommand(redisClient *c);
cc812361 689static void zrangeCommand(redisClient *c);
50c55df5 690static void zrangebyscoreCommand(redisClient *c);
f44dd428 691static void zcountCommand(redisClient *c);
e3870fab 692static void zrevrangeCommand(redisClient *c);
3c41331e 693static void zcardCommand(redisClient *c);
1b7106e7 694static void zremCommand(redisClient *c);
6e333bbe 695static void zscoreCommand(redisClient *c);
1807985b 696static void zremrangebyscoreCommand(redisClient *c);
6e469882 697static void multiCommand(redisClient *c);
698static void execCommand(redisClient *c);
18b6cb76 699static void discardCommand(redisClient *c);
4409877e 700static void blpopCommand(redisClient *c);
701static void brpopCommand(redisClient *c);
4b00bebd 702static void appendCommand(redisClient *c);
39191553 703static void substrCommand(redisClient *c);
69d95c3e 704static void zrankCommand(redisClient *c);
798d9e55 705static void zrevrankCommand(redisClient *c);
978c2c94 706static void hsetCommand(redisClient *c);
707static void hgetCommand(redisClient *c);
09aeb579
PN
708static void hmsetCommand(redisClient *c);
709static void hmgetCommand(redisClient *c);
07efaf74 710static void hdelCommand(redisClient *c);
92b27fe9 711static void hlenCommand(redisClient *c);
9212eafd 712static void zremrangebyrankCommand(redisClient *c);
2830ca53
PN
713static void zunionCommand(redisClient *c);
714static void zinterCommand(redisClient *c);
78409a0f 715static void hkeysCommand(redisClient *c);
716static void hvalsCommand(redisClient *c);
717static void hgetallCommand(redisClient *c);
a86f14b1 718static void hexistsCommand(redisClient *c);
500ece7c 719static void configCommand(redisClient *c);
01426b05 720static void hincrbyCommand(redisClient *c);
befec3cd 721static void subscribeCommand(redisClient *c);
722static void unsubscribeCommand(redisClient *c);
ffc6b7f8 723static void psubscribeCommand(redisClient *c);
724static void punsubscribeCommand(redisClient *c);
befec3cd 725static void publishCommand(redisClient *c);
f6b141c5 726
ed9b544e 727/*================================= Globals ================================= */
728
729/* Global vars */
730static struct redisServer server; /* server global state */
731static struct redisCommand cmdTable[] = {
76583ea4
PN
732 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
733 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
734 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
735 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
736 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
737 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
738 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
739 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
740 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
741 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
742 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
743 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
744 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
745 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
746 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
747 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
748 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
749 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
750 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
751 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
752 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
753 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
754 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
755 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
756 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
757 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
758 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
759 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
760 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
761 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
762 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
763 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
764 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
765 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
766 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
767 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
768 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
769 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
770 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
771 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
772 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
773 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
774 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
775 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
776 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
777 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
778 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
780 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
781 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
782 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
783 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
784 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 785 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
d33278d1 786 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 787 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
01426b05 788 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
789 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
790 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
791 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
792 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
793 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 794 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
795 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
796 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
797 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
798 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
799 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
800 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
801 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
802 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
803 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
804 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
805 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
806 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
807 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
808 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
809 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
810 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
811 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
812 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
814 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
815 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
816 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
818 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
958cd5f3 819 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
820 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
823 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
824 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
825 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
826 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
827 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
828 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 830 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
befec3cd 831 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
832 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
ffc6b7f8 833 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
4005fef1 835 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
76583ea4 836 {NULL,NULL,0,0,NULL,0,0,0}
ed9b544e 837};
bcfc686d 838
ed9b544e 839/*============================ Utility functions ============================ */
840
841/* Glob-style pattern matching. */
500ece7c 842static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 843 const char *string, int stringLen, int nocase)
844{
845 while(patternLen) {
846 switch(pattern[0]) {
847 case '*':
848 while (pattern[1] == '*') {
849 pattern++;
850 patternLen--;
851 }
852 if (patternLen == 1)
853 return 1; /* match */
854 while(stringLen) {
855 if (stringmatchlen(pattern+1, patternLen-1,
856 string, stringLen, nocase))
857 return 1; /* match */
858 string++;
859 stringLen--;
860 }
861 return 0; /* no match */
862 break;
863 case '?':
864 if (stringLen == 0)
865 return 0; /* no match */
866 string++;
867 stringLen--;
868 break;
869 case '[':
870 {
871 int not, match;
872
873 pattern++;
874 patternLen--;
875 not = pattern[0] == '^';
876 if (not) {
877 pattern++;
878 patternLen--;
879 }
880 match = 0;
881 while(1) {
882 if (pattern[0] == '\\') {
883 pattern++;
884 patternLen--;
885 if (pattern[0] == string[0])
886 match = 1;
887 } else if (pattern[0] == ']') {
888 break;
889 } else if (patternLen == 0) {
890 pattern--;
891 patternLen++;
892 break;
893 } else if (pattern[1] == '-' && patternLen >= 3) {
894 int start = pattern[0];
895 int end = pattern[2];
896 int c = string[0];
897 if (start > end) {
898 int t = start;
899 start = end;
900 end = t;
901 }
902 if (nocase) {
903 start = tolower(start);
904 end = tolower(end);
905 c = tolower(c);
906 }
907 pattern += 2;
908 patternLen -= 2;
909 if (c >= start && c <= end)
910 match = 1;
911 } else {
912 if (!nocase) {
913 if (pattern[0] == string[0])
914 match = 1;
915 } else {
916 if (tolower((int)pattern[0]) == tolower((int)string[0]))
917 match = 1;
918 }
919 }
920 pattern++;
921 patternLen--;
922 }
923 if (not)
924 match = !match;
925 if (!match)
926 return 0; /* no match */
927 string++;
928 stringLen--;
929 break;
930 }
931 case '\\':
932 if (patternLen >= 2) {
933 pattern++;
934 patternLen--;
935 }
936 /* fall through */
937 default:
938 if (!nocase) {
939 if (pattern[0] != string[0])
940 return 0; /* no match */
941 } else {
942 if (tolower((int)pattern[0]) != tolower((int)string[0]))
943 return 0; /* no match */
944 }
945 string++;
946 stringLen--;
947 break;
948 }
949 pattern++;
950 patternLen--;
951 if (stringLen == 0) {
952 while(*pattern == '*') {
953 pattern++;
954 patternLen--;
955 }
956 break;
957 }
958 }
959 if (patternLen == 0 && stringLen == 0)
960 return 1;
961 return 0;
962}
963
500ece7c 964static int stringmatch(const char *pattern, const char *string, int nocase) {
965 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
966}
967
56906eef 968static void redisLog(int level, const char *fmt, ...) {
ed9b544e 969 va_list ap;
970 FILE *fp;
971
972 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
973 if (!fp) return;
974
975 va_start(ap, fmt);
976 if (level >= server.verbosity) {
6766f45e 977 char *c = ".-*#";
1904ecc1 978 char buf[64];
979 time_t now;
980
981 now = time(NULL);
6c9385e0 982 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 983 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 984 vfprintf(fp, fmt, ap);
985 fprintf(fp,"\n");
986 fflush(fp);
987 }
988 va_end(ap);
989
990 if (server.logfile) fclose(fp);
991}
992
993/*====================== Hash table type implementation ==================== */
994
995/* This is an hash table type that uses the SDS dynamic strings libary as
996 * keys and radis objects as values (objects can hold SDS strings,
997 * lists, sets). */
998
1812e024 999static void dictVanillaFree(void *privdata, void *val)
1000{
1001 DICT_NOTUSED(privdata);
1002 zfree(val);
1003}
1004
4409877e 1005static void dictListDestructor(void *privdata, void *val)
1006{
1007 DICT_NOTUSED(privdata);
1008 listRelease((list*)val);
1009}
1010
ed9b544e 1011static int sdsDictKeyCompare(void *privdata, const void *key1,
1012 const void *key2)
1013{
1014 int l1,l2;
1015 DICT_NOTUSED(privdata);
1016
1017 l1 = sdslen((sds)key1);
1018 l2 = sdslen((sds)key2);
1019 if (l1 != l2) return 0;
1020 return memcmp(key1, key2, l1) == 0;
1021}
1022
1023static void dictRedisObjectDestructor(void *privdata, void *val)
1024{
1025 DICT_NOTUSED(privdata);
1026
a35ddf12 1027 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 1028 decrRefCount(val);
1029}
1030
942a3961 1031static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1032 const void *key2)
1033{
1034 const robj *o1 = key1, *o2 = key2;
1035 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1036}
1037
942a3961 1038static unsigned int dictObjHash(const void *key) {
ed9b544e 1039 const robj *o = key;
1040 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1041}
1042
942a3961 1043static int dictEncObjKeyCompare(void *privdata, const void *key1,
1044 const void *key2)
1045{
9d65a1bb 1046 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1047 int cmp;
942a3961 1048
2a1198b4 1049 if (o1->encoding == REDIS_ENCODING_INT &&
1050 o2->encoding == REDIS_ENCODING_INT &&
db5946fc 1051 o1->ptr == o2->ptr) return 1;
2a1198b4 1052
9d65a1bb 1053 o1 = getDecodedObject(o1);
1054 o2 = getDecodedObject(o2);
1055 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1056 decrRefCount(o1);
1057 decrRefCount(o2);
1058 return cmp;
942a3961 1059}
1060
1061static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1062 robj *o = (robj*) key;
942a3961 1063
ed9e4966 1064 if (o->encoding == REDIS_ENCODING_RAW) {
1065 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1066 } else {
1067 if (o->encoding == REDIS_ENCODING_INT) {
1068 char buf[32];
1069 int len;
1070
1071 len = snprintf(buf,32,"%ld",(long)o->ptr);
1072 return dictGenHashFunction((unsigned char*)buf, len);
1073 } else {
1074 unsigned int hash;
1075
1076 o = getDecodedObject(o);
1077 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1078 decrRefCount(o);
1079 return hash;
1080 }
1081 }
942a3961 1082}
1083
f2d9f50f 1084/* Sets type and expires */
ed9b544e 1085static dictType setDictType = {
942a3961 1086 dictEncObjHash, /* hash function */
ed9b544e 1087 NULL, /* key dup */
1088 NULL, /* val dup */
942a3961 1089 dictEncObjKeyCompare, /* key compare */
ed9b544e 1090 dictRedisObjectDestructor, /* key destructor */
1091 NULL /* val destructor */
1092};
1093
f2d9f50f 1094/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1095static dictType zsetDictType = {
1096 dictEncObjHash, /* hash function */
1097 NULL, /* key dup */
1098 NULL, /* val dup */
1099 dictEncObjKeyCompare, /* key compare */
1100 dictRedisObjectDestructor, /* key destructor */
da0a1620 1101 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1102};
1103
f2d9f50f 1104/* Db->dict */
5234952b 1105static dictType dbDictType = {
942a3961 1106 dictObjHash, /* hash function */
ed9b544e 1107 NULL, /* key dup */
1108 NULL, /* val dup */
942a3961 1109 dictObjKeyCompare, /* key compare */
ed9b544e 1110 dictRedisObjectDestructor, /* key destructor */
1111 dictRedisObjectDestructor /* val destructor */
1112};
1113
f2d9f50f 1114/* Db->expires */
1115static dictType keyptrDictType = {
1116 dictObjHash, /* hash function */
1117 NULL, /* key dup */
1118 NULL, /* val dup */
1119 dictObjKeyCompare, /* key compare */
1120 dictRedisObjectDestructor, /* key destructor */
1121 NULL /* val destructor */
1122};
1123
5234952b 1124/* Hash type hash table (note that small hashes are represented with zimpaps) */
1125static dictType hashDictType = {
1126 dictEncObjHash, /* hash function */
1127 NULL, /* key dup */
1128 NULL, /* val dup */
1129 dictEncObjKeyCompare, /* key compare */
1130 dictRedisObjectDestructor, /* key destructor */
1131 dictRedisObjectDestructor /* val destructor */
1132};
1133
4409877e 1134/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1135 * lists as values. It's used for blocking operations (BLPOP) and to
1136 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1137static dictType keylistDictType = {
1138 dictObjHash, /* hash function */
1139 NULL, /* key dup */
1140 NULL, /* val dup */
1141 dictObjKeyCompare, /* key compare */
1142 dictRedisObjectDestructor, /* key destructor */
1143 dictListDestructor /* val destructor */
1144};
1145
42ab0172
AO
1146static void version();
1147
ed9b544e 1148/* ========================= Random utility functions ======================= */
1149
1150/* Redis generally does not try to recover from out of memory conditions
1151 * when allocating objects or strings, it is not clear if it will be possible
1152 * to report this condition to the client since the networking layer itself
1153 * is based on heap allocation for send buffers, so we simply abort.
1154 * At least the code will be simpler to read... */
1155static void oom(const char *msg) {
71c54b21 1156 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1157 sleep(1);
1158 abort();
1159}
1160
1161/* ====================== Redis server networking stuff ===================== */
56906eef 1162static void closeTimedoutClients(void) {
ed9b544e 1163 redisClient *c;
ed9b544e 1164 listNode *ln;
1165 time_t now = time(NULL);
c7df85a4 1166 listIter li;
ed9b544e 1167
c7df85a4 1168 listRewind(server.clients,&li);
1169 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1170 c = listNodeValue(ln);
f86a74e9 1171 if (server.maxidletime &&
1172 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1173 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
ffc6b7f8 1174 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1175 listLength(c->pubsub_patterns) == 0 &&
d6cc8867 1176 (now - c->lastinteraction > server.maxidletime))
f86a74e9 1177 {
f870935d 1178 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1179 freeClient(c);
f86a74e9 1180 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1181 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1182 addReply(c,shared.nullmultibulk);
b0d8747d 1183 unblockClientWaitingData(c);
f86a74e9 1184 }
ed9b544e 1185 }
1186 }
ed9b544e 1187}
1188
12fea928 1189static int htNeedsResize(dict *dict) {
1190 long long size, used;
1191
1192 size = dictSlots(dict);
1193 used = dictSize(dict);
1194 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1195 (used*100/size < REDIS_HT_MINFILL));
1196}
1197
0bc03378 1198/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1199 * we resize the hash table to save memory */
56906eef 1200static void tryResizeHashTables(void) {
0bc03378 1201 int j;
1202
1203 for (j = 0; j < server.dbnum; j++) {
12fea928 1204 if (htNeedsResize(server.db[j].dict)) {
f870935d 1205 redisLog(REDIS_VERBOSE,"The hash table %d is too sparse, resize it...",j);
0bc03378 1206 dictResize(server.db[j].dict);
f870935d 1207 redisLog(REDIS_VERBOSE,"Hash table %d resized.",j);
0bc03378 1208 }
12fea928 1209 if (htNeedsResize(server.db[j].expires))
1210 dictResize(server.db[j].expires);
0bc03378 1211 }
1212}
1213
9d65a1bb 1214/* A background saving child (BGSAVE) terminated its work. Handle this. */
1215void backgroundSaveDoneHandler(int statloc) {
1216 int exitcode = WEXITSTATUS(statloc);
1217 int bysignal = WIFSIGNALED(statloc);
1218
1219 if (!bysignal && exitcode == 0) {
1220 redisLog(REDIS_NOTICE,
1221 "Background saving terminated with success");
1222 server.dirty = 0;
1223 server.lastsave = time(NULL);
1224 } else if (!bysignal && exitcode != 0) {
1225 redisLog(REDIS_WARNING, "Background saving error");
1226 } else {
1227 redisLog(REDIS_WARNING,
454eea7c 1228 "Background saving terminated by signal %d", WTERMSIG(statloc));
9d65a1bb 1229 rdbRemoveTempFile(server.bgsavechildpid);
1230 }
1231 server.bgsavechildpid = -1;
1232 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1233 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1234 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1235}
1236
1237/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1238 * Handle this. */
1239void backgroundRewriteDoneHandler(int statloc) {
1240 int exitcode = WEXITSTATUS(statloc);
1241 int bysignal = WIFSIGNALED(statloc);
1242
1243 if (!bysignal && exitcode == 0) {
1244 int fd;
1245 char tmpfile[256];
1246
1247 redisLog(REDIS_NOTICE,
1248 "Background append only file rewriting terminated with success");
1249 /* Now it's time to flush the differences accumulated by the parent */
1250 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1251 fd = open(tmpfile,O_WRONLY|O_APPEND);
1252 if (fd == -1) {
1253 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1254 goto cleanup;
1255 }
1256 /* Flush our data... */
1257 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1258 (signed) sdslen(server.bgrewritebuf)) {
1259 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1260 close(fd);
1261 goto cleanup;
1262 }
b32627cd 1263 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1264 /* Now our work is to rename the temp file into the stable file. And
1265 * switch the file descriptor used by the server for append only. */
1266 if (rename(tmpfile,server.appendfilename) == -1) {
1267 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1268 close(fd);
1269 goto cleanup;
1270 }
1271 /* Mission completed... almost */
1272 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1273 if (server.appendfd != -1) {
1274 /* If append only is actually enabled... */
1275 close(server.appendfd);
1276 server.appendfd = fd;
1277 fsync(fd);
85a83172 1278 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1279 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1280 } else {
1281 /* If append only is disabled we just generate a dump in this
1282 * format. Why not? */
1283 close(fd);
1284 }
1285 } else if (!bysignal && exitcode != 0) {
1286 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1287 } else {
1288 redisLog(REDIS_WARNING,
454eea7c 1289 "Background append only file rewriting terminated by signal %d",
1290 WTERMSIG(statloc));
9d65a1bb 1291 }
1292cleanup:
1293 sdsfree(server.bgrewritebuf);
1294 server.bgrewritebuf = sdsempty();
1295 aofRemoveTempFile(server.bgrewritechildpid);
1296 server.bgrewritechildpid = -1;
1297}
1298
884d4b39 1299/* This function is called once a background process of some kind terminates,
1300 * as we want to avoid resizing the hash tables when there is a child in order
1301 * to play well with copy-on-write (otherwise when a resize happens lots of
1302 * memory pages are copied). The goal of this function is to update the ability
1303 * for dict.c to resize the hash tables accordingly to the fact we have o not
1304 * running childs. */
1305static void updateDictResizePolicy(void) {
1306 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1307 dictEnableResize();
1308 else
1309 dictDisableResize();
1310}
1311
56906eef 1312static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1313 int j, loops = server.cronloops++;
ed9b544e 1314 REDIS_NOTUSED(eventLoop);
1315 REDIS_NOTUSED(id);
1316 REDIS_NOTUSED(clientData);
1317
3a66edc7 1318 /* We take a cached value of the unix time in the global state because
1319 * with virtual memory and aging there is to store the current time
1320 * in objects at every object access, and accuracy is not needed.
1321 * To access a global var is faster than calling time(NULL) */
1322 server.unixtime = time(NULL);
1323
0bc03378 1324 /* Show some info about non-empty databases */
ed9b544e 1325 for (j = 0; j < server.dbnum; j++) {
dec423d9 1326 long long size, used, vkeys;
94754ccc 1327
3305306f 1328 size = dictSlots(server.db[j].dict);
1329 used = dictSize(server.db[j].dict);
94754ccc 1330 vkeys = dictSize(server.db[j].expires);
1763929f 1331 if (!(loops % 50) && (used || vkeys)) {
f870935d 1332 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1333 /* dictPrintStats(server.dict); */
ed9b544e 1334 }
ed9b544e 1335 }
1336
0bc03378 1337 /* We don't want to resize the hash tables while a bacground saving
1338 * is in progress: the saving child is created using fork() that is
1339 * implemented with a copy-on-write semantic in most modern systems, so
1340 * if we resize the HT while there is the saving child at work actually
1341 * a lot of memory movements in the parent will cause a lot of pages
1342 * copied. */
884d4b39 1343 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1 &&
1344 !(loops % 10))
1345 {
1346 tryResizeHashTables();
1347 }
0bc03378 1348
ed9b544e 1349 /* Show information about connected clients */
1763929f 1350 if (!(loops % 50)) {
bdcb92f2 1351 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
ed9b544e 1352 listLength(server.clients)-listLength(server.slaves),
1353 listLength(server.slaves),
bdcb92f2 1354 zmalloc_used_memory());
ed9b544e 1355 }
1356
1357 /* Close connections of timedout clients */
1763929f 1358 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1359 closeTimedoutClients();
1360
9d65a1bb 1361 /* Check if a background saving or AOF rewrite in progress terminated */
1362 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1363 int statloc;
9d65a1bb 1364 pid_t pid;
1365
1366 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1367 if (pid == server.bgsavechildpid) {
1368 backgroundSaveDoneHandler(statloc);
ed9b544e 1369 } else {
9d65a1bb 1370 backgroundRewriteDoneHandler(statloc);
ed9b544e 1371 }
884d4b39 1372 updateDictResizePolicy();
ed9b544e 1373 }
1374 } else {
1375 /* If there is not a background saving in progress check if
1376 * we have to save now */
1377 time_t now = time(NULL);
1378 for (j = 0; j < server.saveparamslen; j++) {
1379 struct saveparam *sp = server.saveparams+j;
1380
1381 if (server.dirty >= sp->changes &&
1382 now-server.lastsave > sp->seconds) {
1383 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1384 sp->changes, sp->seconds);
f78fd11b 1385 rdbSaveBackground(server.dbfilename);
ed9b544e 1386 break;
1387 }
1388 }
1389 }
94754ccc 1390
f2324293 1391 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1392 * will use few CPU cycles if there are few expiring keys, otherwise
1393 * it will get more aggressive to avoid that too much memory is used by
1394 * keys that can be removed from the keyspace. */
94754ccc 1395 for (j = 0; j < server.dbnum; j++) {
f2324293 1396 int expired;
94754ccc 1397 redisDb *db = server.db+j;
94754ccc 1398
f2324293 1399 /* Continue to expire if at the end of the cycle more than 25%
1400 * of the keys were expired. */
1401 do {
4ef8de8a 1402 long num = dictSize(db->expires);
94754ccc 1403 time_t now = time(NULL);
1404
f2324293 1405 expired = 0;
94754ccc 1406 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1407 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1408 while (num--) {
1409 dictEntry *de;
1410 time_t t;
1411
1412 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1413 t = (time_t) dictGetEntryVal(de);
1414 if (now > t) {
1415 deleteKey(db,dictGetEntryKey(de));
f2324293 1416 expired++;
2a6a2ed1 1417 server.stat_expiredkeys++;
94754ccc 1418 }
1419 }
f2324293 1420 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1421 }
1422
4ef8de8a 1423 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1424 * is enbled. Try to free objects from the free list first. */
7e69548d 1425 if (vmCanSwapOut()) {
1426 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1427 server.vm_max_memory)
1428 {
72e9fd40 1429 int retval;
1430
a5819310 1431 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1432 retval = (server.vm_max_threads == 0) ?
1433 vmSwapOneObjectBlocking() :
1434 vmSwapOneObjectThreaded();
1763929f 1435 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1436 zmalloc_used_memory() >
1437 (server.vm_max_memory+server.vm_max_memory/10))
1438 {
1439 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1440 }
72e9fd40 1441 /* Note that when using threade I/O we free just one object,
1442 * because anyway when the I/O thread in charge to swap this
1443 * object out will finish, the handler of completed jobs
1444 * will try to swap more objects if we are still out of memory. */
1445 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1446 }
1447 }
1448
ed9b544e 1449 /* Check if we should connect to a MASTER */
1763929f 1450 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1451 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1452 if (syncWithMaster() == REDIS_OK) {
1453 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1454 }
1455 }
1763929f 1456 return 100;
ed9b544e 1457}
1458
d5d55fc3 1459/* This function gets called every time Redis is entering the
1460 * main loop of the event driven library, that is, before to sleep
1461 * for ready file descriptors. */
1462static void beforeSleep(struct aeEventLoop *eventLoop) {
1463 REDIS_NOTUSED(eventLoop);
1464
1465 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1466 listIter li;
1467 listNode *ln;
1468
1469 listRewind(server.io_ready_clients,&li);
1470 while((ln = listNext(&li))) {
1471 redisClient *c = ln->value;
1472 struct redisCommand *cmd;
1473
1474 /* Resume the client. */
1475 listDelNode(server.io_ready_clients,ln);
1476 c->flags &= (~REDIS_IO_WAIT);
1477 server.vm_blocked_clients--;
1478 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1479 readQueryFromClient, c);
1480 cmd = lookupCommand(c->argv[0]->ptr);
1481 assert(cmd != NULL);
1482 call(c,cmd);
1483 resetClient(c);
1484 /* There may be more data to process in the input buffer. */
1485 if (c->querybuf && sdslen(c->querybuf) > 0)
1486 processInputBuffer(c);
1487 }
1488 }
1489}
1490
ed9b544e 1491static void createSharedObjects(void) {
05df7621 1492 int j;
1493
ed9b544e 1494 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1495 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1496 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1497 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1498 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1499 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1500 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1501 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1502 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1503 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1504 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1505 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1506 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1507 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1508 "-ERR no such key\r\n"));
ed9b544e 1509 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1510 "-ERR syntax error\r\n"));
c937aa89 1511 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1512 "-ERR source and destination objects are the same\r\n"));
1513 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1514 "-ERR index out of range\r\n"));
ed9b544e 1515 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1516 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1517 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1518 shared.select0 = createStringObject("select 0\r\n",10);
1519 shared.select1 = createStringObject("select 1\r\n",10);
1520 shared.select2 = createStringObject("select 2\r\n",10);
1521 shared.select3 = createStringObject("select 3\r\n",10);
1522 shared.select4 = createStringObject("select 4\r\n",10);
1523 shared.select5 = createStringObject("select 5\r\n",10);
1524 shared.select6 = createStringObject("select 6\r\n",10);
1525 shared.select7 = createStringObject("select 7\r\n",10);
1526 shared.select8 = createStringObject("select 8\r\n",10);
1527 shared.select9 = createStringObject("select 9\r\n",10);
befec3cd 1528 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
1529 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
fc46bb71 1530 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
ffc6b7f8 1531 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1532 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
befec3cd 1533 shared.mbulk3 = createStringObject("*3\r\n",4);
05df7621 1534 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1535 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1536 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1537 }
ed9b544e 1538}
1539
1540static void appendServerSaveParams(time_t seconds, int changes) {
1541 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1542 server.saveparams[server.saveparamslen].seconds = seconds;
1543 server.saveparams[server.saveparamslen].changes = changes;
1544 server.saveparamslen++;
1545}
1546
bcfc686d 1547static void resetServerSaveParams() {
ed9b544e 1548 zfree(server.saveparams);
1549 server.saveparams = NULL;
1550 server.saveparamslen = 0;
1551}
1552
1553static void initServerConfig() {
1554 server.dbnum = REDIS_DEFAULT_DBNUM;
1555 server.port = REDIS_SERVERPORT;
f870935d 1556 server.verbosity = REDIS_VERBOSE;
ed9b544e 1557 server.maxidletime = REDIS_MAXIDLETIME;
1558 server.saveparams = NULL;
1559 server.logfile = NULL; /* NULL = log on standard output */
1560 server.bindaddr = NULL;
1561 server.glueoutputbuf = 1;
1562 server.daemonize = 0;
44b38ef4 1563 server.appendonly = 0;
4e141d5a 1564 server.appendfsync = APPENDFSYNC_ALWAYS;
48f0308a 1565 server.lastfsync = time(NULL);
44b38ef4 1566 server.appendfd = -1;
1567 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1568 server.pidfile = zstrdup("/var/run/redis.pid");
1569 server.dbfilename = zstrdup("dump.rdb");
1570 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1571 server.requirepass = NULL;
10c43610 1572 server.shareobjects = 0;
b0553789 1573 server.rdbcompression = 1;
285add55 1574 server.maxclients = 0;
d5d55fc3 1575 server.blpop_blocked_clients = 0;
3fd78bcd 1576 server.maxmemory = 0;
75680a3c 1577 server.vm_enabled = 0;
054e426d 1578 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1579 server.vm_page_size = 256; /* 256 bytes per page */
1580 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1581 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1582 server.vm_max_threads = 4;
d5d55fc3 1583 server.vm_blocked_clients = 0;
cbba7dd7 1584 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1585 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
75680a3c 1586
bcfc686d 1587 resetServerSaveParams();
ed9b544e 1588
1589 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1590 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1591 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1592 /* Replication related */
1593 server.isslave = 0;
d0ccebcf 1594 server.masterauth = NULL;
ed9b544e 1595 server.masterhost = NULL;
1596 server.masterport = 6379;
1597 server.master = NULL;
1598 server.replstate = REDIS_REPL_NONE;
a7866db6 1599
1600 /* Double constants initialization */
1601 R_Zero = 0.0;
1602 R_PosInf = 1.0/R_Zero;
1603 R_NegInf = -1.0/R_Zero;
1604 R_Nan = R_Zero/R_Zero;
ed9b544e 1605}
1606
1607static void initServer() {
1608 int j;
1609
1610 signal(SIGHUP, SIG_IGN);
1611 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1612 setupSigSegvAction();
ed9b544e 1613
b9bc0eef 1614 server.devnull = fopen("/dev/null","w");
1615 if (server.devnull == NULL) {
1616 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1617 exit(1);
1618 }
ed9b544e 1619 server.clients = listCreate();
1620 server.slaves = listCreate();
87eca727 1621 server.monitors = listCreate();
ed9b544e 1622 server.objfreelist = listCreate();
1623 createSharedObjects();
1624 server.el = aeCreateEventLoop();
3305306f 1625 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
ed9b544e 1626 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1627 if (server.fd == -1) {
1628 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1629 exit(1);
1630 }
3305306f 1631 for (j = 0; j < server.dbnum; j++) {
5234952b 1632 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1633 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
4409877e 1634 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1635 if (server.vm_enabled)
1636 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1637 server.db[j].id = j;
1638 }
ffc6b7f8 1639 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1640 server.pubsub_patterns = listCreate();
1641 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1642 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
ed9b544e 1643 server.cronloops = 0;
9f3c422c 1644 server.bgsavechildpid = -1;
9d65a1bb 1645 server.bgrewritechildpid = -1;
1646 server.bgrewritebuf = sdsempty();
ed9b544e 1647 server.lastsave = time(NULL);
1648 server.dirty = 0;
ed9b544e 1649 server.stat_numcommands = 0;
1650 server.stat_numconnections = 0;
2a6a2ed1 1651 server.stat_expiredkeys = 0;
ed9b544e 1652 server.stat_starttime = time(NULL);
3a66edc7 1653 server.unixtime = time(NULL);
d8f8b666 1654 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1655 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1656 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1657
1658 if (server.appendonly) {
71eba477 1659 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1660 if (server.appendfd == -1) {
1661 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1662 strerror(errno));
1663 exit(1);
1664 }
1665 }
75680a3c 1666
1667 if (server.vm_enabled) vmInit();
ed9b544e 1668}
1669
1670/* Empty the whole database */
ca37e9cd 1671static long long emptyDb() {
ed9b544e 1672 int j;
ca37e9cd 1673 long long removed = 0;
ed9b544e 1674
3305306f 1675 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1676 removed += dictSize(server.db[j].dict);
3305306f 1677 dictEmpty(server.db[j].dict);
1678 dictEmpty(server.db[j].expires);
1679 }
ca37e9cd 1680 return removed;
ed9b544e 1681}
1682
85dd2f3a 1683static int yesnotoi(char *s) {
1684 if (!strcasecmp(s,"yes")) return 1;
1685 else if (!strcasecmp(s,"no")) return 0;
1686 else return -1;
1687}
1688
ed9b544e 1689/* I agree, this is a very rudimental way to load a configuration...
1690 will improve later if the config gets more complex */
1691static void loadServerConfig(char *filename) {
c9a111ac 1692 FILE *fp;
ed9b544e 1693 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1694 int linenum = 0;
1695 sds line = NULL;
6bccf64a
AO
1696 char *errormsg = "Fatal error, can't open config file '%s'";
1697 char *errorbuf = zmalloc(sizeof(char)*(strlen(errormsg)+strlen(filename)));
1698 sprintf(errorbuf, errormsg, filename);
c9a111ac 1699
1700 if (filename[0] == '-' && filename[1] == '\0')
1701 fp = stdin;
1702 else {
1703 if ((fp = fopen(filename,"r")) == NULL) {
6bccf64a 1704 redisLog(REDIS_WARNING, errorbuf);
c9a111ac 1705 exit(1);
1706 }
ed9b544e 1707 }
c9a111ac 1708
ed9b544e 1709 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1710 sds *argv;
1711 int argc, j;
1712
1713 linenum++;
1714 line = sdsnew(buf);
1715 line = sdstrim(line," \t\r\n");
1716
1717 /* Skip comments and blank lines*/
1718 if (line[0] == '#' || line[0] == '\0') {
1719 sdsfree(line);
1720 continue;
1721 }
1722
1723 /* Split into arguments */
1724 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1725 sdstolower(argv[0]);
1726
1727 /* Execute config directives */
bb0b03a3 1728 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1729 server.maxidletime = atoi(argv[1]);
0150db36 1730 if (server.maxidletime < 0) {
ed9b544e 1731 err = "Invalid timeout value"; goto loaderr;
1732 }
bb0b03a3 1733 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1734 server.port = atoi(argv[1]);
1735 if (server.port < 1 || server.port > 65535) {
1736 err = "Invalid port"; goto loaderr;
1737 }
bb0b03a3 1738 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1739 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1740 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1741 int seconds = atoi(argv[1]);
1742 int changes = atoi(argv[2]);
1743 if (seconds < 1 || changes < 0) {
1744 err = "Invalid save parameters"; goto loaderr;
1745 }
1746 appendServerSaveParams(seconds,changes);
bb0b03a3 1747 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1748 if (chdir(argv[1]) == -1) {
1749 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1750 argv[1], strerror(errno));
1751 exit(1);
1752 }
bb0b03a3 1753 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1754 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1755 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1756 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1757 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1758 else {
1759 err = "Invalid log level. Must be one of debug, notice, warning";
1760 goto loaderr;
1761 }
bb0b03a3 1762 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1763 FILE *logfp;
ed9b544e 1764
1765 server.logfile = zstrdup(argv[1]);
bb0b03a3 1766 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1767 zfree(server.logfile);
1768 server.logfile = NULL;
1769 }
1770 if (server.logfile) {
1771 /* Test if we are able to open the file. The server will not
1772 * be able to abort just for this problem later... */
c9a111ac 1773 logfp = fopen(server.logfile,"a");
1774 if (logfp == NULL) {
ed9b544e 1775 err = sdscatprintf(sdsempty(),
1776 "Can't open the log file: %s", strerror(errno));
1777 goto loaderr;
1778 }
c9a111ac 1779 fclose(logfp);
ed9b544e 1780 }
bb0b03a3 1781 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1782 server.dbnum = atoi(argv[1]);
1783 if (server.dbnum < 1) {
1784 err = "Invalid number of databases"; goto loaderr;
1785 }
b3f83f12
JZ
1786 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1787 loadServerConfig(argv[1]);
285add55 1788 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1789 server.maxclients = atoi(argv[1]);
3fd78bcd 1790 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
d4465900 1791 server.maxmemory = strtoll(argv[1], NULL, 10);
bb0b03a3 1792 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1793 server.masterhost = sdsnew(argv[1]);
1794 server.masterport = atoi(argv[2]);
1795 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1796 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1797 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1798 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1799 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1800 err = "argument must be 'yes' or 'no'"; goto loaderr;
1801 }
bb0b03a3 1802 } else if (!strcasecmp(argv[0],"shareobjects") && argc == 2) {
85dd2f3a 1803 if ((server.shareobjects = yesnotoi(argv[1])) == -1) {
10c43610 1804 err = "argument must be 'yes' or 'no'"; goto loaderr;
1805 }
121f70cf 1806 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1807 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
1808 err = "argument must be 'yes' or 'no'"; goto loaderr;
1809 }
bb0b03a3 1810 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1811 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1812 err = "argument must be 'yes' or 'no'"; goto loaderr;
1813 }
44b38ef4 1814 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1815 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1816 err = "argument must be 'yes' or 'no'"; goto loaderr;
1817 }
48f0308a 1818 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1819 if (!strcasecmp(argv[1],"no")) {
48f0308a 1820 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1821 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1822 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1823 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1824 server.appendfsync = APPENDFSYNC_EVERYSEC;
1825 } else {
1826 err = "argument must be 'no', 'always' or 'everysec'";
1827 goto loaderr;
1828 }
bb0b03a3 1829 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 1830 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1831 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 1832 zfree(server.pidfile);
054e426d 1833 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1834 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 1835 zfree(server.dbfilename);
054e426d 1836 server.dbfilename = zstrdup(argv[1]);
75680a3c 1837 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1838 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1839 err = "argument must be 'yes' or 'no'"; goto loaderr;
1840 }
054e426d 1841 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 1842 zfree(server.vm_swap_file);
054e426d 1843 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 1844 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
1845 server.vm_max_memory = strtoll(argv[1], NULL, 10);
1846 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
1847 server.vm_page_size = strtoll(argv[1], NULL, 10);
1848 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
1849 server.vm_pages = strtoll(argv[1], NULL, 10);
92f8e882 1850 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1851 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 1852 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
1853 server.hash_max_zipmap_entries = strtol(argv[1], NULL, 10);
1854 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
1855 server.hash_max_zipmap_value = strtol(argv[1], NULL, 10);
1856 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1857 server.vm_max_threads = strtoll(argv[1], NULL, 10);
ed9b544e 1858 } else {
1859 err = "Bad directive or wrong number of arguments"; goto loaderr;
1860 }
1861 for (j = 0; j < argc; j++)
1862 sdsfree(argv[j]);
1863 zfree(argv);
1864 sdsfree(line);
1865 }
c9a111ac 1866 if (fp != stdin) fclose(fp);
ed9b544e 1867 return;
1868
1869loaderr:
1870 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1871 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1872 fprintf(stderr, ">>> '%s'\n", line);
1873 fprintf(stderr, "%s\n", err);
1874 exit(1);
1875}
1876
1877static void freeClientArgv(redisClient *c) {
1878 int j;
1879
1880 for (j = 0; j < c->argc; j++)
1881 decrRefCount(c->argv[j]);
e8a74421 1882 for (j = 0; j < c->mbargc; j++)
1883 decrRefCount(c->mbargv[j]);
ed9b544e 1884 c->argc = 0;
e8a74421 1885 c->mbargc = 0;
ed9b544e 1886}
1887
1888static void freeClient(redisClient *c) {
1889 listNode *ln;
1890
4409877e 1891 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 1892 * call, we have to set querybuf to NULL *before* to call
1893 * unblockClientWaitingData() to avoid processInputBuffer() will get
1894 * called. Also it is important to remove the file events after
1895 * this, because this call adds the READABLE event. */
4409877e 1896 sdsfree(c->querybuf);
1897 c->querybuf = NULL;
1898 if (c->flags & REDIS_BLOCKED)
b0d8747d 1899 unblockClientWaitingData(c);
4409877e 1900
ffc6b7f8 1901 /* Unsubscribe from all the pubsub channels */
1902 pubsubUnsubscribeAllChannels(c,0);
1903 pubsubUnsubscribeAllPatterns(c,0);
1904 dictRelease(c->pubsub_channels);
1905 listRelease(c->pubsub_patterns);
befec3cd 1906 /* Obvious cleanup */
ed9b544e 1907 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1908 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 1909 listRelease(c->reply);
1910 freeClientArgv(c);
1911 close(c->fd);
92f8e882 1912 /* Remove from the list of clients */
ed9b544e 1913 ln = listSearchKey(server.clients,c);
dfc5e96c 1914 redisAssert(ln != NULL);
ed9b544e 1915 listDelNode(server.clients,ln);
d5d55fc3 1916 /* Remove from the list of clients waiting for swapped keys */
1917 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1918 ln = listSearchKey(server.io_ready_clients,c);
1919 if (ln) {
1920 listDelNode(server.io_ready_clients,ln);
1921 server.vm_blocked_clients--;
1922 }
1923 }
1924 while (server.vm_enabled && listLength(c->io_keys)) {
1925 ln = listFirst(c->io_keys);
1926 dontWaitForSwappedKey(c,ln->value);
92f8e882 1927 }
b3e3d0d7 1928 listRelease(c->io_keys);
befec3cd 1929 /* Master/slave cleanup */
ed9b544e 1930 if (c->flags & REDIS_SLAVE) {
6208b3a7 1931 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1932 close(c->repldbfd);
87eca727 1933 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1934 ln = listSearchKey(l,c);
dfc5e96c 1935 redisAssert(ln != NULL);
87eca727 1936 listDelNode(l,ln);
ed9b544e 1937 }
1938 if (c->flags & REDIS_MASTER) {
1939 server.master = NULL;
1940 server.replstate = REDIS_REPL_CONNECT;
1941 }
befec3cd 1942 /* Release memory */
93ea3759 1943 zfree(c->argv);
e8a74421 1944 zfree(c->mbargv);
6e469882 1945 freeClientMultiState(c);
ed9b544e 1946 zfree(c);
1947}
1948
cc30e368 1949#define GLUEREPLY_UP_TO (1024)
ed9b544e 1950static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 1951 int copylen = 0;
1952 char buf[GLUEREPLY_UP_TO];
6208b3a7 1953 listNode *ln;
c7df85a4 1954 listIter li;
ed9b544e 1955 robj *o;
1956
c7df85a4 1957 listRewind(c->reply,&li);
1958 while((ln = listNext(&li))) {
c28b42ac 1959 int objlen;
1960
ed9b544e 1961 o = ln->value;
c28b42ac 1962 objlen = sdslen(o->ptr);
1963 if (copylen + objlen <= GLUEREPLY_UP_TO) {
1964 memcpy(buf+copylen,o->ptr,objlen);
1965 copylen += objlen;
ed9b544e 1966 listDelNode(c->reply,ln);
c28b42ac 1967 } else {
1968 if (copylen == 0) return;
1969 break;
ed9b544e 1970 }
ed9b544e 1971 }
c28b42ac 1972 /* Now the output buffer is empty, add the new single element */
1973 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
1974 listAddNodeHead(c->reply,o);
ed9b544e 1975}
1976
1977static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
1978 redisClient *c = privdata;
1979 int nwritten = 0, totwritten = 0, objlen;
1980 robj *o;
1981 REDIS_NOTUSED(el);
1982 REDIS_NOTUSED(mask);
1983
2895e862 1984 /* Use writev() if we have enough buffers to send */
7ea870c0 1985 if (!server.glueoutputbuf &&
e0a62c7f 1986 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
7ea870c0 1987 !(c->flags & REDIS_MASTER))
2895e862 1988 {
1989 sendReplyToClientWritev(el, fd, privdata, mask);
1990 return;
1991 }
2895e862 1992
ed9b544e 1993 while(listLength(c->reply)) {
c28b42ac 1994 if (server.glueoutputbuf && listLength(c->reply) > 1)
1995 glueReplyBuffersIfNeeded(c);
1996
ed9b544e 1997 o = listNodeValue(listFirst(c->reply));
1998 objlen = sdslen(o->ptr);
1999
2000 if (objlen == 0) {
2001 listDelNode(c->reply,listFirst(c->reply));
2002 continue;
2003 }
2004
2005 if (c->flags & REDIS_MASTER) {
6f376729 2006 /* Don't reply to a master */
ed9b544e 2007 nwritten = objlen - c->sentlen;
2008 } else {
a4d1ba9a 2009 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 2010 if (nwritten <= 0) break;
2011 }
2012 c->sentlen += nwritten;
2013 totwritten += nwritten;
2014 /* If we fully sent the object on head go to the next one */
2015 if (c->sentlen == objlen) {
2016 listDelNode(c->reply,listFirst(c->reply));
2017 c->sentlen = 0;
2018 }
6f376729 2019 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 2020 * bytes, in a single threaded server it's a good idea to serve
6f376729 2021 * other clients as well, even if a very large request comes from
2022 * super fast link that is always able to accept data (in real world
12f9d551 2023 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 2024 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 2025 }
2026 if (nwritten == -1) {
2027 if (errno == EAGAIN) {
2028 nwritten = 0;
2029 } else {
f870935d 2030 redisLog(REDIS_VERBOSE,
ed9b544e 2031 "Error writing to client: %s", strerror(errno));
2032 freeClient(c);
2033 return;
2034 }
2035 }
2036 if (totwritten > 0) c->lastinteraction = time(NULL);
2037 if (listLength(c->reply) == 0) {
2038 c->sentlen = 0;
2039 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2040 }
2041}
2042
2895e862 2043static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2044{
2045 redisClient *c = privdata;
2046 int nwritten = 0, totwritten = 0, objlen, willwrite;
2047 robj *o;
2048 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2049 int offset, ion = 0;
2050 REDIS_NOTUSED(el);
2051 REDIS_NOTUSED(mask);
2052
2053 listNode *node;
2054 while (listLength(c->reply)) {
2055 offset = c->sentlen;
2056 ion = 0;
2057 willwrite = 0;
2058
2059 /* fill-in the iov[] array */
2060 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2061 o = listNodeValue(node);
2062 objlen = sdslen(o->ptr);
2063
e0a62c7f 2064 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2895e862 2065 break;
2066
2067 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2068 break; /* no more iovecs */
2069
2070 iov[ion].iov_base = ((char*)o->ptr) + offset;
2071 iov[ion].iov_len = objlen - offset;
2072 willwrite += objlen - offset;
2073 offset = 0; /* just for the first item */
2074 ion++;
2075 }
2076
2077 if(willwrite == 0)
2078 break;
2079
2080 /* write all collected blocks at once */
2081 if((nwritten = writev(fd, iov, ion)) < 0) {
2082 if (errno != EAGAIN) {
f870935d 2083 redisLog(REDIS_VERBOSE,
2895e862 2084 "Error writing to client: %s", strerror(errno));
2085 freeClient(c);
2086 return;
2087 }
2088 break;
2089 }
2090
2091 totwritten += nwritten;
2092 offset = c->sentlen;
2093
2094 /* remove written robjs from c->reply */
2095 while (nwritten && listLength(c->reply)) {
2096 o = listNodeValue(listFirst(c->reply));
2097 objlen = sdslen(o->ptr);
2098
2099 if(nwritten >= objlen - offset) {
2100 listDelNode(c->reply, listFirst(c->reply));
2101 nwritten -= objlen - offset;
2102 c->sentlen = 0;
2103 } else {
2104 /* partial write */
2105 c->sentlen += nwritten;
2106 break;
2107 }
2108 offset = 0;
2109 }
2110 }
2111
e0a62c7f 2112 if (totwritten > 0)
2895e862 2113 c->lastinteraction = time(NULL);
2114
2115 if (listLength(c->reply) == 0) {
2116 c->sentlen = 0;
2117 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2118 }
2119}
2120
ed9b544e 2121static struct redisCommand *lookupCommand(char *name) {
2122 int j = 0;
2123 while(cmdTable[j].name != NULL) {
bb0b03a3 2124 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
ed9b544e 2125 j++;
2126 }
2127 return NULL;
2128}
2129
2130/* resetClient prepare the client to process the next command */
2131static void resetClient(redisClient *c) {
2132 freeClientArgv(c);
2133 c->bulklen = -1;
e8a74421 2134 c->multibulk = 0;
ed9b544e 2135}
2136
6e469882 2137/* Call() is the core of Redis execution of a command */
2138static void call(redisClient *c, struct redisCommand *cmd) {
2139 long long dirty;
2140
2141 dirty = server.dirty;
2142 cmd->proc(c);
4005fef1 2143 dirty = server.dirty-dirty;
2144
2145 if (server.appendonly && dirty)
6e469882 2146 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
4005fef1 2147 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2148 listLength(server.slaves))
248ea310 2149 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2150 if (listLength(server.monitors))
248ea310 2151 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2152 server.stat_numcommands++;
2153}
2154
ed9b544e 2155/* If this function gets called we already read a whole
2156 * command, argments are in the client argv/argc fields.
2157 * processCommand() execute the command or prepare the
2158 * server for a bulk read from the client.
2159 *
2160 * If 1 is returned the client is still alive and valid and
2161 * and other operations can be performed by the caller. Otherwise
2162 * if 0 is returned the client was destroied (i.e. after QUIT). */
2163static int processCommand(redisClient *c) {
2164 struct redisCommand *cmd;
ed9b544e 2165
3fd78bcd 2166 /* Free some memory if needed (maxmemory setting) */
2167 if (server.maxmemory) freeMemoryIfNeeded();
2168
e8a74421 2169 /* Handle the multi bulk command type. This is an alternative protocol
2170 * supported by Redis in order to receive commands that are composed of
2171 * multiple binary-safe "bulk" arguments. The latency of processing is
2172 * a bit higher but this allows things like multi-sets, so if this
2173 * protocol is used only for MSET and similar commands this is a big win. */
2174 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2175 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2176 if (c->multibulk <= 0) {
2177 resetClient(c);
2178 return 1;
2179 } else {
2180 decrRefCount(c->argv[c->argc-1]);
2181 c->argc--;
2182 return 1;
2183 }
2184 } else if (c->multibulk) {
2185 if (c->bulklen == -1) {
2186 if (((char*)c->argv[0]->ptr)[0] != '$') {
2187 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2188 resetClient(c);
2189 return 1;
2190 } else {
2191 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2192 decrRefCount(c->argv[0]);
2193 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2194 c->argc--;
2195 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2196 resetClient(c);
2197 return 1;
2198 }
2199 c->argc--;
2200 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2201 return 1;
2202 }
2203 } else {
2204 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2205 c->mbargv[c->mbargc] = c->argv[0];
2206 c->mbargc++;
2207 c->argc--;
2208 c->multibulk--;
2209 if (c->multibulk == 0) {
2210 robj **auxargv;
2211 int auxargc;
2212
2213 /* Here we need to swap the multi-bulk argc/argv with the
2214 * normal argc/argv of the client structure. */
2215 auxargv = c->argv;
2216 c->argv = c->mbargv;
2217 c->mbargv = auxargv;
2218
2219 auxargc = c->argc;
2220 c->argc = c->mbargc;
2221 c->mbargc = auxargc;
2222
2223 /* We need to set bulklen to something different than -1
2224 * in order for the code below to process the command without
2225 * to try to read the last argument of a bulk command as
2226 * a special argument. */
2227 c->bulklen = 0;
2228 /* continue below and process the command */
2229 } else {
2230 c->bulklen = -1;
2231 return 1;
2232 }
2233 }
2234 }
2235 /* -- end of multi bulk commands processing -- */
2236
ed9b544e 2237 /* The QUIT command is handled as a special case. Normal command
2238 * procs are unable to close the client connection safely */
bb0b03a3 2239 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2240 freeClient(c);
2241 return 0;
2242 }
d5d55fc3 2243
2244 /* Now lookup the command and check ASAP about trivial error conditions
2245 * such wrong arity, bad command name and so forth. */
ed9b544e 2246 cmd = lookupCommand(c->argv[0]->ptr);
2247 if (!cmd) {
2c14807b 2248 addReplySds(c,
2249 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2250 (char*)c->argv[0]->ptr));
ed9b544e 2251 resetClient(c);
2252 return 1;
2253 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2254 (c->argc < -cmd->arity)) {
454d4e43 2255 addReplySds(c,
2256 sdscatprintf(sdsempty(),
2257 "-ERR wrong number of arguments for '%s' command\r\n",
2258 cmd->name));
ed9b544e 2259 resetClient(c);
2260 return 1;
2261 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2262 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2263 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2264
2265 decrRefCount(c->argv[c->argc-1]);
2266 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2267 c->argc--;
2268 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2269 resetClient(c);
2270 return 1;
2271 }
2272 c->argc--;
2273 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2274 /* It is possible that the bulk read is already in the
8d0490e7 2275 * buffer. Check this condition and handle it accordingly.
2276 * This is just a fast path, alternative to call processInputBuffer().
2277 * It's a good idea since the code is small and this condition
2278 * happens most of the times. */
ed9b544e 2279 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2280 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2281 c->argc++;
2282 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2283 } else {
d5d55fc3 2284 /* Otherwise return... there is to read the last argument
2285 * from the socket. */
ed9b544e 2286 return 1;
2287 }
2288 }
942a3961 2289 /* Let's try to encode the bulk object to save space. */
2290 if (cmd->flags & REDIS_CMD_BULK)
05df7621 2291 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
942a3961 2292
e63943a4 2293 /* Check if the user is authenticated */
2294 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2295 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2296 resetClient(c);
2297 return 1;
2298 }
2299
b61a28fe 2300 /* Handle the maxmemory directive */
2301 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2302 zmalloc_used_memory() > server.maxmemory)
2303 {
2304 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2305 resetClient(c);
2306 return 1;
2307 }
2308
d6cc8867 2309 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
ffc6b7f8 2310 if (dictSize(c->pubsub_channels) > 0 &&
2311 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2312 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2313 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
d6cc8867 2314 resetClient(c);
2315 return 1;
2316 }
2317
ed9b544e 2318 /* Exec the command */
18b6cb76 2319 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
6e469882 2320 queueMultiCommand(c,cmd);
2321 addReply(c,shared.queued);
2322 } else {
d5d55fc3 2323 if (server.vm_enabled && server.vm_max_threads > 0 &&
2324 blockClientOnSwappedKeys(cmd,c)) return 1;
6e469882 2325 call(c,cmd);
2326 }
ed9b544e 2327
2328 /* Prepare the client for the next command */
ed9b544e 2329 resetClient(c);
2330 return 1;
2331}
2332
248ea310 2333static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2334 listNode *ln;
c7df85a4 2335 listIter li;
ed9b544e 2336 int outc = 0, j;
93ea3759 2337 robj **outv;
248ea310 2338 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2339 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2340 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2341 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2342 robj *lenobj;
93ea3759 2343
2344 if (argc <= REDIS_STATIC_ARGS) {
2345 outv = static_outv;
2346 } else {
248ea310 2347 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2348 }
248ea310 2349
2350 lenobj = createObject(REDIS_STRING,
2351 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2352 lenobj->refcount = 0;
2353 outv[outc++] = lenobj;
ed9b544e 2354 for (j = 0; j < argc; j++) {
248ea310 2355 lenobj = createObject(REDIS_STRING,
2356 sdscatprintf(sdsempty(),"$%lu\r\n",
2357 (unsigned long) stringObjectLen(argv[j])));
2358 lenobj->refcount = 0;
2359 outv[outc++] = lenobj;
ed9b544e 2360 outv[outc++] = argv[j];
248ea310 2361 outv[outc++] = shared.crlf;
ed9b544e 2362 }
ed9b544e 2363
40d224a9 2364 /* Increment all the refcounts at start and decrement at end in order to
2365 * be sure to free objects if there is no slave in a replication state
2366 * able to be feed with commands */
2367 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2368 listRewind(slaves,&li);
2369 while((ln = listNext(&li))) {
ed9b544e 2370 redisClient *slave = ln->value;
40d224a9 2371
2372 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2373 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2374
2375 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2376 if (slave->slaveseldb != dictid) {
2377 robj *selectcmd;
2378
2379 switch(dictid) {
2380 case 0: selectcmd = shared.select0; break;
2381 case 1: selectcmd = shared.select1; break;
2382 case 2: selectcmd = shared.select2; break;
2383 case 3: selectcmd = shared.select3; break;
2384 case 4: selectcmd = shared.select4; break;
2385 case 5: selectcmd = shared.select5; break;
2386 case 6: selectcmd = shared.select6; break;
2387 case 7: selectcmd = shared.select7; break;
2388 case 8: selectcmd = shared.select8; break;
2389 case 9: selectcmd = shared.select9; break;
2390 default:
2391 selectcmd = createObject(REDIS_STRING,
2392 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2393 selectcmd->refcount = 0;
2394 break;
2395 }
2396 addReply(slave,selectcmd);
2397 slave->slaveseldb = dictid;
2398 }
2399 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2400 }
40d224a9 2401 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2402 if (outv != static_outv) zfree(outv);
ed9b544e 2403}
2404
638e42ac 2405static void processInputBuffer(redisClient *c) {
ed9b544e 2406again:
4409877e 2407 /* Before to process the input buffer, make sure the client is not
2408 * waitig for a blocking operation such as BLPOP. Note that the first
2409 * iteration the client is never blocked, otherwise the processInputBuffer
2410 * would not be called at all, but after the execution of the first commands
2411 * in the input buffer the client may be blocked, and the "goto again"
2412 * will try to reiterate. The following line will make it return asap. */
92f8e882 2413 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2414 if (c->bulklen == -1) {
2415 /* Read the first line of the query */
2416 char *p = strchr(c->querybuf,'\n');
2417 size_t querylen;
644fafa3 2418
ed9b544e 2419 if (p) {
2420 sds query, *argv;
2421 int argc, j;
e0a62c7f 2422
ed9b544e 2423 query = c->querybuf;
2424 c->querybuf = sdsempty();
2425 querylen = 1+(p-(query));
2426 if (sdslen(query) > querylen) {
2427 /* leave data after the first line of the query in the buffer */
2428 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2429 }
2430 *p = '\0'; /* remove "\n" */
2431 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2432 sdsupdatelen(query);
2433
2434 /* Now we can split the query in arguments */
ed9b544e 2435 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2436 sdsfree(query);
2437
2438 if (c->argv) zfree(c->argv);
2439 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2440
2441 for (j = 0; j < argc; j++) {
ed9b544e 2442 if (sdslen(argv[j])) {
2443 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2444 c->argc++;
2445 } else {
2446 sdsfree(argv[j]);
2447 }
2448 }
2449 zfree(argv);
7c49733c 2450 if (c->argc) {
2451 /* Execute the command. If the client is still valid
2452 * after processCommand() return and there is something
2453 * on the query buffer try to process the next command. */
2454 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2455 } else {
2456 /* Nothing to process, argc == 0. Just process the query
2457 * buffer if it's not empty or return to the caller */
2458 if (sdslen(c->querybuf)) goto again;
2459 }
ed9b544e 2460 return;
644fafa3 2461 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2462 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2463 freeClient(c);
2464 return;
2465 }
2466 } else {
2467 /* Bulk read handling. Note that if we are at this point
2468 the client already sent a command terminated with a newline,
2469 we are reading the bulk data that is actually the last
2470 argument of the command. */
2471 int qbl = sdslen(c->querybuf);
2472
2473 if (c->bulklen <= qbl) {
2474 /* Copy everything but the final CRLF as final argument */
2475 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2476 c->argc++;
2477 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2478 /* Process the command. If the client is still valid after
2479 * the processing and there is more data in the buffer
2480 * try to parse it. */
2481 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2482 return;
2483 }
2484 }
2485}
2486
638e42ac 2487static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2488 redisClient *c = (redisClient*) privdata;
2489 char buf[REDIS_IOBUF_LEN];
2490 int nread;
2491 REDIS_NOTUSED(el);
2492 REDIS_NOTUSED(mask);
2493
2494 nread = read(fd, buf, REDIS_IOBUF_LEN);
2495 if (nread == -1) {
2496 if (errno == EAGAIN) {
2497 nread = 0;
2498 } else {
f870935d 2499 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2500 freeClient(c);
2501 return;
2502 }
2503 } else if (nread == 0) {
f870935d 2504 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2505 freeClient(c);
2506 return;
2507 }
2508 if (nread) {
2509 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2510 c->lastinteraction = time(NULL);
2511 } else {
2512 return;
2513 }
168ac5c6 2514 processInputBuffer(c);
638e42ac 2515}
2516
ed9b544e 2517static int selectDb(redisClient *c, int id) {
2518 if (id < 0 || id >= server.dbnum)
2519 return REDIS_ERR;
3305306f 2520 c->db = &server.db[id];
ed9b544e 2521 return REDIS_OK;
2522}
2523
40d224a9 2524static void *dupClientReplyValue(void *o) {
2525 incrRefCount((robj*)o);
12d090d2 2526 return o;
40d224a9 2527}
2528
ffc6b7f8 2529static int listMatchObjects(void *a, void *b) {
2530 return compareStringObjects(a,b) == 0;
2531}
2532
ed9b544e 2533static redisClient *createClient(int fd) {
2534 redisClient *c = zmalloc(sizeof(*c));
2535
2536 anetNonBlock(NULL,fd);
2537 anetTcpNoDelay(NULL,fd);
2538 if (!c) return NULL;
2539 selectDb(c,0);
2540 c->fd = fd;
2541 c->querybuf = sdsempty();
2542 c->argc = 0;
93ea3759 2543 c->argv = NULL;
ed9b544e 2544 c->bulklen = -1;
e8a74421 2545 c->multibulk = 0;
2546 c->mbargc = 0;
2547 c->mbargv = NULL;
ed9b544e 2548 c->sentlen = 0;
2549 c->flags = 0;
2550 c->lastinteraction = time(NULL);
abcb223e 2551 c->authenticated = 0;
40d224a9 2552 c->replstate = REDIS_REPL_NONE;
6b47e12e 2553 c->reply = listCreate();
ed9b544e 2554 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2555 listSetDupMethod(c->reply,dupClientReplyValue);
92f8e882 2556 c->blockingkeys = NULL;
2557 c->blockingkeysnum = 0;
2558 c->io_keys = listCreate();
2559 listSetFreeMethod(c->io_keys,decrRefCount);
ffc6b7f8 2560 c->pubsub_channels = dictCreate(&setDictType,NULL);
2561 c->pubsub_patterns = listCreate();
2562 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2563 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
ed9b544e 2564 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2565 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2566 freeClient(c);
2567 return NULL;
2568 }
6b47e12e 2569 listAddNodeTail(server.clients,c);
6e469882 2570 initClientMultiState(c);
ed9b544e 2571 return c;
2572}
2573
2574static void addReply(redisClient *c, robj *obj) {
2575 if (listLength(c->reply) == 0 &&
6208b3a7 2576 (c->replstate == REDIS_REPL_NONE ||
2577 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2578 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2579 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2580
2581 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2582 obj = dupStringObject(obj);
2583 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2584 }
9d65a1bb 2585 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2586}
2587
2588static void addReplySds(redisClient *c, sds s) {
2589 robj *o = createObject(REDIS_STRING,s);
2590 addReply(c,o);
2591 decrRefCount(o);
2592}
2593
e2665397 2594static void addReplyDouble(redisClient *c, double d) {
2595 char buf[128];
2596
2597 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2598 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2599 (unsigned long) strlen(buf),buf));
e2665397 2600}
2601
f44dd428 2602static void addReplyLong(redisClient *c, long l) {
2603 char buf[128];
2604 size_t len;
2605
dd88747b 2606 if (l == 0) {
2607 addReply(c,shared.czero);
2608 return;
2609 } else if (l == 1) {
2610 addReply(c,shared.cone);
2611 return;
2612 }
f44dd428 2613 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2614 addReplySds(c,sdsnewlen(buf,len));
2615}
2616
aa7c2934
PN
2617static void addReplyLongLong(redisClient *c, long long ll) {
2618 char buf[128];
2619 size_t len;
2620
2621 if (ll == 0) {
2622 addReply(c,shared.czero);
2623 return;
2624 } else if (ll == 1) {
2625 addReply(c,shared.cone);
2626 return;
2627 }
2628 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2629 addReplySds(c,sdsnewlen(buf,len));
2630}
2631
92b27fe9 2632static void addReplyUlong(redisClient *c, unsigned long ul) {
2633 char buf[128];
2634 size_t len;
2635
dd88747b 2636 if (ul == 0) {
2637 addReply(c,shared.czero);
2638 return;
2639 } else if (ul == 1) {
2640 addReply(c,shared.cone);
2641 return;
2642 }
92b27fe9 2643 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2644 addReplySds(c,sdsnewlen(buf,len));
2645}
2646
942a3961 2647static void addReplyBulkLen(redisClient *c, robj *obj) {
2648 size_t len;
2649
2650 if (obj->encoding == REDIS_ENCODING_RAW) {
2651 len = sdslen(obj->ptr);
2652 } else {
2653 long n = (long)obj->ptr;
2654
e054afda 2655 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2656 len = 1;
2657 if (n < 0) {
2658 len++;
2659 n = -n;
2660 }
2661 while((n = n/10) != 0) {
2662 len++;
2663 }
2664 }
83c6a618 2665 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
942a3961 2666}
2667
dd88747b 2668static void addReplyBulk(redisClient *c, robj *obj) {
2669 addReplyBulkLen(c,obj);
2670 addReply(c,obj);
2671 addReply(c,shared.crlf);
2672}
2673
500ece7c 2674/* In the CONFIG command we need to add vanilla C string as bulk replies */
2675static void addReplyBulkCString(redisClient *c, char *s) {
2676 if (s == NULL) {
2677 addReply(c,shared.nullbulk);
2678 } else {
2679 robj *o = createStringObject(s,strlen(s));
2680 addReplyBulk(c,o);
2681 decrRefCount(o);
2682 }
2683}
2684
ed9b544e 2685static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2686 int cport, cfd;
2687 char cip[128];
285add55 2688 redisClient *c;
ed9b544e 2689 REDIS_NOTUSED(el);
2690 REDIS_NOTUSED(mask);
2691 REDIS_NOTUSED(privdata);
2692
2693 cfd = anetAccept(server.neterr, fd, cip, &cport);
2694 if (cfd == AE_ERR) {
f870935d 2695 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2696 return;
2697 }
f870935d 2698 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2699 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2700 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2701 close(cfd); /* May be already closed, just ingore errors */
2702 return;
2703 }
285add55 2704 /* If maxclient directive is set and this is one client more... close the
2705 * connection. Note that we create the client instead to check before
2706 * for this condition, since now the socket is already set in nonblocking
2707 * mode and we can send an error for free using the Kernel I/O */
2708 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2709 char *err = "-ERR max number of clients reached\r\n";
2710
2711 /* That's a best effort error message, don't check write errors */
fee803ba 2712 if (write(c->fd,err,strlen(err)) == -1) {
2713 /* Nothing to do, Just to avoid the warning... */
2714 }
285add55 2715 freeClient(c);
2716 return;
2717 }
ed9b544e 2718 server.stat_numconnections++;
2719}
2720
2721/* ======================= Redis objects implementation ===================== */
2722
2723static robj *createObject(int type, void *ptr) {
2724 robj *o;
2725
a5819310 2726 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2727 if (listLength(server.objfreelist)) {
2728 listNode *head = listFirst(server.objfreelist);
2729 o = listNodeValue(head);
2730 listDelNode(server.objfreelist,head);
a5819310 2731 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2732 } else {
75680a3c 2733 if (server.vm_enabled) {
a5819310 2734 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2735 o = zmalloc(sizeof(*o));
2736 } else {
2737 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2738 }
ed9b544e 2739 }
ed9b544e 2740 o->type = type;
942a3961 2741 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2742 o->ptr = ptr;
2743 o->refcount = 1;
3a66edc7 2744 if (server.vm_enabled) {
1064ef87 2745 /* Note that this code may run in the context of an I/O thread
2746 * and accessing to server.unixtime in theory is an error
2747 * (no locks). But in practice this is safe, and even if we read
2748 * garbage Redis will not fail, as it's just a statistical info */
3a66edc7 2749 o->vm.atime = server.unixtime;
2750 o->storage = REDIS_VM_MEMORY;
2751 }
ed9b544e 2752 return o;
2753}
2754
2755static robj *createStringObject(char *ptr, size_t len) {
2756 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2757}
2758
4ef8de8a 2759static robj *dupStringObject(robj *o) {
b9bc0eef 2760 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2761 return createStringObject(o->ptr,sdslen(o->ptr));
2762}
2763
ed9b544e 2764static robj *createListObject(void) {
2765 list *l = listCreate();
2766
ed9b544e 2767 listSetFreeMethod(l,decrRefCount);
2768 return createObject(REDIS_LIST,l);
2769}
2770
2771static robj *createSetObject(void) {
2772 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2773 return createObject(REDIS_SET,d);
2774}
2775
5234952b 2776static robj *createHashObject(void) {
2777 /* All the Hashes start as zipmaps. Will be automatically converted
2778 * into hash tables if there are enough elements or big elements
2779 * inside. */
2780 unsigned char *zm = zipmapNew();
2781 robj *o = createObject(REDIS_HASH,zm);
2782 o->encoding = REDIS_ENCODING_ZIPMAP;
2783 return o;
2784}
2785
1812e024 2786static robj *createZsetObject(void) {
6b47e12e 2787 zset *zs = zmalloc(sizeof(*zs));
2788
2789 zs->dict = dictCreate(&zsetDictType,NULL);
2790 zs->zsl = zslCreate();
2791 return createObject(REDIS_ZSET,zs);
1812e024 2792}
2793
ed9b544e 2794static void freeStringObject(robj *o) {
942a3961 2795 if (o->encoding == REDIS_ENCODING_RAW) {
2796 sdsfree(o->ptr);
2797 }
ed9b544e 2798}
2799
2800static void freeListObject(robj *o) {
2801 listRelease((list*) o->ptr);
2802}
2803
2804static void freeSetObject(robj *o) {
2805 dictRelease((dict*) o->ptr);
2806}
2807
fd8ccf44 2808static void freeZsetObject(robj *o) {
2809 zset *zs = o->ptr;
2810
2811 dictRelease(zs->dict);
2812 zslFree(zs->zsl);
2813 zfree(zs);
2814}
2815
ed9b544e 2816static void freeHashObject(robj *o) {
cbba7dd7 2817 switch (o->encoding) {
2818 case REDIS_ENCODING_HT:
2819 dictRelease((dict*) o->ptr);
2820 break;
2821 case REDIS_ENCODING_ZIPMAP:
2822 zfree(o->ptr);
2823 break;
2824 default:
2825 redisAssert(0);
2826 break;
2827 }
ed9b544e 2828}
2829
2830static void incrRefCount(robj *o) {
2831 o->refcount++;
2832}
2833
2834static void decrRefCount(void *obj) {
2835 robj *o = obj;
94754ccc 2836
970e10bb 2837 /* Object is a key of a swapped out value, or in the process of being
2838 * loaded. */
996cb5f7 2839 if (server.vm_enabled &&
2840 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2841 {
996cb5f7 2842 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 2843 redisAssert(o->type == REDIS_STRING);
a35ddf12 2844 freeStringObject(o);
2845 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 2846 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 2847 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2848 !listAddNodeHead(server.objfreelist,o))
2849 zfree(o);
a5819310 2850 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 2851 server.vm_stats_swapped_objects--;
a35ddf12 2852 return;
2853 }
996cb5f7 2854 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 2855 if (--(o->refcount) == 0) {
996cb5f7 2856 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2857 vmCancelThreadedIOJob(obj);
ed9b544e 2858 switch(o->type) {
2859 case REDIS_STRING: freeStringObject(o); break;
2860 case REDIS_LIST: freeListObject(o); break;
2861 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 2862 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 2863 case REDIS_HASH: freeHashObject(o); break;
78409a0f 2864 default: redisAssert(0); break;
ed9b544e 2865 }
a5819310 2866 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2867 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2868 !listAddNodeHead(server.objfreelist,o))
2869 zfree(o);
a5819310 2870 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2871 }
2872}
2873
942a3961 2874static robj *lookupKey(redisDb *db, robj *key) {
2875 dictEntry *de = dictFind(db->dict,key);
3a66edc7 2876 if (de) {
55cf8433 2877 robj *key = dictGetEntryKey(de);
2878 robj *val = dictGetEntryVal(de);
3a66edc7 2879
55cf8433 2880 if (server.vm_enabled) {
996cb5f7 2881 if (key->storage == REDIS_VM_MEMORY ||
2882 key->storage == REDIS_VM_SWAPPING)
2883 {
2884 /* If we were swapping the object out, stop it, this key
2885 * was requested. */
2886 if (key->storage == REDIS_VM_SWAPPING)
2887 vmCancelThreadedIOJob(key);
55cf8433 2888 /* Update the access time of the key for the aging algorithm. */
2889 key->vm.atime = server.unixtime;
2890 } else {
d5d55fc3 2891 int notify = (key->storage == REDIS_VM_LOADING);
2892
55cf8433 2893 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 2894 redisAssert(val == NULL);
55cf8433 2895 val = vmLoadObject(key);
2896 dictGetEntryVal(de) = val;
d5d55fc3 2897
2898 /* Clients blocked by the VM subsystem may be waiting for
2899 * this key... */
2900 if (notify) handleClientsBlockedOnSwappedKey(db,key);
55cf8433 2901 }
2902 }
2903 return val;
3a66edc7 2904 } else {
2905 return NULL;
2906 }
942a3961 2907}
2908
2909static robj *lookupKeyRead(redisDb *db, robj *key) {
2910 expireIfNeeded(db,key);
2911 return lookupKey(db,key);
2912}
2913
2914static robj *lookupKeyWrite(redisDb *db, robj *key) {
2915 deleteIfVolatile(db,key);
2916 return lookupKey(db,key);
2917}
2918
92b27fe9 2919static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
2920 robj *o = lookupKeyRead(c->db, key);
2921 if (!o) addReply(c,reply);
2922 return o;
2923}
2924
2925static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
2926 robj *o = lookupKeyWrite(c->db, key);
2927 if (!o) addReply(c,reply);
2928 return o;
2929}
2930
2931static int checkType(redisClient *c, robj *o, int type) {
2932 if (o->type != type) {
2933 addReply(c,shared.wrongtypeerr);
2934 return 1;
2935 }
2936 return 0;
2937}
2938
942a3961 2939static int deleteKey(redisDb *db, robj *key) {
2940 int retval;
2941
2942 /* We need to protect key from destruction: after the first dictDelete()
2943 * it may happen that 'key' is no longer valid if we don't increment
2944 * it's count. This may happen when we get the object reference directly
2945 * from the hash table with dictRandomKey() or dict iterators */
2946 incrRefCount(key);
2947 if (dictSize(db->expires)) dictDelete(db->expires,key);
2948 retval = dictDelete(db->dict,key);
2949 decrRefCount(key);
2950
2951 return retval == DICT_OK;
2952}
2953
724a51b1 2954/* Check if the nul-terminated string 's' can be represented by a long
2955 * (that is, is a number that fits into long without any other space or
2956 * character before or after the digits).
2957 *
2958 * If so, the function returns REDIS_OK and *longval is set to the value
2959 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 2960static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 2961 char buf[32], *endptr;
2962 long value;
2963 int slen;
e0a62c7f 2964
724a51b1 2965 value = strtol(s, &endptr, 10);
2966 if (endptr[0] != '\0') return REDIS_ERR;
2967 slen = snprintf(buf,32,"%ld",value);
2968
2969 /* If the number converted back into a string is not identical
2970 * then it's not possible to encode the string as integer */
f69f2cba 2971 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 2972 if (longval) *longval = value;
2973 return REDIS_OK;
2974}
2975
942a3961 2976/* Try to encode a string object in order to save space */
05df7621 2977static robj *tryObjectEncoding(robj *o) {
942a3961 2978 long value;
942a3961 2979 sds s = o->ptr;
3305306f 2980
942a3961 2981 if (o->encoding != REDIS_ENCODING_RAW)
05df7621 2982 return o; /* Already encoded */
3305306f 2983
05df7621 2984 /* It's not safe to encode shared objects: shared objects can be shared
942a3961 2985 * everywhere in the "object space" of Redis. Encoded objects can only
2986 * appear as "values" (and not, for instance, as keys) */
05df7621 2987 if (o->refcount > 1) return o;
3305306f 2988
942a3961 2989 /* Currently we try to encode only strings */
dfc5e96c 2990 redisAssert(o->type == REDIS_STRING);
94754ccc 2991
724a51b1 2992 /* Check if we can represent this string as a long integer */
05df7621 2993 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
942a3961 2994
2995 /* Ok, this object can be encoded */
05df7621 2996 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2997 decrRefCount(o);
2998 incrRefCount(shared.integers[value]);
2999 return shared.integers[value];
3000 } else {
3001 o->encoding = REDIS_ENCODING_INT;
3002 sdsfree(o->ptr);
3003 o->ptr = (void*) value;
3004 return o;
3005 }
942a3961 3006}
3007
9d65a1bb 3008/* Get a decoded version of an encoded object (returned as a new object).
3009 * If the object is already raw-encoded just increment the ref count. */
3010static robj *getDecodedObject(robj *o) {
942a3961 3011 robj *dec;
e0a62c7f 3012
9d65a1bb 3013 if (o->encoding == REDIS_ENCODING_RAW) {
3014 incrRefCount(o);
3015 return o;
3016 }
942a3961 3017 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3018 char buf[32];
3019
3020 snprintf(buf,32,"%ld",(long)o->ptr);
3021 dec = createStringObject(buf,strlen(buf));
3022 return dec;
3023 } else {
dfc5e96c 3024 redisAssert(1 != 1);
942a3961 3025 }
3305306f 3026}
3027
d7f43c08 3028/* Compare two string objects via strcmp() or alike.
3029 * Note that the objects may be integer-encoded. In such a case we
3030 * use snprintf() to get a string representation of the numbers on the stack
1fd9bc8a 3031 * and compare the strings, it's much faster than calling getDecodedObject().
3032 *
3033 * Important note: if objects are not integer encoded, but binary-safe strings,
3034 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3035 * binary safe. */
724a51b1 3036static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 3037 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 3038 char bufa[128], bufb[128], *astr, *bstr;
3039 int bothsds = 1;
724a51b1 3040
e197b441 3041 if (a == b) return 0;
d7f43c08 3042 if (a->encoding != REDIS_ENCODING_RAW) {
3043 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3044 astr = bufa;
3045 bothsds = 0;
724a51b1 3046 } else {
d7f43c08 3047 astr = a->ptr;
724a51b1 3048 }
d7f43c08 3049 if (b->encoding != REDIS_ENCODING_RAW) {
3050 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3051 bstr = bufb;
3052 bothsds = 0;
3053 } else {
3054 bstr = b->ptr;
3055 }
3056 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3057}
3058
0ea663ea 3059static size_t stringObjectLen(robj *o) {
dfc5e96c 3060 redisAssert(o->type == REDIS_STRING);
0ea663ea 3061 if (o->encoding == REDIS_ENCODING_RAW) {
3062 return sdslen(o->ptr);
3063 } else {
3064 char buf[32];
3065
3066 return snprintf(buf,32,"%ld",(long)o->ptr);
3067 }
3068}
3069
bbe025e0
AM
3070static int getDoubleFromObject(redisClient *c, robj *o, double *value) {
3071 double parsedValue;
3072 char *eptr = NULL;
3073
3074 if (o && o->type != REDIS_STRING) {
3075 addReplySds(c,sdsnew("-ERR value is not a double\r\n"));
3076 return REDIS_ERR;
3077 }
3078
3079 if (o == NULL)
3080 parsedValue = 0;
3081 else if (o->encoding == REDIS_ENCODING_RAW)
3082 parsedValue = strtod(o->ptr, &eptr);
3083 else if (o->encoding == REDIS_ENCODING_INT)
3084 parsedValue = (long)o->ptr;
3085 else
3086 redisAssert(1 != 1);
3087
3088 if (eptr != NULL && *eptr != '\0') {
3089 addReplySds(c,sdsnew("-ERR value is not a double\r\n"));
3090 return REDIS_ERR;
3091 }
3092
3093 *value = parsedValue;
3094
3095 return REDIS_OK;
3096}
3097
3098static int getLongLongFromObject(redisClient *c, robj *o, long long *value) {
3099 long long parsedValue;
3100 char *eptr = NULL;
3101
3102 if (o && o->type != REDIS_STRING) {
3103 addReplySds(c,sdsnew("-ERR value is not an integer\r\n"));
3104 return REDIS_ERR;
3105 }
3106
3107 if (o == NULL)
3108 parsedValue = 0;
3109 else if (o->encoding == REDIS_ENCODING_RAW)
3110 parsedValue = strtoll(o->ptr, &eptr, 10);
3111 else if (o->encoding == REDIS_ENCODING_INT)
3112 parsedValue = (long)o->ptr;
3113 else
3114 redisAssert(1 != 1);
3115
3116 if (eptr != NULL && *eptr != '\0') {
3117 addReplySds(c,sdsnew("-ERR value is not an integer\r\n"));
3118 return REDIS_ERR;
3119 }
3120
3121 *value = parsedValue;
3122
3123 return REDIS_OK;
3124}
3125
3126static int getLongFromObject(redisClient *c, robj *o, long *value) {
3127 long long actualValue;
3128
3129 if (getLongLongFromObject(c, o, &actualValue) != REDIS_OK) return REDIS_ERR;
3130
3131 if (actualValue < LONG_MIN || actualValue > LONG_MAX) {
3132 addReplySds(c,sdsnew("-ERR value is out of range\r\n"));
3133 return REDIS_ERR;
3134 }
3135
3136 *value = actualValue;
3137
3138 return REDIS_OK;
3139}
3140
06233c45 3141/*============================ RDB saving/loading =========================== */
ed9b544e 3142
f78fd11b 3143static int rdbSaveType(FILE *fp, unsigned char type) {
3144 if (fwrite(&type,1,1,fp) == 0) return -1;
3145 return 0;
3146}
3147
bb32ede5 3148static int rdbSaveTime(FILE *fp, time_t t) {
3149 int32_t t32 = (int32_t) t;
3150 if (fwrite(&t32,4,1,fp) == 0) return -1;
3151 return 0;
3152}
3153
e3566d4b 3154/* check rdbLoadLen() comments for more info */
f78fd11b 3155static int rdbSaveLen(FILE *fp, uint32_t len) {
3156 unsigned char buf[2];
3157
3158 if (len < (1<<6)) {
3159 /* Save a 6 bit len */
10c43610 3160 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3161 if (fwrite(buf,1,1,fp) == 0) return -1;
3162 } else if (len < (1<<14)) {
3163 /* Save a 14 bit len */
10c43610 3164 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3165 buf[1] = len&0xFF;
17be1a4a 3166 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3167 } else {
3168 /* Save a 32 bit len */
10c43610 3169 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3170 if (fwrite(buf,1,1,fp) == 0) return -1;
3171 len = htonl(len);
3172 if (fwrite(&len,4,1,fp) == 0) return -1;
3173 }
3174 return 0;
3175}
3176
e3566d4b 3177/* String objects in the form "2391" "-100" without any space and with a
3178 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3179 * encoded as integers to save space */
b1befe6a 3180static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
e3566d4b 3181 long long value;
3182 char *endptr, buf[32];
3183
3184 /* Check if it's possible to encode this value as a number */
3185 value = strtoll(s, &endptr, 10);
3186 if (endptr[0] != '\0') return 0;
3187 snprintf(buf,32,"%lld",value);
3188
3189 /* If the number converted back into a string is not identical
3190 * then it's not possible to encode the string as integer */
b1befe6a 3191 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
e3566d4b 3192
3193 /* Finally check if it fits in our ranges */
3194 if (value >= -(1<<7) && value <= (1<<7)-1) {
3195 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3196 enc[1] = value&0xFF;
3197 return 2;
3198 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3199 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3200 enc[1] = value&0xFF;
3201 enc[2] = (value>>8)&0xFF;
3202 return 3;
3203 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3204 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3205 enc[1] = value&0xFF;
3206 enc[2] = (value>>8)&0xFF;
3207 enc[3] = (value>>16)&0xFF;
3208 enc[4] = (value>>24)&0xFF;
3209 return 5;
3210 } else {
3211 return 0;
3212 }
3213}
3214
b1befe6a 3215static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3216 size_t comprlen, outlen;
774e3047 3217 unsigned char byte;
3218 void *out;
3219
3220 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3221 if (len <= 4) return 0;
3222 outlen = len-4;
3a2694c4 3223 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3224 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3225 if (comprlen == 0) {
88e85998 3226 zfree(out);
774e3047 3227 return 0;
3228 }
3229 /* Data compressed! Let's save it on disk */
3230 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3231 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3232 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3233 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3234 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3235 zfree(out);
774e3047 3236 return comprlen;
3237
3238writeerr:
88e85998 3239 zfree(out);
774e3047 3240 return -1;
3241}
3242
e3566d4b 3243/* Save a string objet as [len][data] on disk. If the object is a string
3244 * representation of an integer value we try to safe it in a special form */
b1befe6a 3245static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3246 int enclen;
10c43610 3247
774e3047 3248 /* Try integer encoding */
e3566d4b 3249 if (len <= 11) {
3250 unsigned char buf[5];
b1befe6a 3251 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3252 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3253 return 0;
3254 }
3255 }
774e3047 3256
3257 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3258 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3259 if (server.rdbcompression && len > 20) {
774e3047 3260 int retval;
3261
b1befe6a 3262 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3263 if (retval == -1) return -1;
3264 if (retval > 0) return 0;
3265 /* retval == 0 means data can't be compressed, save the old way */
3266 }
3267
3268 /* Store verbatim */
10c43610 3269 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3270 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3271 return 0;
3272}
3273
942a3961 3274/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3275static int rdbSaveStringObject(FILE *fp, robj *obj) {
3276 int retval;
942a3961 3277
f2d9f50f 3278 /* Avoid incr/decr ref count business when possible.
3279 * This plays well with copy-on-write given that we are probably
3280 * in a child process (BGSAVE). Also this makes sure key objects
3281 * of swapped objects are not incRefCount-ed (an assert does not allow
3282 * this in order to avoid bugs) */
3283 if (obj->encoding != REDIS_ENCODING_RAW) {
996cb5f7 3284 obj = getDecodedObject(obj);
b1befe6a 3285 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3286 decrRefCount(obj);
3287 } else {
b1befe6a 3288 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3289 }
9d65a1bb 3290 return retval;
942a3961 3291}
3292
a7866db6 3293/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3294 * 8 bit integer specifing the length of the representation.
3295 * This 8 bit integer has special values in order to specify the following
3296 * conditions:
3297 * 253: not a number
3298 * 254: + inf
3299 * 255: - inf
3300 */
3301static int rdbSaveDoubleValue(FILE *fp, double val) {
3302 unsigned char buf[128];
3303 int len;
3304
3305 if (isnan(val)) {
3306 buf[0] = 253;
3307 len = 1;
3308 } else if (!isfinite(val)) {
3309 len = 1;
3310 buf[0] = (val < 0) ? 255 : 254;
3311 } else {
eaa256ad 3312 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3313 buf[0] = strlen((char*)buf+1);
a7866db6 3314 len = buf[0]+1;
3315 }
3316 if (fwrite(buf,len,1,fp) == 0) return -1;
3317 return 0;
3318}
3319
06233c45 3320/* Save a Redis object. */
3321static int rdbSaveObject(FILE *fp, robj *o) {
3322 if (o->type == REDIS_STRING) {
3323 /* Save a string value */
3324 if (rdbSaveStringObject(fp,o) == -1) return -1;
3325 } else if (o->type == REDIS_LIST) {
3326 /* Save a list value */
3327 list *list = o->ptr;
c7df85a4 3328 listIter li;
06233c45 3329 listNode *ln;
3330
06233c45 3331 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
c7df85a4 3332 listRewind(list,&li);
3333 while((ln = listNext(&li))) {
06233c45 3334 robj *eleobj = listNodeValue(ln);
3335
3336 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3337 }
3338 } else if (o->type == REDIS_SET) {
3339 /* Save a set value */
3340 dict *set = o->ptr;
3341 dictIterator *di = dictGetIterator(set);
3342 dictEntry *de;
3343
3344 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3345 while((de = dictNext(di)) != NULL) {
3346 robj *eleobj = dictGetEntryKey(de);
3347
3348 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3349 }
3350 dictReleaseIterator(di);
3351 } else if (o->type == REDIS_ZSET) {
3352 /* Save a set value */
3353 zset *zs = o->ptr;
3354 dictIterator *di = dictGetIterator(zs->dict);
3355 dictEntry *de;
3356
3357 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3358 while((de = dictNext(di)) != NULL) {
3359 robj *eleobj = dictGetEntryKey(de);
3360 double *score = dictGetEntryVal(de);
3361
3362 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3363 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3364 }
3365 dictReleaseIterator(di);
b1befe6a 3366 } else if (o->type == REDIS_HASH) {
3367 /* Save a hash value */
3368 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3369 unsigned char *p = zipmapRewind(o->ptr);
3370 unsigned int count = zipmapLen(o->ptr);
3371 unsigned char *key, *val;
3372 unsigned int klen, vlen;
3373
3374 if (rdbSaveLen(fp,count) == -1) return -1;
3375 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3376 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3377 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3378 }
3379 } else {
3380 dictIterator *di = dictGetIterator(o->ptr);
3381 dictEntry *de;
3382
3383 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3384 while((de = dictNext(di)) != NULL) {
3385 robj *key = dictGetEntryKey(de);
3386 robj *val = dictGetEntryVal(de);
3387
3388 if (rdbSaveStringObject(fp,key) == -1) return -1;
3389 if (rdbSaveStringObject(fp,val) == -1) return -1;
3390 }
3391 dictReleaseIterator(di);
3392 }
06233c45 3393 } else {
78409a0f 3394 redisAssert(0);
06233c45 3395 }
3396 return 0;
3397}
3398
3399/* Return the length the object will have on disk if saved with
3400 * the rdbSaveObject() function. Currently we use a trick to get
3401 * this length with very little changes to the code. In the future
3402 * we could switch to a faster solution. */
b9bc0eef 3403static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3404 if (fp == NULL) fp = server.devnull;
06233c45 3405 rewind(fp);
3406 assert(rdbSaveObject(fp,o) != 1);
3407 return ftello(fp);
3408}
3409
06224fec 3410/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3411static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3412 off_t bytes = rdbSavedObjectLen(o,fp);
e0a62c7f 3413
06224fec 3414 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3415}
3416
ed9b544e 3417/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3418static int rdbSave(char *filename) {
ed9b544e 3419 dictIterator *di = NULL;
3420 dictEntry *de;
ed9b544e 3421 FILE *fp;
3422 char tmpfile[256];
3423 int j;
bb32ede5 3424 time_t now = time(NULL);
ed9b544e 3425
2316bb3b 3426 /* Wait for I/O therads to terminate, just in case this is a
3427 * foreground-saving, to avoid seeking the swap file descriptor at the
3428 * same time. */
3429 if (server.vm_enabled)
3430 waitEmptyIOJobsQueue();
3431
a3b21203 3432 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3433 fp = fopen(tmpfile,"w");
3434 if (!fp) {
3435 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3436 return REDIS_ERR;
3437 }
f78fd11b 3438 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3439 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3440 redisDb *db = server.db+j;
3441 dict *d = db->dict;
3305306f 3442 if (dictSize(d) == 0) continue;
ed9b544e 3443 di = dictGetIterator(d);
3444 if (!di) {
3445 fclose(fp);
3446 return REDIS_ERR;
3447 }
3448
3449 /* Write the SELECT DB opcode */
f78fd11b 3450 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3451 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3452
3453 /* Iterate this DB writing every entry */
3454 while((de = dictNext(di)) != NULL) {
3455 robj *key = dictGetEntryKey(de);
3456 robj *o = dictGetEntryVal(de);
bb32ede5 3457 time_t expiretime = getExpire(db,key);
3458
3459 /* Save the expire time */
3460 if (expiretime != -1) {
3461 /* If this key is already expired skip it */
3462 if (expiretime < now) continue;
3463 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3464 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3465 }
7e69548d 3466 /* Save the key and associated value. This requires special
3467 * handling if the value is swapped out. */
996cb5f7 3468 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3469 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3470 /* Save type, key, value */
3471 if (rdbSaveType(fp,o->type) == -1) goto werr;
3472 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3473 if (rdbSaveObject(fp,o) == -1) goto werr;
3474 } else {
996cb5f7 3475 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3476 robj *po;
7e69548d 3477 /* Get a preview of the object in memory */
3478 po = vmPreviewObject(key);
7e69548d 3479 /* Save type, key, value */
3480 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3481 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3482 if (rdbSaveObject(fp,po) == -1) goto werr;
3483 /* Remove the loaded object from memory */
3484 decrRefCount(po);
7e69548d 3485 }
ed9b544e 3486 }
3487 dictReleaseIterator(di);
3488 }
3489 /* EOF opcode */
f78fd11b 3490 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3491
3492 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3493 fflush(fp);
3494 fsync(fileno(fp));
3495 fclose(fp);
e0a62c7f 3496
ed9b544e 3497 /* Use RENAME to make sure the DB file is changed atomically only
3498 * if the generate DB file is ok. */
3499 if (rename(tmpfile,filename) == -1) {
325d1eb4 3500 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3501 unlink(tmpfile);
3502 return REDIS_ERR;
3503 }
3504 redisLog(REDIS_NOTICE,"DB saved on disk");
3505 server.dirty = 0;
3506 server.lastsave = time(NULL);
3507 return REDIS_OK;
3508
3509werr:
3510 fclose(fp);
3511 unlink(tmpfile);
3512 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3513 if (di) dictReleaseIterator(di);
3514 return REDIS_ERR;
3515}
3516
f78fd11b 3517static int rdbSaveBackground(char *filename) {
ed9b544e 3518 pid_t childpid;
3519
9d65a1bb 3520 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3521 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3522 if ((childpid = fork()) == 0) {
3523 /* Child */
054e426d 3524 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3525 close(server.fd);
f78fd11b 3526 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3527 _exit(0);
ed9b544e 3528 } else {
478c2c6f 3529 _exit(1);
ed9b544e 3530 }
3531 } else {
3532 /* Parent */
5a7c647e 3533 if (childpid == -1) {
3534 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3535 strerror(errno));
3536 return REDIS_ERR;
3537 }
ed9b544e 3538 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3539 server.bgsavechildpid = childpid;
884d4b39 3540 updateDictResizePolicy();
ed9b544e 3541 return REDIS_OK;
3542 }
3543 return REDIS_OK; /* unreached */
3544}
3545
a3b21203 3546static void rdbRemoveTempFile(pid_t childpid) {
3547 char tmpfile[256];
3548
3549 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3550 unlink(tmpfile);
3551}
3552
f78fd11b 3553static int rdbLoadType(FILE *fp) {
3554 unsigned char type;
7b45bfb2 3555 if (fread(&type,1,1,fp) == 0) return -1;
3556 return type;
3557}
3558
bb32ede5 3559static time_t rdbLoadTime(FILE *fp) {
3560 int32_t t32;
3561 if (fread(&t32,4,1,fp) == 0) return -1;
3562 return (time_t) t32;
3563}
3564
e3566d4b 3565/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3566 * of this file for a description of how this are stored on disk.
3567 *
3568 * isencoded is set to 1 if the readed length is not actually a length but
3569 * an "encoding type", check the above comments for more info */
c78a8ccc 3570static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3571 unsigned char buf[2];
3572 uint32_t len;
c78a8ccc 3573 int type;
f78fd11b 3574
e3566d4b 3575 if (isencoded) *isencoded = 0;
c78a8ccc 3576 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3577 type = (buf[0]&0xC0)>>6;
3578 if (type == REDIS_RDB_6BITLEN) {
3579 /* Read a 6 bit len */
3580 return buf[0]&0x3F;
3581 } else if (type == REDIS_RDB_ENCVAL) {
3582 /* Read a 6 bit len encoding type */
3583 if (isencoded) *isencoded = 1;
3584 return buf[0]&0x3F;
3585 } else if (type == REDIS_RDB_14BITLEN) {
3586 /* Read a 14 bit len */
3587 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3588 return ((buf[0]&0x3F)<<8)|buf[1];
3589 } else {
3590 /* Read a 32 bit len */
f78fd11b 3591 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3592 return ntohl(len);
f78fd11b 3593 }
f78fd11b 3594}
3595
e3566d4b 3596static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3597 unsigned char enc[4];
3598 long long val;
3599
3600 if (enctype == REDIS_RDB_ENC_INT8) {
3601 if (fread(enc,1,1,fp) == 0) return NULL;
3602 val = (signed char)enc[0];
3603 } else if (enctype == REDIS_RDB_ENC_INT16) {
3604 uint16_t v;
3605 if (fread(enc,2,1,fp) == 0) return NULL;
3606 v = enc[0]|(enc[1]<<8);
3607 val = (int16_t)v;
3608 } else if (enctype == REDIS_RDB_ENC_INT32) {
3609 uint32_t v;
3610 if (fread(enc,4,1,fp) == 0) return NULL;
3611 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3612 val = (int32_t)v;
3613 } else {
3614 val = 0; /* anti-warning */
78409a0f 3615 redisAssert(0);
e3566d4b 3616 }
3617 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3618}
3619
c78a8ccc 3620static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3621 unsigned int len, clen;
3622 unsigned char *c = NULL;
3623 sds val = NULL;
3624
c78a8ccc 3625 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3626 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3627 if ((c = zmalloc(clen)) == NULL) goto err;
3628 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3629 if (fread(c,clen,1,fp) == 0) goto err;
3630 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3631 zfree(c);
88e85998 3632 return createObject(REDIS_STRING,val);
3633err:
3634 zfree(c);
3635 sdsfree(val);
3636 return NULL;
3637}
3638
c78a8ccc 3639static robj *rdbLoadStringObject(FILE*fp) {
e3566d4b 3640 int isencoded;
3641 uint32_t len;
f78fd11b 3642 sds val;
3643
c78a8ccc 3644 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3645 if (isencoded) {
3646 switch(len) {
3647 case REDIS_RDB_ENC_INT8:
3648 case REDIS_RDB_ENC_INT16:
3649 case REDIS_RDB_ENC_INT32:
bdcb92f2 3650 return rdbLoadIntegerObject(fp,len);
88e85998 3651 case REDIS_RDB_ENC_LZF:
bdcb92f2 3652 return rdbLoadLzfStringObject(fp);
e3566d4b 3653 default:
78409a0f 3654 redisAssert(0);
e3566d4b 3655 }
3656 }
3657
f78fd11b 3658 if (len == REDIS_RDB_LENERR) return NULL;
3659 val = sdsnewlen(NULL,len);
3660 if (len && fread(val,len,1,fp) == 0) {
3661 sdsfree(val);
3662 return NULL;
3663 }
bdcb92f2 3664 return createObject(REDIS_STRING,val);
f78fd11b 3665}
3666
a7866db6 3667/* For information about double serialization check rdbSaveDoubleValue() */
3668static int rdbLoadDoubleValue(FILE *fp, double *val) {
3669 char buf[128];
3670 unsigned char len;
3671
3672 if (fread(&len,1,1,fp) == 0) return -1;
3673 switch(len) {
3674 case 255: *val = R_NegInf; return 0;
3675 case 254: *val = R_PosInf; return 0;
3676 case 253: *val = R_Nan; return 0;
3677 default:
3678 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3679 buf[len] = '\0';
a7866db6 3680 sscanf(buf, "%lg", val);
3681 return 0;
3682 }
3683}
3684
c78a8ccc 3685/* Load a Redis object of the specified type from the specified file.
3686 * On success a newly allocated object is returned, otherwise NULL. */
3687static robj *rdbLoadObject(int type, FILE *fp) {
3688 robj *o;
3689
bcd11906 3690 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 3691 if (type == REDIS_STRING) {
3692 /* Read string value */
3693 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
05df7621 3694 o = tryObjectEncoding(o);
c78a8ccc 3695 } else if (type == REDIS_LIST || type == REDIS_SET) {
3696 /* Read list/set value */
3697 uint32_t listlen;
3698
3699 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3700 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3c68de9b 3701 /* It's faster to expand the dict to the right size asap in order
3702 * to avoid rehashing */
3703 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3704 dictExpand(o->ptr,listlen);
c78a8ccc 3705 /* Load every single element of the list/set */
3706 while(listlen--) {
3707 robj *ele;
3708
3709 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
05df7621 3710 ele = tryObjectEncoding(ele);
c78a8ccc 3711 if (type == REDIS_LIST) {
3712 listAddNodeTail((list*)o->ptr,ele);
3713 } else {
3714 dictAdd((dict*)o->ptr,ele,NULL);
3715 }
3716 }
3717 } else if (type == REDIS_ZSET) {
3718 /* Read list/set value */
ada386b2 3719 size_t zsetlen;
c78a8ccc 3720 zset *zs;
3721
3722 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3723 o = createZsetObject();
3724 zs = o->ptr;
3725 /* Load every single element of the list/set */
3726 while(zsetlen--) {
3727 robj *ele;
3728 double *score = zmalloc(sizeof(double));
3729
3730 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
05df7621 3731 ele = tryObjectEncoding(ele);
c78a8ccc 3732 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3733 dictAdd(zs->dict,ele,score);
3734 zslInsert(zs->zsl,*score,ele);
3735 incrRefCount(ele); /* added to skiplist */
3736 }
ada386b2 3737 } else if (type == REDIS_HASH) {
3738 size_t hashlen;
3739
3740 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3741 o = createHashObject();
3742 /* Too many entries? Use an hash table. */
3743 if (hashlen > server.hash_max_zipmap_entries)
3744 convertToRealHash(o);
3745 /* Load every key/value, then set it into the zipmap or hash
3746 * table, as needed. */
3747 while(hashlen--) {
3748 robj *key, *val;
3749
3750 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3751 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3752 /* If we are using a zipmap and there are too big values
3753 * the object is converted to real hash table encoding. */
3754 if (o->encoding != REDIS_ENCODING_HT &&
3755 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3756 sdslen(val->ptr) > server.hash_max_zipmap_value))
3757 {
3758 convertToRealHash(o);
3759 }
3760
3761 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3762 unsigned char *zm = o->ptr;
3763
3764 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3765 val->ptr,sdslen(val->ptr),NULL);
3766 o->ptr = zm;
3767 decrRefCount(key);
3768 decrRefCount(val);
3769 } else {
05df7621 3770 key = tryObjectEncoding(key);
3771 val = tryObjectEncoding(val);
ada386b2 3772 dictAdd((dict*)o->ptr,key,val);
ada386b2 3773 }
3774 }
c78a8ccc 3775 } else {
78409a0f 3776 redisAssert(0);
c78a8ccc 3777 }
3778 return o;
3779}
3780
f78fd11b 3781static int rdbLoad(char *filename) {
ed9b544e 3782 FILE *fp;
f78fd11b 3783 robj *keyobj = NULL;
3784 uint32_t dbid;
bb32ede5 3785 int type, retval, rdbver;
3305306f 3786 dict *d = server.db[0].dict;
bb32ede5 3787 redisDb *db = server.db+0;
f78fd11b 3788 char buf[1024];
bb32ede5 3789 time_t expiretime = -1, now = time(NULL);
b492cf00 3790 long long loadedkeys = 0;
bb32ede5 3791
ed9b544e 3792 fp = fopen(filename,"r");
3793 if (!fp) return REDIS_ERR;
3794 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 3795 buf[9] = '\0';
3796 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 3797 fclose(fp);
3798 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3799 return REDIS_ERR;
3800 }
f78fd11b 3801 rdbver = atoi(buf+5);
c78a8ccc 3802 if (rdbver != 1) {
f78fd11b 3803 fclose(fp);
3804 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3805 return REDIS_ERR;
3806 }
ed9b544e 3807 while(1) {
3808 robj *o;
3809
3810 /* Read type. */
f78fd11b 3811 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 3812 if (type == REDIS_EXPIRETIME) {
3813 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3814 /* We read the time so we need to read the object type again */
3815 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3816 }
ed9b544e 3817 if (type == REDIS_EOF) break;
3818 /* Handle SELECT DB opcode as a special case */
3819 if (type == REDIS_SELECTDB) {
c78a8ccc 3820 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 3821 goto eoferr;
ed9b544e 3822 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 3823 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 3824 exit(1);
3825 }
bb32ede5 3826 db = server.db+dbid;
3827 d = db->dict;
ed9b544e 3828 continue;
3829 }
3830 /* Read key */
c78a8ccc 3831 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3832 /* Read value */
3833 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
ed9b544e 3834 /* Add the new object in the hash table */
f78fd11b 3835 retval = dictAdd(d,keyobj,o);
ed9b544e 3836 if (retval == DICT_ERR) {
f78fd11b 3837 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
ed9b544e 3838 exit(1);
3839 }
bb32ede5 3840 /* Set the expire time if needed */
3841 if (expiretime != -1) {
3842 setExpire(db,keyobj,expiretime);
3843 /* Delete this key if already expired */
3844 if (expiretime < now) deleteKey(db,keyobj);
3845 expiretime = -1;
3846 }
f78fd11b 3847 keyobj = o = NULL;
b492cf00 3848 /* Handle swapping while loading big datasets when VM is on */
3849 loadedkeys++;
3850 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3851 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 3852 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 3853 }
3854 }
ed9b544e 3855 }
3856 fclose(fp);
3857 return REDIS_OK;
3858
3859eoferr: /* unexpected end of file is handled here with a fatal exit */
e3566d4b 3860 if (keyobj) decrRefCount(keyobj);
f80dff62 3861 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 3862 exit(1);
3863 return REDIS_ERR; /* Just to avoid warning */
3864}
3865
3866/*================================== Commands =============================== */
3867
abcb223e 3868static void authCommand(redisClient *c) {
2e77c2ee 3869 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
3870 c->authenticated = 1;
3871 addReply(c,shared.ok);
3872 } else {
3873 c->authenticated = 0;
fa4c0aba 3874 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
3875 }
3876}
3877
ed9b544e 3878static void pingCommand(redisClient *c) {
3879 addReply(c,shared.pong);
3880}
3881
3882static void echoCommand(redisClient *c) {
dd88747b 3883 addReplyBulk(c,c->argv[1]);
ed9b544e 3884}
3885
3886/*=================================== Strings =============================== */
3887
3888static void setGenericCommand(redisClient *c, int nx) {
3889 int retval;
3890
333fd216 3891 if (nx) deleteIfVolatile(c->db,c->argv[1]);
3305306f 3892 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3893 if (retval == DICT_ERR) {
3894 if (!nx) {
1b03836c 3895 /* If the key is about a swapped value, we want a new key object
3896 * to overwrite the old. So we delete the old key in the database.
3897 * This will also make sure that swap pages about the old object
3898 * will be marked as free. */
ddfaca9d 3899 if (server.vm_enabled && deleteIfSwapped(c->db,c->argv[1]))
1b03836c 3900 incrRefCount(c->argv[1]);
3305306f 3901 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
ed9b544e 3902 incrRefCount(c->argv[2]);
3903 } else {
c937aa89 3904 addReply(c,shared.czero);
ed9b544e 3905 return;
3906 }
3907 } else {
3908 incrRefCount(c->argv[1]);
3909 incrRefCount(c->argv[2]);
3910 }
3911 server.dirty++;
3305306f 3912 removeExpire(c->db,c->argv[1]);
c937aa89 3913 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 3914}
3915
3916static void setCommand(redisClient *c) {
a4d1ba9a 3917 setGenericCommand(c,0);
ed9b544e 3918}
3919
3920static void setnxCommand(redisClient *c) {
a4d1ba9a 3921 setGenericCommand(c,1);
ed9b544e 3922}
3923
322fc7d8 3924static int getGenericCommand(redisClient *c) {
dd88747b 3925 robj *o;
e0a62c7f 3926
dd88747b 3927 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 3928 return REDIS_OK;
dd88747b 3929
3930 if (o->type != REDIS_STRING) {
3931 addReply(c,shared.wrongtypeerr);
3932 return REDIS_ERR;
ed9b544e 3933 } else {
dd88747b 3934 addReplyBulk(c,o);
3935 return REDIS_OK;
ed9b544e 3936 }
3937}
3938
322fc7d8 3939static void getCommand(redisClient *c) {
3940 getGenericCommand(c);
3941}
3942
f6b141c5 3943static void getsetCommand(redisClient *c) {
322fc7d8 3944 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 3945 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
3946 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
3947 } else {
3948 incrRefCount(c->argv[1]);
3949 }
3950 incrRefCount(c->argv[2]);
3951 server.dirty++;
3952 removeExpire(c->db,c->argv[1]);
3953}
3954
70003d28 3955static void mgetCommand(redisClient *c) {
70003d28 3956 int j;
e0a62c7f 3957
c937aa89 3958 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 3959 for (j = 1; j < c->argc; j++) {
3305306f 3960 robj *o = lookupKeyRead(c->db,c->argv[j]);
3961 if (o == NULL) {
c937aa89 3962 addReply(c,shared.nullbulk);
70003d28 3963 } else {
70003d28 3964 if (o->type != REDIS_STRING) {
c937aa89 3965 addReply(c,shared.nullbulk);
70003d28 3966 } else {
dd88747b 3967 addReplyBulk(c,o);
70003d28 3968 }
3969 }
3970 }
3971}
3972
6c446631 3973static void msetGenericCommand(redisClient *c, int nx) {
906573e7 3974 int j, busykeys = 0;
6c446631 3975
3976 if ((c->argc % 2) == 0) {
454d4e43 3977 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 3978 return;
3979 }
3980 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
3981 * set nothing at all if at least one already key exists. */
3982 if (nx) {
3983 for (j = 1; j < c->argc; j += 2) {
906573e7 3984 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
3985 busykeys++;
6c446631 3986 }
3987 }
3988 }
906573e7 3989 if (busykeys) {
3990 addReply(c, shared.czero);
3991 return;
3992 }
6c446631 3993
3994 for (j = 1; j < c->argc; j += 2) {
3995 int retval;
3996
05df7621 3997 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
6c446631 3998 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
3999 if (retval == DICT_ERR) {
4000 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4001 incrRefCount(c->argv[j+1]);
4002 } else {
4003 incrRefCount(c->argv[j]);
4004 incrRefCount(c->argv[j+1]);
4005 }
4006 removeExpire(c->db,c->argv[j]);
4007 }
4008 server.dirty += (c->argc-1)/2;
4009 addReply(c, nx ? shared.cone : shared.ok);
4010}
4011
4012static void msetCommand(redisClient *c) {
4013 msetGenericCommand(c,0);
4014}
4015
4016static void msetnxCommand(redisClient *c) {
4017 msetGenericCommand(c,1);
4018}
4019
d68ed120 4020static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 4021 long long value;
4022 int retval;
4023 robj *o;
e0a62c7f 4024
3305306f 4025 o = lookupKeyWrite(c->db,c->argv[1]);
ed9b544e 4026
bbe025e0 4027 if (getLongLongFromObject(c, o, &value) != REDIS_OK) return;
ed9b544e 4028
4029 value += incr;
4030 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
05df7621 4031 o = tryObjectEncoding(o);
3305306f 4032 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 4033 if (retval == DICT_ERR) {
3305306f 4034 dictReplace(c->db->dict,c->argv[1],o);
4035 removeExpire(c->db,c->argv[1]);
ed9b544e 4036 } else {
4037 incrRefCount(c->argv[1]);
4038 }
4039 server.dirty++;
c937aa89 4040 addReply(c,shared.colon);
ed9b544e 4041 addReply(c,o);
4042 addReply(c,shared.crlf);
4043}
4044
4045static void incrCommand(redisClient *c) {
a4d1ba9a 4046 incrDecrCommand(c,1);
ed9b544e 4047}
4048
4049static void decrCommand(redisClient *c) {
a4d1ba9a 4050 incrDecrCommand(c,-1);
ed9b544e 4051}
4052
4053static void incrbyCommand(redisClient *c) {
bbe025e0
AM
4054 long long incr;
4055
4056 if (getLongLongFromObject(c, c->argv[2], &incr) != REDIS_OK) return;
4057
a4d1ba9a 4058 incrDecrCommand(c,incr);
ed9b544e 4059}
4060
4061static void decrbyCommand(redisClient *c) {
bbe025e0
AM
4062 long long incr;
4063
4064 if (getLongLongFromObject(c, c->argv[2], &incr) != REDIS_OK) return;
4065
a4d1ba9a 4066 incrDecrCommand(c,-incr);
ed9b544e 4067}
4068
4b00bebd 4069static void appendCommand(redisClient *c) {
4070 int retval;
4071 size_t totlen;
4072 robj *o;
4073
4074 o = lookupKeyWrite(c->db,c->argv[1]);
4075 if (o == NULL) {
4076 /* Create the key */
4077 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4078 incrRefCount(c->argv[1]);
4079 incrRefCount(c->argv[2]);
4080 totlen = stringObjectLen(c->argv[2]);
4081 } else {
4082 dictEntry *de;
e0a62c7f 4083
4b00bebd 4084 de = dictFind(c->db->dict,c->argv[1]);
4085 assert(de != NULL);
4086
4087 o = dictGetEntryVal(de);
4088 if (o->type != REDIS_STRING) {
4089 addReply(c,shared.wrongtypeerr);
4090 return;
4091 }
4092 /* If the object is specially encoded or shared we have to make
4093 * a copy */
4094 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4095 robj *decoded = getDecodedObject(o);
4096
4097 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4098 decrRefCount(decoded);
4099 dictReplace(c->db->dict,c->argv[1],o);
4100 }
4101 /* APPEND! */
4102 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4103 o->ptr = sdscatlen(o->ptr,
4104 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4105 } else {
4106 o->ptr = sdscatprintf(o->ptr, "%ld",
4107 (unsigned long) c->argv[2]->ptr);
4108 }
4109 totlen = sdslen(o->ptr);
4110 }
4111 server.dirty++;
4112 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4113}
4114
39191553 4115static void substrCommand(redisClient *c) {
4116 robj *o;
4117 long start = atoi(c->argv[2]->ptr);
4118 long end = atoi(c->argv[3]->ptr);
dd88747b 4119 size_t rangelen, strlen;
4120 sds range;
39191553 4121
dd88747b 4122 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4123 checkType(c,o,REDIS_STRING)) return;
39191553 4124
dd88747b 4125 o = getDecodedObject(o);
4126 strlen = sdslen(o->ptr);
8fe7fad7 4127
dd88747b 4128 /* convert negative indexes */
4129 if (start < 0) start = strlen+start;
4130 if (end < 0) end = strlen+end;
4131 if (start < 0) start = 0;
4132 if (end < 0) end = 0;
39191553 4133
dd88747b 4134 /* indexes sanity checks */
4135 if (start > end || (size_t)start >= strlen) {
4136 /* Out of range start or start > end result in null reply */
4137 addReply(c,shared.nullbulk);
4138 decrRefCount(o);
4139 return;
39191553 4140 }
dd88747b 4141 if ((size_t)end >= strlen) end = strlen-1;
4142 rangelen = (end-start)+1;
4143
4144 /* Return the result */
4145 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4146 range = sdsnewlen((char*)o->ptr+start,rangelen);
4147 addReplySds(c,range);
4148 addReply(c,shared.crlf);
4149 decrRefCount(o);
39191553 4150}
4151
ed9b544e 4152/* ========================= Type agnostic commands ========================= */
4153
4154static void delCommand(redisClient *c) {
5109cdff 4155 int deleted = 0, j;
4156
4157 for (j = 1; j < c->argc; j++) {
4158 if (deleteKey(c->db,c->argv[j])) {
4159 server.dirty++;
4160 deleted++;
4161 }
4162 }
dd88747b 4163 addReplyLong(c,deleted);
ed9b544e 4164}
4165
4166static void existsCommand(redisClient *c) {
3305306f 4167 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
ed9b544e 4168}
4169
4170static void selectCommand(redisClient *c) {
4171 int id = atoi(c->argv[1]->ptr);
e0a62c7f 4172
ed9b544e 4173 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4174 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4175 } else {
4176 addReply(c,shared.ok);
4177 }
4178}
4179
4180static void randomkeyCommand(redisClient *c) {
4181 dictEntry *de;
e0a62c7f 4182
3305306f 4183 while(1) {
4184 de = dictGetRandomKey(c->db->dict);
ce7bef07 4185 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 4186 }
ed9b544e 4187 if (de == NULL) {
ce7bef07 4188 addReply(c,shared.plus);
ed9b544e 4189 addReply(c,shared.crlf);
4190 } else {
c937aa89 4191 addReply(c,shared.plus);
ed9b544e 4192 addReply(c,dictGetEntryKey(de));
4193 addReply(c,shared.crlf);
4194 }
4195}
4196
4197static void keysCommand(redisClient *c) {
4198 dictIterator *di;
4199 dictEntry *de;
4200 sds pattern = c->argv[1]->ptr;
4201 int plen = sdslen(pattern);
a3f9eec2 4202 unsigned long numkeys = 0;
ed9b544e 4203 robj *lenobj = createObject(REDIS_STRING,NULL);
4204
3305306f 4205 di = dictGetIterator(c->db->dict);
ed9b544e 4206 addReply(c,lenobj);
4207 decrRefCount(lenobj);
4208 while((de = dictNext(di)) != NULL) {
4209 robj *keyobj = dictGetEntryKey(de);
3305306f 4210
ed9b544e 4211 sds key = keyobj->ptr;
4212 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4213 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 4214 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4215 addReplyBulk(c,keyobj);
3305306f 4216 numkeys++;
3305306f 4217 }
ed9b544e 4218 }
4219 }
4220 dictReleaseIterator(di);
a3f9eec2 4221 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4222}
4223
4224static void dbsizeCommand(redisClient *c) {
4225 addReplySds(c,
3305306f 4226 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4227}
4228
4229static void lastsaveCommand(redisClient *c) {
4230 addReplySds(c,
c937aa89 4231 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4232}
4233
4234static void typeCommand(redisClient *c) {
3305306f 4235 robj *o;
ed9b544e 4236 char *type;
3305306f 4237
4238 o = lookupKeyRead(c->db,c->argv[1]);
4239 if (o == NULL) {
c937aa89 4240 type = "+none";
ed9b544e 4241 } else {
ed9b544e 4242 switch(o->type) {
c937aa89 4243 case REDIS_STRING: type = "+string"; break;
4244 case REDIS_LIST: type = "+list"; break;
4245 case REDIS_SET: type = "+set"; break;
412a8bce 4246 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4247 case REDIS_HASH: type = "+hash"; break;
4248 default: type = "+unknown"; break;
ed9b544e 4249 }
4250 }
4251 addReplySds(c,sdsnew(type));
4252 addReply(c,shared.crlf);
4253}
4254
4255static void saveCommand(redisClient *c) {
9d65a1bb 4256 if (server.bgsavechildpid != -1) {
05557f6d 4257 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4258 return;
4259 }
f78fd11b 4260 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4261 addReply(c,shared.ok);
4262 } else {
4263 addReply(c,shared.err);
4264 }
4265}
4266
4267static void bgsaveCommand(redisClient *c) {
9d65a1bb 4268 if (server.bgsavechildpid != -1) {
ed9b544e 4269 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4270 return;
4271 }
f78fd11b 4272 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4273 char *status = "+Background saving started\r\n";
4274 addReplySds(c,sdsnew(status));
ed9b544e 4275 } else {
4276 addReply(c,shared.err);
4277 }
4278}
4279
4280static void shutdownCommand(redisClient *c) {
4281 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
a3b21203 4282 /* Kill the saving child if there is a background saving in progress.
4283 We want to avoid race conditions, for instance our saving child may
4284 overwrite the synchronous saving did by SHUTDOWN. */
9d65a1bb 4285 if (server.bgsavechildpid != -1) {
9f3c422c 4286 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4287 kill(server.bgsavechildpid,SIGKILL);
a3b21203 4288 rdbRemoveTempFile(server.bgsavechildpid);
9f3c422c 4289 }
ac945e2d 4290 if (server.appendonly) {
4291 /* Append only file: fsync() the AOF and exit */
4292 fsync(server.appendfd);
054e426d 4293 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 4294 exit(0);
ed9b544e 4295 } else {
ac945e2d 4296 /* Snapshotting. Perform a SYNC SAVE and exit */
4297 if (rdbSave(server.dbfilename) == REDIS_OK) {
4298 if (server.daemonize)
4299 unlink(server.pidfile);
4300 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4301 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
054e426d 4302 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 4303 exit(0);
4304 } else {
dd88747b 4305 /* Ooops.. error saving! The best we can do is to continue
4306 * operating. Note that if there was a background saving process,
4307 * in the next cron() Redis will be notified that the background
4308 * saving aborted, handling special stuff like slaves pending for
4309 * synchronization... */
e0a62c7f 4310 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
dd88747b 4311 addReplySds(c,
4312 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
ac945e2d 4313 }
ed9b544e 4314 }
4315}
4316
4317static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4318 robj *o;
4319
4320 /* To use the same key as src and dst is probably an error */
4321 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4322 addReply(c,shared.sameobjecterr);
ed9b544e 4323 return;
4324 }
4325
dd88747b 4326 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4327 return;
dd88747b 4328
ed9b544e 4329 incrRefCount(o);
3305306f 4330 deleteIfVolatile(c->db,c->argv[2]);
4331 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 4332 if (nx) {
4333 decrRefCount(o);
c937aa89 4334 addReply(c,shared.czero);
ed9b544e 4335 return;
4336 }
3305306f 4337 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 4338 } else {
4339 incrRefCount(c->argv[2]);
4340 }
3305306f 4341 deleteKey(c->db,c->argv[1]);
ed9b544e 4342 server.dirty++;
c937aa89 4343 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4344}
4345
4346static void renameCommand(redisClient *c) {
4347 renameGenericCommand(c,0);
4348}
4349
4350static void renamenxCommand(redisClient *c) {
4351 renameGenericCommand(c,1);
4352}
4353
4354static void moveCommand(redisClient *c) {
3305306f 4355 robj *o;
4356 redisDb *src, *dst;
ed9b544e 4357 int srcid;
4358
4359 /* Obtain source and target DB pointers */
3305306f 4360 src = c->db;
4361 srcid = c->db->id;
ed9b544e 4362 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4363 addReply(c,shared.outofrangeerr);
ed9b544e 4364 return;
4365 }
3305306f 4366 dst = c->db;
4367 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4368
4369 /* If the user is moving using as target the same
4370 * DB as the source DB it is probably an error. */
4371 if (src == dst) {
c937aa89 4372 addReply(c,shared.sameobjecterr);
ed9b544e 4373 return;
4374 }
4375
4376 /* Check if the element exists and get a reference */
3305306f 4377 o = lookupKeyWrite(c->db,c->argv[1]);
4378 if (!o) {
c937aa89 4379 addReply(c,shared.czero);
ed9b544e 4380 return;
4381 }
4382
4383 /* Try to add the element to the target DB */
3305306f 4384 deleteIfVolatile(dst,c->argv[1]);
4385 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 4386 addReply(c,shared.czero);
ed9b544e 4387 return;
4388 }
3305306f 4389 incrRefCount(c->argv[1]);
ed9b544e 4390 incrRefCount(o);
4391
4392 /* OK! key moved, free the entry in the source DB */
3305306f 4393 deleteKey(src,c->argv[1]);
ed9b544e 4394 server.dirty++;
c937aa89 4395 addReply(c,shared.cone);
ed9b544e 4396}
4397
4398/* =================================== Lists ================================ */
4399static void pushGenericCommand(redisClient *c, int where) {
4400 robj *lobj;
ed9b544e 4401 list *list;
3305306f 4402
4403 lobj = lookupKeyWrite(c->db,c->argv[1]);
4404 if (lobj == NULL) {
95242ab5 4405 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4406 addReply(c,shared.cone);
95242ab5 4407 return;
4408 }
ed9b544e 4409 lobj = createListObject();
4410 list = lobj->ptr;
4411 if (where == REDIS_HEAD) {
6b47e12e 4412 listAddNodeHead(list,c->argv[2]);
ed9b544e 4413 } else {
6b47e12e 4414 listAddNodeTail(list,c->argv[2]);
ed9b544e 4415 }
3305306f 4416 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 4417 incrRefCount(c->argv[1]);
4418 incrRefCount(c->argv[2]);
4419 } else {
ed9b544e 4420 if (lobj->type != REDIS_LIST) {
4421 addReply(c,shared.wrongtypeerr);
4422 return;
4423 }
95242ab5 4424 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4425 addReply(c,shared.cone);
95242ab5 4426 return;
4427 }
ed9b544e 4428 list = lobj->ptr;
4429 if (where == REDIS_HEAD) {
6b47e12e 4430 listAddNodeHead(list,c->argv[2]);
ed9b544e 4431 } else {
6b47e12e 4432 listAddNodeTail(list,c->argv[2]);
ed9b544e 4433 }
4434 incrRefCount(c->argv[2]);
4435 }
4436 server.dirty++;
520b5a33 4437 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
ed9b544e 4438}
4439
4440static void lpushCommand(redisClient *c) {
4441 pushGenericCommand(c,REDIS_HEAD);
4442}
4443
4444static void rpushCommand(redisClient *c) {
4445 pushGenericCommand(c,REDIS_TAIL);
4446}
4447
4448static void llenCommand(redisClient *c) {
3305306f 4449 robj *o;
ed9b544e 4450 list *l;
dd88747b 4451
4452 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4453 checkType(c,o,REDIS_LIST)) return;
e0a62c7f 4454
dd88747b 4455 l = o->ptr;
4456 addReplyUlong(c,listLength(l));
ed9b544e 4457}
4458
4459static void lindexCommand(redisClient *c) {
3305306f 4460 robj *o;
ed9b544e 4461 int index = atoi(c->argv[2]->ptr);
dd88747b 4462 list *list;
4463 listNode *ln;
4464
4465 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4466 checkType(c,o,REDIS_LIST)) return;
4467 list = o->ptr;
4468
4469 ln = listIndex(list, index);
4470 if (ln == NULL) {
c937aa89 4471 addReply(c,shared.nullbulk);
ed9b544e 4472 } else {
dd88747b 4473 robj *ele = listNodeValue(ln);
4474 addReplyBulk(c,ele);
ed9b544e 4475 }
4476}
4477
4478static void lsetCommand(redisClient *c) {
3305306f 4479 robj *o;
ed9b544e 4480 int index = atoi(c->argv[2]->ptr);
dd88747b 4481 list *list;
4482 listNode *ln;
4483
4484 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4485 checkType(c,o,REDIS_LIST)) return;
4486 list = o->ptr;
4487
4488 ln = listIndex(list, index);
4489 if (ln == NULL) {
4490 addReply(c,shared.outofrangeerr);
ed9b544e 4491 } else {
dd88747b 4492 robj *ele = listNodeValue(ln);
ed9b544e 4493
dd88747b 4494 decrRefCount(ele);
4495 listNodeValue(ln) = c->argv[3];
4496 incrRefCount(c->argv[3]);
4497 addReply(c,shared.ok);
4498 server.dirty++;
ed9b544e 4499 }
4500}
4501
4502static void popGenericCommand(redisClient *c, int where) {
3305306f 4503 robj *o;
dd88747b 4504 list *list;
4505 listNode *ln;
3305306f 4506
dd88747b 4507 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4508 checkType(c,o,REDIS_LIST)) return;
4509 list = o->ptr;
ed9b544e 4510
dd88747b 4511 if (where == REDIS_HEAD)
4512 ln = listFirst(list);
4513 else
4514 ln = listLast(list);
ed9b544e 4515
dd88747b 4516 if (ln == NULL) {
4517 addReply(c,shared.nullbulk);
4518 } else {
4519 robj *ele = listNodeValue(ln);
4520 addReplyBulk(c,ele);
4521 listDelNode(list,ln);
3ea27d37 4522 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4523 server.dirty++;
ed9b544e 4524 }
4525}
4526
4527static void lpopCommand(redisClient *c) {
4528 popGenericCommand(c,REDIS_HEAD);
4529}
4530
4531static void rpopCommand(redisClient *c) {
4532 popGenericCommand(c,REDIS_TAIL);
4533}
4534
4535static void lrangeCommand(redisClient *c) {
3305306f 4536 robj *o;
ed9b544e 4537 int start = atoi(c->argv[2]->ptr);
4538 int end = atoi(c->argv[3]->ptr);
dd88747b 4539 int llen;
4540 int rangelen, j;
4541 list *list;
4542 listNode *ln;
4543 robj *ele;
4544
4e27f268 4545 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4546 || checkType(c,o,REDIS_LIST)) return;
dd88747b 4547 list = o->ptr;
4548 llen = listLength(list);
4549
4550 /* convert negative indexes */
4551 if (start < 0) start = llen+start;
4552 if (end < 0) end = llen+end;
4553 if (start < 0) start = 0;
4554 if (end < 0) end = 0;
4555
4556 /* indexes sanity checks */
4557 if (start > end || start >= llen) {
4558 /* Out of range start or start > end result in empty list */
4559 addReply(c,shared.emptymultibulk);
4560 return;
4561 }
4562 if (end >= llen) end = llen-1;
4563 rangelen = (end-start)+1;
3305306f 4564
dd88747b 4565 /* Return the result in form of a multi-bulk reply */
4566 ln = listIndex(list, start);
4567 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4568 for (j = 0; j < rangelen; j++) {
4569 ele = listNodeValue(ln);
4570 addReplyBulk(c,ele);
4571 ln = ln->next;
ed9b544e 4572 }
4573}
4574
4575static void ltrimCommand(redisClient *c) {
3305306f 4576 robj *o;
ed9b544e 4577 int start = atoi(c->argv[2]->ptr);
4578 int end = atoi(c->argv[3]->ptr);
dd88747b 4579 int llen;
4580 int j, ltrim, rtrim;
4581 list *list;
4582 listNode *ln;
4583
4584 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4585 checkType(c,o,REDIS_LIST)) return;
4586 list = o->ptr;
4587 llen = listLength(list);
4588
4589 /* convert negative indexes */
4590 if (start < 0) start = llen+start;
4591 if (end < 0) end = llen+end;
4592 if (start < 0) start = 0;
4593 if (end < 0) end = 0;
4594
4595 /* indexes sanity checks */
4596 if (start > end || start >= llen) {
4597 /* Out of range start or start > end result in empty list */
4598 ltrim = llen;
4599 rtrim = 0;
ed9b544e 4600 } else {
dd88747b 4601 if (end >= llen) end = llen-1;
4602 ltrim = start;
4603 rtrim = llen-end-1;
4604 }
ed9b544e 4605
dd88747b 4606 /* Remove list elements to perform the trim */
4607 for (j = 0; j < ltrim; j++) {
4608 ln = listFirst(list);
4609 listDelNode(list,ln);
4610 }
4611 for (j = 0; j < rtrim; j++) {
4612 ln = listLast(list);
4613 listDelNode(list,ln);
ed9b544e 4614 }
3ea27d37 4615 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4616 server.dirty++;
4617 addReply(c,shared.ok);
ed9b544e 4618}
4619
4620static void lremCommand(redisClient *c) {
3305306f 4621 robj *o;
dd88747b 4622 list *list;
4623 listNode *ln, *next;
4624 int toremove = atoi(c->argv[2]->ptr);
4625 int removed = 0;
4626 int fromtail = 0;
a4d1ba9a 4627
dd88747b 4628 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4629 checkType(c,o,REDIS_LIST)) return;
4630 list = o->ptr;
4631
4632 if (toremove < 0) {
4633 toremove = -toremove;
4634 fromtail = 1;
4635 }
4636 ln = fromtail ? list->tail : list->head;
4637 while (ln) {
4638 robj *ele = listNodeValue(ln);
4639
4640 next = fromtail ? ln->prev : ln->next;
4641 if (compareStringObjects(ele,c->argv[3]) == 0) {
4642 listDelNode(list,ln);
4643 server.dirty++;
4644 removed++;
4645 if (toremove && removed == toremove) break;
ed9b544e 4646 }
dd88747b 4647 ln = next;
ed9b544e 4648 }
3ea27d37 4649 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4650 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 4651}
4652
12f9d551 4653/* This is the semantic of this command:
0f5f7e9a 4654 * RPOPLPUSH srclist dstlist:
12f9d551 4655 * IF LLEN(srclist) > 0
4656 * element = RPOP srclist
4657 * LPUSH dstlist element
4658 * RETURN element
4659 * ELSE
4660 * RETURN nil
4661 * END
4662 * END
4663 *
4664 * The idea is to be able to get an element from a list in a reliable way
4665 * since the element is not just returned but pushed against another list
4666 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4667 */
0f5f7e9a 4668static void rpoplpushcommand(redisClient *c) {
12f9d551 4669 robj *sobj;
dd88747b 4670 list *srclist;
4671 listNode *ln;
4672
4673 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4674 checkType(c,sobj,REDIS_LIST)) return;
4675 srclist = sobj->ptr;
4676 ln = listLast(srclist);
12f9d551 4677
dd88747b 4678 if (ln == NULL) {
12f9d551 4679 addReply(c,shared.nullbulk);
4680 } else {
dd88747b 4681 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4682 robj *ele = listNodeValue(ln);
4683 list *dstlist;
e20fb74f 4684
dd88747b 4685 if (dobj && dobj->type != REDIS_LIST) {
4686 addReply(c,shared.wrongtypeerr);
4687 return;
4688 }
12f9d551 4689
dd88747b 4690 /* Add the element to the target list (unless it's directly
4691 * passed to some BLPOP-ing client */
4692 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4693 if (dobj == NULL) {
4694 /* Create the list if the key does not exist */
4695 dobj = createListObject();
4696 dictAdd(c->db->dict,c->argv[2],dobj);
4697 incrRefCount(c->argv[2]);
12f9d551 4698 }
dd88747b 4699 dstlist = dobj->ptr;
4700 listAddNodeHead(dstlist,ele);
4701 incrRefCount(ele);
12f9d551 4702 }
dd88747b 4703
4704 /* Send the element to the client as reply as well */
4705 addReplyBulk(c,ele);
4706
4707 /* Finally remove the element from the source list */
4708 listDelNode(srclist,ln);
3ea27d37 4709 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4710 server.dirty++;
12f9d551 4711 }
4712}
4713
ed9b544e 4714/* ==================================== Sets ================================ */
4715
4716static void saddCommand(redisClient *c) {
ed9b544e 4717 robj *set;
4718
3305306f 4719 set = lookupKeyWrite(c->db,c->argv[1]);
4720 if (set == NULL) {
ed9b544e 4721 set = createSetObject();
3305306f 4722 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 4723 incrRefCount(c->argv[1]);
4724 } else {
ed9b544e 4725 if (set->type != REDIS_SET) {
c937aa89 4726 addReply(c,shared.wrongtypeerr);
ed9b544e 4727 return;
4728 }
4729 }
4730 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4731 incrRefCount(c->argv[2]);
4732 server.dirty++;
c937aa89 4733 addReply(c,shared.cone);
ed9b544e 4734 } else {
c937aa89 4735 addReply(c,shared.czero);
ed9b544e 4736 }
4737}
4738
4739static void sremCommand(redisClient *c) {
3305306f 4740 robj *set;
ed9b544e 4741
dd88747b 4742 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4743 checkType(c,set,REDIS_SET)) return;
4744
4745 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4746 server.dirty++;
4747 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 4748 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4749 addReply(c,shared.cone);
ed9b544e 4750 } else {
dd88747b 4751 addReply(c,shared.czero);
ed9b544e 4752 }
4753}
4754
a4460ef4 4755static void smoveCommand(redisClient *c) {
4756 robj *srcset, *dstset;
4757
4758 srcset = lookupKeyWrite(c->db,c->argv[1]);
4759 dstset = lookupKeyWrite(c->db,c->argv[2]);
4760
4761 /* If the source key does not exist return 0, if it's of the wrong type
4762 * raise an error */
4763 if (srcset == NULL || srcset->type != REDIS_SET) {
4764 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4765 return;
4766 }
4767 /* Error if the destination key is not a set as well */
4768 if (dstset && dstset->type != REDIS_SET) {
4769 addReply(c,shared.wrongtypeerr);
4770 return;
4771 }
4772 /* Remove the element from the source set */
4773 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4774 /* Key not found in the src set! return zero */
4775 addReply(c,shared.czero);
4776 return;
4777 }
3ea27d37 4778 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4779 deleteKey(c->db,c->argv[1]);
a4460ef4 4780 server.dirty++;
4781 /* Add the element to the destination set */
4782 if (!dstset) {
4783 dstset = createSetObject();
4784 dictAdd(c->db->dict,c->argv[2],dstset);
4785 incrRefCount(c->argv[2]);
4786 }
4787 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4788 incrRefCount(c->argv[3]);
4789 addReply(c,shared.cone);
4790}
4791
ed9b544e 4792static void sismemberCommand(redisClient *c) {
3305306f 4793 robj *set;
ed9b544e 4794
dd88747b 4795 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4796 checkType(c,set,REDIS_SET)) return;
4797
4798 if (dictFind(set->ptr,c->argv[2]))
4799 addReply(c,shared.cone);
4800 else
c937aa89 4801 addReply(c,shared.czero);
ed9b544e 4802}
4803
4804static void scardCommand(redisClient *c) {
3305306f 4805 robj *o;
ed9b544e 4806 dict *s;
dd88747b 4807
4808 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4809 checkType(c,o,REDIS_SET)) return;
e0a62c7f 4810
dd88747b 4811 s = o->ptr;
4812 addReplyUlong(c,dictSize(s));
ed9b544e 4813}
4814
12fea928 4815static void spopCommand(redisClient *c) {
4816 robj *set;
4817 dictEntry *de;
4818
dd88747b 4819 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4820 checkType(c,set,REDIS_SET)) return;
4821
4822 de = dictGetRandomKey(set->ptr);
4823 if (de == NULL) {
12fea928 4824 addReply(c,shared.nullbulk);
4825 } else {
dd88747b 4826 robj *ele = dictGetEntryKey(de);
12fea928 4827
dd88747b 4828 addReplyBulk(c,ele);
4829 dictDelete(set->ptr,ele);
4830 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 4831 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4832 server.dirty++;
12fea928 4833 }
4834}
4835
2abb95a9 4836static void srandmemberCommand(redisClient *c) {
4837 robj *set;
4838 dictEntry *de;
4839
dd88747b 4840 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4841 checkType(c,set,REDIS_SET)) return;
4842
4843 de = dictGetRandomKey(set->ptr);
4844 if (de == NULL) {
2abb95a9 4845 addReply(c,shared.nullbulk);
4846 } else {
dd88747b 4847 robj *ele = dictGetEntryKey(de);
2abb95a9 4848
dd88747b 4849 addReplyBulk(c,ele);
2abb95a9 4850 }
4851}
4852
ed9b544e 4853static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4854 dict **d1 = (void*) s1, **d2 = (void*) s2;
4855
3305306f 4856 return dictSize(*d1)-dictSize(*d2);
ed9b544e 4857}
4858
682ac724 4859static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 4860 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4861 dictIterator *di;
4862 dictEntry *de;
4863 robj *lenobj = NULL, *dstset = NULL;
682ac724 4864 unsigned long j, cardinality = 0;
ed9b544e 4865
ed9b544e 4866 for (j = 0; j < setsnum; j++) {
4867 robj *setobj;
3305306f 4868
4869 setobj = dstkey ?
4870 lookupKeyWrite(c->db,setskeys[j]) :
4871 lookupKeyRead(c->db,setskeys[j]);
4872 if (!setobj) {
ed9b544e 4873 zfree(dv);
5faa6025 4874 if (dstkey) {
fdcaae84 4875 if (deleteKey(c->db,dstkey))
4876 server.dirty++;
0d36ded0 4877 addReply(c,shared.czero);
5faa6025 4878 } else {
4e27f268 4879 addReply(c,shared.emptymultibulk);
5faa6025 4880 }
ed9b544e 4881 return;
4882 }
ed9b544e 4883 if (setobj->type != REDIS_SET) {
4884 zfree(dv);
c937aa89 4885 addReply(c,shared.wrongtypeerr);
ed9b544e 4886 return;
4887 }
4888 dv[j] = setobj->ptr;
4889 }
4890 /* Sort sets from the smallest to largest, this will improve our
4891 * algorithm's performace */
4892 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
4893
4894 /* The first thing we should output is the total number of elements...
4895 * since this is a multi-bulk write, but at this stage we don't know
4896 * the intersection set size, so we use a trick, append an empty object
4897 * to the output list and save the pointer to later modify it with the
4898 * right length */
4899 if (!dstkey) {
4900 lenobj = createObject(REDIS_STRING,NULL);
4901 addReply(c,lenobj);
4902 decrRefCount(lenobj);
4903 } else {
4904 /* If we have a target key where to store the resulting set
4905 * create this key with an empty set inside */
4906 dstset = createSetObject();
ed9b544e 4907 }
4908
4909 /* Iterate all the elements of the first (smallest) set, and test
4910 * the element against all the other sets, if at least one set does
4911 * not include the element it is discarded */
4912 di = dictGetIterator(dv[0]);
ed9b544e 4913
4914 while((de = dictNext(di)) != NULL) {
4915 robj *ele;
4916
4917 for (j = 1; j < setsnum; j++)
4918 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
4919 if (j != setsnum)
4920 continue; /* at least one set does not contain the member */
4921 ele = dictGetEntryKey(de);
4922 if (!dstkey) {
dd88747b 4923 addReplyBulk(c,ele);
ed9b544e 4924 cardinality++;
4925 } else {
4926 dictAdd(dstset->ptr,ele,NULL);
4927 incrRefCount(ele);
4928 }
4929 }
4930 dictReleaseIterator(di);
4931
83cdfe18 4932 if (dstkey) {
3ea27d37 4933 /* Store the resulting set into the target, if the intersection
4934 * is not an empty set. */
83cdfe18 4935 deleteKey(c->db,dstkey);
3ea27d37 4936 if (dictSize((dict*)dstset->ptr) > 0) {
4937 dictAdd(c->db->dict,dstkey,dstset);
4938 incrRefCount(dstkey);
d36c4e97 4939 addReplyLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 4940 } else {
4941 decrRefCount(dstset);
d36c4e97 4942 addReply(c,shared.czero);
3ea27d37 4943 }
40d224a9 4944 server.dirty++;
d36c4e97 4945 } else {
4946 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 4947 }
ed9b544e 4948 zfree(dv);
4949}
4950
4951static void sinterCommand(redisClient *c) {
4952 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
4953}
4954
4955static void sinterstoreCommand(redisClient *c) {
4956 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
4957}
4958
f4f56e1d 4959#define REDIS_OP_UNION 0
4960#define REDIS_OP_DIFF 1
2830ca53 4961#define REDIS_OP_INTER 2
f4f56e1d 4962
4963static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 4964 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4965 dictIterator *di;
4966 dictEntry *de;
f4f56e1d 4967 robj *dstset = NULL;
40d224a9 4968 int j, cardinality = 0;
4969
40d224a9 4970 for (j = 0; j < setsnum; j++) {
4971 robj *setobj;
4972
4973 setobj = dstkey ?
4974 lookupKeyWrite(c->db,setskeys[j]) :
4975 lookupKeyRead(c->db,setskeys[j]);
4976 if (!setobj) {
4977 dv[j] = NULL;
4978 continue;
4979 }
4980 if (setobj->type != REDIS_SET) {
4981 zfree(dv);
4982 addReply(c,shared.wrongtypeerr);
4983 return;
4984 }
4985 dv[j] = setobj->ptr;
4986 }
4987
4988 /* We need a temp set object to store our union. If the dstkey
4989 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
4990 * this set object will be the resulting object to set into the target key*/
4991 dstset = createSetObject();
4992
40d224a9 4993 /* Iterate all the elements of all the sets, add every element a single
4994 * time to the result set */
4995 for (j = 0; j < setsnum; j++) {
51829ed3 4996 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 4997 if (!dv[j]) continue; /* non existing keys are like empty sets */
4998
4999 di = dictGetIterator(dv[j]);
40d224a9 5000
5001 while((de = dictNext(di)) != NULL) {
5002 robj *ele;
5003
5004 /* dictAdd will not add the same element multiple times */
5005 ele = dictGetEntryKey(de);
f4f56e1d 5006 if (op == REDIS_OP_UNION || j == 0) {
5007 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5008 incrRefCount(ele);
40d224a9 5009 cardinality++;
5010 }
f4f56e1d 5011 } else if (op == REDIS_OP_DIFF) {
5012 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5013 cardinality--;
5014 }
40d224a9 5015 }
5016 }
5017 dictReleaseIterator(di);
51829ed3 5018
d36c4e97 5019 /* result set is empty? Exit asap. */
5020 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 5021 }
5022
f4f56e1d 5023 /* Output the content of the resulting set, if not in STORE mode */
5024 if (!dstkey) {
5025 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5026 di = dictGetIterator(dstset->ptr);
f4f56e1d 5027 while((de = dictNext(di)) != NULL) {
5028 robj *ele;
5029
5030 ele = dictGetEntryKey(de);
dd88747b 5031 addReplyBulk(c,ele);
f4f56e1d 5032 }
5033 dictReleaseIterator(di);
d36c4e97 5034 decrRefCount(dstset);
83cdfe18
AG
5035 } else {
5036 /* If we have a target key where to store the resulting set
5037 * create this key with the result set inside */
5038 deleteKey(c->db,dstkey);
3ea27d37 5039 if (dictSize((dict*)dstset->ptr) > 0) {
5040 dictAdd(c->db->dict,dstkey,dstset);
5041 incrRefCount(dstkey);
d36c4e97 5042 addReplyLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5043 } else {
5044 decrRefCount(dstset);
d36c4e97 5045 addReply(c,shared.czero);
3ea27d37 5046 }
40d224a9 5047 server.dirty++;
5048 }
5049 zfree(dv);
5050}
5051
5052static void sunionCommand(redisClient *c) {
f4f56e1d 5053 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 5054}
5055
5056static void sunionstoreCommand(redisClient *c) {
f4f56e1d 5057 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5058}
5059
5060static void sdiffCommand(redisClient *c) {
5061 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5062}
5063
5064static void sdiffstoreCommand(redisClient *c) {
5065 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 5066}
5067
6b47e12e 5068/* ==================================== ZSets =============================== */
5069
5070/* ZSETs are ordered sets using two data structures to hold the same elements
5071 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5072 * data structure.
5073 *
5074 * The elements are added to an hash table mapping Redis objects to scores.
5075 * At the same time the elements are added to a skip list mapping scores
5076 * to Redis objects (so objects are sorted by scores in this "view"). */
5077
5078/* This skiplist implementation is almost a C translation of the original
5079 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5080 * Alternative to Balanced Trees", modified in three ways:
5081 * a) this implementation allows for repeated values.
5082 * b) the comparison is not just by key (our 'score') but by satellite data.
5083 * c) there is a back pointer, so it's a doubly linked list with the back
5084 * pointers being only at "level 1". This allows to traverse the list
5085 * from tail to head, useful for ZREVRANGE. */
5086
5087static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5088 zskiplistNode *zn = zmalloc(sizeof(*zn));
5089
5090 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2b37892e
PN
5091 if (level > 0)
5092 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
6b47e12e 5093 zn->score = score;
5094 zn->obj = obj;
5095 return zn;
5096}
5097
5098static zskiplist *zslCreate(void) {
5099 int j;
5100 zskiplist *zsl;
e0a62c7f 5101
6b47e12e 5102 zsl = zmalloc(sizeof(*zsl));
5103 zsl->level = 1;
cc812361 5104 zsl->length = 0;
6b47e12e 5105 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 5106 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 5107 zsl->header->forward[j] = NULL;
94e543b5 5108
5109 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5110 if (j < ZSKIPLIST_MAXLEVEL-1)
5111 zsl->header->span[j] = 0;
69d95c3e 5112 }
e3870fab 5113 zsl->header->backward = NULL;
5114 zsl->tail = NULL;
6b47e12e 5115 return zsl;
5116}
5117
fd8ccf44 5118static void zslFreeNode(zskiplistNode *node) {
5119 decrRefCount(node->obj);
ad807e6f 5120 zfree(node->forward);
69d95c3e 5121 zfree(node->span);
fd8ccf44 5122 zfree(node);
5123}
5124
5125static void zslFree(zskiplist *zsl) {
ad807e6f 5126 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 5127
ad807e6f 5128 zfree(zsl->header->forward);
69d95c3e 5129 zfree(zsl->header->span);
ad807e6f 5130 zfree(zsl->header);
fd8ccf44 5131 while(node) {
599379dd 5132 next = node->forward[0];
fd8ccf44 5133 zslFreeNode(node);
5134 node = next;
5135 }
ad807e6f 5136 zfree(zsl);
fd8ccf44 5137}
5138
6b47e12e 5139static int zslRandomLevel(void) {
5140 int level = 1;
5141 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5142 level += 1;
10c2baa5 5143 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6b47e12e 5144}
5145
5146static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5147 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 5148 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 5149 int i, level;
5150
5151 x = zsl->header;
5152 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
5153 /* store rank that is crossed to reach the insert position */
5154 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 5155
9d60e6e4 5156 while (x->forward[i] &&
5157 (x->forward[i]->score < score ||
5158 (x->forward[i]->score == score &&
69d95c3e 5159 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 5160 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 5161 x = x->forward[i];
69d95c3e 5162 }
6b47e12e 5163 update[i] = x;
5164 }
6b47e12e 5165 /* we assume the key is not already inside, since we allow duplicated
5166 * scores, and the re-insertion of score and redis object should never
5167 * happpen since the caller of zslInsert() should test in the hash table
5168 * if the element is already inside or not. */
5169 level = zslRandomLevel();
5170 if (level > zsl->level) {
69d95c3e 5171 for (i = zsl->level; i < level; i++) {
2b37892e 5172 rank[i] = 0;
6b47e12e 5173 update[i] = zsl->header;
2b37892e 5174 update[i]->span[i-1] = zsl->length;
69d95c3e 5175 }
6b47e12e 5176 zsl->level = level;
5177 }
5178 x = zslCreateNode(level,score,obj);
5179 for (i = 0; i < level; i++) {
5180 x->forward[i] = update[i]->forward[i];
5181 update[i]->forward[i] = x;
69d95c3e
PN
5182
5183 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
5184 if (i > 0) {
5185 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5186 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5187 }
6b47e12e 5188 }
69d95c3e
PN
5189
5190 /* increment span for untouched levels */
5191 for (i = level; i < zsl->level; i++) {
2b37892e 5192 update[i]->span[i-1]++;
69d95c3e
PN
5193 }
5194
bb975144 5195 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 5196 if (x->forward[0])
5197 x->forward[0]->backward = x;
5198 else
5199 zsl->tail = x;
cc812361 5200 zsl->length++;
6b47e12e 5201}
5202
84105336
PN
5203/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5204void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5205 int i;
5206 for (i = 0; i < zsl->level; i++) {
5207 if (update[i]->forward[i] == x) {
5208 if (i > 0) {
5209 update[i]->span[i-1] += x->span[i-1] - 1;
5210 }
5211 update[i]->forward[i] = x->forward[i];
5212 } else {
5213 /* invariant: i > 0, because update[0]->forward[0]
5214 * is always equal to x */
5215 update[i]->span[i-1] -= 1;
5216 }
5217 }
5218 if (x->forward[0]) {
5219 x->forward[0]->backward = x->backward;
5220 } else {
5221 zsl->tail = x->backward;
5222 }
5223 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5224 zsl->level--;
5225 zsl->length--;
5226}
5227
50c55df5 5228/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 5229static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 5230 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5231 int i;
5232
5233 x = zsl->header;
5234 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 5235 while (x->forward[i] &&
5236 (x->forward[i]->score < score ||
5237 (x->forward[i]->score == score &&
5238 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 5239 x = x->forward[i];
5240 update[i] = x;
5241 }
5242 /* We may have multiple elements with the same score, what we need
5243 * is to find the element with both the right score and object. */
5244 x = x->forward[0];
50c55df5 5245 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
84105336 5246 zslDeleteNode(zsl, x, update);
9d60e6e4 5247 zslFreeNode(x);
9d60e6e4 5248 return 1;
5249 } else {
5250 return 0; /* not found */
e197b441 5251 }
5252 return 0; /* not found */
fd8ccf44 5253}
5254
1807985b 5255/* Delete all the elements with score between min and max from the skiplist.
5256 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5257 * Note that this function takes the reference to the hash table view of the
5258 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 5259static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 5260 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5261 unsigned long removed = 0;
5262 int i;
5263
5264 x = zsl->header;
5265 for (i = zsl->level-1; i >= 0; i--) {
5266 while (x->forward[i] && x->forward[i]->score < min)
5267 x = x->forward[i];
5268 update[i] = x;
5269 }
5270 /* We may have multiple elements with the same score, what we need
5271 * is to find the element with both the right score and object. */
5272 x = x->forward[0];
5273 while (x && x->score <= max) {
84105336
PN
5274 zskiplistNode *next = x->forward[0];
5275 zslDeleteNode(zsl, x, update);
1807985b 5276 dictDelete(dict,x->obj);
5277 zslFreeNode(x);
1807985b 5278 removed++;
5279 x = next;
5280 }
5281 return removed; /* not found */
5282}
1807985b 5283
9212eafd 5284/* Delete all the elements with rank between start and end from the skiplist.
2424490f 5285 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
5286static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5287 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5288 unsigned long traversed = 0, removed = 0;
5289 int i;
5290
9212eafd
PN
5291 x = zsl->header;
5292 for (i = zsl->level-1; i >= 0; i--) {
5293 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5294 traversed += i > 0 ? x->span[i-1] : 1;
5295 x = x->forward[i];
1807985b 5296 }
9212eafd
PN
5297 update[i] = x;
5298 }
5299
5300 traversed++;
5301 x = x->forward[0];
5302 while (x && traversed <= end) {
84105336
PN
5303 zskiplistNode *next = x->forward[0];
5304 zslDeleteNode(zsl, x, update);
1807985b 5305 dictDelete(dict,x->obj);
5306 zslFreeNode(x);
1807985b 5307 removed++;
9212eafd 5308 traversed++;
1807985b 5309 x = next;
5310 }
9212eafd 5311 return removed;
1807985b 5312}
5313
50c55df5 5314/* Find the first node having a score equal or greater than the specified one.
5315 * Returns NULL if there is no match. */
5316static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5317 zskiplistNode *x;
5318 int i;
5319
5320 x = zsl->header;
5321 for (i = zsl->level-1; i >= 0; i--) {
5322 while (x->forward[i] && x->forward[i]->score < score)
5323 x = x->forward[i];
5324 }
5325 /* We may have multiple elements with the same score, what we need
5326 * is to find the element with both the right score and object. */
5327 return x->forward[0];
5328}
5329
27b0ccca
PN
5330/* Find the rank for an element by both score and key.
5331 * Returns 0 when the element cannot be found, rank otherwise.
5332 * Note that the rank is 1-based due to the span of zsl->header to the
5333 * first element. */
5334static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5335 zskiplistNode *x;
5336 unsigned long rank = 0;
5337 int i;
5338
5339 x = zsl->header;
5340 for (i = zsl->level-1; i >= 0; i--) {
5341 while (x->forward[i] &&
5342 (x->forward[i]->score < score ||
5343 (x->forward[i]->score == score &&
5344 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 5345 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
5346 x = x->forward[i];
5347 }
5348
5349 /* x might be equal to zsl->header, so test if obj is non-NULL */
5350 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5351 return rank;
5352 }
5353 }
5354 return 0;
5355}
5356
e74825c2
PN
5357/* Finds an element by its rank. The rank argument needs to be 1-based. */
5358zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5359 zskiplistNode *x;
5360 unsigned long traversed = 0;
5361 int i;
5362
5363 x = zsl->header;
5364 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 5365 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5366 {
a50ea45c 5367 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
5368 x = x->forward[i];
5369 }
e74825c2
PN
5370 if (traversed == rank) {
5371 return x;
5372 }
5373 }
5374 return NULL;
5375}
5376
fd8ccf44 5377/* The actual Z-commands implementations */
5378
7db723ad 5379/* This generic command implements both ZADD and ZINCRBY.
e2665397 5380 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 5381 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 5382static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 5383 robj *zsetobj;
5384 zset *zs;
5385 double *score;
5386
e2665397 5387 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 5388 if (zsetobj == NULL) {
5389 zsetobj = createZsetObject();
e2665397 5390 dictAdd(c->db->dict,key,zsetobj);
5391 incrRefCount(key);
fd8ccf44 5392 } else {
5393 if (zsetobj->type != REDIS_ZSET) {
5394 addReply(c,shared.wrongtypeerr);
5395 return;
5396 }
5397 }
fd8ccf44 5398 zs = zsetobj->ptr;
e2665397 5399
7db723ad 5400 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 5401 * needs to handle the two different conditions. It's all about setting
5402 * '*score', that is, the new score to set, to the right value. */
5403 score = zmalloc(sizeof(double));
5404 if (doincrement) {
5405 dictEntry *de;
5406
5407 /* Read the old score. If the element was not present starts from 0 */
5408 de = dictFind(zs->dict,ele);
5409 if (de) {
5410 double *oldscore = dictGetEntryVal(de);
5411 *score = *oldscore + scoreval;
5412 } else {
5413 *score = scoreval;
5414 }
5415 } else {
5416 *score = scoreval;
5417 }
5418
5419 /* What follows is a simple remove and re-insert operation that is common
7db723ad 5420 * to both ZADD and ZINCRBY... */
e2665397 5421 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 5422 /* case 1: New element */
e2665397 5423 incrRefCount(ele); /* added to hash */
5424 zslInsert(zs->zsl,*score,ele);
5425 incrRefCount(ele); /* added to skiplist */
fd8ccf44 5426 server.dirty++;
e2665397 5427 if (doincrement)
e2665397 5428 addReplyDouble(c,*score);
91d71bfc 5429 else
5430 addReply(c,shared.cone);
fd8ccf44 5431 } else {
5432 dictEntry *de;
5433 double *oldscore;
e0a62c7f 5434
fd8ccf44 5435 /* case 2: Score update operation */
e2665397 5436 de = dictFind(zs->dict,ele);
dfc5e96c 5437 redisAssert(de != NULL);
fd8ccf44 5438 oldscore = dictGetEntryVal(de);
5439 if (*score != *oldscore) {
5440 int deleted;
5441
e2665397 5442 /* Remove and insert the element in the skip list with new score */
5443 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 5444 redisAssert(deleted != 0);
e2665397 5445 zslInsert(zs->zsl,*score,ele);
5446 incrRefCount(ele);
5447 /* Update the score in the hash table */
5448 dictReplace(zs->dict,ele,score);
fd8ccf44 5449 server.dirty++;
2161a965 5450 } else {
5451 zfree(score);
fd8ccf44 5452 }
e2665397 5453 if (doincrement)
5454 addReplyDouble(c,*score);
5455 else
5456 addReply(c,shared.czero);
fd8ccf44 5457 }
5458}
5459
e2665397 5460static void zaddCommand(redisClient *c) {
5461 double scoreval;
5462
bbe025e0
AM
5463 if (getDoubleFromObject(c, c->argv[2], &scoreval) != REDIS_OK) return;
5464
e2665397 5465 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5466}
5467
7db723ad 5468static void zincrbyCommand(redisClient *c) {
e2665397 5469 double scoreval;
5470
bbe025e0
AM
5471 if (getDoubleFromObject(c, c->argv[2], &scoreval) != REDIS_OK) return;
5472
e2665397 5473 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5474}
5475
1b7106e7 5476static void zremCommand(redisClient *c) {
5477 robj *zsetobj;
5478 zset *zs;
dd88747b 5479 dictEntry *de;
5480 double *oldscore;
5481 int deleted;
1b7106e7 5482
dd88747b 5483 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5484 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 5485
dd88747b 5486 zs = zsetobj->ptr;
5487 de = dictFind(zs->dict,c->argv[2]);
5488 if (de == NULL) {
5489 addReply(c,shared.czero);
5490 return;
1b7106e7 5491 }
dd88747b 5492 /* Delete from the skiplist */
5493 oldscore = dictGetEntryVal(de);
5494 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5495 redisAssert(deleted != 0);
5496
5497 /* Delete from the hash table */
5498 dictDelete(zs->dict,c->argv[2]);
5499 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5500 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5501 server.dirty++;
5502 addReply(c,shared.cone);
1b7106e7 5503}
5504
1807985b 5505static void zremrangebyscoreCommand(redisClient *c) {
bbe025e0
AM
5506 double min;
5507 double max;
dd88747b 5508 long deleted;
1807985b 5509 robj *zsetobj;
5510 zset *zs;
5511
bbe025e0
AM
5512 if ((getDoubleFromObject(c, c->argv[2], &min) != REDIS_OK) ||
5513 (getDoubleFromObject(c, c->argv[3], &max) != REDIS_OK)) return;
5514
dd88747b 5515 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5516 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 5517
dd88747b 5518 zs = zsetobj->ptr;
5519 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5520 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5521 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5522 server.dirty += deleted;
5523 addReplyLong(c,deleted);
1807985b 5524}
5525
9212eafd 5526static void zremrangebyrankCommand(redisClient *c) {
bbe025e0
AM
5527 long start;
5528 long end;
dd88747b 5529 int llen;
5530 long deleted;
9212eafd
PN
5531 robj *zsetobj;
5532 zset *zs;
5533
bbe025e0
AM
5534 if ((getLongFromObject(c, c->argv[2], &start) != REDIS_OK) ||
5535 (getLongFromObject(c, c->argv[3], &end) != REDIS_OK)) return;
5536
dd88747b 5537 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5538 checkType(c,zsetobj,REDIS_ZSET)) return;
5539 zs = zsetobj->ptr;
5540 llen = zs->zsl->length;
9212eafd 5541
dd88747b 5542 /* convert negative indexes */
5543 if (start < 0) start = llen+start;
5544 if (end < 0) end = llen+end;
5545 if (start < 0) start = 0;
5546 if (end < 0) end = 0;
9212eafd 5547
dd88747b 5548 /* indexes sanity checks */
5549 if (start > end || start >= llen) {
5550 addReply(c,shared.czero);
5551 return;
9212eafd 5552 }
dd88747b 5553 if (end >= llen) end = llen-1;
5554
5555 /* increment start and end because zsl*Rank functions
5556 * use 1-based rank */
5557 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5558 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5559 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5560 server.dirty += deleted;
5561 addReplyLong(c, deleted);
9212eafd
PN
5562}
5563
8f92e768
PN
5564typedef struct {
5565 dict *dict;
5566 double weight;
5567} zsetopsrc;
5568
5569static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5570 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5571 unsigned long size1, size2;
5572 size1 = d1->dict ? dictSize(d1->dict) : 0;
5573 size2 = d2->dict ? dictSize(d2->dict) : 0;
5574 return size1 - size2;
5575}
5576
d2764cd6
PN
5577#define REDIS_AGGR_SUM 1
5578#define REDIS_AGGR_MIN 2
5579#define REDIS_AGGR_MAX 3
5580
5581inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5582 if (aggregate == REDIS_AGGR_SUM) {
5583 *target = *target + val;
5584 } else if (aggregate == REDIS_AGGR_MIN) {
5585 *target = val < *target ? val : *target;
5586 } else if (aggregate == REDIS_AGGR_MAX) {
5587 *target = val > *target ? val : *target;
5588 } else {
5589 /* safety net */
5590 redisAssert(0 != 0);
5591 }
5592}
5593
2830ca53 5594static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
8f92e768 5595 int i, j, zsetnum;
d2764cd6 5596 int aggregate = REDIS_AGGR_SUM;
8f92e768 5597 zsetopsrc *src;
2830ca53
PN
5598 robj *dstobj;
5599 zset *dstzset;
b287c9bb
PN
5600 dictIterator *di;
5601 dictEntry *de;
5602
2830ca53
PN
5603 /* expect zsetnum input keys to be given */
5604 zsetnum = atoi(c->argv[2]->ptr);
5605 if (zsetnum < 1) {
5606 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5607 return;
b287c9bb 5608 }
2830ca53
PN
5609
5610 /* test if the expected number of keys would overflow */
5611 if (3+zsetnum > c->argc) {
b287c9bb
PN
5612 addReply(c,shared.syntaxerr);
5613 return;
5614 }
5615
2830ca53 5616 /* read keys to be used for input */
b9eed483 5617 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
2830ca53 5618 for (i = 0, j = 3; i < zsetnum; i++, j++) {
b287c9bb
PN
5619 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5620 if (!zsetobj) {
8f92e768 5621 src[i].dict = NULL;
b287c9bb
PN
5622 } else {
5623 if (zsetobj->type != REDIS_ZSET) {
8f92e768 5624 zfree(src);
b287c9bb
PN
5625 addReply(c,shared.wrongtypeerr);
5626 return;
5627 }
8f92e768 5628 src[i].dict = ((zset*)zsetobj->ptr)->dict;
b287c9bb 5629 }
2830ca53
PN
5630
5631 /* default all weights to 1 */
8f92e768 5632 src[i].weight = 1.0;
b287c9bb
PN
5633 }
5634
2830ca53
PN
5635 /* parse optional extra arguments */
5636 if (j < c->argc) {
d2764cd6 5637 int remaining = c->argc - j;
b287c9bb 5638
2830ca53 5639 while (remaining) {
d2764cd6 5640 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 5641 j++; remaining--;
2830ca53 5642 for (i = 0; i < zsetnum; i++, j++, remaining--) {
bbe025e0
AM
5643 if (getDoubleFromObject(c, c->argv[j], &src[i].weight) != REDIS_OK)
5644 return;
2830ca53 5645 }
d2764cd6
PN
5646 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5647 j++; remaining--;
5648 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5649 aggregate = REDIS_AGGR_SUM;
5650 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5651 aggregate = REDIS_AGGR_MIN;
5652 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5653 aggregate = REDIS_AGGR_MAX;
5654 } else {
5655 zfree(src);
5656 addReply(c,shared.syntaxerr);
5657 return;
5658 }
5659 j++; remaining--;
2830ca53 5660 } else {
8f92e768 5661 zfree(src);
2830ca53
PN
5662 addReply(c,shared.syntaxerr);
5663 return;
5664 }
5665 }
5666 }
b287c9bb 5667
d2764cd6
PN
5668 /* sort sets from the smallest to largest, this will improve our
5669 * algorithm's performance */
5670 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5671
2830ca53
PN
5672 dstobj = createZsetObject();
5673 dstzset = dstobj->ptr;
5674
5675 if (op == REDIS_OP_INTER) {
8f92e768
PN
5676 /* skip going over all entries if the smallest zset is NULL or empty */
5677 if (src[0].dict && dictSize(src[0].dict) > 0) {
5678 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5679 * from small to large, all src[i > 0].dict are non-empty too */
5680 di = dictGetIterator(src[0].dict);
2830ca53 5681 while((de = dictNext(di)) != NULL) {
d2764cd6
PN
5682 double *score = zmalloc(sizeof(double)), value;
5683 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
2830ca53 5684
d2764cd6
PN
5685 for (j = 1; j < zsetnum; j++) {
5686 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 5687 if (other) {
d2764cd6
PN
5688 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5689 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
5690 } else {
5691 break;
5692 }
5693 }
b287c9bb 5694
2830ca53 5695 /* skip entry when not present in every source dict */
8f92e768 5696 if (j != zsetnum) {
2830ca53
PN
5697 zfree(score);
5698 } else {
5699 robj *o = dictGetEntryKey(de);
5700 dictAdd(dstzset->dict,o,score);
5701 incrRefCount(o); /* added to dictionary */
5702 zslInsert(dstzset->zsl,*score,o);
5703 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
5704 }
5705 }
2830ca53
PN
5706 dictReleaseIterator(di);
5707 }
5708 } else if (op == REDIS_OP_UNION) {
5709 for (i = 0; i < zsetnum; i++) {
8f92e768 5710 if (!src[i].dict) continue;
2830ca53 5711
8f92e768 5712 di = dictGetIterator(src[i].dict);
2830ca53
PN
5713 while((de = dictNext(di)) != NULL) {
5714 /* skip key when already processed */
5715 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5716
d2764cd6
PN
5717 double *score = zmalloc(sizeof(double)), value;
5718 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
2830ca53 5719
d2764cd6
PN
5720 /* because the zsets are sorted by size, its only possible
5721 * for sets at larger indices to hold this entry */
5722 for (j = (i+1); j < zsetnum; j++) {
5723 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 5724 if (other) {
d2764cd6
PN
5725 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5726 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
5727 }
5728 }
b287c9bb 5729
2830ca53
PN
5730 robj *o = dictGetEntryKey(de);
5731 dictAdd(dstzset->dict,o,score);
5732 incrRefCount(o); /* added to dictionary */
5733 zslInsert(dstzset->zsl,*score,o);
5734 incrRefCount(o); /* added to skiplist */
5735 }
5736 dictReleaseIterator(di);
b287c9bb 5737 }
2830ca53
PN
5738 } else {
5739 /* unknown operator */
5740 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
5741 }
5742
5743 deleteKey(c->db,dstkey);
3ea27d37 5744 if (dstzset->zsl->length) {
5745 dictAdd(c->db->dict,dstkey,dstobj);
5746 incrRefCount(dstkey);
5747 addReplyLong(c, dstzset->zsl->length);
5748 server.dirty++;
5749 } else {
8bca8773 5750 decrRefCount(dstobj);
3ea27d37 5751 addReply(c, shared.czero);
5752 }
8f92e768 5753 zfree(src);
b287c9bb
PN
5754}
5755
2830ca53
PN
5756static void zunionCommand(redisClient *c) {
5757 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
5758}
5759
2830ca53
PN
5760static void zinterCommand(redisClient *c) {
5761 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
5762}
5763
e3870fab 5764static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 5765 robj *o;
bbe025e0
AM
5766 long start;
5767 long end;
752da584 5768 int withscores = 0;
dd88747b 5769 int llen;
5770 int rangelen, j;
5771 zset *zsetobj;
5772 zskiplist *zsl;
5773 zskiplistNode *ln;
5774 robj *ele;
752da584 5775
bbe025e0
AM
5776 if ((getLongFromObject(c, c->argv[2], &start) != REDIS_OK) ||
5777 (getLongFromObject(c, c->argv[3], &end) != REDIS_OK)) return;
5778
752da584 5779 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5780 withscores = 1;
5781 } else if (c->argc >= 5) {
5782 addReply(c,shared.syntaxerr);
5783 return;
5784 }
cc812361 5785
4e27f268 5786 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5787 || checkType(c,o,REDIS_ZSET)) return;
dd88747b 5788 zsetobj = o->ptr;
5789 zsl = zsetobj->zsl;
5790 llen = zsl->length;
cc812361 5791
dd88747b 5792 /* convert negative indexes */
5793 if (start < 0) start = llen+start;
5794 if (end < 0) end = llen+end;
5795 if (start < 0) start = 0;
5796 if (end < 0) end = 0;
cc812361 5797
dd88747b 5798 /* indexes sanity checks */
5799 if (start > end || start >= llen) {
5800 /* Out of range start or start > end result in empty list */
5801 addReply(c,shared.emptymultibulk);
5802 return;
5803 }
5804 if (end >= llen) end = llen-1;
5805 rangelen = (end-start)+1;
cc812361 5806
dd88747b 5807 /* check if starting point is trivial, before searching
5808 * the element in log(N) time */
5809 if (reverse) {
5810 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5811 } else {
5812 ln = start == 0 ?
5813 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5814 }
cc812361 5815
dd88747b 5816 /* Return the result in form of a multi-bulk reply */
5817 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5818 withscores ? (rangelen*2) : rangelen));
5819 for (j = 0; j < rangelen; j++) {
5820 ele = ln->obj;
5821 addReplyBulk(c,ele);
5822 if (withscores)
5823 addReplyDouble(c,ln->score);
5824 ln = reverse ? ln->backward : ln->forward[0];
cc812361 5825 }
5826}
5827
e3870fab 5828static void zrangeCommand(redisClient *c) {
5829 zrangeGenericCommand(c,0);
5830}
5831
5832static void zrevrangeCommand(redisClient *c) {
5833 zrangeGenericCommand(c,1);
5834}
5835
f44dd428 5836/* This command implements both ZRANGEBYSCORE and ZCOUNT.
5837 * If justcount is non-zero, just the count is returned. */
5838static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 5839 robj *o;
f44dd428 5840 double min, max;
5841 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 5842 int offset = 0, limit = -1;
0500ef27
SH
5843 int withscores = 0;
5844 int badsyntax = 0;
5845
f44dd428 5846 /* Parse the min-max interval. If one of the values is prefixed
5847 * by the "(" character, it's considered "open". For instance
5848 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5849 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5850 if (((char*)c->argv[2]->ptr)[0] == '(') {
5851 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5852 minex = 1;
5853 } else {
5854 min = strtod(c->argv[2]->ptr,NULL);
5855 }
5856 if (((char*)c->argv[3]->ptr)[0] == '(') {
5857 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5858 maxex = 1;
5859 } else {
5860 max = strtod(c->argv[3]->ptr,NULL);
5861 }
5862
5863 /* Parse "WITHSCORES": note that if the command was called with
5864 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5865 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 5866 if (c->argc == 5 || c->argc == 8) {
3a3978b1 5867 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5868 withscores = 1;
5869 else
5870 badsyntax = 1;
0500ef27 5871 }
3a3978b1 5872 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 5873 badsyntax = 1;
0500ef27 5874 if (badsyntax) {
454d4e43 5875 addReplySds(c,
5876 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 5877 return;
0500ef27
SH
5878 }
5879
f44dd428 5880 /* Parse "LIMIT" */
0500ef27 5881 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 5882 addReply(c,shared.syntaxerr);
5883 return;
0500ef27 5884 } else if (c->argc == (7 + withscores)) {
80181f78 5885 offset = atoi(c->argv[5]->ptr);
5886 limit = atoi(c->argv[6]->ptr);
0b13687c 5887 if (offset < 0) offset = 0;
80181f78 5888 }
50c55df5 5889
f44dd428 5890 /* Ok, lookup the key and get the range */
50c55df5 5891 o = lookupKeyRead(c->db,c->argv[1]);
5892 if (o == NULL) {
4e27f268 5893 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 5894 } else {
5895 if (o->type != REDIS_ZSET) {
5896 addReply(c,shared.wrongtypeerr);
5897 } else {
5898 zset *zsetobj = o->ptr;
5899 zskiplist *zsl = zsetobj->zsl;
5900 zskiplistNode *ln;
f44dd428 5901 robj *ele, *lenobj = NULL;
5902 unsigned long rangelen = 0;
50c55df5 5903
f44dd428 5904 /* Get the first node with the score >= min, or with
5905 * score > min if 'minex' is true. */
50c55df5 5906 ln = zslFirstWithScore(zsl,min);
f44dd428 5907 while (minex && ln && ln->score == min) ln = ln->forward[0];
5908
50c55df5 5909 if (ln == NULL) {
5910 /* No element matching the speciifed interval */
f44dd428 5911 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 5912 return;
5913 }
5914
5915 /* We don't know in advance how many matching elements there
5916 * are in the list, so we push this object that will represent
5917 * the multi-bulk length in the output buffer, and will "fix"
5918 * it later */
f44dd428 5919 if (!justcount) {
5920 lenobj = createObject(REDIS_STRING,NULL);
5921 addReply(c,lenobj);
5922 decrRefCount(lenobj);
5923 }
50c55df5 5924
f44dd428 5925 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 5926 if (offset) {
5927 offset--;
5928 ln = ln->forward[0];
5929 continue;
5930 }
5931 if (limit == 0) break;
f44dd428 5932 if (!justcount) {
5933 ele = ln->obj;
dd88747b 5934 addReplyBulk(c,ele);
f44dd428 5935 if (withscores)
5936 addReplyDouble(c,ln->score);
5937 }
50c55df5 5938 ln = ln->forward[0];
5939 rangelen++;
80181f78 5940 if (limit > 0) limit--;
50c55df5 5941 }
f44dd428 5942 if (justcount) {
5943 addReplyLong(c,(long)rangelen);
5944 } else {
5945 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
5946 withscores ? (rangelen*2) : rangelen);
5947 }
50c55df5 5948 }
5949 }
5950}
5951
f44dd428 5952static void zrangebyscoreCommand(redisClient *c) {
5953 genericZrangebyscoreCommand(c,0);
5954}
5955
5956static void zcountCommand(redisClient *c) {
5957 genericZrangebyscoreCommand(c,1);
5958}
5959
3c41331e 5960static void zcardCommand(redisClient *c) {
e197b441 5961 robj *o;
5962 zset *zs;
dd88747b 5963
5964 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
5965 checkType(c,o,REDIS_ZSET)) return;
5966
5967 zs = o->ptr;
5968 addReplyUlong(c,zs->zsl->length);
e197b441 5969}
5970
6e333bbe 5971static void zscoreCommand(redisClient *c) {
5972 robj *o;
5973 zset *zs;
dd88747b 5974 dictEntry *de;
5975
5976 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5977 checkType(c,o,REDIS_ZSET)) return;
5978
5979 zs = o->ptr;
5980 de = dictFind(zs->dict,c->argv[2]);
5981 if (!de) {
96d8b4ee 5982 addReply(c,shared.nullbulk);
6e333bbe 5983 } else {
dd88747b 5984 double *score = dictGetEntryVal(de);
6e333bbe 5985
dd88747b 5986 addReplyDouble(c,*score);
6e333bbe 5987 }
5988}
5989
798d9e55 5990static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 5991 robj *o;
dd88747b 5992 zset *zs;
5993 zskiplist *zsl;
5994 dictEntry *de;
5995 unsigned long rank;
5996 double *score;
5997
5998 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
5999 checkType(c,o,REDIS_ZSET)) return;
6000
6001 zs = o->ptr;
6002 zsl = zs->zsl;
6003 de = dictFind(zs->dict,c->argv[2]);
6004 if (!de) {
69d95c3e
PN
6005 addReply(c,shared.nullbulk);
6006 return;
6007 }
69d95c3e 6008
dd88747b 6009 score = dictGetEntryVal(de);
6010 rank = zslGetRank(zsl, *score, c->argv[2]);
6011 if (rank) {
6012 if (reverse) {
6013 addReplyLong(c, zsl->length - rank);
27b0ccca 6014 } else {
dd88747b 6015 addReplyLong(c, rank-1);
69d95c3e 6016 }
dd88747b 6017 } else {
6018 addReply(c,shared.nullbulk);
978c2c94 6019 }
6020}
6021
798d9e55
PN
6022static void zrankCommand(redisClient *c) {
6023 zrankGenericCommand(c, 0);
6024}
6025
6026static void zrevrankCommand(redisClient *c) {
6027 zrankGenericCommand(c, 1);
6028}
6029
cbba7dd7 6030/* =================================== Hashes =============================== */
978c2c94 6031static void hsetCommand(redisClient *c) {
6032 int update = 0;
6033 robj *o = lookupKeyWrite(c->db,c->argv[1]);
6034
6035 if (o == NULL) {
6036 o = createHashObject();
6037 dictAdd(c->db->dict,c->argv[1],o);
6038 incrRefCount(c->argv[1]);
6039 } else {
6040 if (o->type != REDIS_HASH) {
6041 addReply(c,shared.wrongtypeerr);
6042 return;
6043 }
6044 }
bae2c7ec 6045 /* We want to convert the zipmap into an hash table right now if the
6046 * entry to be added is too big. Note that we check if the object
6047 * is integer encoded before to try fetching the length in the test below.
6048 * This is because integers are small, but currently stringObjectLen()
6049 * performs a slow conversion: not worth it. */
6050 if (o->encoding == REDIS_ENCODING_ZIPMAP &&
6051 ((c->argv[2]->encoding == REDIS_ENCODING_RAW &&
6052 sdslen(c->argv[2]->ptr) > server.hash_max_zipmap_value) ||
6053 (c->argv[3]->encoding == REDIS_ENCODING_RAW &&
6054 sdslen(c->argv[3]->ptr) > server.hash_max_zipmap_value)))
6055 {
6056 convertToRealHash(o);
6057 }
6058
978c2c94 6059 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6060 unsigned char *zm = o->ptr;
b1befe6a 6061 robj *valobj = getDecodedObject(c->argv[3]);
978c2c94 6062
6063 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
b1befe6a 6064 valobj->ptr,sdslen(valobj->ptr),&update);
6065 decrRefCount(valobj);
cbba7dd7 6066 o->ptr = zm;
bae2c7ec 6067
e9484a85
PN
6068 /* And here there is the second check for hash conversion. */
6069 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
bae2c7ec 6070 convertToRealHash(o);
978c2c94 6071 } else {
05df7621 6072 c->argv[2] = tryObjectEncoding(c->argv[2]);
bae2c7ec 6073 /* note that c->argv[3] is already encoded, as the latest arg
6074 * of a bulk command is always integer encoded if possible. */
2069d06a 6075 if (dictReplace(o->ptr,c->argv[2],c->argv[3])) {
978c2c94 6076 incrRefCount(c->argv[2]);
6077 } else {
6078 update = 1;
6079 }
6080 incrRefCount(c->argv[3]);
6081 }
6082 server.dirty++;
6083 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",update == 0));
6084}
6085
d33278d1
PN
6086static void hmsetCommand(redisClient *c) {
6087 int i;
6088 robj *o, *key, *val;
6089
6090 if ((c->argc % 2) == 1) {
6091 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6092 return;
6093 }
6094
6095 if ((o = lookupKeyWrite(c->db,c->argv[1])) == NULL) {
6096 o = createHashObject();
6097 dictAdd(c->db->dict,c->argv[1],o);
6098 incrRefCount(c->argv[1]);
6099 } else {
6100 if (o->type != REDIS_HASH) {
6101 addReply(c,shared.wrongtypeerr);
6102 return;
6103 }
6104 }
6105
6106 /* We want to convert the zipmap into an hash table right now if the
6107 * entry to be added is too big. */
6108 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6109 for (i = 2; i < c->argc; i+=2) {
6110 if ((c->argv[i]->encoding == REDIS_ENCODING_RAW &&
6111 sdslen(c->argv[i]->ptr) > server.hash_max_zipmap_value) ||
6112 (c->argv[i+1]->encoding == REDIS_ENCODING_RAW &&
6113 sdslen(c->argv[i+1]->ptr) > server.hash_max_zipmap_value)) {
6114 convertToRealHash(o);
6115 break;
6116 }
6117 }
6118 }
6119
6120 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6121 unsigned char *zm = o->ptr;
6122
6123 for (i = 2; i < c->argc; i+=2) {
6124 key = getDecodedObject(c->argv[i]);
6125 val = getDecodedObject(c->argv[i+1]);
6126 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
6127 val->ptr,sdslen(val->ptr),NULL);
6128 decrRefCount(key);
6129 decrRefCount(val);
6130 o->ptr = zm;
6131 }
6132
6133 /* And here there is the second check for hash conversion. */
6134 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
6135 convertToRealHash(o);
6136 } else {
6137 for (i = 2; i < c->argc; i+=2) {
6138 key = tryObjectEncoding(c->argv[i]);
6139 val = tryObjectEncoding(c->argv[i+1]);
6140 if (dictReplace(o->ptr,key,val)) {
6141 incrRefCount(key);
6142 }
6143 incrRefCount(val);
6144 }
6145 }
6146
6147 addReply(c, shared.ok);
6148}
6149
01426b05 6150static void hincrbyCommand(redisClient *c) {
01426b05
PN
6151 long long value = 0, incr = 0;
6152 robj *o = lookupKeyWrite(c->db,c->argv[1]);
6153
6154 if (o == NULL) {
6155 o = createHashObject();
6156 dictAdd(c->db->dict,c->argv[1],o);
6157 incrRefCount(c->argv[1]);
6158 } else {
6159 if (o->type != REDIS_HASH) {
6160 addReply(c,shared.wrongtypeerr);
6161 return;
6162 }
6163 }
6164
bbe025e0
AM
6165 if (getLongLongFromObject(c, c->argv[3], &incr) != REDIS_OK) return;
6166
01426b05
PN
6167 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6168 unsigned char *zm = o->ptr;
6169 unsigned char *zval;
6170 unsigned int zvlen;
6171
6172 /* Find value if already present in hash */
6173 if (zipmapGet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
6174 &zval,&zvlen)) {
6175 /* strtoll needs the char* to have a trailing \0, but
6176 * the zipmap doesn't include them. */
6177 sds szval = sdsnewlen(zval, zvlen);
6178 value = strtoll(szval,NULL,10);
6179 sdsfree(szval);
6180 }
6181
6182 value += incr;
6183 sds svalue = sdscatprintf(sdsempty(),"%lld",value);
6184 zm = zipmapSet(zm,c->argv[2]->ptr,sdslen(c->argv[2]->ptr),
e9484a85 6185 (unsigned char*)svalue,sdslen(svalue),NULL);
01426b05
PN
6186 sdsfree(svalue);
6187 o->ptr = zm;
6188
e9484a85
PN
6189 /* Check if the zipmap needs to be converted. */
6190 if (zipmapLen(zm) > server.hash_max_zipmap_entries)
01426b05
PN
6191 convertToRealHash(o);
6192 } else {
6193 robj *hval;
6194 dictEntry *de;
6195
6196 /* Find value if already present in hash */
6197 de = dictFind(o->ptr,c->argv[2]);
6198 if (de != NULL) {
6199 hval = dictGetEntryVal(de);
6200 if (hval->encoding == REDIS_ENCODING_RAW)
6201 value = strtoll(hval->ptr,NULL,10);
6202 else if (hval->encoding == REDIS_ENCODING_INT)
6203 value = (long)hval->ptr;
6204 else
6205 redisAssert(1 != 1);
6206 }
6207
6208 value += incr;
6209 hval = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
05df7621 6210 hval = tryObjectEncoding(hval);
01426b05
PN
6211 if (dictReplace(o->ptr,c->argv[2],hval)) {
6212 incrRefCount(c->argv[2]);
6213 }
6214 }
6215
6216 server.dirty++;
aa7c2934 6217 addReplyLongLong(c, value);
01426b05
PN
6218}
6219
978c2c94 6220static void hgetCommand(redisClient *c) {
dd88747b 6221 robj *o;
978c2c94 6222
dd88747b 6223 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6224 checkType(c,o,REDIS_HASH)) return;
6225
6226 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6227 unsigned char *zm = o->ptr;
6228 unsigned char *val;
6229 unsigned int vlen;
164ee595 6230 robj *field;
dd88747b 6231
164ee595 6232 field = getDecodedObject(c->argv[2]);
6233 if (zipmapGet(zm,field->ptr,sdslen(field->ptr), &val,&vlen)) {
dd88747b 6234 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6235 addReplySds(c,sdsnewlen(val,vlen));
6236 addReply(c,shared.crlf);
164ee595 6237 decrRefCount(field);
dd88747b 6238 return;
6239 } else {
6240 addReply(c,shared.nullbulk);
164ee595 6241 decrRefCount(field);
bcd11906 6242 return;
6243 }
dd88747b 6244 } else {
6245 struct dictEntry *de;
bcd11906 6246
dd88747b 6247 de = dictFind(o->ptr,c->argv[2]);
6248 if (de == NULL) {
6249 addReply(c,shared.nullbulk);
978c2c94 6250 } else {
dd88747b 6251 robj *e = dictGetEntryVal(de);
978c2c94 6252
dd88747b 6253 addReplyBulk(c,e);
978c2c94 6254 }
69d95c3e 6255 }
69d95c3e
PN
6256}
6257
09aeb579
PN
6258static void hmgetCommand(redisClient *c) {
6259 int i;
6260
6261 robj *o = lookupKeyRead(c->db, c->argv[1]);
6262 if (o == NULL) {
6263 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6264 for (i = 2; i < c->argc; i++) {
6265 addReply(c,shared.nullbulk);
6266 }
6267 return;
6268 } else {
6269 if (o->type != REDIS_HASH) {
6270 addReply(c,shared.wrongtypeerr);
6271 return;
6272 }
6273 }
6274
6275 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
6276 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6277 unsigned char *zm = o->ptr;
6278 unsigned char *v;
6279 unsigned int vlen;
6280 robj *field;
6281
6282 for (i = 2; i < c->argc; i++) {
6283 field = getDecodedObject(c->argv[i]);
6284 if (zipmapGet(zm,field->ptr,sdslen(field->ptr),&v,&vlen)) {
6285 addReplySds(c,sdscatprintf(sdsempty(),"$%u\r\n", vlen));
6286 addReplySds(c,sdsnewlen(v,vlen));
6287 addReply(c,shared.crlf);
6288 } else {
6289 addReply(c,shared.nullbulk);
6290 }
6291 decrRefCount(field);
6292 }
6293 } else {
6294 dictEntry *de;
6295
6296 for (i = 2; i < c->argc; i++) {
6297 de = dictFind(o->ptr,c->argv[i]);
6298 if (de != NULL) {
6299 addReplyBulk(c,(robj*)dictGetEntryVal(de));
6300 } else {
6301 addReply(c,shared.nullbulk);
6302 }
6303 }
6304 }
6305}
6306
07efaf74 6307static void hdelCommand(redisClient *c) {
dd88747b 6308 robj *o;
6309 int deleted = 0;
07efaf74 6310
dd88747b 6311 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6312 checkType(c,o,REDIS_HASH)) return;
07efaf74 6313
dd88747b 6314 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
2a1198b4 6315 robj *field = getDecodedObject(c->argv[2]);
6316
dd88747b 6317 o->ptr = zipmapDel((unsigned char*) o->ptr,
2a1198b4 6318 (unsigned char*) field->ptr,
6319 sdslen(field->ptr), &deleted);
6320 decrRefCount(field);
3ea27d37 6321 if (zipmapLen((unsigned char*) o->ptr) == 0)
6322 deleteKey(c->db,c->argv[1]);
dd88747b 6323 } else {
6324 deleted = dictDelete((dict*)o->ptr,c->argv[2]) == DICT_OK;
3ea27d37 6325 if (htNeedsResize(o->ptr)) dictResize(o->ptr);
6326 if (dictSize((dict*)o->ptr) == 0) deleteKey(c->db,c->argv[1]);
07efaf74 6327 }
c77169b7 6328 if (deleted) server.dirty++;
dd88747b 6329 addReply(c,deleted ? shared.cone : shared.czero);
07efaf74 6330}
6331
92b27fe9 6332static void hlenCommand(redisClient *c) {
6333 robj *o;
6334 unsigned long len;
6335
dd88747b 6336 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 6337 checkType(c,o,REDIS_HASH)) return;
6338
6339 len = (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6340 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6341 addReplyUlong(c,len);
6342}
6343
78409a0f 6344#define REDIS_GETALL_KEYS 1
6345#define REDIS_GETALL_VALS 2
6346static void genericHgetallCommand(redisClient *c, int flags) {
6347 robj *o, *lenobj;
6348 unsigned long count = 0;
6349
4e27f268 6350 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
78409a0f 6351 || checkType(c,o,REDIS_HASH)) return;
6352
6353 lenobj = createObject(REDIS_STRING,NULL);
6354 addReply(c,lenobj);
6355 decrRefCount(lenobj);
6356
6357 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6358 unsigned char *p = zipmapRewind(o->ptr);
6359 unsigned char *field, *val;
6360 unsigned int flen, vlen;
6361
6362 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
6363 robj *aux;
6364
6365 if (flags & REDIS_GETALL_KEYS) {
6366 aux = createStringObject((char*)field,flen);
6367 addReplyBulk(c,aux);
6368 decrRefCount(aux);
6369 count++;
6370 }
6371 if (flags & REDIS_GETALL_VALS) {
6372 aux = createStringObject((char*)val,vlen);
6373 addReplyBulk(c,aux);
6374 decrRefCount(aux);
6375 count++;
6376 }
6377 }
6378 } else {
6379 dictIterator *di = dictGetIterator(o->ptr);
6380 dictEntry *de;
6381
6382 while((de = dictNext(di)) != NULL) {
6383 robj *fieldobj = dictGetEntryKey(de);
6384 robj *valobj = dictGetEntryVal(de);
6385
6386 if (flags & REDIS_GETALL_KEYS) {
6387 addReplyBulk(c,fieldobj);
6388 count++;
6389 }
6390 if (flags & REDIS_GETALL_VALS) {
6391 addReplyBulk(c,valobj);
6392 count++;
6393 }
6394 }
6395 dictReleaseIterator(di);
6396 }
6397 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6398}
6399
6400static void hkeysCommand(redisClient *c) {
6401 genericHgetallCommand(c,REDIS_GETALL_KEYS);
6402}
6403
6404static void hvalsCommand(redisClient *c) {
6405 genericHgetallCommand(c,REDIS_GETALL_VALS);
6406}
6407
6408static void hgetallCommand(redisClient *c) {
6409 genericHgetallCommand(c,REDIS_GETALL_KEYS|REDIS_GETALL_VALS);
6410}
6411
a86f14b1 6412static void hexistsCommand(redisClient *c) {
6413 robj *o;
6414 int exists = 0;
6415
6416 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6417 checkType(c,o,REDIS_HASH)) return;
6418
6419 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6420 robj *field;
6421 unsigned char *zm = o->ptr;
6422
6423 field = getDecodedObject(c->argv[2]);
6424 exists = zipmapExists(zm,field->ptr,sdslen(field->ptr));
6425 decrRefCount(field);
6426 } else {
6427 exists = dictFind(o->ptr,c->argv[2]) != NULL;
6428 }
6429 addReply(c,exists ? shared.cone : shared.czero);
6430}
6431
ada386b2 6432static void convertToRealHash(robj *o) {
6433 unsigned char *key, *val, *p, *zm = o->ptr;
6434 unsigned int klen, vlen;
6435 dict *dict = dictCreate(&hashDictType,NULL);
6436
6437 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6438 p = zipmapRewind(zm);
6439 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6440 robj *keyobj, *valobj;
6441
6442 keyobj = createStringObject((char*)key,klen);
6443 valobj = createStringObject((char*)val,vlen);
05df7621 6444 keyobj = tryObjectEncoding(keyobj);
6445 valobj = tryObjectEncoding(valobj);
ada386b2 6446 dictAdd(dict,keyobj,valobj);
6447 }
6448 o->encoding = REDIS_ENCODING_HT;
6449 o->ptr = dict;
6450 zfree(zm);
6451}
6452
6b47e12e 6453/* ========================= Non type-specific commands ==================== */
6454
ed9b544e 6455static void flushdbCommand(redisClient *c) {
ca37e9cd 6456 server.dirty += dictSize(c->db->dict);
3305306f 6457 dictEmpty(c->db->dict);
6458 dictEmpty(c->db->expires);
ed9b544e 6459 addReply(c,shared.ok);
ed9b544e 6460}
6461
6462static void flushallCommand(redisClient *c) {
ca37e9cd 6463 server.dirty += emptyDb();
ed9b544e 6464 addReply(c,shared.ok);
500ece7c 6465 if (server.bgsavechildpid != -1) {
6466 kill(server.bgsavechildpid,SIGKILL);
6467 rdbRemoveTempFile(server.bgsavechildpid);
6468 }
f78fd11b 6469 rdbSave(server.dbfilename);
ca37e9cd 6470 server.dirty++;
ed9b544e 6471}
6472
56906eef 6473static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 6474 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 6475 so->type = type;
6476 so->pattern = pattern;
6477 return so;
6478}
6479
6480/* Return the value associated to the key with a name obtained
6481 * substituting the first occurence of '*' in 'pattern' with 'subst' */
56906eef 6482static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
ed9b544e 6483 char *p;
6484 sds spat, ssub;
6485 robj keyobj;
6486 int prefixlen, sublen, postfixlen;
ed9b544e 6487 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6488 struct {
f1017b3f 6489 long len;
6490 long free;
ed9b544e 6491 char buf[REDIS_SORTKEY_MAX+1];
6492 } keyname;
6493
28173a49 6494 /* If the pattern is "#" return the substitution object itself in order
6495 * to implement the "SORT ... GET #" feature. */
6496 spat = pattern->ptr;
6497 if (spat[0] == '#' && spat[1] == '\0') {
6498 return subst;
6499 }
6500
6501 /* The substitution object may be specially encoded. If so we create
9d65a1bb 6502 * a decoded object on the fly. Otherwise getDecodedObject will just
6503 * increment the ref count, that we'll decrement later. */
6504 subst = getDecodedObject(subst);
942a3961 6505
ed9b544e 6506 ssub = subst->ptr;
6507 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6508 p = strchr(spat,'*');
ed5a857a 6509 if (!p) {
6510 decrRefCount(subst);
6511 return NULL;
6512 }
ed9b544e 6513
6514 prefixlen = p-spat;
6515 sublen = sdslen(ssub);
6516 postfixlen = sdslen(spat)-(prefixlen+1);
6517 memcpy(keyname.buf,spat,prefixlen);
6518 memcpy(keyname.buf+prefixlen,ssub,sublen);
6519 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6520 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6521 keyname.len = prefixlen+sublen+postfixlen;
6522
dfc5e96c 6523 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2))
942a3961 6524 decrRefCount(subst);
6525
a4d1ba9a 6526 /* printf("lookup '%s' => %p\n", keyname.buf,de); */
3305306f 6527 return lookupKeyRead(db,&keyobj);
ed9b544e 6528}
6529
6530/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6531 * the additional parameter is not standard but a BSD-specific we have to
6532 * pass sorting parameters via the global 'server' structure */
6533static int sortCompare(const void *s1, const void *s2) {
6534 const redisSortObject *so1 = s1, *so2 = s2;
6535 int cmp;
6536
6537 if (!server.sort_alpha) {
6538 /* Numeric sorting. Here it's trivial as we precomputed scores */
6539 if (so1->u.score > so2->u.score) {
6540 cmp = 1;
6541 } else if (so1->u.score < so2->u.score) {
6542 cmp = -1;
6543 } else {
6544 cmp = 0;
6545 }
6546 } else {
6547 /* Alphanumeric sorting */
6548 if (server.sort_bypattern) {
6549 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6550 /* At least one compare object is NULL */
6551 if (so1->u.cmpobj == so2->u.cmpobj)
6552 cmp = 0;
6553 else if (so1->u.cmpobj == NULL)
6554 cmp = -1;
6555 else
6556 cmp = 1;
6557 } else {
6558 /* We have both the objects, use strcoll */
6559 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6560 }
6561 } else {
6562 /* Compare elements directly */
9d65a1bb 6563 robj *dec1, *dec2;
6564
6565 dec1 = getDecodedObject(so1->obj);
6566 dec2 = getDecodedObject(so2->obj);
6567 cmp = strcoll(dec1->ptr,dec2->ptr);
6568 decrRefCount(dec1);
6569 decrRefCount(dec2);
ed9b544e 6570 }
6571 }
6572 return server.sort_desc ? -cmp : cmp;
6573}
6574
6575/* The SORT command is the most complex command in Redis. Warning: this code
6576 * is optimized for speed and a bit less for readability */
6577static void sortCommand(redisClient *c) {
ed9b544e 6578 list *operations;
6579 int outputlen = 0;
6580 int desc = 0, alpha = 0;
6581 int limit_start = 0, limit_count = -1, start, end;
6582 int j, dontsort = 0, vectorlen;
6583 int getop = 0; /* GET operation counter */
443c6409 6584 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 6585 redisSortObject *vector; /* Resulting vector to sort */
6586
6587 /* Lookup the key to sort. It must be of the right types */
3305306f 6588 sortval = lookupKeyRead(c->db,c->argv[1]);
6589 if (sortval == NULL) {
4e27f268 6590 addReply(c,shared.emptymultibulk);
ed9b544e 6591 return;
6592 }
a5eb649b 6593 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6594 sortval->type != REDIS_ZSET)
6595 {
c937aa89 6596 addReply(c,shared.wrongtypeerr);
ed9b544e 6597 return;
6598 }
6599
6600 /* Create a list of operations to perform for every sorted element.
6601 * Operations can be GET/DEL/INCR/DECR */
6602 operations = listCreate();
092dac2a 6603 listSetFreeMethod(operations,zfree);
ed9b544e 6604 j = 2;
6605
6606 /* Now we need to protect sortval incrementing its count, in the future
6607 * SORT may have options able to overwrite/delete keys during the sorting
6608 * and the sorted key itself may get destroied */
6609 incrRefCount(sortval);
6610
6611 /* The SORT command has an SQL-alike syntax, parse it */
6612 while(j < c->argc) {
6613 int leftargs = c->argc-j-1;
6614 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6615 desc = 0;
6616 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6617 desc = 1;
6618 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6619 alpha = 1;
6620 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6621 limit_start = atoi(c->argv[j+1]->ptr);
6622 limit_count = atoi(c->argv[j+2]->ptr);
6623 j+=2;
443c6409 6624 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6625 storekey = c->argv[j+1];
6626 j++;
ed9b544e 6627 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6628 sortby = c->argv[j+1];
6629 /* If the BY pattern does not contain '*', i.e. it is constant,
6630 * we don't need to sort nor to lookup the weight keys. */
6631 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6632 j++;
6633 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6634 listAddNodeTail(operations,createSortOperation(
6635 REDIS_SORT_GET,c->argv[j+1]));
6636 getop++;
6637 j++;
ed9b544e 6638 } else {
6639 decrRefCount(sortval);
6640 listRelease(operations);
c937aa89 6641 addReply(c,shared.syntaxerr);
ed9b544e 6642 return;
6643 }
6644 j++;
6645 }
6646
6647 /* Load the sorting vector with all the objects to sort */
a5eb649b 6648 switch(sortval->type) {
6649 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6650 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6651 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
dfc5e96c 6652 default: vectorlen = 0; redisAssert(0); /* Avoid GCC warning */
a5eb649b 6653 }
ed9b544e 6654 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 6655 j = 0;
a5eb649b 6656
ed9b544e 6657 if (sortval->type == REDIS_LIST) {
6658 list *list = sortval->ptr;
6208b3a7 6659 listNode *ln;
c7df85a4 6660 listIter li;
6208b3a7 6661
c7df85a4 6662 listRewind(list,&li);
6663 while((ln = listNext(&li))) {
ed9b544e 6664 robj *ele = ln->value;
6665 vector[j].obj = ele;
6666 vector[j].u.score = 0;
6667 vector[j].u.cmpobj = NULL;
ed9b544e 6668 j++;
6669 }
6670 } else {
a5eb649b 6671 dict *set;
ed9b544e 6672 dictIterator *di;
6673 dictEntry *setele;
6674
a5eb649b 6675 if (sortval->type == REDIS_SET) {
6676 set = sortval->ptr;
6677 } else {
6678 zset *zs = sortval->ptr;
6679 set = zs->dict;
6680 }
6681
ed9b544e 6682 di = dictGetIterator(set);
ed9b544e 6683 while((setele = dictNext(di)) != NULL) {
6684 vector[j].obj = dictGetEntryKey(setele);
6685 vector[j].u.score = 0;
6686 vector[j].u.cmpobj = NULL;
6687 j++;
6688 }
6689 dictReleaseIterator(di);
6690 }
dfc5e96c 6691 redisAssert(j == vectorlen);
ed9b544e 6692
6693 /* Now it's time to load the right scores in the sorting vector */
6694 if (dontsort == 0) {
6695 for (j = 0; j < vectorlen; j++) {
6696 if (sortby) {
6697 robj *byval;
6698
3305306f 6699 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
ed9b544e 6700 if (!byval || byval->type != REDIS_STRING) continue;
6701 if (alpha) {
9d65a1bb 6702 vector[j].u.cmpobj = getDecodedObject(byval);
ed9b544e 6703 } else {
942a3961 6704 if (byval->encoding == REDIS_ENCODING_RAW) {
6705 vector[j].u.score = strtod(byval->ptr,NULL);
6706 } else {
9d65a1bb 6707 /* Don't need to decode the object if it's
6708 * integer-encoded (the only encoding supported) so
6709 * far. We can just cast it */
f1017b3f 6710 if (byval->encoding == REDIS_ENCODING_INT) {
942a3961 6711 vector[j].u.score = (long)byval->ptr;
f1017b3f 6712 } else
dfc5e96c 6713 redisAssert(1 != 1);
942a3961 6714 }
ed9b544e 6715 }
6716 } else {
942a3961 6717 if (!alpha) {
6718 if (vector[j].obj->encoding == REDIS_ENCODING_RAW)
6719 vector[j].u.score = strtod(vector[j].obj->ptr,NULL);
6720 else {
6721 if (vector[j].obj->encoding == REDIS_ENCODING_INT)
6722 vector[j].u.score = (long) vector[j].obj->ptr;
6723 else
dfc5e96c 6724 redisAssert(1 != 1);
942a3961 6725 }
6726 }
ed9b544e 6727 }
6728 }
6729 }
6730
6731 /* We are ready to sort the vector... perform a bit of sanity check
6732 * on the LIMIT option too. We'll use a partial version of quicksort. */
6733 start = (limit_start < 0) ? 0 : limit_start;
6734 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6735 if (start >= vectorlen) {
6736 start = vectorlen-1;
6737 end = vectorlen-2;
6738 }
6739 if (end >= vectorlen) end = vectorlen-1;
6740
6741 if (dontsort == 0) {
6742 server.sort_desc = desc;
6743 server.sort_alpha = alpha;
6744 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 6745 if (sortby && (start != 0 || end != vectorlen-1))
6746 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6747 else
6748 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 6749 }
6750
6751 /* Send command output to the output buffer, performing the specified
6752 * GET/DEL/INCR/DECR operations if any. */
6753 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 6754 if (storekey == NULL) {
6755 /* STORE option not specified, sent the sorting result to client */
6756 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6757 for (j = start; j <= end; j++) {
6758 listNode *ln;
c7df85a4 6759 listIter li;
6760
dd88747b 6761 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 6762 listRewind(operations,&li);
6763 while((ln = listNext(&li))) {
443c6409 6764 redisSortOperation *sop = ln->value;
6765 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6766 vector[j].obj);
6767
6768 if (sop->type == REDIS_SORT_GET) {
6769 if (!val || val->type != REDIS_STRING) {
6770 addReply(c,shared.nullbulk);
6771 } else {
dd88747b 6772 addReplyBulk(c,val);
443c6409 6773 }
6774 } else {
dfc5e96c 6775 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 6776 }
6777 }
ed9b544e 6778 }
443c6409 6779 } else {
6780 robj *listObject = createListObject();
6781 list *listPtr = (list*) listObject->ptr;
6782
6783 /* STORE option specified, set the sorting result as a List object */
6784 for (j = start; j <= end; j++) {
6785 listNode *ln;
c7df85a4 6786 listIter li;
6787
443c6409 6788 if (!getop) {
6789 listAddNodeTail(listPtr,vector[j].obj);
6790 incrRefCount(vector[j].obj);
6791 }
c7df85a4 6792 listRewind(operations,&li);
6793 while((ln = listNext(&li))) {
443c6409 6794 redisSortOperation *sop = ln->value;
6795 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6796 vector[j].obj);
6797
6798 if (sop->type == REDIS_SORT_GET) {
6799 if (!val || val->type != REDIS_STRING) {
6800 listAddNodeTail(listPtr,createStringObject("",0));
6801 } else {
6802 listAddNodeTail(listPtr,val);
6803 incrRefCount(val);
6804 }
ed9b544e 6805 } else {
dfc5e96c 6806 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 6807 }
ed9b544e 6808 }
ed9b544e 6809 }
121796f7 6810 if (dictReplace(c->db->dict,storekey,listObject)) {
6811 incrRefCount(storekey);
6812 }
443c6409 6813 /* Note: we add 1 because the DB is dirty anyway since even if the
6814 * SORT result is empty a new key is set and maybe the old content
6815 * replaced. */
6816 server.dirty += 1+outputlen;
6817 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 6818 }
6819
6820 /* Cleanup */
6821 decrRefCount(sortval);
6822 listRelease(operations);
6823 for (j = 0; j < vectorlen; j++) {
6824 if (sortby && alpha && vector[j].u.cmpobj)
6825 decrRefCount(vector[j].u.cmpobj);
6826 }
6827 zfree(vector);
6828}
6829
ec6c7a1d 6830/* Convert an amount of bytes into a human readable string in the form
6831 * of 100B, 2G, 100M, 4K, and so forth. */
6832static void bytesToHuman(char *s, unsigned long long n) {
6833 double d;
6834
6835 if (n < 1024) {
6836 /* Bytes */
6837 sprintf(s,"%lluB",n);
6838 return;
6839 } else if (n < (1024*1024)) {
6840 d = (double)n/(1024);
6841 sprintf(s,"%.2fK",d);
6842 } else if (n < (1024LL*1024*1024)) {
6843 d = (double)n/(1024*1024);
6844 sprintf(s,"%.2fM",d);
6845 } else if (n < (1024LL*1024*1024*1024)) {
6846 d = (double)n/(1024LL*1024*1024);
b72f6a4b 6847 sprintf(s,"%.2fG",d);
ec6c7a1d 6848 }
6849}
6850
1c85b79f 6851/* Create the string returned by the INFO command. This is decoupled
6852 * by the INFO command itself as we need to report the same information
6853 * on memory corruption problems. */
6854static sds genRedisInfoString(void) {
ed9b544e 6855 sds info;
6856 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 6857 int j;
ec6c7a1d 6858 char hmem[64];
55a8298f 6859
b72f6a4b 6860 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 6861 info = sdscatprintf(sdsempty(),
6862 "redis_version:%s\r\n"
f1017b3f 6863 "arch_bits:%s\r\n"
7a932b74 6864 "multiplexing_api:%s\r\n"
0d7170a4 6865 "process_id:%ld\r\n"
682ac724 6866 "uptime_in_seconds:%ld\r\n"
6867 "uptime_in_days:%ld\r\n"
ed9b544e 6868 "connected_clients:%d\r\n"
6869 "connected_slaves:%d\r\n"
f86a74e9 6870 "blocked_clients:%d\r\n"
5fba9f71 6871 "used_memory:%zu\r\n"
ec6c7a1d 6872 "used_memory_human:%s\r\n"
ed9b544e 6873 "changes_since_last_save:%lld\r\n"
be2bb6b0 6874 "bgsave_in_progress:%d\r\n"
682ac724 6875 "last_save_time:%ld\r\n"
b3fad521 6876 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 6877 "total_connections_received:%lld\r\n"
6878 "total_commands_processed:%lld\r\n"
2a6a2ed1 6879 "expired_keys:%lld\r\n"
55a8298f 6880 "hash_max_zipmap_entries:%ld\r\n"
6881 "hash_max_zipmap_value:%ld\r\n"
ffc6b7f8 6882 "pubsub_channels:%ld\r\n"
6883 "pubsub_patterns:%u\r\n"
7d98e08c 6884 "vm_enabled:%d\r\n"
a0f643ea 6885 "role:%s\r\n"
ed9b544e 6886 ,REDIS_VERSION,
f1017b3f 6887 (sizeof(long) == 8) ? "64" : "32",
7a932b74 6888 aeGetApiName(),
0d7170a4 6889 (long) getpid(),
a0f643ea 6890 uptime,
6891 uptime/(3600*24),
ed9b544e 6892 listLength(server.clients)-listLength(server.slaves),
6893 listLength(server.slaves),
d5d55fc3 6894 server.blpop_blocked_clients,
b72f6a4b 6895 zmalloc_used_memory(),
ec6c7a1d 6896 hmem,
ed9b544e 6897 server.dirty,
9d65a1bb 6898 server.bgsavechildpid != -1,
ed9b544e 6899 server.lastsave,
b3fad521 6900 server.bgrewritechildpid != -1,
ed9b544e 6901 server.stat_numconnections,
6902 server.stat_numcommands,
2a6a2ed1 6903 server.stat_expiredkeys,
55a8298f 6904 server.hash_max_zipmap_entries,
6905 server.hash_max_zipmap_value,
ffc6b7f8 6906 dictSize(server.pubsub_channels),
6907 listLength(server.pubsub_patterns),
7d98e08c 6908 server.vm_enabled != 0,
a0f643ea 6909 server.masterhost == NULL ? "master" : "slave"
ed9b544e 6910 );
a0f643ea 6911 if (server.masterhost) {
6912 info = sdscatprintf(info,
6913 "master_host:%s\r\n"
6914 "master_port:%d\r\n"
6915 "master_link_status:%s\r\n"
6916 "master_last_io_seconds_ago:%d\r\n"
6917 ,server.masterhost,
6918 server.masterport,
6919 (server.replstate == REDIS_REPL_CONNECTED) ?
6920 "up" : "down",
f72b934d 6921 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 6922 );
6923 }
7d98e08c 6924 if (server.vm_enabled) {
1064ef87 6925 lockThreadedIO();
7d98e08c 6926 info = sdscatprintf(info,
6927 "vm_conf_max_memory:%llu\r\n"
6928 "vm_conf_page_size:%llu\r\n"
6929 "vm_conf_pages:%llu\r\n"
6930 "vm_stats_used_pages:%llu\r\n"
6931 "vm_stats_swapped_objects:%llu\r\n"
6932 "vm_stats_swappin_count:%llu\r\n"
6933 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 6934 "vm_stats_io_newjobs_len:%lu\r\n"
6935 "vm_stats_io_processing_len:%lu\r\n"
6936 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 6937 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 6938 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 6939 ,(unsigned long long) server.vm_max_memory,
6940 (unsigned long long) server.vm_page_size,
6941 (unsigned long long) server.vm_pages,
6942 (unsigned long long) server.vm_stats_used_pages,
6943 (unsigned long long) server.vm_stats_swapped_objects,
6944 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 6945 (unsigned long long) server.vm_stats_swapouts,
6946 (unsigned long) listLength(server.io_newjobs),
6947 (unsigned long) listLength(server.io_processing),
6948 (unsigned long) listLength(server.io_processed),
d5d55fc3 6949 (unsigned long) server.io_active_threads,
6950 (unsigned long) server.vm_blocked_clients
7d98e08c 6951 );
1064ef87 6952 unlockThreadedIO();
7d98e08c 6953 }
c3cb078d 6954 for (j = 0; j < server.dbnum; j++) {
6955 long long keys, vkeys;
6956
6957 keys = dictSize(server.db[j].dict);
6958 vkeys = dictSize(server.db[j].expires);
6959 if (keys || vkeys) {
9d65a1bb 6960 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 6961 j, keys, vkeys);
6962 }
6963 }
1c85b79f 6964 return info;
6965}
6966
6967static void infoCommand(redisClient *c) {
6968 sds info = genRedisInfoString();
83c6a618 6969 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
6970 (unsigned long)sdslen(info)));
ed9b544e 6971 addReplySds(c,info);
70003d28 6972 addReply(c,shared.crlf);
ed9b544e 6973}
6974
3305306f 6975static void monitorCommand(redisClient *c) {
6976 /* ignore MONITOR if aleady slave or in monitor mode */
6977 if (c->flags & REDIS_SLAVE) return;
6978
6979 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
6980 c->slaveseldb = 0;
6b47e12e 6981 listAddNodeTail(server.monitors,c);
3305306f 6982 addReply(c,shared.ok);
6983}
6984
6985/* ================================= Expire ================================= */
6986static int removeExpire(redisDb *db, robj *key) {
6987 if (dictDelete(db->expires,key) == DICT_OK) {
6988 return 1;
6989 } else {
6990 return 0;
6991 }
6992}
6993
6994static int setExpire(redisDb *db, robj *key, time_t when) {
6995 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
6996 return 0;
6997 } else {
6998 incrRefCount(key);
6999 return 1;
7000 }
7001}
7002
bb32ede5 7003/* Return the expire time of the specified key, or -1 if no expire
7004 * is associated with this key (i.e. the key is non volatile) */
7005static time_t getExpire(redisDb *db, robj *key) {
7006 dictEntry *de;
7007
7008 /* No expire? return ASAP */
7009 if (dictSize(db->expires) == 0 ||
7010 (de = dictFind(db->expires,key)) == NULL) return -1;
7011
7012 return (time_t) dictGetEntryVal(de);
7013}
7014
3305306f 7015static int expireIfNeeded(redisDb *db, robj *key) {
7016 time_t when;
7017 dictEntry *de;
7018
7019 /* No expire? return ASAP */
7020 if (dictSize(db->expires) == 0 ||
7021 (de = dictFind(db->expires,key)) == NULL) return 0;
7022
7023 /* Lookup the expire */
7024 when = (time_t) dictGetEntryVal(de);
7025 if (time(NULL) <= when) return 0;
7026
7027 /* Delete the key */
7028 dictDelete(db->expires,key);
2a6a2ed1 7029 server.stat_expiredkeys++;
3305306f 7030 return dictDelete(db->dict,key) == DICT_OK;
7031}
7032
7033static int deleteIfVolatile(redisDb *db, robj *key) {
7034 dictEntry *de;
7035
7036 /* No expire? return ASAP */
7037 if (dictSize(db->expires) == 0 ||
7038 (de = dictFind(db->expires,key)) == NULL) return 0;
7039
7040 /* Delete the key */
0c66a471 7041 server.dirty++;
2a6a2ed1 7042 server.stat_expiredkeys++;
3305306f 7043 dictDelete(db->expires,key);
7044 return dictDelete(db->dict,key) == DICT_OK;
7045}
7046
bbe025e0 7047static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
3305306f 7048 dictEntry *de;
bbe025e0
AM
7049 time_t seconds;
7050
7051 if (getLongFromObject(c, param, &seconds) != REDIS_OK) return;
7052
7053 seconds -= offset;
3305306f 7054
802e8373 7055 de = dictFind(c->db->dict,key);
3305306f 7056 if (de == NULL) {
7057 addReply(c,shared.czero);
7058 return;
7059 }
43e5ccdf 7060 if (seconds < 0) {
7061 if (deleteKey(c->db,key)) server.dirty++;
7062 addReply(c, shared.cone);
3305306f 7063 return;
7064 } else {
7065 time_t when = time(NULL)+seconds;
802e8373 7066 if (setExpire(c->db,key,when)) {
3305306f 7067 addReply(c,shared.cone);
77423026 7068 server.dirty++;
7069 } else {
3305306f 7070 addReply(c,shared.czero);
77423026 7071 }
3305306f 7072 return;
7073 }
7074}
7075
802e8373 7076static void expireCommand(redisClient *c) {
bbe025e0 7077 expireGenericCommand(c,c->argv[1],c->argv[2],0);
802e8373 7078}
7079
7080static void expireatCommand(redisClient *c) {
bbe025e0 7081 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
802e8373 7082}
7083
fd88489a 7084static void ttlCommand(redisClient *c) {
7085 time_t expire;
7086 int ttl = -1;
7087
7088 expire = getExpire(c->db,c->argv[1]);
7089 if (expire != -1) {
7090 ttl = (int) (expire-time(NULL));
7091 if (ttl < 0) ttl = -1;
7092 }
7093 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7094}
7095
6e469882 7096/* ================================ MULTI/EXEC ============================== */
7097
7098/* Client state initialization for MULTI/EXEC */
7099static void initClientMultiState(redisClient *c) {
7100 c->mstate.commands = NULL;
7101 c->mstate.count = 0;
7102}
7103
7104/* Release all the resources associated with MULTI/EXEC state */
7105static void freeClientMultiState(redisClient *c) {
7106 int j;
7107
7108 for (j = 0; j < c->mstate.count; j++) {
7109 int i;
7110 multiCmd *mc = c->mstate.commands+j;
7111
7112 for (i = 0; i < mc->argc; i++)
7113 decrRefCount(mc->argv[i]);
7114 zfree(mc->argv);
7115 }
7116 zfree(c->mstate.commands);
7117}
7118
7119/* Add a new command into the MULTI commands queue */
7120static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7121 multiCmd *mc;
7122 int j;
7123
7124 c->mstate.commands = zrealloc(c->mstate.commands,
7125 sizeof(multiCmd)*(c->mstate.count+1));
7126 mc = c->mstate.commands+c->mstate.count;
7127 mc->cmd = cmd;
7128 mc->argc = c->argc;
7129 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7130 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7131 for (j = 0; j < c->argc; j++)
7132 incrRefCount(mc->argv[j]);
7133 c->mstate.count++;
7134}
7135
7136static void multiCommand(redisClient *c) {
7137 c->flags |= REDIS_MULTI;
36c548f0 7138 addReply(c,shared.ok);
6e469882 7139}
7140
18b6cb76
DJ
7141static void discardCommand(redisClient *c) {
7142 if (!(c->flags & REDIS_MULTI)) {
7143 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7144 return;
7145 }
7146
7147 freeClientMultiState(c);
7148 initClientMultiState(c);
7149 c->flags &= (~REDIS_MULTI);
7150 addReply(c,shared.ok);
7151}
7152
6e469882 7153static void execCommand(redisClient *c) {
7154 int j;
7155 robj **orig_argv;
7156 int orig_argc;
7157
7158 if (!(c->flags & REDIS_MULTI)) {
7159 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7160 return;
7161 }
7162
7163 orig_argv = c->argv;
7164 orig_argc = c->argc;
7165 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7166 for (j = 0; j < c->mstate.count; j++) {
7167 c->argc = c->mstate.commands[j].argc;
7168 c->argv = c->mstate.commands[j].argv;
7169 call(c,c->mstate.commands[j].cmd);
7170 }
7171 c->argv = orig_argv;
7172 c->argc = orig_argc;
7173 freeClientMultiState(c);
7174 initClientMultiState(c);
7175 c->flags &= (~REDIS_MULTI);
7176}
7177
4409877e 7178/* =========================== Blocking Operations ========================= */
7179
7180/* Currently Redis blocking operations support is limited to list POP ops,
7181 * so the current implementation is not fully generic, but it is also not
7182 * completely specific so it will not require a rewrite to support new
7183 * kind of blocking operations in the future.
7184 *
7185 * Still it's important to note that list blocking operations can be already
7186 * used as a notification mechanism in order to implement other blocking
7187 * operations at application level, so there must be a very strong evidence
7188 * of usefulness and generality before new blocking operations are implemented.
7189 *
7190 * This is how the current blocking POP works, we use BLPOP as example:
7191 * - If the user calls BLPOP and the key exists and contains a non empty list
7192 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7193 * if there is not to block.
7194 * - If instead BLPOP is called and the key does not exists or the list is
7195 * empty we need to block. In order to do so we remove the notification for
7196 * new data to read in the client socket (so that we'll not serve new
7197 * requests if the blocking request is not served). Also we put the client
95242ab5 7198 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
4409877e 7199 * blocking for this keys.
7200 * - If a PUSH operation against a key with blocked clients waiting is
7201 * performed, we serve the first in the list: basically instead to push
7202 * the new element inside the list we return it to the (first / oldest)
7203 * blocking client, unblock the client, and remove it form the list.
7204 *
7205 * The above comment and the source code should be enough in order to understand
7206 * the implementation and modify / fix it later.
7207 */
7208
7209/* Set a client in blocking mode for the specified key, with the specified
7210 * timeout */
b177fd30 7211static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 7212 dictEntry *de;
7213 list *l;
b177fd30 7214 int j;
4409877e 7215
b177fd30 7216 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7217 c->blockingkeysnum = numkeys;
4409877e 7218 c->blockingto = timeout;
b177fd30 7219 for (j = 0; j < numkeys; j++) {
7220 /* Add the key in the client structure, to map clients -> keys */
7221 c->blockingkeys[j] = keys[j];
7222 incrRefCount(keys[j]);
4409877e 7223
b177fd30 7224 /* And in the other "side", to map keys -> clients */
7225 de = dictFind(c->db->blockingkeys,keys[j]);
7226 if (de == NULL) {
7227 int retval;
7228
7229 /* For every key we take a list of clients blocked for it */
7230 l = listCreate();
7231 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7232 incrRefCount(keys[j]);
7233 assert(retval == DICT_OK);
7234 } else {
7235 l = dictGetEntryVal(de);
7236 }
7237 listAddNodeTail(l,c);
4409877e 7238 }
b177fd30 7239 /* Mark the client as a blocked client */
4409877e 7240 c->flags |= REDIS_BLOCKED;
d5d55fc3 7241 server.blpop_blocked_clients++;
4409877e 7242}
7243
7244/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 7245static void unblockClientWaitingData(redisClient *c) {
4409877e 7246 dictEntry *de;
7247 list *l;
b177fd30 7248 int j;
4409877e 7249
b177fd30 7250 assert(c->blockingkeys != NULL);
7251 /* The client may wait for multiple keys, so unblock it for every key. */
7252 for (j = 0; j < c->blockingkeysnum; j++) {
7253 /* Remove this client from the list of clients waiting for this key. */
7254 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7255 assert(de != NULL);
7256 l = dictGetEntryVal(de);
7257 listDelNode(l,listSearchKey(l,c));
7258 /* If the list is empty we need to remove it to avoid wasting memory */
7259 if (listLength(l) == 0)
7260 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7261 decrRefCount(c->blockingkeys[j]);
7262 }
7263 /* Cleanup the client structure */
7264 zfree(c->blockingkeys);
7265 c->blockingkeys = NULL;
4409877e 7266 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 7267 server.blpop_blocked_clients--;
5921aa36 7268 /* We want to process data if there is some command waiting
b0d8747d 7269 * in the input buffer. Note that this is safe even if
7270 * unblockClientWaitingData() gets called from freeClient() because
7271 * freeClient() will be smart enough to call this function
7272 * *after* c->querybuf was set to NULL. */
4409877e 7273 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7274}
7275
7276/* This should be called from any function PUSHing into lists.
7277 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7278 * 'ele' is the element pushed.
7279 *
7280 * If the function returns 0 there was no client waiting for a list push
7281 * against this key.
7282 *
7283 * If the function returns 1 there was a client waiting for a list push
7284 * against this key, the element was passed to this client thus it's not
7285 * needed to actually add it to the list and the caller should return asap. */
7286static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7287 struct dictEntry *de;
7288 redisClient *receiver;
7289 list *l;
7290 listNode *ln;
7291
7292 de = dictFind(c->db->blockingkeys,key);
7293 if (de == NULL) return 0;
7294 l = dictGetEntryVal(de);
7295 ln = listFirst(l);
7296 assert(ln != NULL);
7297 receiver = ln->value;
4409877e 7298
b177fd30 7299 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 7300 addReplyBulk(receiver,key);
7301 addReplyBulk(receiver,ele);
b0d8747d 7302 unblockClientWaitingData(receiver);
4409877e 7303 return 1;
7304}
7305
7306/* Blocking RPOP/LPOP */
7307static void blockingPopGenericCommand(redisClient *c, int where) {
7308 robj *o;
7309 time_t timeout;
b177fd30 7310 int j;
4409877e 7311
b177fd30 7312 for (j = 1; j < c->argc-1; j++) {
7313 o = lookupKeyWrite(c->db,c->argv[j]);
7314 if (o != NULL) {
7315 if (o->type != REDIS_LIST) {
7316 addReply(c,shared.wrongtypeerr);
4409877e 7317 return;
b177fd30 7318 } else {
7319 list *list = o->ptr;
7320 if (listLength(list) != 0) {
7321 /* If the list contains elements fall back to the usual
7322 * non-blocking POP operation */
7323 robj *argv[2], **orig_argv;
7324 int orig_argc;
e0a62c7f 7325
b177fd30 7326 /* We need to alter the command arguments before to call
7327 * popGenericCommand() as the command takes a single key. */
7328 orig_argv = c->argv;
7329 orig_argc = c->argc;
7330 argv[1] = c->argv[j];
7331 c->argv = argv;
7332 c->argc = 2;
7333
7334 /* Also the return value is different, we need to output
7335 * the multi bulk reply header and the key name. The
7336 * "real" command will add the last element (the value)
7337 * for us. If this souds like an hack to you it's just
7338 * because it is... */
7339 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 7340 addReplyBulk(c,argv[1]);
b177fd30 7341 popGenericCommand(c,where);
7342
7343 /* Fix the client structure with the original stuff */
7344 c->argv = orig_argv;
7345 c->argc = orig_argc;
7346 return;
7347 }
4409877e 7348 }
7349 }
7350 }
7351 /* If the list is empty or the key does not exists we must block */
b177fd30 7352 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 7353 if (timeout > 0) timeout += time(NULL);
b177fd30 7354 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 7355}
7356
7357static void blpopCommand(redisClient *c) {
7358 blockingPopGenericCommand(c,REDIS_HEAD);
7359}
7360
7361static void brpopCommand(redisClient *c) {
7362 blockingPopGenericCommand(c,REDIS_TAIL);
7363}
7364
ed9b544e 7365/* =============================== Replication ============================= */
7366
a4d1ba9a 7367static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7368 ssize_t nwritten, ret = size;
7369 time_t start = time(NULL);
7370
7371 timeout++;
7372 while(size) {
7373 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7374 nwritten = write(fd,ptr,size);
7375 if (nwritten == -1) return -1;
7376 ptr += nwritten;
7377 size -= nwritten;
7378 }
7379 if ((time(NULL)-start) > timeout) {
7380 errno = ETIMEDOUT;
7381 return -1;
7382 }
7383 }
7384 return ret;
7385}
7386
a4d1ba9a 7387static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7388 ssize_t nread, totread = 0;
7389 time_t start = time(NULL);
7390
7391 timeout++;
7392 while(size) {
7393 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7394 nread = read(fd,ptr,size);
7395 if (nread == -1) return -1;
7396 ptr += nread;
7397 size -= nread;
7398 totread += nread;
7399 }
7400 if ((time(NULL)-start) > timeout) {
7401 errno = ETIMEDOUT;
7402 return -1;
7403 }
7404 }
7405 return totread;
7406}
7407
7408static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7409 ssize_t nread = 0;
7410
7411 size--;
7412 while(size) {
7413 char c;
7414
7415 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7416 if (c == '\n') {
7417 *ptr = '\0';
7418 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7419 return nread;
7420 } else {
7421 *ptr++ = c;
7422 *ptr = '\0';
7423 nread++;
7424 }
7425 }
7426 return nread;
7427}
7428
7429static void syncCommand(redisClient *c) {
40d224a9 7430 /* ignore SYNC if aleady slave or in monitor mode */
7431 if (c->flags & REDIS_SLAVE) return;
7432
7433 /* SYNC can't be issued when the server has pending data to send to
7434 * the client about already issued commands. We need a fresh reply
7435 * buffer registering the differences between the BGSAVE and the current
7436 * dataset, so that we can copy to other slaves if needed. */
7437 if (listLength(c->reply) != 0) {
7438 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7439 return;
7440 }
7441
7442 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7443 /* Here we need to check if there is a background saving operation
7444 * in progress, or if it is required to start one */
9d65a1bb 7445 if (server.bgsavechildpid != -1) {
40d224a9 7446 /* Ok a background save is in progress. Let's check if it is a good
7447 * one for replication, i.e. if there is another slave that is
7448 * registering differences since the server forked to save */
7449 redisClient *slave;
7450 listNode *ln;
c7df85a4 7451 listIter li;
40d224a9 7452
c7df85a4 7453 listRewind(server.slaves,&li);
7454 while((ln = listNext(&li))) {
40d224a9 7455 slave = ln->value;
7456 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 7457 }
7458 if (ln) {
7459 /* Perfect, the server is already registering differences for
7460 * another slave. Set the right state, and copy the buffer. */
7461 listRelease(c->reply);
7462 c->reply = listDup(slave->reply);
40d224a9 7463 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7464 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7465 } else {
7466 /* No way, we need to wait for the next BGSAVE in order to
7467 * register differences */
7468 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7469 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7470 }
7471 } else {
7472 /* Ok we don't have a BGSAVE in progress, let's start one */
7473 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7474 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7475 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7476 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7477 return;
7478 }
7479 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7480 }
6208b3a7 7481 c->repldbfd = -1;
40d224a9 7482 c->flags |= REDIS_SLAVE;
7483 c->slaveseldb = 0;
6b47e12e 7484 listAddNodeTail(server.slaves,c);
40d224a9 7485 return;
7486}
7487
6208b3a7 7488static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7489 redisClient *slave = privdata;
7490 REDIS_NOTUSED(el);
7491 REDIS_NOTUSED(mask);
7492 char buf[REDIS_IOBUF_LEN];
7493 ssize_t nwritten, buflen;
7494
7495 if (slave->repldboff == 0) {
7496 /* Write the bulk write count before to transfer the DB. In theory here
7497 * we don't know how much room there is in the output buffer of the
7498 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7499 * operations) will never be smaller than the few bytes we need. */
7500 sds bulkcount;
7501
7502 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7503 slave->repldbsize);
7504 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7505 {
7506 sdsfree(bulkcount);
7507 freeClient(slave);
7508 return;
7509 }
7510 sdsfree(bulkcount);
7511 }
7512 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7513 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7514 if (buflen <= 0) {
7515 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7516 (buflen == 0) ? "premature EOF" : strerror(errno));
7517 freeClient(slave);
7518 return;
7519 }
7520 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 7521 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 7522 strerror(errno));
7523 freeClient(slave);
7524 return;
7525 }
7526 slave->repldboff += nwritten;
7527 if (slave->repldboff == slave->repldbsize) {
7528 close(slave->repldbfd);
7529 slave->repldbfd = -1;
7530 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7531 slave->replstate = REDIS_REPL_ONLINE;
7532 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 7533 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 7534 freeClient(slave);
7535 return;
7536 }
7537 addReplySds(slave,sdsempty());
7538 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7539 }
7540}
ed9b544e 7541
a3b21203 7542/* This function is called at the end of every backgrond saving.
7543 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7544 * otherwise REDIS_ERR is passed to the function.
7545 *
7546 * The goal of this function is to handle slaves waiting for a successful
7547 * background saving in order to perform non-blocking synchronization. */
7548static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 7549 listNode *ln;
7550 int startbgsave = 0;
c7df85a4 7551 listIter li;
ed9b544e 7552
c7df85a4 7553 listRewind(server.slaves,&li);
7554 while((ln = listNext(&li))) {
6208b3a7 7555 redisClient *slave = ln->value;
ed9b544e 7556
6208b3a7 7557 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7558 startbgsave = 1;
7559 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7560 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 7561 struct redis_stat buf;
e0a62c7f 7562
6208b3a7 7563 if (bgsaveerr != REDIS_OK) {
7564 freeClient(slave);
7565 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7566 continue;
7567 }
7568 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 7569 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 7570 freeClient(slave);
7571 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7572 continue;
7573 }
7574 slave->repldboff = 0;
7575 slave->repldbsize = buf.st_size;
7576 slave->replstate = REDIS_REPL_SEND_BULK;
7577 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 7578 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 7579 freeClient(slave);
7580 continue;
7581 }
7582 }
ed9b544e 7583 }
6208b3a7 7584 if (startbgsave) {
7585 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 7586 listIter li;
7587
7588 listRewind(server.slaves,&li);
6208b3a7 7589 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 7590 while((ln = listNext(&li))) {
6208b3a7 7591 redisClient *slave = ln->value;
ed9b544e 7592
6208b3a7 7593 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7594 freeClient(slave);
7595 }
7596 }
7597 }
ed9b544e 7598}
7599
7600static int syncWithMaster(void) {
d0ccebcf 7601 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 7602 long dumpsize;
ed9b544e 7603 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 7604 int dfd, maxtries = 5;
ed9b544e 7605
7606 if (fd == -1) {
7607 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7608 strerror(errno));
7609 return REDIS_ERR;
7610 }
d0ccebcf 7611
7612 /* AUTH with the master if required. */
7613 if(server.masterauth) {
7614 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7615 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7616 close(fd);
7617 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7618 strerror(errno));
7619 return REDIS_ERR;
7620 }
7621 /* Read the AUTH result. */
7622 if (syncReadLine(fd,buf,1024,3600) == -1) {
7623 close(fd);
7624 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7625 strerror(errno));
7626 return REDIS_ERR;
7627 }
7628 if (buf[0] != '+') {
7629 close(fd);
7630 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7631 return REDIS_ERR;
7632 }
7633 }
7634
ed9b544e 7635 /* Issue the SYNC command */
7636 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7637 close(fd);
7638 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7639 strerror(errno));
7640 return REDIS_ERR;
7641 }
7642 /* Read the bulk write count */
8c4d91fc 7643 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 7644 close(fd);
7645 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7646 strerror(errno));
7647 return REDIS_ERR;
7648 }
4aa701c1 7649 if (buf[0] != '$') {
7650 close(fd);
7651 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7652 return REDIS_ERR;
7653 }
18e61fa2 7654 dumpsize = strtol(buf+1,NULL,10);
7655 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 7656 /* Read the bulk write data on a temp file */
8c5abee8 7657 while(maxtries--) {
7658 snprintf(tmpfile,256,
7659 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7660 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7661 if (dfd != -1) break;
5de9ad7c 7662 sleep(1);
8c5abee8 7663 }
ed9b544e 7664 if (dfd == -1) {
7665 close(fd);
7666 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7667 return REDIS_ERR;
7668 }
7669 while(dumpsize) {
7670 int nread, nwritten;
7671
7672 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7673 if (nread == -1) {
7674 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7675 strerror(errno));
7676 close(fd);
7677 close(dfd);
7678 return REDIS_ERR;
7679 }
7680 nwritten = write(dfd,buf,nread);
7681 if (nwritten == -1) {
7682 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7683 close(fd);
7684 close(dfd);
7685 return REDIS_ERR;
7686 }
7687 dumpsize -= nread;
7688 }
7689 close(dfd);
7690 if (rename(tmpfile,server.dbfilename) == -1) {
7691 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7692 unlink(tmpfile);
7693 close(fd);
7694 return REDIS_ERR;
7695 }
7696 emptyDb();
f78fd11b 7697 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 7698 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7699 close(fd);
7700 return REDIS_ERR;
7701 }
7702 server.master = createClient(fd);
7703 server.master->flags |= REDIS_MASTER;
179b3952 7704 server.master->authenticated = 1;
ed9b544e 7705 server.replstate = REDIS_REPL_CONNECTED;
7706 return REDIS_OK;
7707}
7708
321b0e13 7709static void slaveofCommand(redisClient *c) {
7710 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7711 !strcasecmp(c->argv[2]->ptr,"one")) {
7712 if (server.masterhost) {
7713 sdsfree(server.masterhost);
7714 server.masterhost = NULL;
7715 if (server.master) freeClient(server.master);
7716 server.replstate = REDIS_REPL_NONE;
7717 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7718 }
7719 } else {
7720 sdsfree(server.masterhost);
7721 server.masterhost = sdsdup(c->argv[1]->ptr);
7722 server.masterport = atoi(c->argv[2]->ptr);
7723 if (server.master) freeClient(server.master);
7724 server.replstate = REDIS_REPL_CONNECT;
7725 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7726 server.masterhost, server.masterport);
7727 }
7728 addReply(c,shared.ok);
7729}
7730
3fd78bcd 7731/* ============================ Maxmemory directive ======================== */
7732
a5819310 7733/* Try to free one object form the pre-allocated objects free list.
7734 * This is useful under low mem conditions as by default we take 1 million
7735 * free objects allocated. On success REDIS_OK is returned, otherwise
7736 * REDIS_ERR. */
7737static int tryFreeOneObjectFromFreelist(void) {
f870935d 7738 robj *o;
7739
a5819310 7740 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7741 if (listLength(server.objfreelist)) {
7742 listNode *head = listFirst(server.objfreelist);
7743 o = listNodeValue(head);
7744 listDelNode(server.objfreelist,head);
7745 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7746 zfree(o);
7747 return REDIS_OK;
7748 } else {
7749 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7750 return REDIS_ERR;
7751 }
f870935d 7752}
7753
3fd78bcd 7754/* This function gets called when 'maxmemory' is set on the config file to limit
7755 * the max memory used by the server, and we are out of memory.
7756 * This function will try to, in order:
7757 *
7758 * - Free objects from the free list
7759 * - Try to remove keys with an EXPIRE set
7760 *
7761 * It is not possible to free enough memory to reach used-memory < maxmemory
7762 * the server will start refusing commands that will enlarge even more the
7763 * memory usage.
7764 */
7765static void freeMemoryIfNeeded(void) {
7766 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 7767 int j, k, freed = 0;
7768
7769 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7770 for (j = 0; j < server.dbnum; j++) {
7771 int minttl = -1;
7772 robj *minkey = NULL;
7773 struct dictEntry *de;
7774
7775 if (dictSize(server.db[j].expires)) {
7776 freed = 1;
7777 /* From a sample of three keys drop the one nearest to
7778 * the natural expire */
7779 for (k = 0; k < 3; k++) {
7780 time_t t;
7781
7782 de = dictGetRandomKey(server.db[j].expires);
7783 t = (time_t) dictGetEntryVal(de);
7784 if (minttl == -1 || t < minttl) {
7785 minkey = dictGetEntryKey(de);
7786 minttl = t;
3fd78bcd 7787 }
3fd78bcd 7788 }
a5819310 7789 deleteKey(server.db+j,minkey);
3fd78bcd 7790 }
3fd78bcd 7791 }
a5819310 7792 if (!freed) return; /* nothing to free... */
3fd78bcd 7793 }
7794}
7795
f80dff62 7796/* ============================== Append Only file ========================== */
7797
7798static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7799 sds buf = sdsempty();
7800 int j;
7801 ssize_t nwritten;
7802 time_t now;
7803 robj *tmpargv[3];
7804
7805 /* The DB this command was targetting is not the same as the last command
7806 * we appendend. To issue a SELECT command is needed. */
7807 if (dictid != server.appendseldb) {
7808 char seldb[64];
7809
7810 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 7811 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 7812 (unsigned long)strlen(seldb),seldb);
f80dff62 7813 server.appendseldb = dictid;
7814 }
7815
7816 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7817 * EXPIREs into EXPIREATs calls */
7818 if (cmd->proc == expireCommand) {
7819 long when;
7820
7821 tmpargv[0] = createStringObject("EXPIREAT",8);
7822 tmpargv[1] = argv[1];
7823 incrRefCount(argv[1]);
7824 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7825 tmpargv[2] = createObject(REDIS_STRING,
7826 sdscatprintf(sdsempty(),"%ld",when));
7827 argv = tmpargv;
7828 }
7829
7830 /* Append the actual command */
7831 buf = sdscatprintf(buf,"*%d\r\n",argc);
7832 for (j = 0; j < argc; j++) {
7833 robj *o = argv[j];
7834
9d65a1bb 7835 o = getDecodedObject(o);
83c6a618 7836 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
f80dff62 7837 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7838 buf = sdscatlen(buf,"\r\n",2);
9d65a1bb 7839 decrRefCount(o);
f80dff62 7840 }
7841
7842 /* Free the objects from the modified argv for EXPIREAT */
7843 if (cmd->proc == expireCommand) {
7844 for (j = 0; j < 3; j++)
7845 decrRefCount(argv[j]);
7846 }
7847
7848 /* We want to perform a single write. This should be guaranteed atomic
7849 * at least if the filesystem we are writing is a real physical one.
7850 * While this will save us against the server being killed I don't think
7851 * there is much to do about the whole server stopping for power problems
7852 * or alike */
7853 nwritten = write(server.appendfd,buf,sdslen(buf));
7854 if (nwritten != (signed)sdslen(buf)) {
7855 /* Ooops, we are in troubles. The best thing to do for now is
7856 * to simply exit instead to give the illusion that everything is
7857 * working as expected. */
7858 if (nwritten == -1) {
7859 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
7860 } else {
7861 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
7862 }
7863 exit(1);
7864 }
85a83172 7865 /* If a background append only file rewriting is in progress we want to
7866 * accumulate the differences between the child DB and the current one
7867 * in a buffer, so that when the child process will do its work we
7868 * can append the differences to the new append only file. */
7869 if (server.bgrewritechildpid != -1)
7870 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
7871
7872 sdsfree(buf);
f80dff62 7873 now = time(NULL);
7874 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
7875 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
7876 now-server.lastfsync > 1))
7877 {
7878 fsync(server.appendfd); /* Let's try to get this data on the disk */
7879 server.lastfsync = now;
7880 }
7881}
7882
7883/* In Redis commands are always executed in the context of a client, so in
7884 * order to load the append only file we need to create a fake client. */
7885static struct redisClient *createFakeClient(void) {
7886 struct redisClient *c = zmalloc(sizeof(*c));
7887
7888 selectDb(c,0);
7889 c->fd = -1;
7890 c->querybuf = sdsempty();
7891 c->argc = 0;
7892 c->argv = NULL;
7893 c->flags = 0;
9387d17d 7894 /* We set the fake client as a slave waiting for the synchronization
7895 * so that Redis will not try to send replies to this client. */
7896 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 7897 c->reply = listCreate();
7898 listSetFreeMethod(c->reply,decrRefCount);
7899 listSetDupMethod(c->reply,dupClientReplyValue);
7900 return c;
7901}
7902
7903static void freeFakeClient(struct redisClient *c) {
7904 sdsfree(c->querybuf);
7905 listRelease(c->reply);
7906 zfree(c);
7907}
7908
7909/* Replay the append log file. On error REDIS_OK is returned. On non fatal
7910 * error (the append only file is zero-length) REDIS_ERR is returned. On
7911 * fatal error an error message is logged and the program exists. */
7912int loadAppendOnlyFile(char *filename) {
7913 struct redisClient *fakeClient;
7914 FILE *fp = fopen(filename,"r");
7915 struct redis_stat sb;
b492cf00 7916 unsigned long long loadedkeys = 0;
f80dff62 7917
7918 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
7919 return REDIS_ERR;
7920
7921 if (fp == NULL) {
7922 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
7923 exit(1);
7924 }
7925
7926 fakeClient = createFakeClient();
7927 while(1) {
7928 int argc, j;
7929 unsigned long len;
7930 robj **argv;
7931 char buf[128];
7932 sds argsds;
7933 struct redisCommand *cmd;
7934
7935 if (fgets(buf,sizeof(buf),fp) == NULL) {
7936 if (feof(fp))
7937 break;
7938 else
7939 goto readerr;
7940 }
7941 if (buf[0] != '*') goto fmterr;
7942 argc = atoi(buf+1);
7943 argv = zmalloc(sizeof(robj*)*argc);
7944 for (j = 0; j < argc; j++) {
7945 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
7946 if (buf[0] != '$') goto fmterr;
7947 len = strtol(buf+1,NULL,10);
7948 argsds = sdsnewlen(NULL,len);
0f151ef1 7949 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 7950 argv[j] = createObject(REDIS_STRING,argsds);
7951 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
7952 }
7953
7954 /* Command lookup */
7955 cmd = lookupCommand(argv[0]->ptr);
7956 if (!cmd) {
7957 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
7958 exit(1);
7959 }
bdcb92f2 7960 /* Try object encoding */
f80dff62 7961 if (cmd->flags & REDIS_CMD_BULK)
05df7621 7962 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
f80dff62 7963 /* Run the command in the context of a fake client */
7964 fakeClient->argc = argc;
7965 fakeClient->argv = argv;
7966 cmd->proc(fakeClient);
7967 /* Discard the reply objects list from the fake client */
7968 while(listLength(fakeClient->reply))
7969 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
7970 /* Clean up, ready for the next command */
7971 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
7972 zfree(argv);
b492cf00 7973 /* Handle swapping while loading big datasets when VM is on */
7974 loadedkeys++;
7975 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
7976 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 7977 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 7978 }
7979 }
f80dff62 7980 }
7981 fclose(fp);
7982 freeFakeClient(fakeClient);
7983 return REDIS_OK;
7984
7985readerr:
7986 if (feof(fp)) {
7987 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
7988 } else {
7989 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
7990 }
7991 exit(1);
7992fmterr:
7993 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
7994 exit(1);
7995}
7996
9d65a1bb 7997/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
9c8e3cee 7998static int fwriteBulkObject(FILE *fp, robj *obj) {
9d65a1bb 7999 char buf[128];
b9bc0eef 8000 int decrrc = 0;
8001
f2d9f50f 8002 /* Avoid the incr/decr ref count business if possible to help
8003 * copy-on-write (we are often in a child process when this function
8004 * is called).
8005 * Also makes sure that key objects don't get incrRefCount-ed when VM
8006 * is enabled */
8007 if (obj->encoding != REDIS_ENCODING_RAW) {
b9bc0eef 8008 obj = getDecodedObject(obj);
8009 decrrc = 1;
8010 }
9d65a1bb 8011 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8012 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 8013 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8014 goto err;
9d65a1bb 8015 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 8016 if (decrrc) decrRefCount(obj);
9d65a1bb 8017 return 1;
8018err:
b9bc0eef 8019 if (decrrc) decrRefCount(obj);
9d65a1bb 8020 return 0;
8021}
8022
9c8e3cee 8023/* Write binary-safe string into a file in the bulkformat
8024 * $<count>\r\n<payload>\r\n */
8025static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8026 char buf[128];
8027
8028 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8029 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8030 if (len && fwrite(s,len,1,fp) == 0) return 0;
8031 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8032 return 1;
8033}
8034
9d65a1bb 8035/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8036static int fwriteBulkDouble(FILE *fp, double d) {
8037 char buf[128], dbuf[128];
8038
8039 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8040 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8041 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8042 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8043 return 1;
8044}
8045
8046/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8047static int fwriteBulkLong(FILE *fp, long l) {
8048 char buf[128], lbuf[128];
8049
8050 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8051 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8052 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8053 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8054 return 1;
8055}
8056
8057/* Write a sequence of commands able to fully rebuild the dataset into
8058 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8059static int rewriteAppendOnlyFile(char *filename) {
8060 dictIterator *di = NULL;
8061 dictEntry *de;
8062 FILE *fp;
8063 char tmpfile[256];
8064 int j;
8065 time_t now = time(NULL);
8066
8067 /* Note that we have to use a different temp name here compared to the
8068 * one used by rewriteAppendOnlyFileBackground() function. */
8069 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8070 fp = fopen(tmpfile,"w");
8071 if (!fp) {
8072 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8073 return REDIS_ERR;
8074 }
8075 for (j = 0; j < server.dbnum; j++) {
8076 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8077 redisDb *db = server.db+j;
8078 dict *d = db->dict;
8079 if (dictSize(d) == 0) continue;
8080 di = dictGetIterator(d);
8081 if (!di) {
8082 fclose(fp);
8083 return REDIS_ERR;
8084 }
8085
8086 /* SELECT the new DB */
8087 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 8088 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 8089
8090 /* Iterate this DB writing every entry */
8091 while((de = dictNext(di)) != NULL) {
e7546c63 8092 robj *key, *o;
8093 time_t expiretime;
8094 int swapped;
8095
8096 key = dictGetEntryKey(de);
b9bc0eef 8097 /* If the value for this key is swapped, load a preview in memory.
8098 * We use a "swapped" flag to remember if we need to free the
8099 * value object instead to just increment the ref count anyway
8100 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 8101 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8102 key->storage == REDIS_VM_SWAPPING) {
e7546c63 8103 o = dictGetEntryVal(de);
8104 swapped = 0;
8105 } else {
8106 o = vmPreviewObject(key);
e7546c63 8107 swapped = 1;
8108 }
8109 expiretime = getExpire(db,key);
9d65a1bb 8110
8111 /* Save the key and associated value */
9d65a1bb 8112 if (o->type == REDIS_STRING) {
8113 /* Emit a SET command */
8114 char cmd[]="*3\r\n$3\r\nSET\r\n";
8115 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8116 /* Key and value */
9c8e3cee 8117 if (fwriteBulkObject(fp,key) == 0) goto werr;
8118 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 8119 } else if (o->type == REDIS_LIST) {
8120 /* Emit the RPUSHes needed to rebuild the list */
8121 list *list = o->ptr;
8122 listNode *ln;
c7df85a4 8123 listIter li;
9d65a1bb 8124
c7df85a4 8125 listRewind(list,&li);
8126 while((ln = listNext(&li))) {
9d65a1bb 8127 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8128 robj *eleobj = listNodeValue(ln);
8129
8130 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8131 if (fwriteBulkObject(fp,key) == 0) goto werr;
8132 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8133 }
8134 } else if (o->type == REDIS_SET) {
8135 /* Emit the SADDs needed to rebuild the set */
8136 dict *set = o->ptr;
8137 dictIterator *di = dictGetIterator(set);
8138 dictEntry *de;
8139
8140 while((de = dictNext(di)) != NULL) {
8141 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8142 robj *eleobj = dictGetEntryKey(de);
8143
8144 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8145 if (fwriteBulkObject(fp,key) == 0) goto werr;
8146 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8147 }
8148 dictReleaseIterator(di);
8149 } else if (o->type == REDIS_ZSET) {
8150 /* Emit the ZADDs needed to rebuild the sorted set */
8151 zset *zs = o->ptr;
8152 dictIterator *di = dictGetIterator(zs->dict);
8153 dictEntry *de;
8154
8155 while((de = dictNext(di)) != NULL) {
8156 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8157 robj *eleobj = dictGetEntryKey(de);
8158 double *score = dictGetEntryVal(de);
8159
8160 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8161 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8162 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 8163 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8164 }
8165 dictReleaseIterator(di);
9c8e3cee 8166 } else if (o->type == REDIS_HASH) {
8167 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8168
8169 /* Emit the HSETs needed to rebuild the hash */
8170 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8171 unsigned char *p = zipmapRewind(o->ptr);
8172 unsigned char *field, *val;
8173 unsigned int flen, vlen;
8174
8175 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8176 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8177 if (fwriteBulkObject(fp,key) == 0) goto werr;
8178 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8179 return -1;
8180 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8181 return -1;
8182 }
8183 } else {
8184 dictIterator *di = dictGetIterator(o->ptr);
8185 dictEntry *de;
8186
8187 while((de = dictNext(di)) != NULL) {
8188 robj *field = dictGetEntryKey(de);
8189 robj *val = dictGetEntryVal(de);
8190
8191 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8192 if (fwriteBulkObject(fp,key) == 0) goto werr;
8193 if (fwriteBulkObject(fp,field) == -1) return -1;
8194 if (fwriteBulkObject(fp,val) == -1) return -1;
8195 }
8196 dictReleaseIterator(di);
8197 }
9d65a1bb 8198 } else {
78409a0f 8199 redisAssert(0);
9d65a1bb 8200 }
8201 /* Save the expire time */
8202 if (expiretime != -1) {
e96e4fbf 8203 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 8204 /* If this key is already expired skip it */
8205 if (expiretime < now) continue;
8206 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8207 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8208 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8209 }
b9bc0eef 8210 if (swapped) decrRefCount(o);
9d65a1bb 8211 }
8212 dictReleaseIterator(di);
8213 }
8214
8215 /* Make sure data will not remain on the OS's output buffers */
8216 fflush(fp);
8217 fsync(fileno(fp));
8218 fclose(fp);
e0a62c7f 8219
9d65a1bb 8220 /* Use RENAME to make sure the DB file is changed atomically only
8221 * if the generate DB file is ok. */
8222 if (rename(tmpfile,filename) == -1) {
8223 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8224 unlink(tmpfile);
8225 return REDIS_ERR;
8226 }
8227 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8228 return REDIS_OK;
8229
8230werr:
8231 fclose(fp);
8232 unlink(tmpfile);
e96e4fbf 8233 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 8234 if (di) dictReleaseIterator(di);
8235 return REDIS_ERR;
8236}
8237
8238/* This is how rewriting of the append only file in background works:
8239 *
8240 * 1) The user calls BGREWRITEAOF
8241 * 2) Redis calls this function, that forks():
8242 * 2a) the child rewrite the append only file in a temp file.
8243 * 2b) the parent accumulates differences in server.bgrewritebuf.
8244 * 3) When the child finished '2a' exists.
8245 * 4) The parent will trap the exit code, if it's OK, will append the
8246 * data accumulated into server.bgrewritebuf into the temp file, and
8247 * finally will rename(2) the temp file in the actual file name.
8248 * The the new file is reopened as the new append only file. Profit!
8249 */
8250static int rewriteAppendOnlyFileBackground(void) {
8251 pid_t childpid;
8252
8253 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 8254 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 8255 if ((childpid = fork()) == 0) {
8256 /* Child */
8257 char tmpfile[256];
9d65a1bb 8258
054e426d 8259 if (server.vm_enabled) vmReopenSwapFile();
8260 close(server.fd);
9d65a1bb 8261 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8262 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 8263 _exit(0);
9d65a1bb 8264 } else {
478c2c6f 8265 _exit(1);
9d65a1bb 8266 }
8267 } else {
8268 /* Parent */
8269 if (childpid == -1) {
8270 redisLog(REDIS_WARNING,
8271 "Can't rewrite append only file in background: fork: %s",
8272 strerror(errno));
8273 return REDIS_ERR;
8274 }
8275 redisLog(REDIS_NOTICE,
8276 "Background append only file rewriting started by pid %d",childpid);
8277 server.bgrewritechildpid = childpid;
884d4b39 8278 updateDictResizePolicy();
85a83172 8279 /* We set appendseldb to -1 in order to force the next call to the
8280 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8281 * accumulated by the parent into server.bgrewritebuf will start
8282 * with a SELECT statement and it will be safe to merge. */
8283 server.appendseldb = -1;
9d65a1bb 8284 return REDIS_OK;
8285 }
8286 return REDIS_OK; /* unreached */
8287}
8288
8289static void bgrewriteaofCommand(redisClient *c) {
8290 if (server.bgrewritechildpid != -1) {
8291 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8292 return;
8293 }
8294 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 8295 char *status = "+Background append only file rewriting started\r\n";
8296 addReplySds(c,sdsnew(status));
9d65a1bb 8297 } else {
8298 addReply(c,shared.err);
8299 }
8300}
8301
8302static void aofRemoveTempFile(pid_t childpid) {
8303 char tmpfile[256];
8304
8305 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8306 unlink(tmpfile);
8307}
8308
996cb5f7 8309/* Virtual Memory is composed mainly of two subsystems:
8310 * - Blocking Virutal Memory
8311 * - Threaded Virtual Memory I/O
8312 * The two parts are not fully decoupled, but functions are split among two
8313 * different sections of the source code (delimited by comments) in order to
8314 * make more clear what functionality is about the blocking VM and what about
8315 * the threaded (not blocking) VM.
8316 *
8317 * Redis VM design:
8318 *
8319 * Redis VM is a blocking VM (one that blocks reading swapped values from
8320 * disk into memory when a value swapped out is needed in memory) that is made
8321 * unblocking by trying to examine the command argument vector in order to
8322 * load in background values that will likely be needed in order to exec
8323 * the command. The command is executed only once all the relevant keys
8324 * are loaded into memory.
8325 *
8326 * This basically is almost as simple of a blocking VM, but almost as parallel
8327 * as a fully non-blocking VM.
8328 */
8329
8330/* =================== Virtual Memory - Blocking Side ====================== */
054e426d 8331
8332/* substitute the first occurrence of '%p' with the process pid in the
8333 * swap file name. */
8334static void expandVmSwapFilename(void) {
8335 char *p = strstr(server.vm_swap_file,"%p");
8336 sds new;
e0a62c7f 8337
054e426d 8338 if (!p) return;
8339 new = sdsempty();
8340 *p = '\0';
8341 new = sdscat(new,server.vm_swap_file);
8342 new = sdscatprintf(new,"%ld",(long) getpid());
8343 new = sdscat(new,p+2);
8344 zfree(server.vm_swap_file);
8345 server.vm_swap_file = new;
8346}
8347
75680a3c 8348static void vmInit(void) {
8349 off_t totsize;
996cb5f7 8350 int pipefds[2];
bcaa7a4f 8351 size_t stacksize;
75680a3c 8352
4ad37480 8353 if (server.vm_max_threads != 0)
8354 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8355
054e426d 8356 expandVmSwapFilename();
8357 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
6fa987e3 8358 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8359 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8360 }
75680a3c 8361 if (server.vm_fp == NULL) {
6fa987e3 8362 redisLog(REDIS_WARNING,
8363 "Impossible to open the swap file: %s. Exiting.",
8364 strerror(errno));
75680a3c 8365 exit(1);
8366 }
8367 server.vm_fd = fileno(server.vm_fp);
8368 server.vm_next_page = 0;
8369 server.vm_near_pages = 0;
7d98e08c 8370 server.vm_stats_used_pages = 0;
8371 server.vm_stats_swapped_objects = 0;
8372 server.vm_stats_swapouts = 0;
8373 server.vm_stats_swapins = 0;
75680a3c 8374 totsize = server.vm_pages*server.vm_page_size;
8375 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8376 if (ftruncate(server.vm_fd,totsize) == -1) {
8377 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8378 strerror(errno));
8379 exit(1);
8380 } else {
8381 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8382 }
7d30035d 8383 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 8384 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 8385 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 8386 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 8387
996cb5f7 8388 /* Initialize threaded I/O (used by Virtual Memory) */
8389 server.io_newjobs = listCreate();
8390 server.io_processing = listCreate();
8391 server.io_processed = listCreate();
d5d55fc3 8392 server.io_ready_clients = listCreate();
92f8e882 8393 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 8394 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8395 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 8396 server.io_active_threads = 0;
996cb5f7 8397 if (pipe(pipefds) == -1) {
8398 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8399 ,strerror(errno));
8400 exit(1);
8401 }
8402 server.io_ready_pipe_read = pipefds[0];
8403 server.io_ready_pipe_write = pipefds[1];
8404 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 8405 /* LZF requires a lot of stack */
8406 pthread_attr_init(&server.io_threads_attr);
8407 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8408 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8409 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 8410 /* Listen for events in the threaded I/O pipe */
8411 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8412 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8413 oom("creating file event");
75680a3c 8414}
8415
06224fec 8416/* Mark the page as used */
8417static void vmMarkPageUsed(off_t page) {
8418 off_t byte = page/8;
8419 int bit = page&7;
970e10bb 8420 redisAssert(vmFreePage(page) == 1);
06224fec 8421 server.vm_bitmap[byte] |= 1<<bit;
8422}
8423
8424/* Mark N contiguous pages as used, with 'page' being the first. */
8425static void vmMarkPagesUsed(off_t page, off_t count) {
8426 off_t j;
8427
8428 for (j = 0; j < count; j++)
7d30035d 8429 vmMarkPageUsed(page+j);
7d98e08c 8430 server.vm_stats_used_pages += count;
7c775e09 8431 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8432 (long long)count, (long long)page);
06224fec 8433}
8434
8435/* Mark the page as free */
8436static void vmMarkPageFree(off_t page) {
8437 off_t byte = page/8;
8438 int bit = page&7;
970e10bb 8439 redisAssert(vmFreePage(page) == 0);
06224fec 8440 server.vm_bitmap[byte] &= ~(1<<bit);
8441}
8442
8443/* Mark N contiguous pages as free, with 'page' being the first. */
8444static void vmMarkPagesFree(off_t page, off_t count) {
8445 off_t j;
8446
8447 for (j = 0; j < count; j++)
7d30035d 8448 vmMarkPageFree(page+j);
7d98e08c 8449 server.vm_stats_used_pages -= count;
7c775e09 8450 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8451 (long long)count, (long long)page);
06224fec 8452}
8453
8454/* Test if the page is free */
8455static int vmFreePage(off_t page) {
8456 off_t byte = page/8;
8457 int bit = page&7;
7d30035d 8458 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 8459}
8460
8461/* Find N contiguous free pages storing the first page of the cluster in *first.
e0a62c7f 8462 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
3a66edc7 8463 * REDIS_ERR is returned.
06224fec 8464 *
8465 * This function uses a simple algorithm: we try to allocate
8466 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8467 * again from the start of the swap file searching for free spaces.
8468 *
8469 * If it looks pretty clear that there are no free pages near our offset
8470 * we try to find less populated places doing a forward jump of
8471 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8472 * without hurry, and then we jump again and so forth...
e0a62c7f 8473 *
06224fec 8474 * This function can be improved using a free list to avoid to guess
8475 * too much, since we could collect data about freed pages.
8476 *
8477 * note: I implemented this function just after watching an episode of
8478 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8479 */
c7df85a4 8480static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 8481 off_t base, offset = 0, since_jump = 0, numfree = 0;
8482
8483 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8484 server.vm_near_pages = 0;
8485 server.vm_next_page = 0;
8486 }
8487 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8488 base = server.vm_next_page;
8489
8490 while(offset < server.vm_pages) {
8491 off_t this = base+offset;
8492
8493 /* If we overflow, restart from page zero */
8494 if (this >= server.vm_pages) {
8495 this -= server.vm_pages;
8496 if (this == 0) {
8497 /* Just overflowed, what we found on tail is no longer
8498 * interesting, as it's no longer contiguous. */
8499 numfree = 0;
8500 }
8501 }
8502 if (vmFreePage(this)) {
8503 /* This is a free page */
8504 numfree++;
8505 /* Already got N free pages? Return to the caller, with success */
8506 if (numfree == n) {
7d30035d 8507 *first = this-(n-1);
8508 server.vm_next_page = this+1;
7c775e09 8509 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 8510 return REDIS_OK;
06224fec 8511 }
8512 } else {
8513 /* The current one is not a free page */
8514 numfree = 0;
8515 }
8516
8517 /* Fast-forward if the current page is not free and we already
8518 * searched enough near this place. */
8519 since_jump++;
8520 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8521 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8522 since_jump = 0;
8523 /* Note that even if we rewind after the jump, we are don't need
8524 * to make sure numfree is set to zero as we only jump *if* it
8525 * is set to zero. */
8526 } else {
8527 /* Otherwise just check the next page */
8528 offset++;
8529 }
8530 }
3a66edc7 8531 return REDIS_ERR;
8532}
8533
a5819310 8534/* Write the specified object at the specified page of the swap file */
8535static int vmWriteObjectOnSwap(robj *o, off_t page) {
8536 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8537 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8538 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8539 redisLog(REDIS_WARNING,
9ebed7cf 8540 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 8541 strerror(errno));
8542 return REDIS_ERR;
8543 }
8544 rdbSaveObject(server.vm_fp,o);
ba76a8f9 8545 fflush(server.vm_fp);
a5819310 8546 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8547 return REDIS_OK;
8548}
8549
3a66edc7 8550/* Swap the 'val' object relative to 'key' into disk. Store all the information
8551 * needed to later retrieve the object into the key object.
8552 * If we can't find enough contiguous empty pages to swap the object on disk
8553 * REDIS_ERR is returned. */
a69a0c9c 8554static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 8555 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 8556 off_t page;
8557
8558 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 8559 assert(key->refcount == 1);
3a66edc7 8560 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 8561 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 8562 key->vm.page = page;
8563 key->vm.usedpages = pages;
8564 key->storage = REDIS_VM_SWAPPED;
d894161b 8565 key->vtype = val->type;
3a66edc7 8566 decrRefCount(val); /* Deallocate the object from memory. */
8567 vmMarkPagesUsed(page,pages);
7d30035d 8568 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8569 (unsigned char*) key->ptr,
8570 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 8571 server.vm_stats_swapped_objects++;
8572 server.vm_stats_swapouts++;
3a66edc7 8573 return REDIS_OK;
8574}
8575
a5819310 8576static robj *vmReadObjectFromSwap(off_t page, int type) {
8577 robj *o;
3a66edc7 8578
a5819310 8579 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8580 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 8581 redisLog(REDIS_WARNING,
d5d55fc3 8582 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 8583 strerror(errno));
478c2c6f 8584 _exit(1);
3a66edc7 8585 }
a5819310 8586 o = rdbLoadObject(type,server.vm_fp);
8587 if (o == NULL) {
d5d55fc3 8588 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 8589 _exit(1);
3a66edc7 8590 }
a5819310 8591 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8592 return o;
8593}
8594
8595/* Load the value object relative to the 'key' object from swap to memory.
8596 * The newly allocated object is returned.
8597 *
8598 * If preview is true the unserialized object is returned to the caller but
8599 * no changes are made to the key object, nor the pages are marked as freed */
8600static robj *vmGenericLoadObject(robj *key, int preview) {
8601 robj *val;
8602
d5d55fc3 8603 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
a5819310 8604 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 8605 if (!preview) {
8606 key->storage = REDIS_VM_MEMORY;
8607 key->vm.atime = server.unixtime;
8608 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8609 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8610 (unsigned char*) key->ptr);
7d98e08c 8611 server.vm_stats_swapped_objects--;
38aba9a1 8612 } else {
8613 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8614 (unsigned char*) key->ptr);
7e69548d 8615 }
7d98e08c 8616 server.vm_stats_swapins++;
3a66edc7 8617 return val;
06224fec 8618}
8619
7e69548d 8620/* Plain object loading, from swap to memory */
8621static robj *vmLoadObject(robj *key) {
996cb5f7 8622 /* If we are loading the object in background, stop it, we
8623 * need to load this object synchronously ASAP. */
8624 if (key->storage == REDIS_VM_LOADING)
8625 vmCancelThreadedIOJob(key);
7e69548d 8626 return vmGenericLoadObject(key,0);
8627}
8628
8629/* Just load the value on disk, without to modify the key.
8630 * This is useful when we want to perform some operation on the value
8631 * without to really bring it from swap to memory, like while saving the
8632 * dataset or rewriting the append only log. */
8633static robj *vmPreviewObject(robj *key) {
8634 return vmGenericLoadObject(key,1);
8635}
8636
4ef8de8a 8637/* How a good candidate is this object for swapping?
8638 * The better candidate it is, the greater the returned value.
8639 *
8640 * Currently we try to perform a fast estimation of the object size in
8641 * memory, and combine it with aging informations.
8642 *
8643 * Basically swappability = idle-time * log(estimated size)
8644 *
8645 * Bigger objects are preferred over smaller objects, but not
8646 * proportionally, this is why we use the logarithm. This algorithm is
8647 * just a first try and will probably be tuned later. */
8648static double computeObjectSwappability(robj *o) {
8649 time_t age = server.unixtime - o->vm.atime;
8650 long asize = 0;
8651 list *l;
8652 dict *d;
8653 struct dictEntry *de;
8654 int z;
8655
8656 if (age <= 0) return 0;
8657 switch(o->type) {
8658 case REDIS_STRING:
8659 if (o->encoding != REDIS_ENCODING_RAW) {
8660 asize = sizeof(*o);
8661 } else {
8662 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8663 }
8664 break;
8665 case REDIS_LIST:
8666 l = o->ptr;
8667 listNode *ln = listFirst(l);
8668
8669 asize = sizeof(list);
8670 if (ln) {
8671 robj *ele = ln->value;
8672 long elesize;
8673
8674 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8675 (sizeof(*o)+sdslen(ele->ptr)) :
8676 sizeof(*o);
8677 asize += (sizeof(listNode)+elesize)*listLength(l);
8678 }
8679 break;
8680 case REDIS_SET:
8681 case REDIS_ZSET:
8682 z = (o->type == REDIS_ZSET);
8683 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8684
8685 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8686 if (z) asize += sizeof(zset)-sizeof(dict);
8687 if (dictSize(d)) {
8688 long elesize;
8689 robj *ele;
8690
8691 de = dictGetRandomKey(d);
8692 ele = dictGetEntryKey(de);
8693 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8694 (sizeof(*o)+sdslen(ele->ptr)) :
8695 sizeof(*o);
8696 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8697 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8698 }
8699 break;
a97b9060 8700 case REDIS_HASH:
8701 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8702 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8703 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8704 unsigned int klen, vlen;
8705 unsigned char *key, *val;
8706
8707 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8708 klen = 0;
8709 vlen = 0;
8710 }
8711 asize = len*(klen+vlen+3);
8712 } else if (o->encoding == REDIS_ENCODING_HT) {
8713 d = o->ptr;
8714 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8715 if (dictSize(d)) {
8716 long elesize;
8717 robj *ele;
8718
8719 de = dictGetRandomKey(d);
8720 ele = dictGetEntryKey(de);
8721 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8722 (sizeof(*o)+sdslen(ele->ptr)) :
8723 sizeof(*o);
8724 ele = dictGetEntryVal(de);
8725 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8726 (sizeof(*o)+sdslen(ele->ptr)) :
8727 sizeof(*o);
8728 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8729 }
8730 }
8731 break;
4ef8de8a 8732 }
c8c72447 8733 return (double)age*log(1+asize);
4ef8de8a 8734}
8735
8736/* Try to swap an object that's a good candidate for swapping.
8737 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 8738 * to swap any object at all.
8739 *
8740 * If 'usethreaded' is true, Redis will try to swap the object in background
8741 * using I/O threads. */
8742static int vmSwapOneObject(int usethreads) {
4ef8de8a 8743 int j, i;
8744 struct dictEntry *best = NULL;
8745 double best_swappability = 0;
b9bc0eef 8746 redisDb *best_db = NULL;
4ef8de8a 8747 robj *key, *val;
8748
8749 for (j = 0; j < server.dbnum; j++) {
8750 redisDb *db = server.db+j;
b72f6a4b 8751 /* Why maxtries is set to 100?
8752 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8753 * are swappable objects */
b0d8747d 8754 int maxtries = 100;
4ef8de8a 8755
8756 if (dictSize(db->dict) == 0) continue;
8757 for (i = 0; i < 5; i++) {
8758 dictEntry *de;
8759 double swappability;
8760
e3cadb8a 8761 if (maxtries) maxtries--;
4ef8de8a 8762 de = dictGetRandomKey(db->dict);
8763 key = dictGetEntryKey(de);
8764 val = dictGetEntryVal(de);
1064ef87 8765 /* Only swap objects that are currently in memory.
8766 *
8767 * Also don't swap shared objects if threaded VM is on, as we
8768 * try to ensure that the main thread does not touch the
8769 * object while the I/O thread is using it, but we can't
8770 * control other keys without adding additional mutex. */
8771 if (key->storage != REDIS_VM_MEMORY ||
8772 (server.vm_max_threads != 0 && val->refcount != 1)) {
e3cadb8a 8773 if (maxtries) i--; /* don't count this try */
8774 continue;
8775 }
4ef8de8a 8776 swappability = computeObjectSwappability(val);
8777 if (!best || swappability > best_swappability) {
8778 best = de;
8779 best_swappability = swappability;
b9bc0eef 8780 best_db = db;
4ef8de8a 8781 }
8782 }
8783 }
7c775e09 8784 if (best == NULL) return REDIS_ERR;
4ef8de8a 8785 key = dictGetEntryKey(best);
8786 val = dictGetEntryVal(best);
8787
e3cadb8a 8788 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 8789 key->ptr, best_swappability);
8790
8791 /* Unshare the key if needed */
8792 if (key->refcount > 1) {
8793 robj *newkey = dupStringObject(key);
8794 decrRefCount(key);
8795 key = dictGetEntryKey(best) = newkey;
8796 }
8797 /* Swap it */
a69a0c9c 8798 if (usethreads) {
b9bc0eef 8799 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 8800 return REDIS_OK;
8801 } else {
a69a0c9c 8802 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8803 dictGetEntryVal(best) = NULL;
8804 return REDIS_OK;
8805 } else {
8806 return REDIS_ERR;
8807 }
4ef8de8a 8808 }
8809}
8810
a69a0c9c 8811static int vmSwapOneObjectBlocking() {
8812 return vmSwapOneObject(0);
8813}
8814
8815static int vmSwapOneObjectThreaded() {
8816 return vmSwapOneObject(1);
8817}
8818
7e69548d 8819/* Return true if it's safe to swap out objects in a given moment.
8820 * Basically we don't want to swap objects out while there is a BGSAVE
8821 * or a BGAEOREWRITE running in backgroud. */
8822static int vmCanSwapOut(void) {
8823 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8824}
8825
1b03836c 8826/* Delete a key if swapped. Returns 1 if the key was found, was swapped
8827 * and was deleted. Otherwise 0 is returned. */
8828static int deleteIfSwapped(redisDb *db, robj *key) {
8829 dictEntry *de;
8830 robj *foundkey;
8831
8832 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8833 foundkey = dictGetEntryKey(de);
8834 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8835 deleteKey(db,key);
8836 return 1;
8837}
8838
996cb5f7 8839/* =================== Virtual Memory - Threaded I/O ======================= */
8840
b9bc0eef 8841static void freeIOJob(iojob *j) {
d5d55fc3 8842 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8843 j->type == REDIS_IOJOB_DO_SWAP ||
8844 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
b9bc0eef 8845 decrRefCount(j->val);
78ebe4c8 8846 /* We don't decrRefCount the j->key field as we did't incremented
8847 * the count creating IO Jobs. This is because the key field here is
8848 * just used as an indentifier and if a key is removed the Job should
8849 * never be touched again. */
b9bc0eef 8850 zfree(j);
8851}
8852
996cb5f7 8853/* Every time a thread finished a Job, it writes a byte into the write side
8854 * of an unix pipe in order to "awake" the main thread, and this function
8855 * is called. */
8856static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
8857 int mask)
8858{
8859 char buf[1];
b0d8747d 8860 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 8861 REDIS_NOTUSED(el);
8862 REDIS_NOTUSED(mask);
8863 REDIS_NOTUSED(privdata);
8864
8865 /* For every byte we read in the read side of the pipe, there is one
8866 * I/O job completed to process. */
8867 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 8868 iojob *j;
8869 listNode *ln;
8870 robj *key;
8871 struct dictEntry *de;
8872
996cb5f7 8873 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 8874
8875 /* Get the processed element (the oldest one) */
8876 lockThreadedIO();
1064ef87 8877 assert(listLength(server.io_processed) != 0);
f6c0bba8 8878 if (toprocess == -1) {
8879 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
8880 if (toprocess <= 0) toprocess = 1;
8881 }
b9bc0eef 8882 ln = listFirst(server.io_processed);
8883 j = ln->value;
8884 listDelNode(server.io_processed,ln);
8885 unlockThreadedIO();
8886 /* If this job is marked as canceled, just ignore it */
8887 if (j->canceled) {
8888 freeIOJob(j);
8889 continue;
8890 }
8891 /* Post process it in the main thread, as there are things we
8892 * can do just here to avoid race conditions and/or invasive locks */
6c96ba7d 8893 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 8894 de = dictFind(j->db->dict,j->key);
8895 assert(de != NULL);
8896 key = dictGetEntryKey(de);
8897 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 8898 redisDb *db;
8899
b9bc0eef 8900 /* Key loaded, bring it at home */
8901 key->storage = REDIS_VM_MEMORY;
8902 key->vm.atime = server.unixtime;
8903 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8904 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
8905 (unsigned char*) key->ptr);
8906 server.vm_stats_swapped_objects--;
8907 server.vm_stats_swapins++;
d5d55fc3 8908 dictGetEntryVal(de) = j->val;
8909 incrRefCount(j->val);
8910 db = j->db;
b9bc0eef 8911 freeIOJob(j);
d5d55fc3 8912 /* Handle clients waiting for this key to be loaded. */
8913 handleClientsBlockedOnSwappedKey(db,key);
b9bc0eef 8914 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
8915 /* Now we know the amount of pages required to swap this object.
8916 * Let's find some space for it, and queue this task again
8917 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 8918 if (!vmCanSwapOut() ||
8919 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
8920 {
8921 /* Ooops... no space or we can't swap as there is
8922 * a fork()ed Redis trying to save stuff on disk. */
b9bc0eef 8923 freeIOJob(j);
054e426d 8924 key->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 8925 } else {
c7df85a4 8926 /* Note that we need to mark this pages as used now,
8927 * if the job will be canceled, we'll mark them as freed
8928 * again. */
8929 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 8930 j->type = REDIS_IOJOB_DO_SWAP;
8931 lockThreadedIO();
8932 queueIOJob(j);
8933 unlockThreadedIO();
8934 }
8935 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
8936 robj *val;
8937
8938 /* Key swapped. We can finally free some memory. */
6c96ba7d 8939 if (key->storage != REDIS_VM_SWAPPING) {
8940 printf("key->storage: %d\n",key->storage);
8941 printf("key->name: %s\n",(char*)key->ptr);
8942 printf("key->refcount: %d\n",key->refcount);
8943 printf("val: %p\n",(void*)j->val);
8944 printf("val->type: %d\n",j->val->type);
8945 printf("val->ptr: %s\n",(char*)j->val->ptr);
8946 }
8947 redisAssert(key->storage == REDIS_VM_SWAPPING);
b9bc0eef 8948 val = dictGetEntryVal(de);
8949 key->vm.page = j->page;
8950 key->vm.usedpages = j->pages;
8951 key->storage = REDIS_VM_SWAPPED;
8952 key->vtype = j->val->type;
8953 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 8954 dictGetEntryVal(de) = NULL;
b9bc0eef 8955 redisLog(REDIS_DEBUG,
8956 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
8957 (unsigned char*) key->ptr,
8958 (unsigned long long) j->page, (unsigned long long) j->pages);
8959 server.vm_stats_swapped_objects++;
8960 server.vm_stats_swapouts++;
8961 freeIOJob(j);
f11b8647 8962 /* Put a few more swap requests in queue if we are still
8963 * out of memory */
b0d8747d 8964 if (trytoswap && vmCanSwapOut() &&
8965 zmalloc_used_memory() > server.vm_max_memory)
8966 {
f11b8647 8967 int more = 1;
8968 while(more) {
8969 lockThreadedIO();
8970 more = listLength(server.io_newjobs) <
8971 (unsigned) server.vm_max_threads;
8972 unlockThreadedIO();
8973 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 8974 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
8975 trytoswap = 0;
8976 break;
8977 }
f11b8647 8978 }
8979 }
b9bc0eef 8980 }
c953f24b 8981 processed++;
f6c0bba8 8982 if (processed == toprocess) return;
996cb5f7 8983 }
8984 if (retval < 0 && errno != EAGAIN) {
8985 redisLog(REDIS_WARNING,
8986 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
8987 strerror(errno));
8988 }
8989}
8990
8991static void lockThreadedIO(void) {
8992 pthread_mutex_lock(&server.io_mutex);
8993}
8994
8995static void unlockThreadedIO(void) {
8996 pthread_mutex_unlock(&server.io_mutex);
8997}
8998
8999/* Remove the specified object from the threaded I/O queue if still not
9000 * processed, otherwise make sure to flag it as canceled. */
9001static void vmCancelThreadedIOJob(robj *o) {
9002 list *lists[3] = {
6c96ba7d 9003 server.io_newjobs, /* 0 */
9004 server.io_processing, /* 1 */
9005 server.io_processed /* 2 */
996cb5f7 9006 };
9007 int i;
9008
9009 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 9010again:
996cb5f7 9011 lockThreadedIO();
9012 /* Search for a matching key in one of the queues */
9013 for (i = 0; i < 3; i++) {
9014 listNode *ln;
c7df85a4 9015 listIter li;
996cb5f7 9016
c7df85a4 9017 listRewind(lists[i],&li);
9018 while ((ln = listNext(&li)) != NULL) {
996cb5f7 9019 iojob *job = ln->value;
9020
6c96ba7d 9021 if (job->canceled) continue; /* Skip this, already canceled. */
78ebe4c8 9022 if (job->key == o) {
970e10bb 9023 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9024 (void*)job, (char*)o->ptr, job->type, i);
427a2153 9025 /* Mark the pages as free since the swap didn't happened
9026 * or happened but is now discarded. */
970e10bb 9027 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 9028 vmMarkPagesFree(job->page,job->pages);
9029 /* Cancel the job. It depends on the list the job is
9030 * living in. */
996cb5f7 9031 switch(i) {
9032 case 0: /* io_newjobs */
6c96ba7d 9033 /* If the job was yet not processed the best thing to do
996cb5f7 9034 * is to remove it from the queue at all */
6c96ba7d 9035 freeIOJob(job);
996cb5f7 9036 listDelNode(lists[i],ln);
9037 break;
9038 case 1: /* io_processing */
d5d55fc3 9039 /* Oh Shi- the thread is messing with the Job:
9040 *
9041 * Probably it's accessing the object if this is a
9042 * PREPARE_SWAP or DO_SWAP job.
9043 * If it's a LOAD job it may be reading from disk and
9044 * if we don't wait for the job to terminate before to
9045 * cancel it, maybe in a few microseconds data can be
9046 * corrupted in this pages. So the short story is:
9047 *
9048 * Better to wait for the job to move into the
9049 * next queue (processed)... */
9050
9051 /* We try again and again until the job is completed. */
9052 unlockThreadedIO();
9053 /* But let's wait some time for the I/O thread
9054 * to finish with this job. After all this condition
9055 * should be very rare. */
9056 usleep(1);
9057 goto again;
996cb5f7 9058 case 2: /* io_processed */
2e111efe 9059 /* The job was already processed, that's easy...
9060 * just mark it as canceled so that we'll ignore it
9061 * when processing completed jobs. */
996cb5f7 9062 job->canceled = 1;
9063 break;
9064 }
c7df85a4 9065 /* Finally we have to adjust the storage type of the object
9066 * in order to "UNDO" the operaiton. */
996cb5f7 9067 if (o->storage == REDIS_VM_LOADING)
9068 o->storage = REDIS_VM_SWAPPED;
9069 else if (o->storage == REDIS_VM_SWAPPING)
9070 o->storage = REDIS_VM_MEMORY;
9071 unlockThreadedIO();
9072 return;
9073 }
9074 }
9075 }
9076 unlockThreadedIO();
9077 assert(1 != 1); /* We should never reach this */
9078}
9079
b9bc0eef 9080static void *IOThreadEntryPoint(void *arg) {
9081 iojob *j;
9082 listNode *ln;
9083 REDIS_NOTUSED(arg);
9084
9085 pthread_detach(pthread_self());
9086 while(1) {
9087 /* Get a new job to process */
9088 lockThreadedIO();
9089 if (listLength(server.io_newjobs) == 0) {
9090 /* No new jobs in queue, exit. */
9ebed7cf 9091 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9092 (long) pthread_self());
b9bc0eef 9093 server.io_active_threads--;
9094 unlockThreadedIO();
9095 return NULL;
9096 }
9097 ln = listFirst(server.io_newjobs);
9098 j = ln->value;
9099 listDelNode(server.io_newjobs,ln);
9100 /* Add the job in the processing queue */
9101 j->thread = pthread_self();
9102 listAddNodeTail(server.io_processing,j);
9103 ln = listLast(server.io_processing); /* We use ln later to remove it */
9104 unlockThreadedIO();
9ebed7cf 9105 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9106 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 9107
9108 /* Process the Job */
9109 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9110 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
b9bc0eef 9111 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9112 FILE *fp = fopen("/dev/null","w+");
9113 j->pages = rdbSavedObjectPages(j->val,fp);
9114 fclose(fp);
9115 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 9116 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9117 j->canceled = 1;
b9bc0eef 9118 }
9119
9120 /* Done: insert the job into the processed queue */
9ebed7cf 9121 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9122 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 9123 lockThreadedIO();
9124 listDelNode(server.io_processing,ln);
9125 listAddNodeTail(server.io_processed,j);
9126 unlockThreadedIO();
e0a62c7f 9127
b9bc0eef 9128 /* Signal the main thread there is new stuff to process */
9129 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9130 }
9131 return NULL; /* never reached */
9132}
9133
9134static void spawnIOThread(void) {
9135 pthread_t thread;
478c2c6f 9136 sigset_t mask, omask;
a97b9060 9137 int err;
b9bc0eef 9138
478c2c6f 9139 sigemptyset(&mask);
9140 sigaddset(&mask,SIGCHLD);
9141 sigaddset(&mask,SIGHUP);
9142 sigaddset(&mask,SIGPIPE);
9143 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 9144 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9145 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9146 strerror(err));
9147 usleep(1000000);
9148 }
478c2c6f 9149 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 9150 server.io_active_threads++;
9151}
9152
4ee9488d 9153/* We need to wait for the last thread to exit before we are able to
9154 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 9155static void waitEmptyIOJobsQueue(void) {
4ee9488d 9156 while(1) {
76b7233a 9157 int io_processed_len;
9158
4ee9488d 9159 lockThreadedIO();
054e426d 9160 if (listLength(server.io_newjobs) == 0 &&
9161 listLength(server.io_processing) == 0 &&
9162 server.io_active_threads == 0)
9163 {
4ee9488d 9164 unlockThreadedIO();
9165 return;
9166 }
76b7233a 9167 /* While waiting for empty jobs queue condition we post-process some
9168 * finshed job, as I/O threads may be hanging trying to write against
9169 * the io_ready_pipe_write FD but there are so much pending jobs that
9170 * it's blocking. */
9171 io_processed_len = listLength(server.io_processed);
4ee9488d 9172 unlockThreadedIO();
76b7233a 9173 if (io_processed_len) {
9174 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9175 usleep(1000); /* 1 millisecond */
9176 } else {
9177 usleep(10000); /* 10 milliseconds */
9178 }
4ee9488d 9179 }
9180}
9181
054e426d 9182static void vmReopenSwapFile(void) {
478c2c6f 9183 /* Note: we don't close the old one as we are in the child process
9184 * and don't want to mess at all with the original file object. */
054e426d 9185 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9186 if (server.vm_fp == NULL) {
9187 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9188 server.vm_swap_file);
478c2c6f 9189 _exit(1);
054e426d 9190 }
9191 server.vm_fd = fileno(server.vm_fp);
9192}
9193
b9bc0eef 9194/* This function must be called while with threaded IO locked */
9195static void queueIOJob(iojob *j) {
6c96ba7d 9196 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9197 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 9198 listAddNodeTail(server.io_newjobs,j);
9199 if (server.io_active_threads < server.vm_max_threads)
9200 spawnIOThread();
9201}
9202
9203static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9204 iojob *j;
e0a62c7f 9205
b9bc0eef 9206 assert(key->storage == REDIS_VM_MEMORY);
9207 assert(key->refcount == 1);
9208
9209 j = zmalloc(sizeof(*j));
9210 j->type = REDIS_IOJOB_PREPARE_SWAP;
9211 j->db = db;
78ebe4c8 9212 j->key = key;
b9bc0eef 9213 j->val = val;
9214 incrRefCount(val);
9215 j->canceled = 0;
9216 j->thread = (pthread_t) -1;
f11b8647 9217 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 9218
9219 lockThreadedIO();
9220 queueIOJob(j);
9221 unlockThreadedIO();
9222 return REDIS_OK;
9223}
9224
b0d8747d 9225/* ============ Virtual Memory - Blocking clients on missing keys =========== */
9226
d5d55fc3 9227/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9228 * If there is not already a job loading the key, it is craeted.
9229 * The key is added to the io_keys list in the client structure, and also
9230 * in the hash table mapping swapped keys to waiting clients, that is,
9231 * server.io_waited_keys. */
9232static int waitForSwappedKey(redisClient *c, robj *key) {
9233 struct dictEntry *de;
9234 robj *o;
9235 list *l;
9236
9237 /* If the key does not exist or is already in RAM we don't need to
9238 * block the client at all. */
9239 de = dictFind(c->db->dict,key);
9240 if (de == NULL) return 0;
9241 o = dictGetEntryKey(de);
9242 if (o->storage == REDIS_VM_MEMORY) {
9243 return 0;
9244 } else if (o->storage == REDIS_VM_SWAPPING) {
9245 /* We were swapping the key, undo it! */
9246 vmCancelThreadedIOJob(o);
9247 return 0;
9248 }
e0a62c7f 9249
d5d55fc3 9250 /* OK: the key is either swapped, or being loaded just now. */
9251
9252 /* Add the key to the list of keys this client is waiting for.
9253 * This maps clients to keys they are waiting for. */
9254 listAddNodeTail(c->io_keys,key);
9255 incrRefCount(key);
9256
9257 /* Add the client to the swapped keys => clients waiting map. */
9258 de = dictFind(c->db->io_keys,key);
9259 if (de == NULL) {
9260 int retval;
9261
9262 /* For every key we take a list of clients blocked for it */
9263 l = listCreate();
9264 retval = dictAdd(c->db->io_keys,key,l);
9265 incrRefCount(key);
9266 assert(retval == DICT_OK);
9267 } else {
9268 l = dictGetEntryVal(de);
9269 }
9270 listAddNodeTail(l,c);
9271
9272 /* Are we already loading the key from disk? If not create a job */
9273 if (o->storage == REDIS_VM_SWAPPED) {
9274 iojob *j;
9275
9276 o->storage = REDIS_VM_LOADING;
9277 j = zmalloc(sizeof(*j));
9278 j->type = REDIS_IOJOB_LOAD;
9279 j->db = c->db;
78ebe4c8 9280 j->key = o;
d5d55fc3 9281 j->key->vtype = o->vtype;
9282 j->page = o->vm.page;
9283 j->val = NULL;
9284 j->canceled = 0;
9285 j->thread = (pthread_t) -1;
9286 lockThreadedIO();
9287 queueIOJob(j);
9288 unlockThreadedIO();
9289 }
9290 return 1;
9291}
9292
76583ea4
PN
9293/* Preload keys needed for the ZUNION and ZINTER commands. */
9294static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9295 int i, num;
9296 num = atoi(c->argv[2]->ptr);
9297 for (i = 0; i < num; i++) {
9298 waitForSwappedKey(c,c->argv[3+i]);
9299 }
9300}
9301
b0d8747d 9302/* Is this client attempting to run a command against swapped keys?
d5d55fc3 9303 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 9304 *
d5d55fc3 9305 * The important idea about this function is that it can fail! If keys will
9306 * still be swapped when the client is resumed, this key lookups will
9307 * just block loading keys from disk. In practical terms this should only
9308 * happen with SORT BY command or if there is a bug in this function.
9309 *
9310 * Return 1 if the client is marked as blocked, 0 if the client can
9311 * continue as the keys it is going to access appear to be in memory. */
9312static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
7c775e09 9313 int j, last;
9314
76583ea4
PN
9315 if (cmd->vm_preload_proc != NULL) {
9316 cmd->vm_preload_proc(c);
9317 } else {
9318 if (cmd->vm_firstkey == 0) return 0;
9319 last = cmd->vm_lastkey;
9320 if (last < 0) last = c->argc+last;
9321 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9322 waitForSwappedKey(c,c->argv[j]);
9323 }
9324
d5d55fc3 9325 /* If the client was blocked for at least one key, mark it as blocked. */
9326 if (listLength(c->io_keys)) {
9327 c->flags |= REDIS_IO_WAIT;
9328 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9329 server.vm_blocked_clients++;
9330 return 1;
9331 } else {
9332 return 0;
9333 }
9334}
9335
9336/* Remove the 'key' from the list of blocked keys for a given client.
9337 *
9338 * The function returns 1 when there are no longer blocking keys after
9339 * the current one was removed (and the client can be unblocked). */
9340static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9341 list *l;
9342 listNode *ln;
9343 listIter li;
9344 struct dictEntry *de;
9345
9346 /* Remove the key from the list of keys this client is waiting for. */
9347 listRewind(c->io_keys,&li);
9348 while ((ln = listNext(&li)) != NULL) {
9349 if (compareStringObjects(ln->value,key) == 0) {
9350 listDelNode(c->io_keys,ln);
9351 break;
9352 }
9353 }
9354 assert(ln != NULL);
9355
9356 /* Remove the client form the key => waiting clients map. */
9357 de = dictFind(c->db->io_keys,key);
9358 assert(de != NULL);
9359 l = dictGetEntryVal(de);
9360 ln = listSearchKey(l,c);
9361 assert(ln != NULL);
9362 listDelNode(l,ln);
9363 if (listLength(l) == 0)
9364 dictDelete(c->db->io_keys,key);
9365
9366 return listLength(c->io_keys) == 0;
9367}
9368
9369static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9370 struct dictEntry *de;
9371 list *l;
9372 listNode *ln;
9373 int len;
9374
9375 de = dictFind(db->io_keys,key);
9376 if (!de) return;
9377
9378 l = dictGetEntryVal(de);
9379 len = listLength(l);
9380 /* Note: we can't use something like while(listLength(l)) as the list
9381 * can be freed by the calling function when we remove the last element. */
9382 while (len--) {
9383 ln = listFirst(l);
9384 redisClient *c = ln->value;
9385
9386 if (dontWaitForSwappedKey(c,key)) {
9387 /* Put the client in the list of clients ready to go as we
9388 * loaded all the keys about it. */
9389 listAddNodeTail(server.io_ready_clients,c);
9390 }
9391 }
b0d8747d 9392}
b0d8747d 9393
500ece7c 9394/* =========================== Remote Configuration ========================= */
9395
9396static void configSetCommand(redisClient *c) {
9397 robj *o = getDecodedObject(c->argv[3]);
9398 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9399 zfree(server.dbfilename);
9400 server.dbfilename = zstrdup(o->ptr);
9401 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9402 zfree(server.requirepass);
9403 server.requirepass = zstrdup(o->ptr);
9404 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9405 zfree(server.masterauth);
9406 server.masterauth = zstrdup(o->ptr);
9407 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9408 server.maxmemory = strtoll(o->ptr, NULL, 10);
9409 } else {
9410 addReplySds(c,sdscatprintf(sdsempty(),
9411 "-ERR not supported CONFIG parameter %s\r\n",
9412 (char*)c->argv[2]->ptr));
9413 decrRefCount(o);
9414 return;
9415 }
9416 decrRefCount(o);
9417 addReply(c,shared.ok);
9418}
9419
9420static void configGetCommand(redisClient *c) {
9421 robj *o = getDecodedObject(c->argv[2]);
9422 robj *lenobj = createObject(REDIS_STRING,NULL);
9423 char *pattern = o->ptr;
9424 int matches = 0;
9425
9426 addReply(c,lenobj);
9427 decrRefCount(lenobj);
9428
9429 if (stringmatch(pattern,"dbfilename",0)) {
9430 addReplyBulkCString(c,"dbfilename");
9431 addReplyBulkCString(c,server.dbfilename);
9432 matches++;
9433 }
9434 if (stringmatch(pattern,"requirepass",0)) {
9435 addReplyBulkCString(c,"requirepass");
9436 addReplyBulkCString(c,server.requirepass);
9437 matches++;
9438 }
9439 if (stringmatch(pattern,"masterauth",0)) {
9440 addReplyBulkCString(c,"masterauth");
9441 addReplyBulkCString(c,server.masterauth);
9442 matches++;
9443 }
9444 if (stringmatch(pattern,"maxmemory",0)) {
9445 char buf[128];
9446
9447 snprintf(buf,128,"%llu\n",server.maxmemory);
9448 addReplyBulkCString(c,"maxmemory");
9449 addReplyBulkCString(c,buf);
9450 matches++;
9451 }
9452 decrRefCount(o);
9453 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9454}
9455
9456static void configCommand(redisClient *c) {
9457 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9458 if (c->argc != 4) goto badarity;
9459 configSetCommand(c);
9460 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9461 if (c->argc != 3) goto badarity;
9462 configGetCommand(c);
9463 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9464 if (c->argc != 2) goto badarity;
9465 server.stat_numcommands = 0;
9466 server.stat_numconnections = 0;
9467 server.stat_expiredkeys = 0;
9468 server.stat_starttime = time(NULL);
9469 addReply(c,shared.ok);
9470 } else {
9471 addReplySds(c,sdscatprintf(sdsempty(),
9472 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9473 }
9474 return;
9475
9476badarity:
9477 addReplySds(c,sdscatprintf(sdsempty(),
9478 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9479 (char*) c->argv[1]->ptr));
9480}
9481
befec3cd 9482/* =========================== Pubsub implementation ======================== */
9483
ffc6b7f8 9484static void freePubsubPattern(void *p) {
9485 pubsubPattern *pat = p;
9486
9487 decrRefCount(pat->pattern);
9488 zfree(pat);
9489}
9490
9491static int listMatchPubsubPattern(void *a, void *b) {
9492 pubsubPattern *pa = a, *pb = b;
9493
9494 return (pa->client == pb->client) &&
9495 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9496}
9497
9498/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9499 * 0 if the client was already subscribed to that channel. */
9500static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
befec3cd 9501 struct dictEntry *de;
9502 list *clients = NULL;
9503 int retval = 0;
9504
ffc6b7f8 9505 /* Add the channel to the client -> channels hash table */
9506 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
befec3cd 9507 retval = 1;
ffc6b7f8 9508 incrRefCount(channel);
9509 /* Add the client to the channel -> list of clients hash table */
9510 de = dictFind(server.pubsub_channels,channel);
befec3cd 9511 if (de == NULL) {
9512 clients = listCreate();
ffc6b7f8 9513 dictAdd(server.pubsub_channels,channel,clients);
9514 incrRefCount(channel);
befec3cd 9515 } else {
9516 clients = dictGetEntryVal(de);
9517 }
9518 listAddNodeTail(clients,c);
9519 }
9520 /* Notify the client */
9521 addReply(c,shared.mbulk3);
9522 addReply(c,shared.subscribebulk);
ffc6b7f8 9523 addReplyBulk(c,channel);
9524 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
befec3cd 9525 return retval;
9526}
9527
ffc6b7f8 9528/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9529 * 0 if the client was not subscribed to the specified channel. */
9530static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
befec3cd 9531 struct dictEntry *de;
9532 list *clients;
9533 listNode *ln;
9534 int retval = 0;
9535
ffc6b7f8 9536 /* Remove the channel from the client -> channels hash table */
9537 incrRefCount(channel); /* channel may be just a pointer to the same object
201037f5 9538 we have in the hash tables. Protect it... */
ffc6b7f8 9539 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
befec3cd 9540 retval = 1;
ffc6b7f8 9541 /* Remove the client from the channel -> clients list hash table */
9542 de = dictFind(server.pubsub_channels,channel);
befec3cd 9543 assert(de != NULL);
9544 clients = dictGetEntryVal(de);
9545 ln = listSearchKey(clients,c);
9546 assert(ln != NULL);
9547 listDelNode(clients,ln);
ff767a75 9548 if (listLength(clients) == 0) {
9549 /* Free the list and associated hash entry at all if this was
9550 * the latest client, so that it will be possible to abuse
ffc6b7f8 9551 * Redis PUBSUB creating millions of channels. */
9552 dictDelete(server.pubsub_channels,channel);
ff767a75 9553 }
befec3cd 9554 }
9555 /* Notify the client */
9556 if (notify) {
9557 addReply(c,shared.mbulk3);
9558 addReply(c,shared.unsubscribebulk);
ffc6b7f8 9559 addReplyBulk(c,channel);
9560 addReplyLong(c,dictSize(c->pubsub_channels)+
9561 listLength(c->pubsub_patterns));
9562
9563 }
9564 decrRefCount(channel); /* it is finally safe to release it */
9565 return retval;
9566}
9567
9568/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9569static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9570 int retval = 0;
9571
9572 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9573 retval = 1;
9574 pubsubPattern *pat;
9575 listAddNodeTail(c->pubsub_patterns,pattern);
9576 incrRefCount(pattern);
9577 pat = zmalloc(sizeof(*pat));
9578 pat->pattern = getDecodedObject(pattern);
9579 pat->client = c;
9580 listAddNodeTail(server.pubsub_patterns,pat);
9581 }
9582 /* Notify the client */
9583 addReply(c,shared.mbulk3);
9584 addReply(c,shared.psubscribebulk);
9585 addReplyBulk(c,pattern);
9586 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9587 return retval;
9588}
9589
9590/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9591 * 0 if the client was not subscribed to the specified channel. */
9592static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9593 listNode *ln;
9594 pubsubPattern pat;
9595 int retval = 0;
9596
9597 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9598 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9599 retval = 1;
9600 listDelNode(c->pubsub_patterns,ln);
9601 pat.client = c;
9602 pat.pattern = pattern;
9603 ln = listSearchKey(server.pubsub_patterns,&pat);
9604 listDelNode(server.pubsub_patterns,ln);
9605 }
9606 /* Notify the client */
9607 if (notify) {
9608 addReply(c,shared.mbulk3);
9609 addReply(c,shared.punsubscribebulk);
9610 addReplyBulk(c,pattern);
9611 addReplyLong(c,dictSize(c->pubsub_channels)+
9612 listLength(c->pubsub_patterns));
befec3cd 9613 }
ffc6b7f8 9614 decrRefCount(pattern);
befec3cd 9615 return retval;
9616}
9617
ffc6b7f8 9618/* Unsubscribe from all the channels. Return the number of channels the
9619 * client was subscribed from. */
9620static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9621 dictIterator *di = dictGetIterator(c->pubsub_channels);
befec3cd 9622 dictEntry *de;
9623 int count = 0;
9624
9625 while((de = dictNext(di)) != NULL) {
ffc6b7f8 9626 robj *channel = dictGetEntryKey(de);
befec3cd 9627
ffc6b7f8 9628 count += pubsubUnsubscribeChannel(c,channel,notify);
befec3cd 9629 }
9630 dictReleaseIterator(di);
9631 return count;
9632}
9633
ffc6b7f8 9634/* Unsubscribe from all the patterns. Return the number of patterns the
9635 * client was subscribed from. */
9636static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9637 listNode *ln;
9638 listIter li;
9639 int count = 0;
9640
9641 listRewind(c->pubsub_patterns,&li);
9642 while ((ln = listNext(&li)) != NULL) {
9643 robj *pattern = ln->value;
9644
9645 count += pubsubUnsubscribePattern(c,pattern,notify);
9646 }
9647 return count;
9648}
9649
befec3cd 9650/* Publish a message */
ffc6b7f8 9651static int pubsubPublishMessage(robj *channel, robj *message) {
befec3cd 9652 int receivers = 0;
9653 struct dictEntry *de;
ffc6b7f8 9654 listNode *ln;
9655 listIter li;
befec3cd 9656
ffc6b7f8 9657 /* Send to clients listening for that channel */
9658 de = dictFind(server.pubsub_channels,channel);
befec3cd 9659 if (de) {
9660 list *list = dictGetEntryVal(de);
9661 listNode *ln;
9662 listIter li;
9663
9664 listRewind(list,&li);
9665 while ((ln = listNext(&li)) != NULL) {
9666 redisClient *c = ln->value;
9667
9668 addReply(c,shared.mbulk3);
9669 addReply(c,shared.messagebulk);
ffc6b7f8 9670 addReplyBulk(c,channel);
befec3cd 9671 addReplyBulk(c,message);
9672 receivers++;
9673 }
9674 }
ffc6b7f8 9675 /* Send to clients listening to matching channels */
9676 if (listLength(server.pubsub_patterns)) {
9677 listRewind(server.pubsub_patterns,&li);
9678 channel = getDecodedObject(channel);
9679 while ((ln = listNext(&li)) != NULL) {
9680 pubsubPattern *pat = ln->value;
9681
9682 if (stringmatchlen((char*)pat->pattern->ptr,
9683 sdslen(pat->pattern->ptr),
9684 (char*)channel->ptr,
9685 sdslen(channel->ptr),0)) {
9686 addReply(pat->client,shared.mbulk3);
9687 addReply(pat->client,shared.messagebulk);
9688 addReplyBulk(pat->client,channel);
9689 addReplyBulk(pat->client,message);
9690 receivers++;
9691 }
9692 }
9693 decrRefCount(channel);
9694 }
befec3cd 9695 return receivers;
9696}
9697
9698static void subscribeCommand(redisClient *c) {
9699 int j;
9700
9701 for (j = 1; j < c->argc; j++)
ffc6b7f8 9702 pubsubSubscribeChannel(c,c->argv[j]);
befec3cd 9703}
9704
9705static void unsubscribeCommand(redisClient *c) {
9706 if (c->argc == 1) {
ffc6b7f8 9707 pubsubUnsubscribeAllChannels(c,1);
9708 return;
9709 } else {
9710 int j;
9711
9712 for (j = 1; j < c->argc; j++)
9713 pubsubUnsubscribeChannel(c,c->argv[j],1);
9714 }
9715}
9716
9717static void psubscribeCommand(redisClient *c) {
9718 int j;
9719
9720 for (j = 1; j < c->argc; j++)
9721 pubsubSubscribePattern(c,c->argv[j]);
9722}
9723
9724static void punsubscribeCommand(redisClient *c) {
9725 if (c->argc == 1) {
9726 pubsubUnsubscribeAllPatterns(c,1);
befec3cd 9727 return;
9728 } else {
9729 int j;
9730
9731 for (j = 1; j < c->argc; j++)
ffc6b7f8 9732 pubsubUnsubscribePattern(c,c->argv[j],1);
befec3cd 9733 }
9734}
9735
9736static void publishCommand(redisClient *c) {
9737 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9738 addReplyLong(c,receivers);
9739}
9740
7f957c92 9741/* ================================= Debugging ============================== */
9742
9743static void debugCommand(redisClient *c) {
9744 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9745 *((char*)-1) = 'x';
210e29f7 9746 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9747 if (rdbSave(server.dbfilename) != REDIS_OK) {
9748 addReply(c,shared.err);
9749 return;
9750 }
9751 emptyDb();
9752 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9753 addReply(c,shared.err);
9754 return;
9755 }
9756 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9757 addReply(c,shared.ok);
71c2b467 9758 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9759 emptyDb();
9760 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9761 addReply(c,shared.err);
9762 return;
9763 }
9764 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9765 addReply(c,shared.ok);
333298da 9766 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9767 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9768 robj *key, *val;
9769
9770 if (!de) {
9771 addReply(c,shared.nokeyerr);
9772 return;
9773 }
9774 key = dictGetEntryKey(de);
9775 val = dictGetEntryVal(de);
59146ef3 9776 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9777 key->storage == REDIS_VM_SWAPPING)) {
07efaf74 9778 char *strenc;
9779 char buf[128];
9780
9781 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9782 strenc = strencoding[val->encoding];
9783 } else {
9784 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9785 strenc = buf;
9786 }
ace06542 9787 addReplySds(c,sdscatprintf(sdsempty(),
9788 "+Key at:%p refcount:%d, value at:%p refcount:%d "
07efaf74 9789 "encoding:%s serializedlength:%lld\r\n",
682ac724 9790 (void*)key, key->refcount, (void*)val, val->refcount,
07efaf74 9791 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 9792 } else {
9793 addReplySds(c,sdscatprintf(sdsempty(),
9794 "+Key at:%p refcount:%d, value swapped at: page %llu "
9795 "using %llu pages\r\n",
9796 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9797 (unsigned long long) key->vm.usedpages));
9798 }
78ebe4c8 9799 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
9800 lookupKeyRead(c->db,c->argv[2]);
9801 addReply(c,shared.ok);
7d30035d 9802 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9803 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9804 robj *key, *val;
9805
9806 if (!server.vm_enabled) {
9807 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9808 return;
9809 }
9810 if (!de) {
9811 addReply(c,shared.nokeyerr);
9812 return;
9813 }
9814 key = dictGetEntryKey(de);
9815 val = dictGetEntryVal(de);
4ef8de8a 9816 /* If the key is shared we want to create a copy */
9817 if (key->refcount > 1) {
9818 robj *newkey = dupStringObject(key);
9819 decrRefCount(key);
9820 key = dictGetEntryKey(de) = newkey;
9821 }
9822 /* Swap it */
7d30035d 9823 if (key->storage != REDIS_VM_MEMORY) {
9824 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 9825 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 9826 dictGetEntryVal(de) = NULL;
9827 addReply(c,shared.ok);
9828 } else {
9829 addReply(c,shared.err);
9830 }
7f957c92 9831 } else {
333298da 9832 addReplySds(c,sdsnew(
bdcb92f2 9833 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 9834 }
9835}
56906eef 9836
6c96ba7d 9837static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 9838 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
6c96ba7d 9839 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
dfc5e96c 9840#ifdef HAVE_BACKTRACE
9841 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9842 *((char*)-1) = 'x';
9843#endif
9844}
9845
bcfc686d 9846/* =================================== Main! ================================ */
56906eef 9847
bcfc686d 9848#ifdef __linux__
9849int linuxOvercommitMemoryValue(void) {
9850 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
9851 char buf[64];
56906eef 9852
bcfc686d 9853 if (!fp) return -1;
9854 if (fgets(buf,64,fp) == NULL) {
9855 fclose(fp);
9856 return -1;
9857 }
9858 fclose(fp);
56906eef 9859
bcfc686d 9860 return atoi(buf);
9861}
9862
9863void linuxOvercommitMemoryWarning(void) {
9864 if (linuxOvercommitMemoryValue() == 0) {
9865 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low condition memory. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
9866 }
9867}
9868#endif /* __linux__ */
9869
9870static void daemonize(void) {
9871 int fd;
9872 FILE *fp;
9873
9874 if (fork() != 0) exit(0); /* parent exits */
9875 setsid(); /* create a new session */
9876
9877 /* Every output goes to /dev/null. If Redis is daemonized but
9878 * the 'logfile' is set to 'stdout' in the configuration file
9879 * it will not log at all. */
9880 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
9881 dup2(fd, STDIN_FILENO);
9882 dup2(fd, STDOUT_FILENO);
9883 dup2(fd, STDERR_FILENO);
9884 if (fd > STDERR_FILENO) close(fd);
9885 }
9886 /* Try to write the pid file */
9887 fp = fopen(server.pidfile,"w");
9888 if (fp) {
9889 fprintf(fp,"%d\n",getpid());
9890 fclose(fp);
56906eef 9891 }
56906eef 9892}
9893
42ab0172
AO
9894static void version() {
9895 printf("Redis server version %s\n", REDIS_VERSION);
9896 exit(0);
9897}
9898
723fb69b
AO
9899static void usage() {
9900 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 9901 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
9902 exit(1);
9903}
9904
bcfc686d 9905int main(int argc, char **argv) {
9651a787 9906 time_t start;
9907
bcfc686d 9908 initServerConfig();
9909 if (argc == 2) {
44efe66e 9910 if (strcmp(argv[1], "-v") == 0 ||
9911 strcmp(argv[1], "--version") == 0) version();
9912 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 9913 resetServerSaveParams();
9914 loadServerConfig(argv[1]);
723fb69b
AO
9915 } else if ((argc > 2)) {
9916 usage();
bcfc686d 9917 } else {
9918 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
9919 }
bcfc686d 9920 if (server.daemonize) daemonize();
71c54b21 9921 initServer();
bcfc686d 9922 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
9923#ifdef __linux__
9924 linuxOvercommitMemoryWarning();
9925#endif
9651a787 9926 start = time(NULL);
bcfc686d 9927 if (server.appendonly) {
9928 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 9929 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 9930 } else {
9931 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 9932 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 9933 }
bcfc686d 9934 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 9935 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 9936 aeMain(server.el);
9937 aeDeleteEventLoop(server.el);
9938 return 0;
9939}
9940
9941/* ============================= Backtrace support ========================= */
9942
9943#ifdef HAVE_BACKTRACE
9944static char *findFuncName(void *pointer, unsigned long *offset);
9945
56906eef 9946static void *getMcontextEip(ucontext_t *uc) {
9947#if defined(__FreeBSD__)
9948 return (void*) uc->uc_mcontext.mc_eip;
9949#elif defined(__dietlibc__)
9950 return (void*) uc->uc_mcontext.eip;
06db1f50 9951#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 9952 #if __x86_64__
9953 return (void*) uc->uc_mcontext->__ss.__rip;
9954 #else
56906eef 9955 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 9956 #endif
06db1f50 9957#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 9958 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 9959 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 9960 #else
9961 return (void*) uc->uc_mcontext->__ss.__eip;
e0a62c7f 9962 #endif
54bac49d 9963#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 9964 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 9965#elif defined(__ia64__) /* Linux IA64 */
9966 return (void*) uc->uc_mcontext.sc_ip;
9967#else
9968 return NULL;
56906eef 9969#endif
9970}
9971
9972static void segvHandler(int sig, siginfo_t *info, void *secret) {
9973 void *trace[100];
9974 char **messages = NULL;
9975 int i, trace_size = 0;
9976 unsigned long offset=0;
56906eef 9977 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 9978 sds infostring;
56906eef 9979 REDIS_NOTUSED(info);
9980
9981 redisLog(REDIS_WARNING,
9982 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 9983 infostring = genRedisInfoString();
9984 redisLog(REDIS_WARNING, "%s",infostring);
9985 /* It's not safe to sdsfree() the returned string under memory
9986 * corruption conditions. Let it leak as we are going to abort */
e0a62c7f 9987
56906eef 9988 trace_size = backtrace(trace, 100);
de96dbfe 9989 /* overwrite sigaction with caller's address */
b91cf5ef 9990 if (getMcontextEip(uc) != NULL) {
9991 trace[1] = getMcontextEip(uc);
9992 }
56906eef 9993 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 9994
d76412d1 9995 for (i=1; i<trace_size; ++i) {
56906eef 9996 char *fn = findFuncName(trace[i], &offset), *p;
9997
9998 p = strchr(messages[i],'+');
9999 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
10000 redisLog(REDIS_WARNING,"%s", messages[i]);
10001 } else {
10002 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
10003 }
10004 }
b177fd30 10005 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 10006 _exit(0);
fe3bbfbe 10007}
56906eef 10008
10009static void setupSigSegvAction(void) {
10010 struct sigaction act;
10011
10012 sigemptyset (&act.sa_mask);
10013 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10014 * is used. Otherwise, sa_handler is used */
10015 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10016 act.sa_sigaction = segvHandler;
10017 sigaction (SIGSEGV, &act, NULL);
10018 sigaction (SIGBUS, &act, NULL);
12fea928 10019 sigaction (SIGFPE, &act, NULL);
10020 sigaction (SIGILL, &act, NULL);
10021 sigaction (SIGBUS, &act, NULL);
e65fdc78 10022 return;
56906eef 10023}
e65fdc78 10024
bcfc686d 10025#include "staticsymbols.h"
10026/* This function try to convert a pointer into a function name. It's used in
10027 * oreder to provide a backtrace under segmentation fault that's able to
10028 * display functions declared as static (otherwise the backtrace is useless). */
10029static char *findFuncName(void *pointer, unsigned long *offset){
10030 int i, ret = -1;
10031 unsigned long off, minoff = 0;
ed9b544e 10032
bcfc686d 10033 /* Try to match against the Symbol with the smallest offset */
10034 for (i=0; symsTable[i].pointer; i++) {
10035 unsigned long lp = (unsigned long) pointer;
0bc03378 10036
bcfc686d 10037 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
10038 off=lp-symsTable[i].pointer;
10039 if (ret < 0 || off < minoff) {
10040 minoff=off;
10041 ret=i;
10042 }
10043 }
0bc03378 10044 }
bcfc686d 10045 if (ret == -1) return NULL;
10046 *offset = minoff;
10047 return symsTable[ret].name;
0bc03378 10048}
bcfc686d 10049#else /* HAVE_BACKTRACE */
10050static void setupSigSegvAction(void) {
0bc03378 10051}
bcfc686d 10052#endif /* HAVE_BACKTRACE */
0bc03378 10053
ed9b544e 10054
ed9b544e 10055
bcfc686d 10056/* The End */
10057
10058
ed9b544e 10059