]> git.saurik.com Git - redis.git/blame - redis.c
Initial support for quoted strings in redis-cli
[redis.git] / redis.c
CommitLineData
ed9b544e 1/*
12d090d2 2 * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
ed9b544e 3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * * Neither the name of Redis nor the names of its contributors may be used
14 * to endorse or promote products derived from this software without
15 * specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
24df7698 30#define REDIS_VERSION "1.3.10"
23d4709d 31
32#include "fmacros.h"
fbf9bcdb 33#include "config.h"
ed9b544e 34
35#include <stdio.h>
36#include <stdlib.h>
37#include <string.h>
38#include <time.h>
39#include <unistd.h>
c9468bcf 40#define __USE_POSIX199309
54bac49d 41#define __USE_UNIX98
ed9b544e 42#include <signal.h>
fbf9bcdb 43
44#ifdef HAVE_BACKTRACE
c9468bcf 45#include <execinfo.h>
46#include <ucontext.h>
fbf9bcdb 47#endif /* HAVE_BACKTRACE */
48
ed9b544e 49#include <sys/wait.h>
50#include <errno.h>
51#include <assert.h>
52#include <ctype.h>
53#include <stdarg.h>
54#include <inttypes.h>
55#include <arpa/inet.h>
56#include <sys/stat.h>
57#include <fcntl.h>
58#include <sys/time.h>
59#include <sys/resource.h>
2895e862 60#include <sys/uio.h>
f78fd11b 61#include <limits.h>
a7866db6 62#include <math.h>
92f8e882 63#include <pthread.h>
0bc1b2f6 64
65#if defined(__sun)
5043dff3 66#include "solarisfixes.h"
67#endif
ed9b544e 68
c9468bcf 69#include "redis.h"
ed9b544e 70#include "ae.h" /* Event driven programming library */
71#include "sds.h" /* Dynamic safe strings */
72#include "anet.h" /* Networking the easy way */
73#include "dict.h" /* Hash tables */
74#include "adlist.h" /* Linked lists */
75#include "zmalloc.h" /* total memory usage aware version of malloc/free */
5f5b9840 76#include "lzf.h" /* LZF compression library */
77#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
5234952b 78#include "zipmap.h"
ed9b544e 79
80/* Error codes */
81#define REDIS_OK 0
82#define REDIS_ERR -1
83
84/* Static server configuration */
85#define REDIS_SERVERPORT 6379 /* TCP port */
86#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
6208b3a7 87#define REDIS_IOBUF_LEN 1024
ed9b544e 88#define REDIS_LOADBUF_LEN 1024
248ea310 89#define REDIS_STATIC_ARGS 8
ed9b544e 90#define REDIS_DEFAULT_DBNUM 16
91#define REDIS_CONFIGLINE_MAX 1024
92#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
93#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
8ca3e9d1 94#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
6f376729 95#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
2895e862 96#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
97
98/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
99#define REDIS_WRITEV_THRESHOLD 3
100/* Max number of iovecs used for each writev call */
101#define REDIS_WRITEV_IOVEC_COUNT 256
ed9b544e 102
103/* Hash table parameters */
104#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
ed9b544e 105
106/* Command flags */
3fd78bcd 107#define REDIS_CMD_BULK 1 /* Bulk write command */
108#define REDIS_CMD_INLINE 2 /* Inline command */
109/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
110 this flags will return an error when the 'maxmemory' option is set in the
111 config file and the server is using more than maxmemory bytes of memory.
112 In short this commands are denied on low memory conditions. */
113#define REDIS_CMD_DENYOOM 4
4005fef1 114#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
ed9b544e 115
116/* Object types */
117#define REDIS_STRING 0
118#define REDIS_LIST 1
119#define REDIS_SET 2
1812e024 120#define REDIS_ZSET 3
121#define REDIS_HASH 4
f78fd11b 122
5234952b 123/* Objects encoding. Some kind of objects like Strings and Hashes can be
124 * internally represented in multiple ways. The 'encoding' field of the object
125 * is set to one of this fields for this object. */
942a3961 126#define REDIS_ENCODING_RAW 0 /* Raw representation */
127#define REDIS_ENCODING_INT 1 /* Encoded as integer */
5234952b 128#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
129#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
942a3961 130
07efaf74 131static char* strencoding[] = {
132 "raw", "int", "zipmap", "hashtable"
133};
134
f78fd11b 135/* Object types only used for dumping to disk */
bb32ede5 136#define REDIS_EXPIRETIME 253
ed9b544e 137#define REDIS_SELECTDB 254
138#define REDIS_EOF 255
139
f78fd11b 140/* Defines related to the dump file format. To store 32 bits lengths for short
141 * keys requires a lot of space, so we check the most significant 2 bits of
142 * the first byte to interpreter the length:
143 *
144 * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
145 * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
146 * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
a4d1ba9a 147 * 11|000000 this means: specially encoded object will follow. The six bits
148 * number specify the kind of object that follows.
149 * See the REDIS_RDB_ENC_* defines.
f78fd11b 150 *
10c43610 151 * Lenghts up to 63 are stored using a single byte, most DB keys, and may
152 * values, will fit inside. */
f78fd11b 153#define REDIS_RDB_6BITLEN 0
154#define REDIS_RDB_14BITLEN 1
155#define REDIS_RDB_32BITLEN 2
17be1a4a 156#define REDIS_RDB_ENCVAL 3
f78fd11b 157#define REDIS_RDB_LENERR UINT_MAX
158
a4d1ba9a 159/* When a length of a string object stored on disk has the first two bits
160 * set, the remaining two bits specify a special encoding for the object
161 * accordingly to the following defines: */
162#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
163#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
164#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
774e3047 165#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
a4d1ba9a 166
75680a3c 167/* Virtual memory object->where field. */
168#define REDIS_VM_MEMORY 0 /* The object is on memory */
169#define REDIS_VM_SWAPPED 1 /* The object is on disk */
170#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
171#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
172
06224fec 173/* Virtual memory static configuration stuff.
174 * Check vmFindContiguousPages() to know more about this magic numbers. */
175#define REDIS_VM_MAX_NEAR_PAGES 65536
176#define REDIS_VM_MAX_RANDOM_JUMP 4096
92f8e882 177#define REDIS_VM_MAX_THREADS 32
bcaa7a4f 178#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
f6c0bba8 179/* The following is the *percentage* of completed I/O jobs to process when the
180 * handelr is called. While Virtual Memory I/O operations are performed by
181 * threads, this operations must be processed by the main thread when completed
182 * in order to take effect. */
c953f24b 183#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
06224fec 184
ed9b544e 185/* Client flags */
d5d55fc3 186#define REDIS_SLAVE 1 /* This client is a slave server */
187#define REDIS_MASTER 2 /* This client is a master server */
188#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
189#define REDIS_MULTI 8 /* This client is in a MULTI context */
190#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
191#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
ed9b544e 192
40d224a9 193/* Slave replication state - slave side */
ed9b544e 194#define REDIS_REPL_NONE 0 /* No active replication */
195#define REDIS_REPL_CONNECT 1 /* Must connect to master */
196#define REDIS_REPL_CONNECTED 2 /* Connected to master */
197
40d224a9 198/* Slave replication state - from the point of view of master
199 * Note that in SEND_BULK and ONLINE state the slave receives new updates
200 * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
201 * to start the next background saving in order to send updates to it. */
202#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
203#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
204#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
205#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
206
ed9b544e 207/* List related stuff */
208#define REDIS_HEAD 0
209#define REDIS_TAIL 1
210
211/* Sort operations */
212#define REDIS_SORT_GET 0
443c6409 213#define REDIS_SORT_ASC 1
214#define REDIS_SORT_DESC 2
ed9b544e 215#define REDIS_SORTKEY_MAX 1024
216
217/* Log levels */
218#define REDIS_DEBUG 0
f870935d 219#define REDIS_VERBOSE 1
220#define REDIS_NOTICE 2
221#define REDIS_WARNING 3
ed9b544e 222
223/* Anti-warning macro... */
224#define REDIS_NOTUSED(V) ((void) V)
225
6b47e12e 226#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
227#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
ed9b544e 228
48f0308a 229/* Append only defines */
230#define APPENDFSYNC_NO 0
231#define APPENDFSYNC_ALWAYS 1
232#define APPENDFSYNC_EVERYSEC 2
233
cbba7dd7 234/* Hashes related defaults */
235#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
236#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
237
dfc5e96c 238/* We can print the stacktrace, so our assert is defined this way: */
478c2c6f 239#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
c651fd9e 240#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
6c96ba7d 241static void _redisAssert(char *estr, char *file, int line);
c651fd9e 242static void _redisPanic(char *msg, char *file, int line);
dfc5e96c 243
ed9b544e 244/*================================= Data types ============================== */
245
246/* A redis object, that is a type able to hold a string / list / set */
75680a3c 247
248/* The VM object structure */
249struct redisObjectVM {
3a66edc7 250 off_t page; /* the page at witch the object is stored on disk */
251 off_t usedpages; /* number of pages used on disk */
252 time_t atime; /* Last access time */
75680a3c 253} vm;
254
255/* The actual Redis Object */
ed9b544e 256typedef struct redisObject {
ed9b544e 257 void *ptr;
942a3961 258 unsigned char type;
259 unsigned char encoding;
d894161b 260 unsigned char storage; /* If this object is a key, where is the value?
261 * REDIS_VM_MEMORY, REDIS_VM_SWAPPED, ... */
262 unsigned char vtype; /* If this object is a key, and value is swapped out,
263 * this is the type of the swapped out object. */
ed9b544e 264 int refcount;
75680a3c 265 /* VM fields, this are only allocated if VM is active, otherwise the
266 * object allocation function will just allocate
267 * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
268 * Redis without VM active will not have any overhead. */
269 struct redisObjectVM vm;
ed9b544e 270} robj;
271
dfc5e96c 272/* Macro used to initalize a Redis object allocated on the stack.
273 * Note that this macro is taken near the structure definition to make sure
274 * we'll update it when the structure is changed, to avoid bugs like
275 * bug #85 introduced exactly in this way. */
276#define initStaticStringObject(_var,_ptr) do { \
277 _var.refcount = 1; \
278 _var.type = REDIS_STRING; \
279 _var.encoding = REDIS_ENCODING_RAW; \
280 _var.ptr = _ptr; \
3a66edc7 281 if (server.vm_enabled) _var.storage = REDIS_VM_MEMORY; \
dfc5e96c 282} while(0);
283
3305306f 284typedef struct redisDb {
4409877e 285 dict *dict; /* The keyspace for this DB */
286 dict *expires; /* Timeout of keys with a timeout set */
287 dict *blockingkeys; /* Keys with clients waiting for data (BLPOP) */
d5d55fc3 288 dict *io_keys; /* Keys with clients waiting for VM I/O */
3305306f 289 int id;
290} redisDb;
291
6e469882 292/* Client MULTI/EXEC state */
293typedef struct multiCmd {
294 robj **argv;
295 int argc;
296 struct redisCommand *cmd;
297} multiCmd;
298
299typedef struct multiState {
300 multiCmd *commands; /* Array of MULTI commands */
301 int count; /* Total number of MULTI commands */
302} multiState;
303
ed9b544e 304/* With multiplexing we need to take per-clinet state.
305 * Clients are taken in a liked list. */
306typedef struct redisClient {
307 int fd;
3305306f 308 redisDb *db;
ed9b544e 309 int dictid;
310 sds querybuf;
e8a74421 311 robj **argv, **mbargv;
312 int argc, mbargc;
40d224a9 313 int bulklen; /* bulk read len. -1 if not in bulk read mode */
e8a74421 314 int multibulk; /* multi bulk command format active */
ed9b544e 315 list *reply;
316 int sentlen;
317 time_t lastinteraction; /* time of the last interaction, used for timeout */
d5d55fc3 318 int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
40d224a9 319 int slaveseldb; /* slave selected db, if this client is a slave */
320 int authenticated; /* when requirepass is non-NULL */
321 int replstate; /* replication state if this is a slave */
322 int repldbfd; /* replication DB file descriptor */
6e469882 323 long repldboff; /* replication DB file offset */
40d224a9 324 off_t repldbsize; /* replication DB file size */
6e469882 325 multiState mstate; /* MULTI/EXEC state */
d5d55fc3 326 robj **blockingkeys; /* The key we are waiting to terminate a blocking
4409877e 327 * operation such as BLPOP. Otherwise NULL. */
b177fd30 328 int blockingkeysnum; /* Number of blocking keys */
4409877e 329 time_t blockingto; /* Blocking operation timeout. If UNIX current time
330 * is >= blockingto then the operation timed out. */
92f8e882 331 list *io_keys; /* Keys this client is waiting to be loaded from the
332 * swap file in order to continue. */
ffc6b7f8 333 dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
334 list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
ed9b544e 335} redisClient;
336
337struct saveparam {
338 time_t seconds;
339 int changes;
340};
341
342/* Global server state structure */
343struct redisServer {
344 int port;
345 int fd;
3305306f 346 redisDb *db;
ed9b544e 347 long long dirty; /* changes to DB from the last save */
348 list *clients;
87eca727 349 list *slaves, *monitors;
ed9b544e 350 char neterr[ANET_ERR_LEN];
351 aeEventLoop *el;
352 int cronloops; /* number of times the cron function run */
353 list *objfreelist; /* A list of freed objects to avoid malloc() */
354 time_t lastsave; /* Unix time of last save succeeede */
ed9b544e 355 /* Fields used only for stats */
356 time_t stat_starttime; /* server start time */
357 long long stat_numcommands; /* number of processed commands */
358 long long stat_numconnections; /* number of connections received */
2a6a2ed1 359 long long stat_expiredkeys; /* number of expired keys */
ed9b544e 360 /* Configuration */
361 int verbosity;
362 int glueoutputbuf;
363 int maxidletime;
364 int dbnum;
365 int daemonize;
44b38ef4 366 int appendonly;
48f0308a 367 int appendfsync;
368 time_t lastfsync;
44b38ef4 369 int appendfd;
370 int appendseldb;
ed329fcf 371 char *pidfile;
9f3c422c 372 pid_t bgsavechildpid;
9d65a1bb 373 pid_t bgrewritechildpid;
374 sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
ed9b544e 375 struct saveparam *saveparams;
376 int saveparamslen;
377 char *logfile;
378 char *bindaddr;
379 char *dbfilename;
44b38ef4 380 char *appendfilename;
abcb223e 381 char *requirepass;
121f70cf 382 int rdbcompression;
8ca3e9d1 383 int activerehashing;
ed9b544e 384 /* Replication related */
385 int isslave;
d0ccebcf 386 char *masterauth;
ed9b544e 387 char *masterhost;
388 int masterport;
40d224a9 389 redisClient *master; /* client that is master for this slave */
ed9b544e 390 int replstate;
285add55 391 unsigned int maxclients;
4ef8de8a 392 unsigned long long maxmemory;
d5d55fc3 393 unsigned int blpop_blocked_clients;
394 unsigned int vm_blocked_clients;
ed9b544e 395 /* Sort parameters - qsort_r() is only available under BSD so we
396 * have to take this state global, in order to pass it to sortCompare() */
397 int sort_desc;
398 int sort_alpha;
399 int sort_bypattern;
75680a3c 400 /* Virtual memory configuration */
401 int vm_enabled;
054e426d 402 char *vm_swap_file;
75680a3c 403 off_t vm_page_size;
404 off_t vm_pages;
4ef8de8a 405 unsigned long long vm_max_memory;
cbba7dd7 406 /* Hashes config */
407 size_t hash_max_zipmap_entries;
408 size_t hash_max_zipmap_value;
75680a3c 409 /* Virtual memory state */
410 FILE *vm_fp;
411 int vm_fd;
412 off_t vm_next_page; /* Next probably empty page */
413 off_t vm_near_pages; /* Number of pages allocated sequentially */
06224fec 414 unsigned char *vm_bitmap; /* Bitmap of free/used pages */
3a66edc7 415 time_t unixtime; /* Unix time sampled every second. */
92f8e882 416 /* Virtual memory I/O threads stuff */
92f8e882 417 /* An I/O thread process an element taken from the io_jobs queue and
996cb5f7 418 * put the result of the operation in the io_done list. While the
419 * job is being processed, it's put on io_processing queue. */
420 list *io_newjobs; /* List of VM I/O jobs yet to be processed */
421 list *io_processing; /* List of VM I/O jobs being processed */
422 list *io_processed; /* List of VM I/O jobs already processed */
d5d55fc3 423 list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
996cb5f7 424 pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
a5819310 425 pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
426 pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
bcaa7a4f 427 pthread_attr_t io_threads_attr; /* attributes for threads creation */
92f8e882 428 int io_active_threads; /* Number of running I/O threads */
429 int vm_max_threads; /* Max number of I/O threads running at the same time */
996cb5f7 430 /* Our main thread is blocked on the event loop, locking for sockets ready
431 * to be read or written, so when a threaded I/O operation is ready to be
432 * processed by the main thread, the I/O thread will use a unix pipe to
433 * awake the main thread. The followings are the two pipe FDs. */
434 int io_ready_pipe_read;
435 int io_ready_pipe_write;
7d98e08c 436 /* Virtual memory stats */
437 unsigned long long vm_stats_used_pages;
438 unsigned long long vm_stats_swapped_objects;
439 unsigned long long vm_stats_swapouts;
440 unsigned long long vm_stats_swapins;
befec3cd 441 /* Pubsub */
ffc6b7f8 442 dict *pubsub_channels; /* Map channels to list of subscribed clients */
443 list *pubsub_patterns; /* A list of pubsub_patterns */
befec3cd 444 /* Misc */
b9bc0eef 445 FILE *devnull;
ed9b544e 446};
447
ffc6b7f8 448typedef struct pubsubPattern {
449 redisClient *client;
450 robj *pattern;
451} pubsubPattern;
452
ed9b544e 453typedef void redisCommandProc(redisClient *c);
454struct redisCommand {
455 char *name;
456 redisCommandProc *proc;
457 int arity;
458 int flags;
76583ea4
PN
459 /* Use a function to determine which keys need to be loaded
460 * in the background prior to executing this command. Takes precedence
461 * over vm_firstkey and others, ignored when NULL */
462 redisCommandProc *vm_preload_proc;
7c775e09 463 /* What keys should be loaded in background when calling this command? */
464 int vm_firstkey; /* The first argument that's a key (0 = no keys) */
465 int vm_lastkey; /* THe last argument that's a key */
466 int vm_keystep; /* The step between first and last key */
ed9b544e 467};
468
de96dbfe 469struct redisFunctionSym {
470 char *name;
56906eef 471 unsigned long pointer;
de96dbfe 472};
473
ed9b544e 474typedef struct _redisSortObject {
475 robj *obj;
476 union {
477 double score;
478 robj *cmpobj;
479 } u;
480} redisSortObject;
481
482typedef struct _redisSortOperation {
483 int type;
484 robj *pattern;
485} redisSortOperation;
486
6b47e12e 487/* ZSETs use a specialized version of Skiplists */
488
489typedef struct zskiplistNode {
490 struct zskiplistNode **forward;
e3870fab 491 struct zskiplistNode *backward;
912b9165 492 unsigned int *span;
6b47e12e 493 double score;
494 robj *obj;
495} zskiplistNode;
496
497typedef struct zskiplist {
e3870fab 498 struct zskiplistNode *header, *tail;
d13f767c 499 unsigned long length;
6b47e12e 500 int level;
501} zskiplist;
502
1812e024 503typedef struct zset {
504 dict *dict;
6b47e12e 505 zskiplist *zsl;
1812e024 506} zset;
507
6b47e12e 508/* Our shared "common" objects */
509
05df7621 510#define REDIS_SHARED_INTEGERS 10000
ed9b544e 511struct sharedObjectsStruct {
c937aa89 512 robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *pong, *space,
6e469882 513 *colon, *nullbulk, *nullmultibulk, *queued,
c937aa89 514 *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
515 *outofrangeerr, *plus,
ed9b544e 516 *select0, *select1, *select2, *select3, *select4,
befec3cd 517 *select5, *select6, *select7, *select8, *select9,
c8d0ea0e 518 *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
519 *mbulk4, *psubscribebulk, *punsubscribebulk,
520 *integers[REDIS_SHARED_INTEGERS];
ed9b544e 521} shared;
522
a7866db6 523/* Global vars that are actally used as constants. The following double
524 * values are used for double on-disk serialization, and are initialized
525 * at runtime to avoid strange compiler optimizations. */
526
527static double R_Zero, R_PosInf, R_NegInf, R_Nan;
528
92f8e882 529/* VM threaded I/O request message */
b9bc0eef 530#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
531#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
532#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
d5d55fc3 533typedef struct iojob {
996cb5f7 534 int type; /* Request type, REDIS_IOJOB_* */
b9bc0eef 535 redisDb *db;/* Redis database */
92f8e882 536 robj *key; /* This I/O request is about swapping this key */
b9bc0eef 537 robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
92f8e882 538 * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
539 off_t page; /* Swap page where to read/write the object */
248ea310 540 off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
996cb5f7 541 int canceled; /* True if this command was canceled by blocking side of VM */
542 pthread_t thread; /* ID of the thread processing this entry */
543} iojob;
92f8e882 544
ed9b544e 545/*================================ Prototypes =============================== */
546
547static void freeStringObject(robj *o);
548static void freeListObject(robj *o);
549static void freeSetObject(robj *o);
550static void decrRefCount(void *o);
551static robj *createObject(int type, void *ptr);
552static void freeClient(redisClient *c);
f78fd11b 553static int rdbLoad(char *filename);
ed9b544e 554static void addReply(redisClient *c, robj *obj);
555static void addReplySds(redisClient *c, sds s);
556static void incrRefCount(robj *o);
f78fd11b 557static int rdbSaveBackground(char *filename);
ed9b544e 558static robj *createStringObject(char *ptr, size_t len);
4ef8de8a 559static robj *dupStringObject(robj *o);
248ea310 560static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
44b38ef4 561static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
ed9b544e 562static int syncWithMaster(void);
05df7621 563static robj *tryObjectEncoding(robj *o);
9d65a1bb 564static robj *getDecodedObject(robj *o);
3305306f 565static int removeExpire(redisDb *db, robj *key);
566static int expireIfNeeded(redisDb *db, robj *key);
567static int deleteIfVolatile(redisDb *db, robj *key);
1b03836c 568static int deleteIfSwapped(redisDb *db, robj *key);
94754ccc 569static int deleteKey(redisDb *db, robj *key);
bb32ede5 570static time_t getExpire(redisDb *db, robj *key);
571static int setExpire(redisDb *db, robj *key, time_t when);
a3b21203 572static void updateSlavesWaitingBgsave(int bgsaveerr);
3fd78bcd 573static void freeMemoryIfNeeded(void);
de96dbfe 574static int processCommand(redisClient *c);
56906eef 575static void setupSigSegvAction(void);
a3b21203 576static void rdbRemoveTempFile(pid_t childpid);
9d65a1bb 577static void aofRemoveTempFile(pid_t childpid);
0ea663ea 578static size_t stringObjectLen(robj *o);
638e42ac 579static void processInputBuffer(redisClient *c);
6b47e12e 580static zskiplist *zslCreate(void);
fd8ccf44 581static void zslFree(zskiplist *zsl);
2b59cfdf 582static void zslInsert(zskiplist *zsl, double score, robj *obj);
2895e862 583static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
6e469882 584static void initClientMultiState(redisClient *c);
585static void freeClientMultiState(redisClient *c);
586static void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
b0d8747d 587static void unblockClientWaitingData(redisClient *c);
4409877e 588static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
75680a3c 589static void vmInit(void);
a35ddf12 590static void vmMarkPagesFree(off_t page, off_t count);
55cf8433 591static robj *vmLoadObject(robj *key);
7e69548d 592static robj *vmPreviewObject(robj *key);
a69a0c9c 593static int vmSwapOneObjectBlocking(void);
594static int vmSwapOneObjectThreaded(void);
7e69548d 595static int vmCanSwapOut(void);
a5819310 596static int tryFreeOneObjectFromFreelist(void);
996cb5f7 597static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
598static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
599static void vmCancelThreadedIOJob(robj *o);
b9bc0eef 600static void lockThreadedIO(void);
601static void unlockThreadedIO(void);
602static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
603static void freeIOJob(iojob *j);
604static void queueIOJob(iojob *j);
a5819310 605static int vmWriteObjectOnSwap(robj *o, off_t page);
606static robj *vmReadObjectFromSwap(off_t page, int type);
054e426d 607static void waitEmptyIOJobsQueue(void);
608static void vmReopenSwapFile(void);
970e10bb 609static int vmFreePage(off_t page);
76583ea4 610static void zunionInterBlockClientOnSwappedKeys(redisClient *c);
d5d55fc3 611static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c);
612static int dontWaitForSwappedKey(redisClient *c, robj *key);
613static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
614static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
615static struct redisCommand *lookupCommand(char *name);
616static void call(redisClient *c, struct redisCommand *cmd);
617static void resetClient(redisClient *c);
ada386b2 618static void convertToRealHash(robj *o);
ffc6b7f8 619static int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
620static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
621static void freePubsubPattern(void *p);
622static int listMatchPubsubPattern(void *a, void *b);
623static int compareStringObjects(robj *a, robj *b);
befec3cd 624static void usage();
ed9b544e 625
abcb223e 626static void authCommand(redisClient *c);
ed9b544e 627static void pingCommand(redisClient *c);
628static void echoCommand(redisClient *c);
629static void setCommand(redisClient *c);
630static void setnxCommand(redisClient *c);
526d00a5 631static void setexCommand(redisClient *c);
ed9b544e 632static void getCommand(redisClient *c);
633static void delCommand(redisClient *c);
634static void existsCommand(redisClient *c);
635static void incrCommand(redisClient *c);
636static void decrCommand(redisClient *c);
637static void incrbyCommand(redisClient *c);
638static void decrbyCommand(redisClient *c);
639static void selectCommand(redisClient *c);
640static void randomkeyCommand(redisClient *c);
641static void keysCommand(redisClient *c);
642static void dbsizeCommand(redisClient *c);
643static void lastsaveCommand(redisClient *c);
644static void saveCommand(redisClient *c);
645static void bgsaveCommand(redisClient *c);
9d65a1bb 646static void bgrewriteaofCommand(redisClient *c);
ed9b544e 647static void shutdownCommand(redisClient *c);
648static void moveCommand(redisClient *c);
649static void renameCommand(redisClient *c);
650static void renamenxCommand(redisClient *c);
651static void lpushCommand(redisClient *c);
652static void rpushCommand(redisClient *c);
653static void lpopCommand(redisClient *c);
654static void rpopCommand(redisClient *c);
655static void llenCommand(redisClient *c);
656static void lindexCommand(redisClient *c);
657static void lrangeCommand(redisClient *c);
658static void ltrimCommand(redisClient *c);
659static void typeCommand(redisClient *c);
660static void lsetCommand(redisClient *c);
661static void saddCommand(redisClient *c);
662static void sremCommand(redisClient *c);
a4460ef4 663static void smoveCommand(redisClient *c);
ed9b544e 664static void sismemberCommand(redisClient *c);
665static void scardCommand(redisClient *c);
12fea928 666static void spopCommand(redisClient *c);
2abb95a9 667static void srandmemberCommand(redisClient *c);
ed9b544e 668static void sinterCommand(redisClient *c);
669static void sinterstoreCommand(redisClient *c);
40d224a9 670static void sunionCommand(redisClient *c);
671static void sunionstoreCommand(redisClient *c);
f4f56e1d 672static void sdiffCommand(redisClient *c);
673static void sdiffstoreCommand(redisClient *c);
ed9b544e 674static void syncCommand(redisClient *c);
675static void flushdbCommand(redisClient *c);
676static void flushallCommand(redisClient *c);
677static void sortCommand(redisClient *c);
678static void lremCommand(redisClient *c);
0f5f7e9a 679static void rpoplpushcommand(redisClient *c);
ed9b544e 680static void infoCommand(redisClient *c);
70003d28 681static void mgetCommand(redisClient *c);
87eca727 682static void monitorCommand(redisClient *c);
3305306f 683static void expireCommand(redisClient *c);
802e8373 684static void expireatCommand(redisClient *c);
f6b141c5 685static void getsetCommand(redisClient *c);
fd88489a 686static void ttlCommand(redisClient *c);
321b0e13 687static void slaveofCommand(redisClient *c);
7f957c92 688static void debugCommand(redisClient *c);
f6b141c5 689static void msetCommand(redisClient *c);
690static void msetnxCommand(redisClient *c);
fd8ccf44 691static void zaddCommand(redisClient *c);
7db723ad 692static void zincrbyCommand(redisClient *c);
cc812361 693static void zrangeCommand(redisClient *c);
50c55df5 694static void zrangebyscoreCommand(redisClient *c);
f44dd428 695static void zcountCommand(redisClient *c);
e3870fab 696static void zrevrangeCommand(redisClient *c);
3c41331e 697static void zcardCommand(redisClient *c);
1b7106e7 698static void zremCommand(redisClient *c);
6e333bbe 699static void zscoreCommand(redisClient *c);
1807985b 700static void zremrangebyscoreCommand(redisClient *c);
6e469882 701static void multiCommand(redisClient *c);
702static void execCommand(redisClient *c);
18b6cb76 703static void discardCommand(redisClient *c);
4409877e 704static void blpopCommand(redisClient *c);
705static void brpopCommand(redisClient *c);
4b00bebd 706static void appendCommand(redisClient *c);
39191553 707static void substrCommand(redisClient *c);
69d95c3e 708static void zrankCommand(redisClient *c);
798d9e55 709static void zrevrankCommand(redisClient *c);
978c2c94 710static void hsetCommand(redisClient *c);
1f1c7695 711static void hsetnxCommand(redisClient *c);
978c2c94 712static void hgetCommand(redisClient *c);
09aeb579
PN
713static void hmsetCommand(redisClient *c);
714static void hmgetCommand(redisClient *c);
07efaf74 715static void hdelCommand(redisClient *c);
92b27fe9 716static void hlenCommand(redisClient *c);
9212eafd 717static void zremrangebyrankCommand(redisClient *c);
2830ca53
PN
718static void zunionCommand(redisClient *c);
719static void zinterCommand(redisClient *c);
78409a0f 720static void hkeysCommand(redisClient *c);
721static void hvalsCommand(redisClient *c);
722static void hgetallCommand(redisClient *c);
a86f14b1 723static void hexistsCommand(redisClient *c);
500ece7c 724static void configCommand(redisClient *c);
01426b05 725static void hincrbyCommand(redisClient *c);
befec3cd 726static void subscribeCommand(redisClient *c);
727static void unsubscribeCommand(redisClient *c);
ffc6b7f8 728static void psubscribeCommand(redisClient *c);
729static void punsubscribeCommand(redisClient *c);
befec3cd 730static void publishCommand(redisClient *c);
f6b141c5 731
ed9b544e 732/*================================= Globals ================================= */
733
734/* Global vars */
735static struct redisServer server; /* server global state */
736static struct redisCommand cmdTable[] = {
76583ea4
PN
737 {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
738 {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
739 {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
526d00a5 740 {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
741 {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
742 {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
743 {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
744 {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
745 {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
746 {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
747 {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
748 {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
749 {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
750 {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
751 {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
752 {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
753 {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
754 {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
755 {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
756 {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
757 {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
758 {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
759 {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
760 {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
761 {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
762 {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
763 {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
764 {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
765 {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
766 {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
767 {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
768 {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
769 {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
770 {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
771 {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
772 {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
773 {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
774 {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
775 {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
776 {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
777 {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
778 {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
779 {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
780 {"zunion",zunionCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
781 {"zinter",zinterCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
782 {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
783 {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
784 {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
785 {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
786 {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
787 {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
788 {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
789 {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
790 {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
1f1c7695 791 {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 792 {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
d33278d1 793 {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
09aeb579 794 {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
01426b05 795 {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
76583ea4
PN
796 {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
797 {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
798 {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
799 {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
800 {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
4583c4f0 801 {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
76583ea4
PN
802 {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
803 {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
804 {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
805 {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
806 {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
807 {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
808 {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
809 {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
810 {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
811 {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
812 {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
813 {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
814 {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
815 {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
816 {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
817 {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
818 {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
819 {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
820 {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
821 {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
822 {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
823 {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
824 {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
825 {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
958cd5f3 826 {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,0,0,0},
76583ea4
PN
827 {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
828 {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
829 {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
830 {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
831 {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
832 {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
833 {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
834 {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
835 {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
836 {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
500ece7c 837 {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
befec3cd 838 {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
839 {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
ffc6b7f8 840 {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
841 {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
4005fef1 842 {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
76583ea4 843 {NULL,NULL,0,0,NULL,0,0,0}
ed9b544e 844};
bcfc686d 845
ed9b544e 846/*============================ Utility functions ============================ */
847
848/* Glob-style pattern matching. */
500ece7c 849static int stringmatchlen(const char *pattern, int patternLen,
ed9b544e 850 const char *string, int stringLen, int nocase)
851{
852 while(patternLen) {
853 switch(pattern[0]) {
854 case '*':
855 while (pattern[1] == '*') {
856 pattern++;
857 patternLen--;
858 }
859 if (patternLen == 1)
860 return 1; /* match */
861 while(stringLen) {
862 if (stringmatchlen(pattern+1, patternLen-1,
863 string, stringLen, nocase))
864 return 1; /* match */
865 string++;
866 stringLen--;
867 }
868 return 0; /* no match */
869 break;
870 case '?':
871 if (stringLen == 0)
872 return 0; /* no match */
873 string++;
874 stringLen--;
875 break;
876 case '[':
877 {
878 int not, match;
879
880 pattern++;
881 patternLen--;
882 not = pattern[0] == '^';
883 if (not) {
884 pattern++;
885 patternLen--;
886 }
887 match = 0;
888 while(1) {
889 if (pattern[0] == '\\') {
890 pattern++;
891 patternLen--;
892 if (pattern[0] == string[0])
893 match = 1;
894 } else if (pattern[0] == ']') {
895 break;
896 } else if (patternLen == 0) {
897 pattern--;
898 patternLen++;
899 break;
900 } else if (pattern[1] == '-' && patternLen >= 3) {
901 int start = pattern[0];
902 int end = pattern[2];
903 int c = string[0];
904 if (start > end) {
905 int t = start;
906 start = end;
907 end = t;
908 }
909 if (nocase) {
910 start = tolower(start);
911 end = tolower(end);
912 c = tolower(c);
913 }
914 pattern += 2;
915 patternLen -= 2;
916 if (c >= start && c <= end)
917 match = 1;
918 } else {
919 if (!nocase) {
920 if (pattern[0] == string[0])
921 match = 1;
922 } else {
923 if (tolower((int)pattern[0]) == tolower((int)string[0]))
924 match = 1;
925 }
926 }
927 pattern++;
928 patternLen--;
929 }
930 if (not)
931 match = !match;
932 if (!match)
933 return 0; /* no match */
934 string++;
935 stringLen--;
936 break;
937 }
938 case '\\':
939 if (patternLen >= 2) {
940 pattern++;
941 patternLen--;
942 }
943 /* fall through */
944 default:
945 if (!nocase) {
946 if (pattern[0] != string[0])
947 return 0; /* no match */
948 } else {
949 if (tolower((int)pattern[0]) != tolower((int)string[0]))
950 return 0; /* no match */
951 }
952 string++;
953 stringLen--;
954 break;
955 }
956 pattern++;
957 patternLen--;
958 if (stringLen == 0) {
959 while(*pattern == '*') {
960 pattern++;
961 patternLen--;
962 }
963 break;
964 }
965 }
966 if (patternLen == 0 && stringLen == 0)
967 return 1;
968 return 0;
969}
970
500ece7c 971static int stringmatch(const char *pattern, const char *string, int nocase) {
972 return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
973}
974
2b619329 975/* Convert a string representing an amount of memory into the number of
976 * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
977 * (1024*1024*1024).
978 *
979 * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
980 * set to 0 */
981static long long memtoll(const char *p, int *err) {
982 const char *u;
983 char buf[128];
984 long mul; /* unit multiplier */
985 long long val;
986 unsigned int digits;
987
988 if (err) *err = 0;
989 /* Search the first non digit character. */
990 u = p;
991 if (*u == '-') u++;
992 while(*u && isdigit(*u)) u++;
993 if (*u == '\0' || !strcasecmp(u,"b")) {
994 mul = 1;
72324005 995 } else if (!strcasecmp(u,"k")) {
2b619329 996 mul = 1000;
72324005 997 } else if (!strcasecmp(u,"kb")) {
2b619329 998 mul = 1024;
72324005 999 } else if (!strcasecmp(u,"m")) {
2b619329 1000 mul = 1000*1000;
72324005 1001 } else if (!strcasecmp(u,"mb")) {
2b619329 1002 mul = 1024*1024;
72324005 1003 } else if (!strcasecmp(u,"g")) {
2b619329 1004 mul = 1000L*1000*1000;
72324005 1005 } else if (!strcasecmp(u,"gb")) {
2b619329 1006 mul = 1024L*1024*1024;
1007 } else {
1008 if (err) *err = 1;
1009 mul = 1;
1010 }
1011 digits = u-p;
1012 if (digits >= sizeof(buf)) {
1013 if (err) *err = 1;
1014 return LLONG_MAX;
1015 }
1016 memcpy(buf,p,digits);
1017 buf[digits] = '\0';
1018 val = strtoll(buf,NULL,10);
1019 return val*mul;
1020}
1021
56906eef 1022static void redisLog(int level, const char *fmt, ...) {
ed9b544e 1023 va_list ap;
1024 FILE *fp;
1025
1026 fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
1027 if (!fp) return;
1028
1029 va_start(ap, fmt);
1030 if (level >= server.verbosity) {
6766f45e 1031 char *c = ".-*#";
1904ecc1 1032 char buf[64];
1033 time_t now;
1034
1035 now = time(NULL);
6c9385e0 1036 strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
054e426d 1037 fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
ed9b544e 1038 vfprintf(fp, fmt, ap);
1039 fprintf(fp,"\n");
1040 fflush(fp);
1041 }
1042 va_end(ap);
1043
1044 if (server.logfile) fclose(fp);
1045}
1046
1047/*====================== Hash table type implementation ==================== */
1048
1049/* This is an hash table type that uses the SDS dynamic strings libary as
1050 * keys and radis objects as values (objects can hold SDS strings,
1051 * lists, sets). */
1052
1812e024 1053static void dictVanillaFree(void *privdata, void *val)
1054{
1055 DICT_NOTUSED(privdata);
1056 zfree(val);
1057}
1058
4409877e 1059static void dictListDestructor(void *privdata, void *val)
1060{
1061 DICT_NOTUSED(privdata);
1062 listRelease((list*)val);
1063}
1064
ed9b544e 1065static int sdsDictKeyCompare(void *privdata, const void *key1,
1066 const void *key2)
1067{
1068 int l1,l2;
1069 DICT_NOTUSED(privdata);
1070
1071 l1 = sdslen((sds)key1);
1072 l2 = sdslen((sds)key2);
1073 if (l1 != l2) return 0;
1074 return memcmp(key1, key2, l1) == 0;
1075}
1076
1077static void dictRedisObjectDestructor(void *privdata, void *val)
1078{
1079 DICT_NOTUSED(privdata);
1080
a35ddf12 1081 if (val == NULL) return; /* Values of swapped out keys as set to NULL */
ed9b544e 1082 decrRefCount(val);
1083}
1084
942a3961 1085static int dictObjKeyCompare(void *privdata, const void *key1,
ed9b544e 1086 const void *key2)
1087{
1088 const robj *o1 = key1, *o2 = key2;
1089 return sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1090}
1091
942a3961 1092static unsigned int dictObjHash(const void *key) {
ed9b544e 1093 const robj *o = key;
1094 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1095}
1096
942a3961 1097static int dictEncObjKeyCompare(void *privdata, const void *key1,
1098 const void *key2)
1099{
9d65a1bb 1100 robj *o1 = (robj*) key1, *o2 = (robj*) key2;
1101 int cmp;
942a3961 1102
2a1198b4 1103 if (o1->encoding == REDIS_ENCODING_INT &&
1104 o2->encoding == REDIS_ENCODING_INT &&
db5946fc 1105 o1->ptr == o2->ptr) return 1;
2a1198b4 1106
9d65a1bb 1107 o1 = getDecodedObject(o1);
1108 o2 = getDecodedObject(o2);
1109 cmp = sdsDictKeyCompare(privdata,o1->ptr,o2->ptr);
1110 decrRefCount(o1);
1111 decrRefCount(o2);
1112 return cmp;
942a3961 1113}
1114
1115static unsigned int dictEncObjHash(const void *key) {
9d65a1bb 1116 robj *o = (robj*) key;
942a3961 1117
ed9e4966 1118 if (o->encoding == REDIS_ENCODING_RAW) {
1119 return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1120 } else {
1121 if (o->encoding == REDIS_ENCODING_INT) {
1122 char buf[32];
1123 int len;
1124
1125 len = snprintf(buf,32,"%ld",(long)o->ptr);
1126 return dictGenHashFunction((unsigned char*)buf, len);
1127 } else {
1128 unsigned int hash;
1129
1130 o = getDecodedObject(o);
1131 hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
1132 decrRefCount(o);
1133 return hash;
1134 }
1135 }
942a3961 1136}
1137
f2d9f50f 1138/* Sets type and expires */
ed9b544e 1139static dictType setDictType = {
942a3961 1140 dictEncObjHash, /* hash function */
ed9b544e 1141 NULL, /* key dup */
1142 NULL, /* val dup */
942a3961 1143 dictEncObjKeyCompare, /* key compare */
ed9b544e 1144 dictRedisObjectDestructor, /* key destructor */
1145 NULL /* val destructor */
1146};
1147
f2d9f50f 1148/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
1812e024 1149static dictType zsetDictType = {
1150 dictEncObjHash, /* hash function */
1151 NULL, /* key dup */
1152 NULL, /* val dup */
1153 dictEncObjKeyCompare, /* key compare */
1154 dictRedisObjectDestructor, /* key destructor */
da0a1620 1155 dictVanillaFree /* val destructor of malloc(sizeof(double)) */
1812e024 1156};
1157
f2d9f50f 1158/* Db->dict */
5234952b 1159static dictType dbDictType = {
942a3961 1160 dictObjHash, /* hash function */
ed9b544e 1161 NULL, /* key dup */
1162 NULL, /* val dup */
942a3961 1163 dictObjKeyCompare, /* key compare */
ed9b544e 1164 dictRedisObjectDestructor, /* key destructor */
1165 dictRedisObjectDestructor /* val destructor */
1166};
1167
f2d9f50f 1168/* Db->expires */
1169static dictType keyptrDictType = {
1170 dictObjHash, /* hash function */
1171 NULL, /* key dup */
1172 NULL, /* val dup */
1173 dictObjKeyCompare, /* key compare */
1174 dictRedisObjectDestructor, /* key destructor */
1175 NULL /* val destructor */
1176};
1177
5234952b 1178/* Hash type hash table (note that small hashes are represented with zimpaps) */
1179static dictType hashDictType = {
1180 dictEncObjHash, /* hash function */
1181 NULL, /* key dup */
1182 NULL, /* val dup */
1183 dictEncObjKeyCompare, /* key compare */
1184 dictRedisObjectDestructor, /* key destructor */
1185 dictRedisObjectDestructor /* val destructor */
1186};
1187
4409877e 1188/* Keylist hash table type has unencoded redis objects as keys and
d5d55fc3 1189 * lists as values. It's used for blocking operations (BLPOP) and to
1190 * map swapped keys to a list of clients waiting for this keys to be loaded. */
4409877e 1191static dictType keylistDictType = {
1192 dictObjHash, /* hash function */
1193 NULL, /* key dup */
1194 NULL, /* val dup */
1195 dictObjKeyCompare, /* key compare */
1196 dictRedisObjectDestructor, /* key destructor */
1197 dictListDestructor /* val destructor */
1198};
1199
42ab0172
AO
1200static void version();
1201
ed9b544e 1202/* ========================= Random utility functions ======================= */
1203
1204/* Redis generally does not try to recover from out of memory conditions
1205 * when allocating objects or strings, it is not clear if it will be possible
1206 * to report this condition to the client since the networking layer itself
1207 * is based on heap allocation for send buffers, so we simply abort.
1208 * At least the code will be simpler to read... */
1209static void oom(const char *msg) {
71c54b21 1210 redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
ed9b544e 1211 sleep(1);
1212 abort();
1213}
1214
1215/* ====================== Redis server networking stuff ===================== */
56906eef 1216static void closeTimedoutClients(void) {
ed9b544e 1217 redisClient *c;
ed9b544e 1218 listNode *ln;
1219 time_t now = time(NULL);
c7df85a4 1220 listIter li;
ed9b544e 1221
c7df85a4 1222 listRewind(server.clients,&li);
1223 while ((ln = listNext(&li)) != NULL) {
ed9b544e 1224 c = listNodeValue(ln);
f86a74e9 1225 if (server.maxidletime &&
1226 !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
c7cf2ec9 1227 !(c->flags & REDIS_MASTER) && /* no timeout for masters */
ffc6b7f8 1228 dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
1229 listLength(c->pubsub_patterns) == 0 &&
d6cc8867 1230 (now - c->lastinteraction > server.maxidletime))
f86a74e9 1231 {
f870935d 1232 redisLog(REDIS_VERBOSE,"Closing idle client");
ed9b544e 1233 freeClient(c);
f86a74e9 1234 } else if (c->flags & REDIS_BLOCKED) {
58d976b8 1235 if (c->blockingto != 0 && c->blockingto < now) {
b177fd30 1236 addReply(c,shared.nullmultibulk);
b0d8747d 1237 unblockClientWaitingData(c);
f86a74e9 1238 }
ed9b544e 1239 }
1240 }
ed9b544e 1241}
1242
12fea928 1243static int htNeedsResize(dict *dict) {
1244 long long size, used;
1245
1246 size = dictSlots(dict);
1247 used = dictSize(dict);
1248 return (size && used && size > DICT_HT_INITIAL_SIZE &&
1249 (used*100/size < REDIS_HT_MINFILL));
1250}
1251
0bc03378 1252/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
1253 * we resize the hash table to save memory */
56906eef 1254static void tryResizeHashTables(void) {
0bc03378 1255 int j;
1256
1257 for (j = 0; j < server.dbnum; j++) {
5413c40d 1258 if (htNeedsResize(server.db[j].dict))
0bc03378 1259 dictResize(server.db[j].dict);
12fea928 1260 if (htNeedsResize(server.db[j].expires))
1261 dictResize(server.db[j].expires);
0bc03378 1262 }
1263}
1264
8ca3e9d1 1265/* Our hash table implementation performs rehashing incrementally while
1266 * we write/read from the hash table. Still if the server is idle, the hash
1267 * table will use two tables for a long time. So we try to use 1 millisecond
1268 * of CPU time at every serverCron() loop in order to rehash some key. */
1269static void incrementallyRehash(void) {
1270 int j;
1271
1272 for (j = 0; j < server.dbnum; j++) {
1273 if (dictIsRehashing(server.db[j].dict)) {
1274 dictRehashMilliseconds(server.db[j].dict,1);
1275 break; /* already used our millisecond for this loop... */
1276 }
1277 }
1278}
1279
9d65a1bb 1280/* A background saving child (BGSAVE) terminated its work. Handle this. */
1281void backgroundSaveDoneHandler(int statloc) {
1282 int exitcode = WEXITSTATUS(statloc);
1283 int bysignal = WIFSIGNALED(statloc);
1284
1285 if (!bysignal && exitcode == 0) {
1286 redisLog(REDIS_NOTICE,
1287 "Background saving terminated with success");
1288 server.dirty = 0;
1289 server.lastsave = time(NULL);
1290 } else if (!bysignal && exitcode != 0) {
1291 redisLog(REDIS_WARNING, "Background saving error");
1292 } else {
1293 redisLog(REDIS_WARNING,
454eea7c 1294 "Background saving terminated by signal %d", WTERMSIG(statloc));
9d65a1bb 1295 rdbRemoveTempFile(server.bgsavechildpid);
1296 }
1297 server.bgsavechildpid = -1;
1298 /* Possibly there are slaves waiting for a BGSAVE in order to be served
1299 * (the first stage of SYNC is a bulk transfer of dump.rdb) */
1300 updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
1301}
1302
1303/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1304 * Handle this. */
1305void backgroundRewriteDoneHandler(int statloc) {
1306 int exitcode = WEXITSTATUS(statloc);
1307 int bysignal = WIFSIGNALED(statloc);
1308
1309 if (!bysignal && exitcode == 0) {
1310 int fd;
1311 char tmpfile[256];
1312
1313 redisLog(REDIS_NOTICE,
1314 "Background append only file rewriting terminated with success");
1315 /* Now it's time to flush the differences accumulated by the parent */
1316 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
1317 fd = open(tmpfile,O_WRONLY|O_APPEND);
1318 if (fd == -1) {
1319 redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
1320 goto cleanup;
1321 }
1322 /* Flush our data... */
1323 if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
1324 (signed) sdslen(server.bgrewritebuf)) {
1325 redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
1326 close(fd);
1327 goto cleanup;
1328 }
b32627cd 1329 redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
9d65a1bb 1330 /* Now our work is to rename the temp file into the stable file. And
1331 * switch the file descriptor used by the server for append only. */
1332 if (rename(tmpfile,server.appendfilename) == -1) {
1333 redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
1334 close(fd);
1335 goto cleanup;
1336 }
1337 /* Mission completed... almost */
1338 redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
1339 if (server.appendfd != -1) {
1340 /* If append only is actually enabled... */
1341 close(server.appendfd);
1342 server.appendfd = fd;
1343 fsync(fd);
85a83172 1344 server.appendseldb = -1; /* Make sure it will issue SELECT */
9d65a1bb 1345 redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
1346 } else {
1347 /* If append only is disabled we just generate a dump in this
1348 * format. Why not? */
1349 close(fd);
1350 }
1351 } else if (!bysignal && exitcode != 0) {
1352 redisLog(REDIS_WARNING, "Background append only file rewriting error");
1353 } else {
1354 redisLog(REDIS_WARNING,
454eea7c 1355 "Background append only file rewriting terminated by signal %d",
1356 WTERMSIG(statloc));
9d65a1bb 1357 }
1358cleanup:
1359 sdsfree(server.bgrewritebuf);
1360 server.bgrewritebuf = sdsempty();
1361 aofRemoveTempFile(server.bgrewritechildpid);
1362 server.bgrewritechildpid = -1;
1363}
1364
884d4b39 1365/* This function is called once a background process of some kind terminates,
1366 * as we want to avoid resizing the hash tables when there is a child in order
1367 * to play well with copy-on-write (otherwise when a resize happens lots of
1368 * memory pages are copied). The goal of this function is to update the ability
1369 * for dict.c to resize the hash tables accordingly to the fact we have o not
1370 * running childs. */
1371static void updateDictResizePolicy(void) {
1372 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
1373 dictEnableResize();
1374 else
1375 dictDisableResize();
1376}
1377
56906eef 1378static int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
94754ccc 1379 int j, loops = server.cronloops++;
ed9b544e 1380 REDIS_NOTUSED(eventLoop);
1381 REDIS_NOTUSED(id);
1382 REDIS_NOTUSED(clientData);
1383
3a66edc7 1384 /* We take a cached value of the unix time in the global state because
1385 * with virtual memory and aging there is to store the current time
1386 * in objects at every object access, and accuracy is not needed.
1387 * To access a global var is faster than calling time(NULL) */
1388 server.unixtime = time(NULL);
1389
0bc03378 1390 /* Show some info about non-empty databases */
ed9b544e 1391 for (j = 0; j < server.dbnum; j++) {
dec423d9 1392 long long size, used, vkeys;
94754ccc 1393
3305306f 1394 size = dictSlots(server.db[j].dict);
1395 used = dictSize(server.db[j].dict);
94754ccc 1396 vkeys = dictSize(server.db[j].expires);
1763929f 1397 if (!(loops % 50) && (used || vkeys)) {
f870935d 1398 redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
a4d1ba9a 1399 /* dictPrintStats(server.dict); */
ed9b544e 1400 }
ed9b544e 1401 }
1402
0bc03378 1403 /* We don't want to resize the hash tables while a bacground saving
1404 * is in progress: the saving child is created using fork() that is
1405 * implemented with a copy-on-write semantic in most modern systems, so
1406 * if we resize the HT while there is the saving child at work actually
1407 * a lot of memory movements in the parent will cause a lot of pages
1408 * copied. */
8ca3e9d1 1409 if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
1410 if (!(loops % 10)) tryResizeHashTables();
1411 if (server.activerehashing) incrementallyRehash();
884d4b39 1412 }
0bc03378 1413
ed9b544e 1414 /* Show information about connected clients */
1763929f 1415 if (!(loops % 50)) {
bdcb92f2 1416 redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
ed9b544e 1417 listLength(server.clients)-listLength(server.slaves),
1418 listLength(server.slaves),
bdcb92f2 1419 zmalloc_used_memory());
ed9b544e 1420 }
1421
1422 /* Close connections of timedout clients */
1763929f 1423 if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
ed9b544e 1424 closeTimedoutClients();
1425
9d65a1bb 1426 /* Check if a background saving or AOF rewrite in progress terminated */
1427 if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
ed9b544e 1428 int statloc;
9d65a1bb 1429 pid_t pid;
1430
1431 if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
1432 if (pid == server.bgsavechildpid) {
1433 backgroundSaveDoneHandler(statloc);
ed9b544e 1434 } else {
9d65a1bb 1435 backgroundRewriteDoneHandler(statloc);
ed9b544e 1436 }
884d4b39 1437 updateDictResizePolicy();
ed9b544e 1438 }
1439 } else {
1440 /* If there is not a background saving in progress check if
1441 * we have to save now */
1442 time_t now = time(NULL);
1443 for (j = 0; j < server.saveparamslen; j++) {
1444 struct saveparam *sp = server.saveparams+j;
1445
1446 if (server.dirty >= sp->changes &&
1447 now-server.lastsave > sp->seconds) {
1448 redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
1449 sp->changes, sp->seconds);
f78fd11b 1450 rdbSaveBackground(server.dbfilename);
ed9b544e 1451 break;
1452 }
1453 }
1454 }
94754ccc 1455
f2324293 1456 /* Try to expire a few timed out keys. The algorithm used is adaptive and
1457 * will use few CPU cycles if there are few expiring keys, otherwise
1458 * it will get more aggressive to avoid that too much memory is used by
1459 * keys that can be removed from the keyspace. */
94754ccc 1460 for (j = 0; j < server.dbnum; j++) {
f2324293 1461 int expired;
94754ccc 1462 redisDb *db = server.db+j;
94754ccc 1463
f2324293 1464 /* Continue to expire if at the end of the cycle more than 25%
1465 * of the keys were expired. */
1466 do {
4ef8de8a 1467 long num = dictSize(db->expires);
94754ccc 1468 time_t now = time(NULL);
1469
f2324293 1470 expired = 0;
94754ccc 1471 if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
1472 num = REDIS_EXPIRELOOKUPS_PER_CRON;
1473 while (num--) {
1474 dictEntry *de;
1475 time_t t;
1476
1477 if ((de = dictGetRandomKey(db->expires)) == NULL) break;
1478 t = (time_t) dictGetEntryVal(de);
1479 if (now > t) {
1480 deleteKey(db,dictGetEntryKey(de));
f2324293 1481 expired++;
2a6a2ed1 1482 server.stat_expiredkeys++;
94754ccc 1483 }
1484 }
f2324293 1485 } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
94754ccc 1486 }
1487
4ef8de8a 1488 /* Swap a few keys on disk if we are over the memory limit and VM
f870935d 1489 * is enbled. Try to free objects from the free list first. */
7e69548d 1490 if (vmCanSwapOut()) {
1491 while (server.vm_enabled && zmalloc_used_memory() >
f870935d 1492 server.vm_max_memory)
1493 {
72e9fd40 1494 int retval;
1495
a5819310 1496 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
72e9fd40 1497 retval = (server.vm_max_threads == 0) ?
1498 vmSwapOneObjectBlocking() :
1499 vmSwapOneObjectThreaded();
1763929f 1500 if (retval == REDIS_ERR && !(loops % 300) &&
72e9fd40 1501 zmalloc_used_memory() >
1502 (server.vm_max_memory+server.vm_max_memory/10))
1503 {
1504 redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
7e69548d 1505 }
72e9fd40 1506 /* Note that when using threade I/O we free just one object,
1507 * because anyway when the I/O thread in charge to swap this
1508 * object out will finish, the handler of completed jobs
1509 * will try to swap more objects if we are still out of memory. */
1510 if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
4ef8de8a 1511 }
1512 }
1513
ed9b544e 1514 /* Check if we should connect to a MASTER */
1763929f 1515 if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
ed9b544e 1516 redisLog(REDIS_NOTICE,"Connecting to MASTER...");
1517 if (syncWithMaster() == REDIS_OK) {
1518 redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
1519 }
1520 }
1763929f 1521 return 100;
ed9b544e 1522}
1523
d5d55fc3 1524/* This function gets called every time Redis is entering the
1525 * main loop of the event driven library, that is, before to sleep
1526 * for ready file descriptors. */
1527static void beforeSleep(struct aeEventLoop *eventLoop) {
1528 REDIS_NOTUSED(eventLoop);
1529
1530 if (server.vm_enabled && listLength(server.io_ready_clients)) {
1531 listIter li;
1532 listNode *ln;
1533
1534 listRewind(server.io_ready_clients,&li);
1535 while((ln = listNext(&li))) {
1536 redisClient *c = ln->value;
1537 struct redisCommand *cmd;
1538
1539 /* Resume the client. */
1540 listDelNode(server.io_ready_clients,ln);
1541 c->flags &= (~REDIS_IO_WAIT);
1542 server.vm_blocked_clients--;
1543 aeCreateFileEvent(server.el, c->fd, AE_READABLE,
1544 readQueryFromClient, c);
1545 cmd = lookupCommand(c->argv[0]->ptr);
1546 assert(cmd != NULL);
1547 call(c,cmd);
1548 resetClient(c);
1549 /* There may be more data to process in the input buffer. */
1550 if (c->querybuf && sdslen(c->querybuf) > 0)
1551 processInputBuffer(c);
1552 }
1553 }
1554}
1555
ed9b544e 1556static void createSharedObjects(void) {
05df7621 1557 int j;
1558
ed9b544e 1559 shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
1560 shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
1561 shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
c937aa89 1562 shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
1563 shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
1564 shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
1565 shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
1566 shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
1567 shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
ed9b544e 1568 shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
6e469882 1569 shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
ed9b544e 1570 shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
1571 "-ERR Operation against a key holding the wrong kind of value\r\n"));
ed9b544e 1572 shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
1573 "-ERR no such key\r\n"));
ed9b544e 1574 shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
1575 "-ERR syntax error\r\n"));
c937aa89 1576 shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
1577 "-ERR source and destination objects are the same\r\n"));
1578 shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
1579 "-ERR index out of range\r\n"));
ed9b544e 1580 shared.space = createObject(REDIS_STRING,sdsnew(" "));
c937aa89 1581 shared.colon = createObject(REDIS_STRING,sdsnew(":"));
1582 shared.plus = createObject(REDIS_STRING,sdsnew("+"));
ed9b544e 1583 shared.select0 = createStringObject("select 0\r\n",10);
1584 shared.select1 = createStringObject("select 1\r\n",10);
1585 shared.select2 = createStringObject("select 2\r\n",10);
1586 shared.select3 = createStringObject("select 3\r\n",10);
1587 shared.select4 = createStringObject("select 4\r\n",10);
1588 shared.select5 = createStringObject("select 5\r\n",10);
1589 shared.select6 = createStringObject("select 6\r\n",10);
1590 shared.select7 = createStringObject("select 7\r\n",10);
1591 shared.select8 = createStringObject("select 8\r\n",10);
1592 shared.select9 = createStringObject("select 9\r\n",10);
befec3cd 1593 shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
c8d0ea0e 1594 shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
befec3cd 1595 shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
fc46bb71 1596 shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
ffc6b7f8 1597 shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
1598 shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
befec3cd 1599 shared.mbulk3 = createStringObject("*3\r\n",4);
c8d0ea0e 1600 shared.mbulk4 = createStringObject("*4\r\n",4);
05df7621 1601 for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
1602 shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
1603 shared.integers[j]->encoding = REDIS_ENCODING_INT;
1604 }
ed9b544e 1605}
1606
1607static void appendServerSaveParams(time_t seconds, int changes) {
1608 server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
ed9b544e 1609 server.saveparams[server.saveparamslen].seconds = seconds;
1610 server.saveparams[server.saveparamslen].changes = changes;
1611 server.saveparamslen++;
1612}
1613
bcfc686d 1614static void resetServerSaveParams() {
ed9b544e 1615 zfree(server.saveparams);
1616 server.saveparams = NULL;
1617 server.saveparamslen = 0;
1618}
1619
1620static void initServerConfig() {
1621 server.dbnum = REDIS_DEFAULT_DBNUM;
1622 server.port = REDIS_SERVERPORT;
f870935d 1623 server.verbosity = REDIS_VERBOSE;
ed9b544e 1624 server.maxidletime = REDIS_MAXIDLETIME;
1625 server.saveparams = NULL;
1626 server.logfile = NULL; /* NULL = log on standard output */
1627 server.bindaddr = NULL;
1628 server.glueoutputbuf = 1;
1629 server.daemonize = 0;
44b38ef4 1630 server.appendonly = 0;
4e141d5a 1631 server.appendfsync = APPENDFSYNC_ALWAYS;
48f0308a 1632 server.lastfsync = time(NULL);
44b38ef4 1633 server.appendfd = -1;
1634 server.appendseldb = -1; /* Make sure the first time will not match */
500ece7c 1635 server.pidfile = zstrdup("/var/run/redis.pid");
1636 server.dbfilename = zstrdup("dump.rdb");
1637 server.appendfilename = zstrdup("appendonly.aof");
abcb223e 1638 server.requirepass = NULL;
b0553789 1639 server.rdbcompression = 1;
8ca3e9d1 1640 server.activerehashing = 1;
285add55 1641 server.maxclients = 0;
d5d55fc3 1642 server.blpop_blocked_clients = 0;
3fd78bcd 1643 server.maxmemory = 0;
75680a3c 1644 server.vm_enabled = 0;
054e426d 1645 server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
75680a3c 1646 server.vm_page_size = 256; /* 256 bytes per page */
1647 server.vm_pages = 1024*1024*100; /* 104 millions of pages */
1648 server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
92f8e882 1649 server.vm_max_threads = 4;
d5d55fc3 1650 server.vm_blocked_clients = 0;
cbba7dd7 1651 server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
1652 server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
75680a3c 1653
bcfc686d 1654 resetServerSaveParams();
ed9b544e 1655
1656 appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
1657 appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
1658 appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
1659 /* Replication related */
1660 server.isslave = 0;
d0ccebcf 1661 server.masterauth = NULL;
ed9b544e 1662 server.masterhost = NULL;
1663 server.masterport = 6379;
1664 server.master = NULL;
1665 server.replstate = REDIS_REPL_NONE;
a7866db6 1666
1667 /* Double constants initialization */
1668 R_Zero = 0.0;
1669 R_PosInf = 1.0/R_Zero;
1670 R_NegInf = -1.0/R_Zero;
1671 R_Nan = R_Zero/R_Zero;
ed9b544e 1672}
1673
1674static void initServer() {
1675 int j;
1676
1677 signal(SIGHUP, SIG_IGN);
1678 signal(SIGPIPE, SIG_IGN);
fe3bbfbe 1679 setupSigSegvAction();
ed9b544e 1680
b9bc0eef 1681 server.devnull = fopen("/dev/null","w");
1682 if (server.devnull == NULL) {
1683 redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
1684 exit(1);
1685 }
ed9b544e 1686 server.clients = listCreate();
1687 server.slaves = listCreate();
87eca727 1688 server.monitors = listCreate();
ed9b544e 1689 server.objfreelist = listCreate();
1690 createSharedObjects();
1691 server.el = aeCreateEventLoop();
3305306f 1692 server.db = zmalloc(sizeof(redisDb)*server.dbnum);
ed9b544e 1693 server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
1694 if (server.fd == -1) {
1695 redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
1696 exit(1);
1697 }
3305306f 1698 for (j = 0; j < server.dbnum; j++) {
5234952b 1699 server.db[j].dict = dictCreate(&dbDictType,NULL);
f2d9f50f 1700 server.db[j].expires = dictCreate(&keyptrDictType,NULL);
4409877e 1701 server.db[j].blockingkeys = dictCreate(&keylistDictType,NULL);
d5d55fc3 1702 if (server.vm_enabled)
1703 server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
3305306f 1704 server.db[j].id = j;
1705 }
ffc6b7f8 1706 server.pubsub_channels = dictCreate(&keylistDictType,NULL);
1707 server.pubsub_patterns = listCreate();
1708 listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
1709 listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
ed9b544e 1710 server.cronloops = 0;
9f3c422c 1711 server.bgsavechildpid = -1;
9d65a1bb 1712 server.bgrewritechildpid = -1;
1713 server.bgrewritebuf = sdsempty();
ed9b544e 1714 server.lastsave = time(NULL);
1715 server.dirty = 0;
ed9b544e 1716 server.stat_numcommands = 0;
1717 server.stat_numconnections = 0;
2a6a2ed1 1718 server.stat_expiredkeys = 0;
ed9b544e 1719 server.stat_starttime = time(NULL);
3a66edc7 1720 server.unixtime = time(NULL);
d8f8b666 1721 aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
996cb5f7 1722 if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
1723 acceptHandler, NULL) == AE_ERR) oom("creating file event");
44b38ef4 1724
1725 if (server.appendonly) {
3bb225d6 1726 server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
44b38ef4 1727 if (server.appendfd == -1) {
1728 redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
1729 strerror(errno));
1730 exit(1);
1731 }
1732 }
75680a3c 1733
1734 if (server.vm_enabled) vmInit();
ed9b544e 1735}
1736
1737/* Empty the whole database */
ca37e9cd 1738static long long emptyDb() {
ed9b544e 1739 int j;
ca37e9cd 1740 long long removed = 0;
ed9b544e 1741
3305306f 1742 for (j = 0; j < server.dbnum; j++) {
ca37e9cd 1743 removed += dictSize(server.db[j].dict);
3305306f 1744 dictEmpty(server.db[j].dict);
1745 dictEmpty(server.db[j].expires);
1746 }
ca37e9cd 1747 return removed;
ed9b544e 1748}
1749
85dd2f3a 1750static int yesnotoi(char *s) {
1751 if (!strcasecmp(s,"yes")) return 1;
1752 else if (!strcasecmp(s,"no")) return 0;
1753 else return -1;
1754}
1755
ed9b544e 1756/* I agree, this is a very rudimental way to load a configuration...
1757 will improve later if the config gets more complex */
1758static void loadServerConfig(char *filename) {
c9a111ac 1759 FILE *fp;
ed9b544e 1760 char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
1761 int linenum = 0;
1762 sds line = NULL;
c9a111ac 1763
1764 if (filename[0] == '-' && filename[1] == '\0')
1765 fp = stdin;
1766 else {
1767 if ((fp = fopen(filename,"r")) == NULL) {
9a22de82 1768 redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
c9a111ac 1769 exit(1);
1770 }
ed9b544e 1771 }
c9a111ac 1772
ed9b544e 1773 while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
1774 sds *argv;
1775 int argc, j;
1776
1777 linenum++;
1778 line = sdsnew(buf);
1779 line = sdstrim(line," \t\r\n");
1780
1781 /* Skip comments and blank lines*/
1782 if (line[0] == '#' || line[0] == '\0') {
1783 sdsfree(line);
1784 continue;
1785 }
1786
1787 /* Split into arguments */
1788 argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
1789 sdstolower(argv[0]);
1790
1791 /* Execute config directives */
bb0b03a3 1792 if (!strcasecmp(argv[0],"timeout") && argc == 2) {
ed9b544e 1793 server.maxidletime = atoi(argv[1]);
0150db36 1794 if (server.maxidletime < 0) {
ed9b544e 1795 err = "Invalid timeout value"; goto loaderr;
1796 }
bb0b03a3 1797 } else if (!strcasecmp(argv[0],"port") && argc == 2) {
ed9b544e 1798 server.port = atoi(argv[1]);
1799 if (server.port < 1 || server.port > 65535) {
1800 err = "Invalid port"; goto loaderr;
1801 }
bb0b03a3 1802 } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
ed9b544e 1803 server.bindaddr = zstrdup(argv[1]);
bb0b03a3 1804 } else if (!strcasecmp(argv[0],"save") && argc == 3) {
ed9b544e 1805 int seconds = atoi(argv[1]);
1806 int changes = atoi(argv[2]);
1807 if (seconds < 1 || changes < 0) {
1808 err = "Invalid save parameters"; goto loaderr;
1809 }
1810 appendServerSaveParams(seconds,changes);
bb0b03a3 1811 } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
ed9b544e 1812 if (chdir(argv[1]) == -1) {
1813 redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
1814 argv[1], strerror(errno));
1815 exit(1);
1816 }
bb0b03a3 1817 } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
1818 if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
f870935d 1819 else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
bb0b03a3 1820 else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
1821 else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
ed9b544e 1822 else {
1823 err = "Invalid log level. Must be one of debug, notice, warning";
1824 goto loaderr;
1825 }
bb0b03a3 1826 } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
c9a111ac 1827 FILE *logfp;
ed9b544e 1828
1829 server.logfile = zstrdup(argv[1]);
bb0b03a3 1830 if (!strcasecmp(server.logfile,"stdout")) {
ed9b544e 1831 zfree(server.logfile);
1832 server.logfile = NULL;
1833 }
1834 if (server.logfile) {
1835 /* Test if we are able to open the file. The server will not
1836 * be able to abort just for this problem later... */
c9a111ac 1837 logfp = fopen(server.logfile,"a");
1838 if (logfp == NULL) {
ed9b544e 1839 err = sdscatprintf(sdsempty(),
1840 "Can't open the log file: %s", strerror(errno));
1841 goto loaderr;
1842 }
c9a111ac 1843 fclose(logfp);
ed9b544e 1844 }
bb0b03a3 1845 } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
ed9b544e 1846 server.dbnum = atoi(argv[1]);
1847 if (server.dbnum < 1) {
1848 err = "Invalid number of databases"; goto loaderr;
1849 }
b3f83f12
JZ
1850 } else if (!strcasecmp(argv[0],"include") && argc == 2) {
1851 loadServerConfig(argv[1]);
285add55 1852 } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
1853 server.maxclients = atoi(argv[1]);
3fd78bcd 1854 } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
2b619329 1855 server.maxmemory = memtoll(argv[1],NULL);
bb0b03a3 1856 } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
ed9b544e 1857 server.masterhost = sdsnew(argv[1]);
1858 server.masterport = atoi(argv[2]);
1859 server.replstate = REDIS_REPL_CONNECT;
d0ccebcf 1860 } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
1861 server.masterauth = zstrdup(argv[1]);
bb0b03a3 1862 } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
85dd2f3a 1863 if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
ed9b544e 1864 err = "argument must be 'yes' or 'no'"; goto loaderr;
1865 }
121f70cf 1866 } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
1867 if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
8ca3e9d1 1868 err = "argument must be 'yes' or 'no'"; goto loaderr;
1869 }
1870 } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
1871 if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
121f70cf 1872 err = "argument must be 'yes' or 'no'"; goto loaderr;
1873 }
bb0b03a3 1874 } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
85dd2f3a 1875 if ((server.daemonize = yesnotoi(argv[1])) == -1) {
ed9b544e 1876 err = "argument must be 'yes' or 'no'"; goto loaderr;
1877 }
44b38ef4 1878 } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
1879 if ((server.appendonly = yesnotoi(argv[1])) == -1) {
1880 err = "argument must be 'yes' or 'no'"; goto loaderr;
1881 }
48f0308a 1882 } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
1766c6da 1883 if (!strcasecmp(argv[1],"no")) {
48f0308a 1884 server.appendfsync = APPENDFSYNC_NO;
1766c6da 1885 } else if (!strcasecmp(argv[1],"always")) {
48f0308a 1886 server.appendfsync = APPENDFSYNC_ALWAYS;
1766c6da 1887 } else if (!strcasecmp(argv[1],"everysec")) {
48f0308a 1888 server.appendfsync = APPENDFSYNC_EVERYSEC;
1889 } else {
1890 err = "argument must be 'no', 'always' or 'everysec'";
1891 goto loaderr;
1892 }
bb0b03a3 1893 } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
054e426d 1894 server.requirepass = zstrdup(argv[1]);
bb0b03a3 1895 } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
500ece7c 1896 zfree(server.pidfile);
054e426d 1897 server.pidfile = zstrdup(argv[1]);
bb0b03a3 1898 } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
500ece7c 1899 zfree(server.dbfilename);
054e426d 1900 server.dbfilename = zstrdup(argv[1]);
75680a3c 1901 } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
1902 if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
1903 err = "argument must be 'yes' or 'no'"; goto loaderr;
1904 }
054e426d 1905 } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
fefed597 1906 zfree(server.vm_swap_file);
054e426d 1907 server.vm_swap_file = zstrdup(argv[1]);
4ef8de8a 1908 } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
2b619329 1909 server.vm_max_memory = memtoll(argv[1],NULL);
4ef8de8a 1910 } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
2b619329 1911 server.vm_page_size = memtoll(argv[1], NULL);
4ef8de8a 1912 } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
2b619329 1913 server.vm_pages = memtoll(argv[1], NULL);
92f8e882 1914 } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
1915 server.vm_max_threads = strtoll(argv[1], NULL, 10);
cbba7dd7 1916 } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
2b619329 1917 server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
cbba7dd7 1918 } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
2b619329 1919 server.hash_max_zipmap_value = memtoll(argv[1], NULL);
ed9b544e 1920 } else {
1921 err = "Bad directive or wrong number of arguments"; goto loaderr;
1922 }
1923 for (j = 0; j < argc; j++)
1924 sdsfree(argv[j]);
1925 zfree(argv);
1926 sdsfree(line);
1927 }
c9a111ac 1928 if (fp != stdin) fclose(fp);
ed9b544e 1929 return;
1930
1931loaderr:
1932 fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
1933 fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
1934 fprintf(stderr, ">>> '%s'\n", line);
1935 fprintf(stderr, "%s\n", err);
1936 exit(1);
1937}
1938
1939static void freeClientArgv(redisClient *c) {
1940 int j;
1941
1942 for (j = 0; j < c->argc; j++)
1943 decrRefCount(c->argv[j]);
e8a74421 1944 for (j = 0; j < c->mbargc; j++)
1945 decrRefCount(c->mbargv[j]);
ed9b544e 1946 c->argc = 0;
e8a74421 1947 c->mbargc = 0;
ed9b544e 1948}
1949
1950static void freeClient(redisClient *c) {
1951 listNode *ln;
1952
4409877e 1953 /* Note that if the client we are freeing is blocked into a blocking
b0d8747d 1954 * call, we have to set querybuf to NULL *before* to call
1955 * unblockClientWaitingData() to avoid processInputBuffer() will get
1956 * called. Also it is important to remove the file events after
1957 * this, because this call adds the READABLE event. */
4409877e 1958 sdsfree(c->querybuf);
1959 c->querybuf = NULL;
1960 if (c->flags & REDIS_BLOCKED)
b0d8747d 1961 unblockClientWaitingData(c);
4409877e 1962
ffc6b7f8 1963 /* Unsubscribe from all the pubsub channels */
1964 pubsubUnsubscribeAllChannels(c,0);
1965 pubsubUnsubscribeAllPatterns(c,0);
1966 dictRelease(c->pubsub_channels);
1967 listRelease(c->pubsub_patterns);
befec3cd 1968 /* Obvious cleanup */
ed9b544e 1969 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
1970 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
ed9b544e 1971 listRelease(c->reply);
1972 freeClientArgv(c);
1973 close(c->fd);
92f8e882 1974 /* Remove from the list of clients */
ed9b544e 1975 ln = listSearchKey(server.clients,c);
dfc5e96c 1976 redisAssert(ln != NULL);
ed9b544e 1977 listDelNode(server.clients,ln);
d5d55fc3 1978 /* Remove from the list of clients waiting for swapped keys */
1979 if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
1980 ln = listSearchKey(server.io_ready_clients,c);
1981 if (ln) {
1982 listDelNode(server.io_ready_clients,ln);
1983 server.vm_blocked_clients--;
1984 }
1985 }
1986 while (server.vm_enabled && listLength(c->io_keys)) {
1987 ln = listFirst(c->io_keys);
1988 dontWaitForSwappedKey(c,ln->value);
92f8e882 1989 }
b3e3d0d7 1990 listRelease(c->io_keys);
befec3cd 1991 /* Master/slave cleanup */
ed9b544e 1992 if (c->flags & REDIS_SLAVE) {
6208b3a7 1993 if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
1994 close(c->repldbfd);
87eca727 1995 list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
1996 ln = listSearchKey(l,c);
dfc5e96c 1997 redisAssert(ln != NULL);
87eca727 1998 listDelNode(l,ln);
ed9b544e 1999 }
2000 if (c->flags & REDIS_MASTER) {
2001 server.master = NULL;
2002 server.replstate = REDIS_REPL_CONNECT;
2003 }
befec3cd 2004 /* Release memory */
93ea3759 2005 zfree(c->argv);
e8a74421 2006 zfree(c->mbargv);
6e469882 2007 freeClientMultiState(c);
ed9b544e 2008 zfree(c);
2009}
2010
cc30e368 2011#define GLUEREPLY_UP_TO (1024)
ed9b544e 2012static void glueReplyBuffersIfNeeded(redisClient *c) {
c28b42ac 2013 int copylen = 0;
2014 char buf[GLUEREPLY_UP_TO];
6208b3a7 2015 listNode *ln;
c7df85a4 2016 listIter li;
ed9b544e 2017 robj *o;
2018
c7df85a4 2019 listRewind(c->reply,&li);
2020 while((ln = listNext(&li))) {
c28b42ac 2021 int objlen;
2022
ed9b544e 2023 o = ln->value;
c28b42ac 2024 objlen = sdslen(o->ptr);
2025 if (copylen + objlen <= GLUEREPLY_UP_TO) {
2026 memcpy(buf+copylen,o->ptr,objlen);
2027 copylen += objlen;
ed9b544e 2028 listDelNode(c->reply,ln);
c28b42ac 2029 } else {
2030 if (copylen == 0) return;
2031 break;
ed9b544e 2032 }
ed9b544e 2033 }
c28b42ac 2034 /* Now the output buffer is empty, add the new single element */
2035 o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
2036 listAddNodeHead(c->reply,o);
ed9b544e 2037}
2038
2039static void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2040 redisClient *c = privdata;
2041 int nwritten = 0, totwritten = 0, objlen;
2042 robj *o;
2043 REDIS_NOTUSED(el);
2044 REDIS_NOTUSED(mask);
2045
2895e862 2046 /* Use writev() if we have enough buffers to send */
7ea870c0 2047 if (!server.glueoutputbuf &&
e0a62c7f 2048 listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
7ea870c0 2049 !(c->flags & REDIS_MASTER))
2895e862 2050 {
2051 sendReplyToClientWritev(el, fd, privdata, mask);
2052 return;
2053 }
2895e862 2054
ed9b544e 2055 while(listLength(c->reply)) {
c28b42ac 2056 if (server.glueoutputbuf && listLength(c->reply) > 1)
2057 glueReplyBuffersIfNeeded(c);
2058
ed9b544e 2059 o = listNodeValue(listFirst(c->reply));
2060 objlen = sdslen(o->ptr);
2061
2062 if (objlen == 0) {
2063 listDelNode(c->reply,listFirst(c->reply));
2064 continue;
2065 }
2066
2067 if (c->flags & REDIS_MASTER) {
6f376729 2068 /* Don't reply to a master */
ed9b544e 2069 nwritten = objlen - c->sentlen;
2070 } else {
a4d1ba9a 2071 nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
ed9b544e 2072 if (nwritten <= 0) break;
2073 }
2074 c->sentlen += nwritten;
2075 totwritten += nwritten;
2076 /* If we fully sent the object on head go to the next one */
2077 if (c->sentlen == objlen) {
2078 listDelNode(c->reply,listFirst(c->reply));
2079 c->sentlen = 0;
2080 }
6f376729 2081 /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
12f9d551 2082 * bytes, in a single threaded server it's a good idea to serve
6f376729 2083 * other clients as well, even if a very large request comes from
2084 * super fast link that is always able to accept data (in real world
12f9d551 2085 * scenario think about 'KEYS *' against the loopback interfae) */
6f376729 2086 if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
ed9b544e 2087 }
2088 if (nwritten == -1) {
2089 if (errno == EAGAIN) {
2090 nwritten = 0;
2091 } else {
f870935d 2092 redisLog(REDIS_VERBOSE,
ed9b544e 2093 "Error writing to client: %s", strerror(errno));
2094 freeClient(c);
2095 return;
2096 }
2097 }
2098 if (totwritten > 0) c->lastinteraction = time(NULL);
2099 if (listLength(c->reply) == 0) {
2100 c->sentlen = 0;
2101 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2102 }
2103}
2104
2895e862 2105static void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
2106{
2107 redisClient *c = privdata;
2108 int nwritten = 0, totwritten = 0, objlen, willwrite;
2109 robj *o;
2110 struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
2111 int offset, ion = 0;
2112 REDIS_NOTUSED(el);
2113 REDIS_NOTUSED(mask);
2114
2115 listNode *node;
2116 while (listLength(c->reply)) {
2117 offset = c->sentlen;
2118 ion = 0;
2119 willwrite = 0;
2120
2121 /* fill-in the iov[] array */
2122 for(node = listFirst(c->reply); node; node = listNextNode(node)) {
2123 o = listNodeValue(node);
2124 objlen = sdslen(o->ptr);
2125
e0a62c7f 2126 if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
2895e862 2127 break;
2128
2129 if(ion == REDIS_WRITEV_IOVEC_COUNT)
2130 break; /* no more iovecs */
2131
2132 iov[ion].iov_base = ((char*)o->ptr) + offset;
2133 iov[ion].iov_len = objlen - offset;
2134 willwrite += objlen - offset;
2135 offset = 0; /* just for the first item */
2136 ion++;
2137 }
2138
2139 if(willwrite == 0)
2140 break;
2141
2142 /* write all collected blocks at once */
2143 if((nwritten = writev(fd, iov, ion)) < 0) {
2144 if (errno != EAGAIN) {
f870935d 2145 redisLog(REDIS_VERBOSE,
2895e862 2146 "Error writing to client: %s", strerror(errno));
2147 freeClient(c);
2148 return;
2149 }
2150 break;
2151 }
2152
2153 totwritten += nwritten;
2154 offset = c->sentlen;
2155
2156 /* remove written robjs from c->reply */
2157 while (nwritten && listLength(c->reply)) {
2158 o = listNodeValue(listFirst(c->reply));
2159 objlen = sdslen(o->ptr);
2160
2161 if(nwritten >= objlen - offset) {
2162 listDelNode(c->reply, listFirst(c->reply));
2163 nwritten -= objlen - offset;
2164 c->sentlen = 0;
2165 } else {
2166 /* partial write */
2167 c->sentlen += nwritten;
2168 break;
2169 }
2170 offset = 0;
2171 }
2172 }
2173
e0a62c7f 2174 if (totwritten > 0)
2895e862 2175 c->lastinteraction = time(NULL);
2176
2177 if (listLength(c->reply) == 0) {
2178 c->sentlen = 0;
2179 aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
2180 }
2181}
2182
ed9b544e 2183static struct redisCommand *lookupCommand(char *name) {
2184 int j = 0;
2185 while(cmdTable[j].name != NULL) {
bb0b03a3 2186 if (!strcasecmp(name,cmdTable[j].name)) return &cmdTable[j];
ed9b544e 2187 j++;
2188 }
2189 return NULL;
2190}
2191
2192/* resetClient prepare the client to process the next command */
2193static void resetClient(redisClient *c) {
2194 freeClientArgv(c);
2195 c->bulklen = -1;
e8a74421 2196 c->multibulk = 0;
ed9b544e 2197}
2198
6e469882 2199/* Call() is the core of Redis execution of a command */
2200static void call(redisClient *c, struct redisCommand *cmd) {
2201 long long dirty;
2202
2203 dirty = server.dirty;
2204 cmd->proc(c);
4005fef1 2205 dirty = server.dirty-dirty;
2206
2207 if (server.appendonly && dirty)
6e469882 2208 feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
4005fef1 2209 if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
2210 listLength(server.slaves))
248ea310 2211 replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
6e469882 2212 if (listLength(server.monitors))
248ea310 2213 replicationFeedSlaves(server.monitors,c->db->id,c->argv,c->argc);
6e469882 2214 server.stat_numcommands++;
2215}
2216
ed9b544e 2217/* If this function gets called we already read a whole
2218 * command, argments are in the client argv/argc fields.
2219 * processCommand() execute the command or prepare the
2220 * server for a bulk read from the client.
2221 *
2222 * If 1 is returned the client is still alive and valid and
2223 * and other operations can be performed by the caller. Otherwise
2224 * if 0 is returned the client was destroied (i.e. after QUIT). */
2225static int processCommand(redisClient *c) {
2226 struct redisCommand *cmd;
ed9b544e 2227
3fd78bcd 2228 /* Free some memory if needed (maxmemory setting) */
2229 if (server.maxmemory) freeMemoryIfNeeded();
2230
e8a74421 2231 /* Handle the multi bulk command type. This is an alternative protocol
2232 * supported by Redis in order to receive commands that are composed of
2233 * multiple binary-safe "bulk" arguments. The latency of processing is
2234 * a bit higher but this allows things like multi-sets, so if this
2235 * protocol is used only for MSET and similar commands this is a big win. */
2236 if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
2237 c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
2238 if (c->multibulk <= 0) {
2239 resetClient(c);
2240 return 1;
2241 } else {
2242 decrRefCount(c->argv[c->argc-1]);
2243 c->argc--;
2244 return 1;
2245 }
2246 } else if (c->multibulk) {
2247 if (c->bulklen == -1) {
2248 if (((char*)c->argv[0]->ptr)[0] != '$') {
2249 addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
2250 resetClient(c);
2251 return 1;
2252 } else {
2253 int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
2254 decrRefCount(c->argv[0]);
2255 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2256 c->argc--;
2257 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2258 resetClient(c);
2259 return 1;
2260 }
2261 c->argc--;
2262 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2263 return 1;
2264 }
2265 } else {
2266 c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
2267 c->mbargv[c->mbargc] = c->argv[0];
2268 c->mbargc++;
2269 c->argc--;
2270 c->multibulk--;
2271 if (c->multibulk == 0) {
2272 robj **auxargv;
2273 int auxargc;
2274
2275 /* Here we need to swap the multi-bulk argc/argv with the
2276 * normal argc/argv of the client structure. */
2277 auxargv = c->argv;
2278 c->argv = c->mbargv;
2279 c->mbargv = auxargv;
2280
2281 auxargc = c->argc;
2282 c->argc = c->mbargc;
2283 c->mbargc = auxargc;
2284
2285 /* We need to set bulklen to something different than -1
2286 * in order for the code below to process the command without
2287 * to try to read the last argument of a bulk command as
2288 * a special argument. */
2289 c->bulklen = 0;
2290 /* continue below and process the command */
2291 } else {
2292 c->bulklen = -1;
2293 return 1;
2294 }
2295 }
2296 }
2297 /* -- end of multi bulk commands processing -- */
2298
ed9b544e 2299 /* The QUIT command is handled as a special case. Normal command
2300 * procs are unable to close the client connection safely */
bb0b03a3 2301 if (!strcasecmp(c->argv[0]->ptr,"quit")) {
ed9b544e 2302 freeClient(c);
2303 return 0;
2304 }
d5d55fc3 2305
2306 /* Now lookup the command and check ASAP about trivial error conditions
2307 * such wrong arity, bad command name and so forth. */
ed9b544e 2308 cmd = lookupCommand(c->argv[0]->ptr);
2309 if (!cmd) {
2c14807b 2310 addReplySds(c,
2311 sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
2312 (char*)c->argv[0]->ptr));
ed9b544e 2313 resetClient(c);
2314 return 1;
2315 } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
2316 (c->argc < -cmd->arity)) {
454d4e43 2317 addReplySds(c,
2318 sdscatprintf(sdsempty(),
2319 "-ERR wrong number of arguments for '%s' command\r\n",
2320 cmd->name));
ed9b544e 2321 resetClient(c);
2322 return 1;
2323 } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
d5d55fc3 2324 /* This is a bulk command, we have to read the last argument yet. */
ed9b544e 2325 int bulklen = atoi(c->argv[c->argc-1]->ptr);
2326
2327 decrRefCount(c->argv[c->argc-1]);
2328 if (bulklen < 0 || bulklen > 1024*1024*1024) {
2329 c->argc--;
2330 addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
2331 resetClient(c);
2332 return 1;
2333 }
2334 c->argc--;
2335 c->bulklen = bulklen+2; /* add two bytes for CR+LF */
2336 /* It is possible that the bulk read is already in the
8d0490e7 2337 * buffer. Check this condition and handle it accordingly.
2338 * This is just a fast path, alternative to call processInputBuffer().
2339 * It's a good idea since the code is small and this condition
2340 * happens most of the times. */
ed9b544e 2341 if ((signed)sdslen(c->querybuf) >= c->bulklen) {
2342 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2343 c->argc++;
2344 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
2345 } else {
d5d55fc3 2346 /* Otherwise return... there is to read the last argument
2347 * from the socket. */
ed9b544e 2348 return 1;
2349 }
2350 }
942a3961 2351 /* Let's try to encode the bulk object to save space. */
2352 if (cmd->flags & REDIS_CMD_BULK)
05df7621 2353 c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
942a3961 2354
e63943a4 2355 /* Check if the user is authenticated */
2356 if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
2357 addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
2358 resetClient(c);
2359 return 1;
2360 }
2361
b61a28fe 2362 /* Handle the maxmemory directive */
2363 if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
2364 zmalloc_used_memory() > server.maxmemory)
2365 {
2366 addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
2367 resetClient(c);
2368 return 1;
2369 }
2370
d6cc8867 2371 /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
e6cca5db 2372 if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
2373 &&
ffc6b7f8 2374 cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
2375 cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
2376 addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
d6cc8867 2377 resetClient(c);
2378 return 1;
2379 }
2380
ed9b544e 2381 /* Exec the command */
18b6cb76 2382 if (c->flags & REDIS_MULTI && cmd->proc != execCommand && cmd->proc != discardCommand) {
6e469882 2383 queueMultiCommand(c,cmd);
2384 addReply(c,shared.queued);
2385 } else {
d5d55fc3 2386 if (server.vm_enabled && server.vm_max_threads > 0 &&
2387 blockClientOnSwappedKeys(cmd,c)) return 1;
6e469882 2388 call(c,cmd);
2389 }
ed9b544e 2390
2391 /* Prepare the client for the next command */
ed9b544e 2392 resetClient(c);
2393 return 1;
2394}
2395
248ea310 2396static void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
6208b3a7 2397 listNode *ln;
c7df85a4 2398 listIter li;
ed9b544e 2399 int outc = 0, j;
93ea3759 2400 robj **outv;
248ea310 2401 /* We need 1+(ARGS*3) objects since commands are using the new protocol
2402 * and we one 1 object for the first "*<count>\r\n" multibulk count, then
2403 * for every additional object we have "$<count>\r\n" + object + "\r\n". */
2404 robj *static_outv[REDIS_STATIC_ARGS*3+1];
2405 robj *lenobj;
93ea3759 2406
2407 if (argc <= REDIS_STATIC_ARGS) {
2408 outv = static_outv;
2409 } else {
248ea310 2410 outv = zmalloc(sizeof(robj*)*(argc*3+1));
93ea3759 2411 }
248ea310 2412
2413 lenobj = createObject(REDIS_STRING,
2414 sdscatprintf(sdsempty(), "*%d\r\n", argc));
2415 lenobj->refcount = 0;
2416 outv[outc++] = lenobj;
ed9b544e 2417 for (j = 0; j < argc; j++) {
248ea310 2418 lenobj = createObject(REDIS_STRING,
2419 sdscatprintf(sdsempty(),"$%lu\r\n",
2420 (unsigned long) stringObjectLen(argv[j])));
2421 lenobj->refcount = 0;
2422 outv[outc++] = lenobj;
ed9b544e 2423 outv[outc++] = argv[j];
248ea310 2424 outv[outc++] = shared.crlf;
ed9b544e 2425 }
ed9b544e 2426
40d224a9 2427 /* Increment all the refcounts at start and decrement at end in order to
2428 * be sure to free objects if there is no slave in a replication state
2429 * able to be feed with commands */
2430 for (j = 0; j < outc; j++) incrRefCount(outv[j]);
c7df85a4 2431 listRewind(slaves,&li);
2432 while((ln = listNext(&li))) {
ed9b544e 2433 redisClient *slave = ln->value;
40d224a9 2434
2435 /* Don't feed slaves that are still waiting for BGSAVE to start */
6208b3a7 2436 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
40d224a9 2437
2438 /* Feed all the other slaves, MONITORs and so on */
ed9b544e 2439 if (slave->slaveseldb != dictid) {
2440 robj *selectcmd;
2441
2442 switch(dictid) {
2443 case 0: selectcmd = shared.select0; break;
2444 case 1: selectcmd = shared.select1; break;
2445 case 2: selectcmd = shared.select2; break;
2446 case 3: selectcmd = shared.select3; break;
2447 case 4: selectcmd = shared.select4; break;
2448 case 5: selectcmd = shared.select5; break;
2449 case 6: selectcmd = shared.select6; break;
2450 case 7: selectcmd = shared.select7; break;
2451 case 8: selectcmd = shared.select8; break;
2452 case 9: selectcmd = shared.select9; break;
2453 default:
2454 selectcmd = createObject(REDIS_STRING,
2455 sdscatprintf(sdsempty(),"select %d\r\n",dictid));
2456 selectcmd->refcount = 0;
2457 break;
2458 }
2459 addReply(slave,selectcmd);
2460 slave->slaveseldb = dictid;
2461 }
2462 for (j = 0; j < outc; j++) addReply(slave,outv[j]);
ed9b544e 2463 }
40d224a9 2464 for (j = 0; j < outc; j++) decrRefCount(outv[j]);
93ea3759 2465 if (outv != static_outv) zfree(outv);
ed9b544e 2466}
2467
638e42ac 2468static void processInputBuffer(redisClient *c) {
ed9b544e 2469again:
4409877e 2470 /* Before to process the input buffer, make sure the client is not
2471 * waitig for a blocking operation such as BLPOP. Note that the first
2472 * iteration the client is never blocked, otherwise the processInputBuffer
2473 * would not be called at all, but after the execution of the first commands
2474 * in the input buffer the client may be blocked, and the "goto again"
2475 * will try to reiterate. The following line will make it return asap. */
92f8e882 2476 if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
ed9b544e 2477 if (c->bulklen == -1) {
2478 /* Read the first line of the query */
2479 char *p = strchr(c->querybuf,'\n');
2480 size_t querylen;
644fafa3 2481
ed9b544e 2482 if (p) {
2483 sds query, *argv;
2484 int argc, j;
e0a62c7f 2485
ed9b544e 2486 query = c->querybuf;
2487 c->querybuf = sdsempty();
2488 querylen = 1+(p-(query));
2489 if (sdslen(query) > querylen) {
2490 /* leave data after the first line of the query in the buffer */
2491 c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
2492 }
2493 *p = '\0'; /* remove "\n" */
2494 if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
2495 sdsupdatelen(query);
2496
2497 /* Now we can split the query in arguments */
ed9b544e 2498 argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
93ea3759 2499 sdsfree(query);
2500
2501 if (c->argv) zfree(c->argv);
2502 c->argv = zmalloc(sizeof(robj*)*argc);
93ea3759 2503
2504 for (j = 0; j < argc; j++) {
ed9b544e 2505 if (sdslen(argv[j])) {
2506 c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
2507 c->argc++;
2508 } else {
2509 sdsfree(argv[j]);
2510 }
2511 }
2512 zfree(argv);
7c49733c 2513 if (c->argc) {
2514 /* Execute the command. If the client is still valid
2515 * after processCommand() return and there is something
2516 * on the query buffer try to process the next command. */
2517 if (processCommand(c) && sdslen(c->querybuf)) goto again;
2518 } else {
2519 /* Nothing to process, argc == 0. Just process the query
2520 * buffer if it's not empty or return to the caller */
2521 if (sdslen(c->querybuf)) goto again;
2522 }
ed9b544e 2523 return;
644fafa3 2524 } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
f870935d 2525 redisLog(REDIS_VERBOSE, "Client protocol error");
ed9b544e 2526 freeClient(c);
2527 return;
2528 }
2529 } else {
2530 /* Bulk read handling. Note that if we are at this point
2531 the client already sent a command terminated with a newline,
2532 we are reading the bulk data that is actually the last
2533 argument of the command. */
2534 int qbl = sdslen(c->querybuf);
2535
2536 if (c->bulklen <= qbl) {
2537 /* Copy everything but the final CRLF as final argument */
2538 c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
2539 c->argc++;
2540 c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
638e42ac 2541 /* Process the command. If the client is still valid after
2542 * the processing and there is more data in the buffer
2543 * try to parse it. */
2544 if (processCommand(c) && sdslen(c->querybuf)) goto again;
ed9b544e 2545 return;
2546 }
2547 }
2548}
2549
638e42ac 2550static void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
2551 redisClient *c = (redisClient*) privdata;
2552 char buf[REDIS_IOBUF_LEN];
2553 int nread;
2554 REDIS_NOTUSED(el);
2555 REDIS_NOTUSED(mask);
2556
2557 nread = read(fd, buf, REDIS_IOBUF_LEN);
2558 if (nread == -1) {
2559 if (errno == EAGAIN) {
2560 nread = 0;
2561 } else {
f870935d 2562 redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
638e42ac 2563 freeClient(c);
2564 return;
2565 }
2566 } else if (nread == 0) {
f870935d 2567 redisLog(REDIS_VERBOSE, "Client closed connection");
638e42ac 2568 freeClient(c);
2569 return;
2570 }
2571 if (nread) {
2572 c->querybuf = sdscatlen(c->querybuf, buf, nread);
2573 c->lastinteraction = time(NULL);
2574 } else {
2575 return;
2576 }
168ac5c6 2577 processInputBuffer(c);
638e42ac 2578}
2579
ed9b544e 2580static int selectDb(redisClient *c, int id) {
2581 if (id < 0 || id >= server.dbnum)
2582 return REDIS_ERR;
3305306f 2583 c->db = &server.db[id];
ed9b544e 2584 return REDIS_OK;
2585}
2586
40d224a9 2587static void *dupClientReplyValue(void *o) {
2588 incrRefCount((robj*)o);
12d090d2 2589 return o;
40d224a9 2590}
2591
ffc6b7f8 2592static int listMatchObjects(void *a, void *b) {
2593 return compareStringObjects(a,b) == 0;
2594}
2595
ed9b544e 2596static redisClient *createClient(int fd) {
2597 redisClient *c = zmalloc(sizeof(*c));
2598
2599 anetNonBlock(NULL,fd);
2600 anetTcpNoDelay(NULL,fd);
2601 if (!c) return NULL;
2602 selectDb(c,0);
2603 c->fd = fd;
2604 c->querybuf = sdsempty();
2605 c->argc = 0;
93ea3759 2606 c->argv = NULL;
ed9b544e 2607 c->bulklen = -1;
e8a74421 2608 c->multibulk = 0;
2609 c->mbargc = 0;
2610 c->mbargv = NULL;
ed9b544e 2611 c->sentlen = 0;
2612 c->flags = 0;
2613 c->lastinteraction = time(NULL);
abcb223e 2614 c->authenticated = 0;
40d224a9 2615 c->replstate = REDIS_REPL_NONE;
6b47e12e 2616 c->reply = listCreate();
ed9b544e 2617 listSetFreeMethod(c->reply,decrRefCount);
40d224a9 2618 listSetDupMethod(c->reply,dupClientReplyValue);
92f8e882 2619 c->blockingkeys = NULL;
2620 c->blockingkeysnum = 0;
2621 c->io_keys = listCreate();
2622 listSetFreeMethod(c->io_keys,decrRefCount);
ffc6b7f8 2623 c->pubsub_channels = dictCreate(&setDictType,NULL);
2624 c->pubsub_patterns = listCreate();
2625 listSetFreeMethod(c->pubsub_patterns,decrRefCount);
2626 listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
ed9b544e 2627 if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
266373b2 2628 readQueryFromClient, c) == AE_ERR) {
ed9b544e 2629 freeClient(c);
2630 return NULL;
2631 }
6b47e12e 2632 listAddNodeTail(server.clients,c);
6e469882 2633 initClientMultiState(c);
ed9b544e 2634 return c;
2635}
2636
2637static void addReply(redisClient *c, robj *obj) {
2638 if (listLength(c->reply) == 0 &&
6208b3a7 2639 (c->replstate == REDIS_REPL_NONE ||
2640 c->replstate == REDIS_REPL_ONLINE) &&
ed9b544e 2641 aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
266373b2 2642 sendReplyToClient, c) == AE_ERR) return;
e3cadb8a 2643
2644 if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
2645 obj = dupStringObject(obj);
2646 obj->refcount = 0; /* getDecodedObject() will increment the refcount */
2647 }
9d65a1bb 2648 listAddNodeTail(c->reply,getDecodedObject(obj));
ed9b544e 2649}
2650
2651static void addReplySds(redisClient *c, sds s) {
2652 robj *o = createObject(REDIS_STRING,s);
2653 addReply(c,o);
2654 decrRefCount(o);
2655}
2656
e2665397 2657static void addReplyDouble(redisClient *c, double d) {
2658 char buf[128];
2659
2660 snprintf(buf,sizeof(buf),"%.17g",d);
682ac724 2661 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
83c6a618 2662 (unsigned long) strlen(buf),buf));
e2665397 2663}
2664
f44dd428 2665static void addReplyLong(redisClient *c, long l) {
2666 char buf[128];
2667 size_t len;
2668
dd88747b 2669 if (l == 0) {
2670 addReply(c,shared.czero);
2671 return;
2672 } else if (l == 1) {
2673 addReply(c,shared.cone);
2674 return;
2675 }
f44dd428 2676 len = snprintf(buf,sizeof(buf),":%ld\r\n",l);
2677 addReplySds(c,sdsnewlen(buf,len));
2678}
2679
aa7c2934
PN
2680static void addReplyLongLong(redisClient *c, long long ll) {
2681 char buf[128];
2682 size_t len;
2683
2684 if (ll == 0) {
2685 addReply(c,shared.czero);
2686 return;
2687 } else if (ll == 1) {
2688 addReply(c,shared.cone);
2689 return;
2690 }
2691 len = snprintf(buf,sizeof(buf),":%lld\r\n",ll);
2692 addReplySds(c,sdsnewlen(buf,len));
2693}
2694
92b27fe9 2695static void addReplyUlong(redisClient *c, unsigned long ul) {
2696 char buf[128];
2697 size_t len;
2698
dd88747b 2699 if (ul == 0) {
2700 addReply(c,shared.czero);
2701 return;
2702 } else if (ul == 1) {
2703 addReply(c,shared.cone);
2704 return;
2705 }
92b27fe9 2706 len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
2707 addReplySds(c,sdsnewlen(buf,len));
2708}
2709
942a3961 2710static void addReplyBulkLen(redisClient *c, robj *obj) {
2711 size_t len;
2712
2713 if (obj->encoding == REDIS_ENCODING_RAW) {
2714 len = sdslen(obj->ptr);
2715 } else {
2716 long n = (long)obj->ptr;
2717
e054afda 2718 /* Compute how many bytes will take this integer as a radix 10 string */
942a3961 2719 len = 1;
2720 if (n < 0) {
2721 len++;
2722 n = -n;
2723 }
2724 while((n = n/10) != 0) {
2725 len++;
2726 }
2727 }
83c6a618 2728 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",(unsigned long)len));
942a3961 2729}
2730
dd88747b 2731static void addReplyBulk(redisClient *c, robj *obj) {
2732 addReplyBulkLen(c,obj);
2733 addReply(c,obj);
2734 addReply(c,shared.crlf);
2735}
2736
500ece7c 2737/* In the CONFIG command we need to add vanilla C string as bulk replies */
2738static void addReplyBulkCString(redisClient *c, char *s) {
2739 if (s == NULL) {
2740 addReply(c,shared.nullbulk);
2741 } else {
2742 robj *o = createStringObject(s,strlen(s));
2743 addReplyBulk(c,o);
2744 decrRefCount(o);
2745 }
2746}
2747
ed9b544e 2748static void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
2749 int cport, cfd;
2750 char cip[128];
285add55 2751 redisClient *c;
ed9b544e 2752 REDIS_NOTUSED(el);
2753 REDIS_NOTUSED(mask);
2754 REDIS_NOTUSED(privdata);
2755
2756 cfd = anetAccept(server.neterr, fd, cip, &cport);
2757 if (cfd == AE_ERR) {
f870935d 2758 redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
ed9b544e 2759 return;
2760 }
f870935d 2761 redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
285add55 2762 if ((c = createClient(cfd)) == NULL) {
ed9b544e 2763 redisLog(REDIS_WARNING,"Error allocating resoures for the client");
2764 close(cfd); /* May be already closed, just ingore errors */
2765 return;
2766 }
285add55 2767 /* If maxclient directive is set and this is one client more... close the
2768 * connection. Note that we create the client instead to check before
2769 * for this condition, since now the socket is already set in nonblocking
2770 * mode and we can send an error for free using the Kernel I/O */
2771 if (server.maxclients && listLength(server.clients) > server.maxclients) {
2772 char *err = "-ERR max number of clients reached\r\n";
2773
2774 /* That's a best effort error message, don't check write errors */
fee803ba 2775 if (write(c->fd,err,strlen(err)) == -1) {
2776 /* Nothing to do, Just to avoid the warning... */
2777 }
285add55 2778 freeClient(c);
2779 return;
2780 }
ed9b544e 2781 server.stat_numconnections++;
2782}
2783
2784/* ======================= Redis objects implementation ===================== */
2785
2786static robj *createObject(int type, void *ptr) {
2787 robj *o;
2788
a5819310 2789 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2790 if (listLength(server.objfreelist)) {
2791 listNode *head = listFirst(server.objfreelist);
2792 o = listNodeValue(head);
2793 listDelNode(server.objfreelist,head);
a5819310 2794 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2795 } else {
75680a3c 2796 if (server.vm_enabled) {
a5819310 2797 pthread_mutex_unlock(&server.obj_freelist_mutex);
75680a3c 2798 o = zmalloc(sizeof(*o));
2799 } else {
2800 o = zmalloc(sizeof(*o)-sizeof(struct redisObjectVM));
2801 }
ed9b544e 2802 }
ed9b544e 2803 o->type = type;
942a3961 2804 o->encoding = REDIS_ENCODING_RAW;
ed9b544e 2805 o->ptr = ptr;
2806 o->refcount = 1;
3a66edc7 2807 if (server.vm_enabled) {
1064ef87 2808 /* Note that this code may run in the context of an I/O thread
2809 * and accessing to server.unixtime in theory is an error
2810 * (no locks). But in practice this is safe, and even if we read
2811 * garbage Redis will not fail, as it's just a statistical info */
3a66edc7 2812 o->vm.atime = server.unixtime;
2813 o->storage = REDIS_VM_MEMORY;
2814 }
ed9b544e 2815 return o;
2816}
2817
2818static robj *createStringObject(char *ptr, size_t len) {
2819 return createObject(REDIS_STRING,sdsnewlen(ptr,len));
2820}
2821
3f973463
PN
2822static robj *createStringObjectFromLongLong(long long value) {
2823 robj *o;
2824 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
2825 incrRefCount(shared.integers[value]);
2826 o = shared.integers[value];
2827 } else {
2828 o = createObject(REDIS_STRING, NULL);
2829 if (value >= LONG_MIN && value <= LONG_MAX) {
2830 o->encoding = REDIS_ENCODING_INT;
2831 o->ptr = (void*)((long)value);
2832 } else {
2833 o->ptr = sdscatprintf(sdsempty(),"%lld",value);
2834 }
2835 }
2836 return o;
2837}
2838
4ef8de8a 2839static robj *dupStringObject(robj *o) {
b9bc0eef 2840 assert(o->encoding == REDIS_ENCODING_RAW);
4ef8de8a 2841 return createStringObject(o->ptr,sdslen(o->ptr));
2842}
2843
ed9b544e 2844static robj *createListObject(void) {
2845 list *l = listCreate();
2846
ed9b544e 2847 listSetFreeMethod(l,decrRefCount);
2848 return createObject(REDIS_LIST,l);
2849}
2850
2851static robj *createSetObject(void) {
2852 dict *d = dictCreate(&setDictType,NULL);
ed9b544e 2853 return createObject(REDIS_SET,d);
2854}
2855
5234952b 2856static robj *createHashObject(void) {
2857 /* All the Hashes start as zipmaps. Will be automatically converted
2858 * into hash tables if there are enough elements or big elements
2859 * inside. */
2860 unsigned char *zm = zipmapNew();
2861 robj *o = createObject(REDIS_HASH,zm);
2862 o->encoding = REDIS_ENCODING_ZIPMAP;
2863 return o;
2864}
2865
1812e024 2866static robj *createZsetObject(void) {
6b47e12e 2867 zset *zs = zmalloc(sizeof(*zs));
2868
2869 zs->dict = dictCreate(&zsetDictType,NULL);
2870 zs->zsl = zslCreate();
2871 return createObject(REDIS_ZSET,zs);
1812e024 2872}
2873
ed9b544e 2874static void freeStringObject(robj *o) {
942a3961 2875 if (o->encoding == REDIS_ENCODING_RAW) {
2876 sdsfree(o->ptr);
2877 }
ed9b544e 2878}
2879
2880static void freeListObject(robj *o) {
2881 listRelease((list*) o->ptr);
2882}
2883
2884static void freeSetObject(robj *o) {
2885 dictRelease((dict*) o->ptr);
2886}
2887
fd8ccf44 2888static void freeZsetObject(robj *o) {
2889 zset *zs = o->ptr;
2890
2891 dictRelease(zs->dict);
2892 zslFree(zs->zsl);
2893 zfree(zs);
2894}
2895
ed9b544e 2896static void freeHashObject(robj *o) {
cbba7dd7 2897 switch (o->encoding) {
2898 case REDIS_ENCODING_HT:
2899 dictRelease((dict*) o->ptr);
2900 break;
2901 case REDIS_ENCODING_ZIPMAP:
2902 zfree(o->ptr);
2903 break;
2904 default:
f83c6cb5 2905 redisPanic("Unknown hash encoding type");
cbba7dd7 2906 break;
2907 }
ed9b544e 2908}
2909
2910static void incrRefCount(robj *o) {
2911 o->refcount++;
2912}
2913
2914static void decrRefCount(void *obj) {
2915 robj *o = obj;
94754ccc 2916
c651fd9e 2917 if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
970e10bb 2918 /* Object is a key of a swapped out value, or in the process of being
2919 * loaded. */
996cb5f7 2920 if (server.vm_enabled &&
2921 (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
2922 {
996cb5f7 2923 if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(obj);
f2b8ab34 2924 redisAssert(o->type == REDIS_STRING);
a35ddf12 2925 freeStringObject(o);
2926 vmMarkPagesFree(o->vm.page,o->vm.usedpages);
a5819310 2927 pthread_mutex_lock(&server.obj_freelist_mutex);
a35ddf12 2928 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2929 !listAddNodeHead(server.objfreelist,o))
2930 zfree(o);
a5819310 2931 pthread_mutex_unlock(&server.obj_freelist_mutex);
7d98e08c 2932 server.vm_stats_swapped_objects--;
a35ddf12 2933 return;
2934 }
996cb5f7 2935 /* Object is in memory, or in the process of being swapped out. */
ed9b544e 2936 if (--(o->refcount) == 0) {
996cb5f7 2937 if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
2938 vmCancelThreadedIOJob(obj);
ed9b544e 2939 switch(o->type) {
2940 case REDIS_STRING: freeStringObject(o); break;
2941 case REDIS_LIST: freeListObject(o); break;
2942 case REDIS_SET: freeSetObject(o); break;
fd8ccf44 2943 case REDIS_ZSET: freeZsetObject(o); break;
ed9b544e 2944 case REDIS_HASH: freeHashObject(o); break;
f83c6cb5 2945 default: redisPanic("Unknown object type"); break;
ed9b544e 2946 }
a5819310 2947 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
ed9b544e 2948 if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
2949 !listAddNodeHead(server.objfreelist,o))
2950 zfree(o);
a5819310 2951 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
ed9b544e 2952 }
2953}
2954
942a3961 2955static robj *lookupKey(redisDb *db, robj *key) {
2956 dictEntry *de = dictFind(db->dict,key);
3a66edc7 2957 if (de) {
55cf8433 2958 robj *key = dictGetEntryKey(de);
2959 robj *val = dictGetEntryVal(de);
3a66edc7 2960
55cf8433 2961 if (server.vm_enabled) {
996cb5f7 2962 if (key->storage == REDIS_VM_MEMORY ||
2963 key->storage == REDIS_VM_SWAPPING)
2964 {
2965 /* If we were swapping the object out, stop it, this key
2966 * was requested. */
2967 if (key->storage == REDIS_VM_SWAPPING)
2968 vmCancelThreadedIOJob(key);
55cf8433 2969 /* Update the access time of the key for the aging algorithm. */
2970 key->vm.atime = server.unixtime;
2971 } else {
d5d55fc3 2972 int notify = (key->storage == REDIS_VM_LOADING);
2973
55cf8433 2974 /* Our value was swapped on disk. Bring it at home. */
f2b8ab34 2975 redisAssert(val == NULL);
55cf8433 2976 val = vmLoadObject(key);
2977 dictGetEntryVal(de) = val;
d5d55fc3 2978
2979 /* Clients blocked by the VM subsystem may be waiting for
2980 * this key... */
2981 if (notify) handleClientsBlockedOnSwappedKey(db,key);
55cf8433 2982 }
2983 }
2984 return val;
3a66edc7 2985 } else {
2986 return NULL;
2987 }
942a3961 2988}
2989
2990static robj *lookupKeyRead(redisDb *db, robj *key) {
2991 expireIfNeeded(db,key);
2992 return lookupKey(db,key);
2993}
2994
2995static robj *lookupKeyWrite(redisDb *db, robj *key) {
2996 deleteIfVolatile(db,key);
2997 return lookupKey(db,key);
2998}
2999
92b27fe9 3000static robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
3001 robj *o = lookupKeyRead(c->db, key);
3002 if (!o) addReply(c,reply);
3003 return o;
3004}
3005
3006static robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
3007 robj *o = lookupKeyWrite(c->db, key);
3008 if (!o) addReply(c,reply);
3009 return o;
3010}
3011
3012static int checkType(redisClient *c, robj *o, int type) {
3013 if (o->type != type) {
3014 addReply(c,shared.wrongtypeerr);
3015 return 1;
3016 }
3017 return 0;
3018}
3019
942a3961 3020static int deleteKey(redisDb *db, robj *key) {
3021 int retval;
3022
3023 /* We need to protect key from destruction: after the first dictDelete()
3024 * it may happen that 'key' is no longer valid if we don't increment
3025 * it's count. This may happen when we get the object reference directly
3026 * from the hash table with dictRandomKey() or dict iterators */
3027 incrRefCount(key);
3028 if (dictSize(db->expires)) dictDelete(db->expires,key);
3029 retval = dictDelete(db->dict,key);
3030 decrRefCount(key);
3031
3032 return retval == DICT_OK;
3033}
3034
724a51b1 3035/* Check if the nul-terminated string 's' can be represented by a long
3036 * (that is, is a number that fits into long without any other space or
3037 * character before or after the digits).
3038 *
3039 * If so, the function returns REDIS_OK and *longval is set to the value
3040 * of the number. Otherwise REDIS_ERR is returned */
f69f2cba 3041static int isStringRepresentableAsLong(sds s, long *longval) {
724a51b1 3042 char buf[32], *endptr;
3043 long value;
3044 int slen;
e0a62c7f 3045
724a51b1 3046 value = strtol(s, &endptr, 10);
3047 if (endptr[0] != '\0') return REDIS_ERR;
3048 slen = snprintf(buf,32,"%ld",value);
3049
3050 /* If the number converted back into a string is not identical
3051 * then it's not possible to encode the string as integer */
f69f2cba 3052 if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
724a51b1 3053 if (longval) *longval = value;
3054 return REDIS_OK;
3055}
3056
942a3961 3057/* Try to encode a string object in order to save space */
05df7621 3058static robj *tryObjectEncoding(robj *o) {
942a3961 3059 long value;
942a3961 3060 sds s = o->ptr;
3305306f 3061
942a3961 3062 if (o->encoding != REDIS_ENCODING_RAW)
05df7621 3063 return o; /* Already encoded */
3305306f 3064
05df7621 3065 /* It's not safe to encode shared objects: shared objects can be shared
942a3961 3066 * everywhere in the "object space" of Redis. Encoded objects can only
3067 * appear as "values" (and not, for instance, as keys) */
05df7621 3068 if (o->refcount > 1) return o;
3305306f 3069
942a3961 3070 /* Currently we try to encode only strings */
dfc5e96c 3071 redisAssert(o->type == REDIS_STRING);
94754ccc 3072
724a51b1 3073 /* Check if we can represent this string as a long integer */
05df7621 3074 if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
942a3961 3075
3076 /* Ok, this object can be encoded */
05df7621 3077 if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
3078 decrRefCount(o);
3079 incrRefCount(shared.integers[value]);
3080 return shared.integers[value];
3081 } else {
3082 o->encoding = REDIS_ENCODING_INT;
3083 sdsfree(o->ptr);
3084 o->ptr = (void*) value;
3085 return o;
3086 }
942a3961 3087}
3088
9d65a1bb 3089/* Get a decoded version of an encoded object (returned as a new object).
3090 * If the object is already raw-encoded just increment the ref count. */
3091static robj *getDecodedObject(robj *o) {
942a3961 3092 robj *dec;
e0a62c7f 3093
9d65a1bb 3094 if (o->encoding == REDIS_ENCODING_RAW) {
3095 incrRefCount(o);
3096 return o;
3097 }
942a3961 3098 if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
3099 char buf[32];
3100
3101 snprintf(buf,32,"%ld",(long)o->ptr);
3102 dec = createStringObject(buf,strlen(buf));
3103 return dec;
3104 } else {
08ee9b57 3105 redisPanic("Unknown encoding type");
942a3961 3106 }
3305306f 3107}
3108
d7f43c08 3109/* Compare two string objects via strcmp() or alike.
3110 * Note that the objects may be integer-encoded. In such a case we
3111 * use snprintf() to get a string representation of the numbers on the stack
1fd9bc8a 3112 * and compare the strings, it's much faster than calling getDecodedObject().
3113 *
3114 * Important note: if objects are not integer encoded, but binary-safe strings,
3115 * sdscmp() from sds.c will apply memcmp() so this function ca be considered
3116 * binary safe. */
724a51b1 3117static int compareStringObjects(robj *a, robj *b) {
dfc5e96c 3118 redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
d7f43c08 3119 char bufa[128], bufb[128], *astr, *bstr;
3120 int bothsds = 1;
724a51b1 3121
e197b441 3122 if (a == b) return 0;
d7f43c08 3123 if (a->encoding != REDIS_ENCODING_RAW) {
3124 snprintf(bufa,sizeof(bufa),"%ld",(long) a->ptr);
3125 astr = bufa;
3126 bothsds = 0;
724a51b1 3127 } else {
d7f43c08 3128 astr = a->ptr;
724a51b1 3129 }
d7f43c08 3130 if (b->encoding != REDIS_ENCODING_RAW) {
3131 snprintf(bufb,sizeof(bufb),"%ld",(long) b->ptr);
3132 bstr = bufb;
3133 bothsds = 0;
3134 } else {
3135 bstr = b->ptr;
3136 }
3137 return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
724a51b1 3138}
3139
0ea663ea 3140static size_t stringObjectLen(robj *o) {
dfc5e96c 3141 redisAssert(o->type == REDIS_STRING);
0ea663ea 3142 if (o->encoding == REDIS_ENCODING_RAW) {
3143 return sdslen(o->ptr);
3144 } else {
3145 char buf[32];
3146
3147 return snprintf(buf,32,"%ld",(long)o->ptr);
3148 }
3149}
3150
bd79a6bd
PN
3151static int getDoubleFromObject(robj *o, double *target) {
3152 double value;
682c73e8 3153 char *eptr;
bbe025e0 3154
bd79a6bd
PN
3155 if (o == NULL) {
3156 value = 0;
3157 } else {
3158 redisAssert(o->type == REDIS_STRING);
3159 if (o->encoding == REDIS_ENCODING_RAW) {
3160 value = strtod(o->ptr, &eptr);
682c73e8 3161 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3162 } else if (o->encoding == REDIS_ENCODING_INT) {
3163 value = (long)o->ptr;
3164 } else {
3165 redisAssert(1 != 1);
3166 }
3167 }
3168
bd79a6bd
PN
3169 *target = value;
3170 return REDIS_OK;
3171}
bbe025e0 3172
bd79a6bd
PN
3173static int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
3174 double value;
3175 if (getDoubleFromObject(o, &value) != REDIS_OK) {
3176 if (msg != NULL) {
3177 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3178 } else {
3179 addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
3180 }
bbe025e0
AM
3181 return REDIS_ERR;
3182 }
3183
bd79a6bd 3184 *target = value;
bbe025e0
AM
3185 return REDIS_OK;
3186}
3187
bd79a6bd
PN
3188static int getLongLongFromObject(robj *o, long long *target) {
3189 long long value;
682c73e8 3190 char *eptr;
bbe025e0 3191
bd79a6bd
PN
3192 if (o == NULL) {
3193 value = 0;
3194 } else {
3195 redisAssert(o->type == REDIS_STRING);
3196 if (o->encoding == REDIS_ENCODING_RAW) {
3197 value = strtoll(o->ptr, &eptr, 10);
682c73e8 3198 if (eptr[0] != '\0') return REDIS_ERR;
bd79a6bd
PN
3199 } else if (o->encoding == REDIS_ENCODING_INT) {
3200 value = (long)o->ptr;
3201 } else {
3202 redisAssert(1 != 1);
3203 }
3204 }
3205
bd79a6bd
PN
3206 *target = value;
3207 return REDIS_OK;
3208}
bbe025e0 3209
bd79a6bd
PN
3210static int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
3211 long long value;
3212 if (getLongLongFromObject(o, &value) != REDIS_OK) {
3213 if (msg != NULL) {
3214 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3215 } else {
3216 addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
3217 }
bbe025e0
AM
3218 return REDIS_ERR;
3219 }
3220
bd79a6bd 3221 *target = value;
bbe025e0
AM
3222 return REDIS_OK;
3223}
3224
bd79a6bd
PN
3225static int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
3226 long long value;
bbe025e0 3227
bd79a6bd
PN
3228 if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
3229 if (value < LONG_MIN || value > LONG_MAX) {
3230 if (msg != NULL) {
3231 addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
3232 } else {
3233 addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
3234 }
bbe025e0
AM
3235 return REDIS_ERR;
3236 }
3237
bd79a6bd 3238 *target = value;
bbe025e0
AM
3239 return REDIS_OK;
3240}
3241
06233c45 3242/*============================ RDB saving/loading =========================== */
ed9b544e 3243
f78fd11b 3244static int rdbSaveType(FILE *fp, unsigned char type) {
3245 if (fwrite(&type,1,1,fp) == 0) return -1;
3246 return 0;
3247}
3248
bb32ede5 3249static int rdbSaveTime(FILE *fp, time_t t) {
3250 int32_t t32 = (int32_t) t;
3251 if (fwrite(&t32,4,1,fp) == 0) return -1;
3252 return 0;
3253}
3254
e3566d4b 3255/* check rdbLoadLen() comments for more info */
f78fd11b 3256static int rdbSaveLen(FILE *fp, uint32_t len) {
3257 unsigned char buf[2];
3258
3259 if (len < (1<<6)) {
3260 /* Save a 6 bit len */
10c43610 3261 buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
f78fd11b 3262 if (fwrite(buf,1,1,fp) == 0) return -1;
3263 } else if (len < (1<<14)) {
3264 /* Save a 14 bit len */
10c43610 3265 buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
f78fd11b 3266 buf[1] = len&0xFF;
17be1a4a 3267 if (fwrite(buf,2,1,fp) == 0) return -1;
f78fd11b 3268 } else {
3269 /* Save a 32 bit len */
10c43610 3270 buf[0] = (REDIS_RDB_32BITLEN<<6);
f78fd11b 3271 if (fwrite(buf,1,1,fp) == 0) return -1;
3272 len = htonl(len);
3273 if (fwrite(&len,4,1,fp) == 0) return -1;
3274 }
3275 return 0;
3276}
3277
e3566d4b 3278/* String objects in the form "2391" "-100" without any space and with a
3279 * range of values that can fit in an 8, 16 or 32 bit signed value can be
3280 * encoded as integers to save space */
b1befe6a 3281static int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
e3566d4b 3282 long long value;
3283 char *endptr, buf[32];
3284
3285 /* Check if it's possible to encode this value as a number */
3286 value = strtoll(s, &endptr, 10);
3287 if (endptr[0] != '\0') return 0;
3288 snprintf(buf,32,"%lld",value);
3289
3290 /* If the number converted back into a string is not identical
3291 * then it's not possible to encode the string as integer */
b1befe6a 3292 if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
e3566d4b 3293
3294 /* Finally check if it fits in our ranges */
3295 if (value >= -(1<<7) && value <= (1<<7)-1) {
3296 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
3297 enc[1] = value&0xFF;
3298 return 2;
3299 } else if (value >= -(1<<15) && value <= (1<<15)-1) {
3300 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
3301 enc[1] = value&0xFF;
3302 enc[2] = (value>>8)&0xFF;
3303 return 3;
3304 } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
3305 enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
3306 enc[1] = value&0xFF;
3307 enc[2] = (value>>8)&0xFF;
3308 enc[3] = (value>>16)&0xFF;
3309 enc[4] = (value>>24)&0xFF;
3310 return 5;
3311 } else {
3312 return 0;
3313 }
3314}
3315
b1befe6a 3316static int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
3317 size_t comprlen, outlen;
774e3047 3318 unsigned char byte;
3319 void *out;
3320
3321 /* We require at least four bytes compression for this to be worth it */
b1befe6a 3322 if (len <= 4) return 0;
3323 outlen = len-4;
3a2694c4 3324 if ((out = zmalloc(outlen+1)) == NULL) return 0;
b1befe6a 3325 comprlen = lzf_compress(s, len, out, outlen);
774e3047 3326 if (comprlen == 0) {
88e85998 3327 zfree(out);
774e3047 3328 return 0;
3329 }
3330 /* Data compressed! Let's save it on disk */
3331 byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
3332 if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
3333 if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
b1befe6a 3334 if (rdbSaveLen(fp,len) == -1) goto writeerr;
774e3047 3335 if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
88e85998 3336 zfree(out);
774e3047 3337 return comprlen;
3338
3339writeerr:
88e85998 3340 zfree(out);
774e3047 3341 return -1;
3342}
3343
e3566d4b 3344/* Save a string objet as [len][data] on disk. If the object is a string
3345 * representation of an integer value we try to safe it in a special form */
b1befe6a 3346static int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
e3566d4b 3347 int enclen;
10c43610 3348
774e3047 3349 /* Try integer encoding */
e3566d4b 3350 if (len <= 11) {
3351 unsigned char buf[5];
b1befe6a 3352 if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
e3566d4b 3353 if (fwrite(buf,enclen,1,fp) == 0) return -1;
3354 return 0;
3355 }
3356 }
774e3047 3357
3358 /* Try LZF compression - under 20 bytes it's unable to compress even
88e85998 3359 * aaaaaaaaaaaaaaaaaa so skip it */
121f70cf 3360 if (server.rdbcompression && len > 20) {
774e3047 3361 int retval;
3362
b1befe6a 3363 retval = rdbSaveLzfStringObject(fp,s,len);
774e3047 3364 if (retval == -1) return -1;
3365 if (retval > 0) return 0;
3366 /* retval == 0 means data can't be compressed, save the old way */
3367 }
3368
3369 /* Store verbatim */
10c43610 3370 if (rdbSaveLen(fp,len) == -1) return -1;
b1befe6a 3371 if (len && fwrite(s,len,1,fp) == 0) return -1;
10c43610 3372 return 0;
3373}
3374
942a3961 3375/* Like rdbSaveStringObjectRaw() but handle encoded objects */
3376static int rdbSaveStringObject(FILE *fp, robj *obj) {
3377 int retval;
942a3961 3378
f2d9f50f 3379 /* Avoid incr/decr ref count business when possible.
3380 * This plays well with copy-on-write given that we are probably
3381 * in a child process (BGSAVE). Also this makes sure key objects
3382 * of swapped objects are not incRefCount-ed (an assert does not allow
3383 * this in order to avoid bugs) */
3384 if (obj->encoding != REDIS_ENCODING_RAW) {
996cb5f7 3385 obj = getDecodedObject(obj);
b1befe6a 3386 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3387 decrRefCount(obj);
3388 } else {
b1befe6a 3389 retval = rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
996cb5f7 3390 }
9d65a1bb 3391 return retval;
942a3961 3392}
3393
a7866db6 3394/* Save a double value. Doubles are saved as strings prefixed by an unsigned
3395 * 8 bit integer specifing the length of the representation.
3396 * This 8 bit integer has special values in order to specify the following
3397 * conditions:
3398 * 253: not a number
3399 * 254: + inf
3400 * 255: - inf
3401 */
3402static int rdbSaveDoubleValue(FILE *fp, double val) {
3403 unsigned char buf[128];
3404 int len;
3405
3406 if (isnan(val)) {
3407 buf[0] = 253;
3408 len = 1;
3409 } else if (!isfinite(val)) {
3410 len = 1;
3411 buf[0] = (val < 0) ? 255 : 254;
3412 } else {
eaa256ad 3413 snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
6c446631 3414 buf[0] = strlen((char*)buf+1);
a7866db6 3415 len = buf[0]+1;
3416 }
3417 if (fwrite(buf,len,1,fp) == 0) return -1;
3418 return 0;
3419}
3420
06233c45 3421/* Save a Redis object. */
3422static int rdbSaveObject(FILE *fp, robj *o) {
3423 if (o->type == REDIS_STRING) {
3424 /* Save a string value */
3425 if (rdbSaveStringObject(fp,o) == -1) return -1;
3426 } else if (o->type == REDIS_LIST) {
3427 /* Save a list value */
3428 list *list = o->ptr;
c7df85a4 3429 listIter li;
06233c45 3430 listNode *ln;
3431
06233c45 3432 if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
c7df85a4 3433 listRewind(list,&li);
3434 while((ln = listNext(&li))) {
06233c45 3435 robj *eleobj = listNodeValue(ln);
3436
3437 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3438 }
3439 } else if (o->type == REDIS_SET) {
3440 /* Save a set value */
3441 dict *set = o->ptr;
3442 dictIterator *di = dictGetIterator(set);
3443 dictEntry *de;
3444
3445 if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
3446 while((de = dictNext(di)) != NULL) {
3447 robj *eleobj = dictGetEntryKey(de);
3448
3449 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3450 }
3451 dictReleaseIterator(di);
3452 } else if (o->type == REDIS_ZSET) {
3453 /* Save a set value */
3454 zset *zs = o->ptr;
3455 dictIterator *di = dictGetIterator(zs->dict);
3456 dictEntry *de;
3457
3458 if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
3459 while((de = dictNext(di)) != NULL) {
3460 robj *eleobj = dictGetEntryKey(de);
3461 double *score = dictGetEntryVal(de);
3462
3463 if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
3464 if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
3465 }
3466 dictReleaseIterator(di);
b1befe6a 3467 } else if (o->type == REDIS_HASH) {
3468 /* Save a hash value */
3469 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3470 unsigned char *p = zipmapRewind(o->ptr);
3471 unsigned int count = zipmapLen(o->ptr);
3472 unsigned char *key, *val;
3473 unsigned int klen, vlen;
3474
3475 if (rdbSaveLen(fp,count) == -1) return -1;
3476 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
3477 if (rdbSaveRawString(fp,key,klen) == -1) return -1;
3478 if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
3479 }
3480 } else {
3481 dictIterator *di = dictGetIterator(o->ptr);
3482 dictEntry *de;
3483
3484 if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
3485 while((de = dictNext(di)) != NULL) {
3486 robj *key = dictGetEntryKey(de);
3487 robj *val = dictGetEntryVal(de);
3488
3489 if (rdbSaveStringObject(fp,key) == -1) return -1;
3490 if (rdbSaveStringObject(fp,val) == -1) return -1;
3491 }
3492 dictReleaseIterator(di);
3493 }
06233c45 3494 } else {
f83c6cb5 3495 redisPanic("Unknown object type");
06233c45 3496 }
3497 return 0;
3498}
3499
3500/* Return the length the object will have on disk if saved with
3501 * the rdbSaveObject() function. Currently we use a trick to get
3502 * this length with very little changes to the code. In the future
3503 * we could switch to a faster solution. */
b9bc0eef 3504static off_t rdbSavedObjectLen(robj *o, FILE *fp) {
3505 if (fp == NULL) fp = server.devnull;
06233c45 3506 rewind(fp);
3507 assert(rdbSaveObject(fp,o) != 1);
3508 return ftello(fp);
3509}
3510
06224fec 3511/* Return the number of pages required to save this object in the swap file */
b9bc0eef 3512static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
3513 off_t bytes = rdbSavedObjectLen(o,fp);
e0a62c7f 3514
06224fec 3515 return (bytes+(server.vm_page_size-1))/server.vm_page_size;
3516}
3517
ed9b544e 3518/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
f78fd11b 3519static int rdbSave(char *filename) {
ed9b544e 3520 dictIterator *di = NULL;
3521 dictEntry *de;
ed9b544e 3522 FILE *fp;
3523 char tmpfile[256];
3524 int j;
bb32ede5 3525 time_t now = time(NULL);
ed9b544e 3526
2316bb3b 3527 /* Wait for I/O therads to terminate, just in case this is a
3528 * foreground-saving, to avoid seeking the swap file descriptor at the
3529 * same time. */
3530 if (server.vm_enabled)
3531 waitEmptyIOJobsQueue();
3532
a3b21203 3533 snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
ed9b544e 3534 fp = fopen(tmpfile,"w");
3535 if (!fp) {
3536 redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
3537 return REDIS_ERR;
3538 }
f78fd11b 3539 if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
ed9b544e 3540 for (j = 0; j < server.dbnum; j++) {
bb32ede5 3541 redisDb *db = server.db+j;
3542 dict *d = db->dict;
3305306f 3543 if (dictSize(d) == 0) continue;
ed9b544e 3544 di = dictGetIterator(d);
3545 if (!di) {
3546 fclose(fp);
3547 return REDIS_ERR;
3548 }
3549
3550 /* Write the SELECT DB opcode */
f78fd11b 3551 if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
3552 if (rdbSaveLen(fp,j) == -1) goto werr;
ed9b544e 3553
3554 /* Iterate this DB writing every entry */
3555 while((de = dictNext(di)) != NULL) {
3556 robj *key = dictGetEntryKey(de);
3557 robj *o = dictGetEntryVal(de);
bb32ede5 3558 time_t expiretime = getExpire(db,key);
3559
3560 /* Save the expire time */
3561 if (expiretime != -1) {
3562 /* If this key is already expired skip it */
3563 if (expiretime < now) continue;
3564 if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
3565 if (rdbSaveTime(fp,expiretime) == -1) goto werr;
3566 }
7e69548d 3567 /* Save the key and associated value. This requires special
3568 * handling if the value is swapped out. */
996cb5f7 3569 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
3570 key->storage == REDIS_VM_SWAPPING) {
7e69548d 3571 /* Save type, key, value */
3572 if (rdbSaveType(fp,o->type) == -1) goto werr;
3573 if (rdbSaveStringObject(fp,key) == -1) goto werr;
3574 if (rdbSaveObject(fp,o) == -1) goto werr;
3575 } else {
996cb5f7 3576 /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
b9bc0eef 3577 robj *po;
7e69548d 3578 /* Get a preview of the object in memory */
3579 po = vmPreviewObject(key);
7e69548d 3580 /* Save type, key, value */
3581 if (rdbSaveType(fp,key->vtype) == -1) goto werr;
b9bc0eef 3582 if (rdbSaveStringObject(fp,key) == -1) goto werr;
7e69548d 3583 if (rdbSaveObject(fp,po) == -1) goto werr;
3584 /* Remove the loaded object from memory */
3585 decrRefCount(po);
7e69548d 3586 }
ed9b544e 3587 }
3588 dictReleaseIterator(di);
3589 }
3590 /* EOF opcode */
f78fd11b 3591 if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
3592
3593 /* Make sure data will not remain on the OS's output buffers */
ed9b544e 3594 fflush(fp);
3595 fsync(fileno(fp));
3596 fclose(fp);
e0a62c7f 3597
ed9b544e 3598 /* Use RENAME to make sure the DB file is changed atomically only
3599 * if the generate DB file is ok. */
3600 if (rename(tmpfile,filename) == -1) {
325d1eb4 3601 redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
ed9b544e 3602 unlink(tmpfile);
3603 return REDIS_ERR;
3604 }
3605 redisLog(REDIS_NOTICE,"DB saved on disk");
3606 server.dirty = 0;
3607 server.lastsave = time(NULL);
3608 return REDIS_OK;
3609
3610werr:
3611 fclose(fp);
3612 unlink(tmpfile);
3613 redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
3614 if (di) dictReleaseIterator(di);
3615 return REDIS_ERR;
3616}
3617
f78fd11b 3618static int rdbSaveBackground(char *filename) {
ed9b544e 3619 pid_t childpid;
3620
9d65a1bb 3621 if (server.bgsavechildpid != -1) return REDIS_ERR;
054e426d 3622 if (server.vm_enabled) waitEmptyIOJobsQueue();
ed9b544e 3623 if ((childpid = fork()) == 0) {
3624 /* Child */
054e426d 3625 if (server.vm_enabled) vmReopenSwapFile();
ed9b544e 3626 close(server.fd);
f78fd11b 3627 if (rdbSave(filename) == REDIS_OK) {
478c2c6f 3628 _exit(0);
ed9b544e 3629 } else {
478c2c6f 3630 _exit(1);
ed9b544e 3631 }
3632 } else {
3633 /* Parent */
5a7c647e 3634 if (childpid == -1) {
3635 redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
3636 strerror(errno));
3637 return REDIS_ERR;
3638 }
ed9b544e 3639 redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
9f3c422c 3640 server.bgsavechildpid = childpid;
884d4b39 3641 updateDictResizePolicy();
ed9b544e 3642 return REDIS_OK;
3643 }
3644 return REDIS_OK; /* unreached */
3645}
3646
a3b21203 3647static void rdbRemoveTempFile(pid_t childpid) {
3648 char tmpfile[256];
3649
3650 snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
3651 unlink(tmpfile);
3652}
3653
f78fd11b 3654static int rdbLoadType(FILE *fp) {
3655 unsigned char type;
7b45bfb2 3656 if (fread(&type,1,1,fp) == 0) return -1;
3657 return type;
3658}
3659
bb32ede5 3660static time_t rdbLoadTime(FILE *fp) {
3661 int32_t t32;
3662 if (fread(&t32,4,1,fp) == 0) return -1;
3663 return (time_t) t32;
3664}
3665
e3566d4b 3666/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
3667 * of this file for a description of how this are stored on disk.
3668 *
3669 * isencoded is set to 1 if the readed length is not actually a length but
3670 * an "encoding type", check the above comments for more info */
c78a8ccc 3671static uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
f78fd11b 3672 unsigned char buf[2];
3673 uint32_t len;
c78a8ccc 3674 int type;
f78fd11b 3675
e3566d4b 3676 if (isencoded) *isencoded = 0;
c78a8ccc 3677 if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
3678 type = (buf[0]&0xC0)>>6;
3679 if (type == REDIS_RDB_6BITLEN) {
3680 /* Read a 6 bit len */
3681 return buf[0]&0x3F;
3682 } else if (type == REDIS_RDB_ENCVAL) {
3683 /* Read a 6 bit len encoding type */
3684 if (isencoded) *isencoded = 1;
3685 return buf[0]&0x3F;
3686 } else if (type == REDIS_RDB_14BITLEN) {
3687 /* Read a 14 bit len */
3688 if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
3689 return ((buf[0]&0x3F)<<8)|buf[1];
3690 } else {
3691 /* Read a 32 bit len */
f78fd11b 3692 if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
3693 return ntohl(len);
f78fd11b 3694 }
f78fd11b 3695}
3696
e3566d4b 3697static robj *rdbLoadIntegerObject(FILE *fp, int enctype) {
3698 unsigned char enc[4];
3699 long long val;
3700
3701 if (enctype == REDIS_RDB_ENC_INT8) {
3702 if (fread(enc,1,1,fp) == 0) return NULL;
3703 val = (signed char)enc[0];
3704 } else if (enctype == REDIS_RDB_ENC_INT16) {
3705 uint16_t v;
3706 if (fread(enc,2,1,fp) == 0) return NULL;
3707 v = enc[0]|(enc[1]<<8);
3708 val = (int16_t)v;
3709 } else if (enctype == REDIS_RDB_ENC_INT32) {
3710 uint32_t v;
3711 if (fread(enc,4,1,fp) == 0) return NULL;
3712 v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
3713 val = (int32_t)v;
3714 } else {
3715 val = 0; /* anti-warning */
f83c6cb5 3716 redisPanic("Unknown RDB integer encoding type");
e3566d4b 3717 }
3718 return createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",val));
3719}
3720
c78a8ccc 3721static robj *rdbLoadLzfStringObject(FILE*fp) {
88e85998 3722 unsigned int len, clen;
3723 unsigned char *c = NULL;
3724 sds val = NULL;
3725
c78a8ccc 3726 if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3727 if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
88e85998 3728 if ((c = zmalloc(clen)) == NULL) goto err;
3729 if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
3730 if (fread(c,clen,1,fp) == 0) goto err;
3731 if (lzf_decompress(c,clen,val,len) == 0) goto err;
5109cdff 3732 zfree(c);
88e85998 3733 return createObject(REDIS_STRING,val);
3734err:
3735 zfree(c);
3736 sdsfree(val);
3737 return NULL;
3738}
3739
c78a8ccc 3740static robj *rdbLoadStringObject(FILE*fp) {
e3566d4b 3741 int isencoded;
3742 uint32_t len;
f78fd11b 3743 sds val;
3744
c78a8ccc 3745 len = rdbLoadLen(fp,&isencoded);
e3566d4b 3746 if (isencoded) {
3747 switch(len) {
3748 case REDIS_RDB_ENC_INT8:
3749 case REDIS_RDB_ENC_INT16:
3750 case REDIS_RDB_ENC_INT32:
bdcb92f2 3751 return rdbLoadIntegerObject(fp,len);
88e85998 3752 case REDIS_RDB_ENC_LZF:
bdcb92f2 3753 return rdbLoadLzfStringObject(fp);
e3566d4b 3754 default:
f83c6cb5 3755 redisPanic("Unknown RDB encoding type");
e3566d4b 3756 }
3757 }
3758
f78fd11b 3759 if (len == REDIS_RDB_LENERR) return NULL;
3760 val = sdsnewlen(NULL,len);
3761 if (len && fread(val,len,1,fp) == 0) {
3762 sdsfree(val);
3763 return NULL;
3764 }
bdcb92f2 3765 return createObject(REDIS_STRING,val);
f78fd11b 3766}
3767
a7866db6 3768/* For information about double serialization check rdbSaveDoubleValue() */
3769static int rdbLoadDoubleValue(FILE *fp, double *val) {
3770 char buf[128];
3771 unsigned char len;
3772
3773 if (fread(&len,1,1,fp) == 0) return -1;
3774 switch(len) {
3775 case 255: *val = R_NegInf; return 0;
3776 case 254: *val = R_PosInf; return 0;
3777 case 253: *val = R_Nan; return 0;
3778 default:
3779 if (fread(buf,len,1,fp) == 0) return -1;
231d758e 3780 buf[len] = '\0';
a7866db6 3781 sscanf(buf, "%lg", val);
3782 return 0;
3783 }
3784}
3785
c78a8ccc 3786/* Load a Redis object of the specified type from the specified file.
3787 * On success a newly allocated object is returned, otherwise NULL. */
3788static robj *rdbLoadObject(int type, FILE *fp) {
3789 robj *o;
3790
bcd11906 3791 redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
c78a8ccc 3792 if (type == REDIS_STRING) {
3793 /* Read string value */
3794 if ((o = rdbLoadStringObject(fp)) == NULL) return NULL;
05df7621 3795 o = tryObjectEncoding(o);
c78a8ccc 3796 } else if (type == REDIS_LIST || type == REDIS_SET) {
3797 /* Read list/set value */
3798 uint32_t listlen;
3799
3800 if ((listlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3801 o = (type == REDIS_LIST) ? createListObject() : createSetObject();
3c68de9b 3802 /* It's faster to expand the dict to the right size asap in order
3803 * to avoid rehashing */
3804 if (type == REDIS_SET && listlen > DICT_HT_INITIAL_SIZE)
3805 dictExpand(o->ptr,listlen);
c78a8ccc 3806 /* Load every single element of the list/set */
3807 while(listlen--) {
3808 robj *ele;
3809
3810 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
05df7621 3811 ele = tryObjectEncoding(ele);
c78a8ccc 3812 if (type == REDIS_LIST) {
3813 listAddNodeTail((list*)o->ptr,ele);
3814 } else {
3815 dictAdd((dict*)o->ptr,ele,NULL);
3816 }
3817 }
3818 } else if (type == REDIS_ZSET) {
3819 /* Read list/set value */
ada386b2 3820 size_t zsetlen;
c78a8ccc 3821 zset *zs;
3822
3823 if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3824 o = createZsetObject();
3825 zs = o->ptr;
3826 /* Load every single element of the list/set */
3827 while(zsetlen--) {
3828 robj *ele;
3829 double *score = zmalloc(sizeof(double));
3830
3831 if ((ele = rdbLoadStringObject(fp)) == NULL) return NULL;
05df7621 3832 ele = tryObjectEncoding(ele);
c78a8ccc 3833 if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
3834 dictAdd(zs->dict,ele,score);
3835 zslInsert(zs->zsl,*score,ele);
3836 incrRefCount(ele); /* added to skiplist */
3837 }
ada386b2 3838 } else if (type == REDIS_HASH) {
3839 size_t hashlen;
3840
3841 if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
3842 o = createHashObject();
3843 /* Too many entries? Use an hash table. */
3844 if (hashlen > server.hash_max_zipmap_entries)
3845 convertToRealHash(o);
3846 /* Load every key/value, then set it into the zipmap or hash
3847 * table, as needed. */
3848 while(hashlen--) {
3849 robj *key, *val;
3850
3851 if ((key = rdbLoadStringObject(fp)) == NULL) return NULL;
3852 if ((val = rdbLoadStringObject(fp)) == NULL) return NULL;
3853 /* If we are using a zipmap and there are too big values
3854 * the object is converted to real hash table encoding. */
3855 if (o->encoding != REDIS_ENCODING_HT &&
3856 (sdslen(key->ptr) > server.hash_max_zipmap_value ||
3857 sdslen(val->ptr) > server.hash_max_zipmap_value))
3858 {
3859 convertToRealHash(o);
3860 }
3861
3862 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
3863 unsigned char *zm = o->ptr;
3864
3865 zm = zipmapSet(zm,key->ptr,sdslen(key->ptr),
3866 val->ptr,sdslen(val->ptr),NULL);
3867 o->ptr = zm;
3868 decrRefCount(key);
3869 decrRefCount(val);
3870 } else {
05df7621 3871 key = tryObjectEncoding(key);
3872 val = tryObjectEncoding(val);
ada386b2 3873 dictAdd((dict*)o->ptr,key,val);
ada386b2 3874 }
3875 }
c78a8ccc 3876 } else {
f83c6cb5 3877 redisPanic("Unknown object type");
c78a8ccc 3878 }
3879 return o;
3880}
3881
f78fd11b 3882static int rdbLoad(char *filename) {
ed9b544e 3883 FILE *fp;
f78fd11b 3884 robj *keyobj = NULL;
3885 uint32_t dbid;
bb32ede5 3886 int type, retval, rdbver;
3305306f 3887 dict *d = server.db[0].dict;
bb32ede5 3888 redisDb *db = server.db+0;
f78fd11b 3889 char buf[1024];
bb32ede5 3890 time_t expiretime = -1, now = time(NULL);
b492cf00 3891 long long loadedkeys = 0;
bb32ede5 3892
ed9b544e 3893 fp = fopen(filename,"r");
3894 if (!fp) return REDIS_ERR;
3895 if (fread(buf,9,1,fp) == 0) goto eoferr;
f78fd11b 3896 buf[9] = '\0';
3897 if (memcmp(buf,"REDIS",5) != 0) {
ed9b544e 3898 fclose(fp);
3899 redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
3900 return REDIS_ERR;
3901 }
f78fd11b 3902 rdbver = atoi(buf+5);
c78a8ccc 3903 if (rdbver != 1) {
f78fd11b 3904 fclose(fp);
3905 redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
3906 return REDIS_ERR;
3907 }
ed9b544e 3908 while(1) {
3909 robj *o;
3910
3911 /* Read type. */
f78fd11b 3912 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
bb32ede5 3913 if (type == REDIS_EXPIRETIME) {
3914 if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
3915 /* We read the time so we need to read the object type again */
3916 if ((type = rdbLoadType(fp)) == -1) goto eoferr;
3917 }
ed9b544e 3918 if (type == REDIS_EOF) break;
3919 /* Handle SELECT DB opcode as a special case */
3920 if (type == REDIS_SELECTDB) {
c78a8ccc 3921 if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
e3566d4b 3922 goto eoferr;
ed9b544e 3923 if (dbid >= (unsigned)server.dbnum) {
f78fd11b 3924 redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
ed9b544e 3925 exit(1);
3926 }
bb32ede5 3927 db = server.db+dbid;
3928 d = db->dict;
ed9b544e 3929 continue;
3930 }
3931 /* Read key */
c78a8ccc 3932 if ((keyobj = rdbLoadStringObject(fp)) == NULL) goto eoferr;
3933 /* Read value */
3934 if ((o = rdbLoadObject(type,fp)) == NULL) goto eoferr;
ed9b544e 3935 /* Add the new object in the hash table */
f78fd11b 3936 retval = dictAdd(d,keyobj,o);
ed9b544e 3937 if (retval == DICT_ERR) {
f78fd11b 3938 redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", keyobj->ptr);
ed9b544e 3939 exit(1);
3940 }
bb32ede5 3941 /* Set the expire time if needed */
3942 if (expiretime != -1) {
3943 setExpire(db,keyobj,expiretime);
3944 /* Delete this key if already expired */
3945 if (expiretime < now) deleteKey(db,keyobj);
3946 expiretime = -1;
3947 }
f78fd11b 3948 keyobj = o = NULL;
b492cf00 3949 /* Handle swapping while loading big datasets when VM is on */
3950 loadedkeys++;
3951 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
3952 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 3953 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 3954 }
3955 }
ed9b544e 3956 }
3957 fclose(fp);
3958 return REDIS_OK;
3959
3960eoferr: /* unexpected end of file is handled here with a fatal exit */
e3566d4b 3961 if (keyobj) decrRefCount(keyobj);
f80dff62 3962 redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
ed9b544e 3963 exit(1);
3964 return REDIS_ERR; /* Just to avoid warning */
3965}
3966
3967/*================================== Commands =============================== */
3968
abcb223e 3969static void authCommand(redisClient *c) {
2e77c2ee 3970 if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
abcb223e
BH
3971 c->authenticated = 1;
3972 addReply(c,shared.ok);
3973 } else {
3974 c->authenticated = 0;
fa4c0aba 3975 addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
abcb223e
BH
3976 }
3977}
3978
ed9b544e 3979static void pingCommand(redisClient *c) {
3980 addReply(c,shared.pong);
3981}
3982
3983static void echoCommand(redisClient *c) {
dd88747b 3984 addReplyBulk(c,c->argv[1]);
ed9b544e 3985}
3986
3987/*=================================== Strings =============================== */
3988
526d00a5 3989static void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
ed9b544e 3990 int retval;
526d00a5 3991 long seconds;
ed9b544e 3992
526d00a5 3993 if (expire) {
3994 if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
3995 return;
3996 if (seconds <= 0) {
3997 addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
3998 return;
3999 }
4000 }
4001
4002 if (nx) deleteIfVolatile(c->db,key);
4003 retval = dictAdd(c->db->dict,key,val);
ed9b544e 4004 if (retval == DICT_ERR) {
4005 if (!nx) {
1b03836c 4006 /* If the key is about a swapped value, we want a new key object
4007 * to overwrite the old. So we delete the old key in the database.
4008 * This will also make sure that swap pages about the old object
4009 * will be marked as free. */
526d00a5 4010 if (server.vm_enabled && deleteIfSwapped(c->db,key))
4011 incrRefCount(key);
4012 dictReplace(c->db->dict,key,val);
4013 incrRefCount(val);
ed9b544e 4014 } else {
c937aa89 4015 addReply(c,shared.czero);
ed9b544e 4016 return;
4017 }
4018 } else {
526d00a5 4019 incrRefCount(key);
4020 incrRefCount(val);
ed9b544e 4021 }
4022 server.dirty++;
526d00a5 4023 removeExpire(c->db,key);
4024 if (expire) setExpire(c->db,key,time(NULL)+seconds);
c937aa89 4025 addReply(c, nx ? shared.cone : shared.ok);
ed9b544e 4026}
4027
4028static void setCommand(redisClient *c) {
526d00a5 4029 setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
ed9b544e 4030}
4031
4032static void setnxCommand(redisClient *c) {
526d00a5 4033 setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
4034}
4035
4036static void setexCommand(redisClient *c) {
4037 setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
ed9b544e 4038}
4039
322fc7d8 4040static int getGenericCommand(redisClient *c) {
dd88747b 4041 robj *o;
e0a62c7f 4042
dd88747b 4043 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
322fc7d8 4044 return REDIS_OK;
dd88747b 4045
4046 if (o->type != REDIS_STRING) {
4047 addReply(c,shared.wrongtypeerr);
4048 return REDIS_ERR;
ed9b544e 4049 } else {
dd88747b 4050 addReplyBulk(c,o);
4051 return REDIS_OK;
ed9b544e 4052 }
4053}
4054
322fc7d8 4055static void getCommand(redisClient *c) {
4056 getGenericCommand(c);
4057}
4058
f6b141c5 4059static void getsetCommand(redisClient *c) {
322fc7d8 4060 if (getGenericCommand(c) == REDIS_ERR) return;
a431eb74 4061 if (dictAdd(c->db->dict,c->argv[1],c->argv[2]) == DICT_ERR) {
4062 dictReplace(c->db->dict,c->argv[1],c->argv[2]);
4063 } else {
4064 incrRefCount(c->argv[1]);
4065 }
4066 incrRefCount(c->argv[2]);
4067 server.dirty++;
4068 removeExpire(c->db,c->argv[1]);
4069}
4070
70003d28 4071static void mgetCommand(redisClient *c) {
70003d28 4072 int j;
e0a62c7f 4073
c937aa89 4074 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
70003d28 4075 for (j = 1; j < c->argc; j++) {
3305306f 4076 robj *o = lookupKeyRead(c->db,c->argv[j]);
4077 if (o == NULL) {
c937aa89 4078 addReply(c,shared.nullbulk);
70003d28 4079 } else {
70003d28 4080 if (o->type != REDIS_STRING) {
c937aa89 4081 addReply(c,shared.nullbulk);
70003d28 4082 } else {
dd88747b 4083 addReplyBulk(c,o);
70003d28 4084 }
4085 }
4086 }
4087}
4088
6c446631 4089static void msetGenericCommand(redisClient *c, int nx) {
906573e7 4090 int j, busykeys = 0;
6c446631 4091
4092 if ((c->argc % 2) == 0) {
454d4e43 4093 addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
6c446631 4094 return;
4095 }
4096 /* Handle the NX flag. The MSETNX semantic is to return zero and don't
4097 * set nothing at all if at least one already key exists. */
4098 if (nx) {
4099 for (j = 1; j < c->argc; j += 2) {
906573e7 4100 if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
4101 busykeys++;
6c446631 4102 }
4103 }
4104 }
906573e7 4105 if (busykeys) {
4106 addReply(c, shared.czero);
4107 return;
4108 }
6c446631 4109
4110 for (j = 1; j < c->argc; j += 2) {
4111 int retval;
4112
05df7621 4113 c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
6c446631 4114 retval = dictAdd(c->db->dict,c->argv[j],c->argv[j+1]);
4115 if (retval == DICT_ERR) {
4116 dictReplace(c->db->dict,c->argv[j],c->argv[j+1]);
4117 incrRefCount(c->argv[j+1]);
4118 } else {
4119 incrRefCount(c->argv[j]);
4120 incrRefCount(c->argv[j+1]);
4121 }
4122 removeExpire(c->db,c->argv[j]);
4123 }
4124 server.dirty += (c->argc-1)/2;
4125 addReply(c, nx ? shared.cone : shared.ok);
4126}
4127
4128static void msetCommand(redisClient *c) {
4129 msetGenericCommand(c,0);
4130}
4131
4132static void msetnxCommand(redisClient *c) {
4133 msetGenericCommand(c,1);
4134}
4135
d68ed120 4136static void incrDecrCommand(redisClient *c, long long incr) {
ed9b544e 4137 long long value;
4138 int retval;
4139 robj *o;
e0a62c7f 4140
3305306f 4141 o = lookupKeyWrite(c->db,c->argv[1]);
ed9b544e 4142
bd79a6bd 4143 if (getLongLongFromObjectOrReply(c, o, &value, NULL) != REDIS_OK) return;
ed9b544e 4144
4145 value += incr;
4146 o = createObject(REDIS_STRING,sdscatprintf(sdsempty(),"%lld",value));
05df7621 4147 o = tryObjectEncoding(o);
3305306f 4148 retval = dictAdd(c->db->dict,c->argv[1],o);
ed9b544e 4149 if (retval == DICT_ERR) {
3305306f 4150 dictReplace(c->db->dict,c->argv[1],o);
4151 removeExpire(c->db,c->argv[1]);
ed9b544e 4152 } else {
4153 incrRefCount(c->argv[1]);
4154 }
4155 server.dirty++;
c937aa89 4156 addReply(c,shared.colon);
ed9b544e 4157 addReply(c,o);
4158 addReply(c,shared.crlf);
4159}
4160
4161static void incrCommand(redisClient *c) {
a4d1ba9a 4162 incrDecrCommand(c,1);
ed9b544e 4163}
4164
4165static void decrCommand(redisClient *c) {
a4d1ba9a 4166 incrDecrCommand(c,-1);
ed9b544e 4167}
4168
4169static void incrbyCommand(redisClient *c) {
bbe025e0
AM
4170 long long incr;
4171
bd79a6bd 4172 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4173 incrDecrCommand(c,incr);
ed9b544e 4174}
4175
4176static void decrbyCommand(redisClient *c) {
bbe025e0
AM
4177 long long incr;
4178
bd79a6bd 4179 if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
a4d1ba9a 4180 incrDecrCommand(c,-incr);
ed9b544e 4181}
4182
4b00bebd 4183static void appendCommand(redisClient *c) {
4184 int retval;
4185 size_t totlen;
4186 robj *o;
4187
4188 o = lookupKeyWrite(c->db,c->argv[1]);
4189 if (o == NULL) {
4190 /* Create the key */
4191 retval = dictAdd(c->db->dict,c->argv[1],c->argv[2]);
4192 incrRefCount(c->argv[1]);
4193 incrRefCount(c->argv[2]);
4194 totlen = stringObjectLen(c->argv[2]);
4195 } else {
4196 dictEntry *de;
e0a62c7f 4197
4b00bebd 4198 de = dictFind(c->db->dict,c->argv[1]);
4199 assert(de != NULL);
4200
4201 o = dictGetEntryVal(de);
4202 if (o->type != REDIS_STRING) {
4203 addReply(c,shared.wrongtypeerr);
4204 return;
4205 }
4206 /* If the object is specially encoded or shared we have to make
4207 * a copy */
4208 if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
4209 robj *decoded = getDecodedObject(o);
4210
4211 o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
4212 decrRefCount(decoded);
4213 dictReplace(c->db->dict,c->argv[1],o);
4214 }
4215 /* APPEND! */
4216 if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
4217 o->ptr = sdscatlen(o->ptr,
4218 c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
4219 } else {
4220 o->ptr = sdscatprintf(o->ptr, "%ld",
4221 (unsigned long) c->argv[2]->ptr);
4222 }
4223 totlen = sdslen(o->ptr);
4224 }
4225 server.dirty++;
4226 addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
4227}
4228
39191553 4229static void substrCommand(redisClient *c) {
4230 robj *o;
4231 long start = atoi(c->argv[2]->ptr);
4232 long end = atoi(c->argv[3]->ptr);
dd88747b 4233 size_t rangelen, strlen;
4234 sds range;
39191553 4235
dd88747b 4236 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4237 checkType(c,o,REDIS_STRING)) return;
39191553 4238
dd88747b 4239 o = getDecodedObject(o);
4240 strlen = sdslen(o->ptr);
8fe7fad7 4241
dd88747b 4242 /* convert negative indexes */
4243 if (start < 0) start = strlen+start;
4244 if (end < 0) end = strlen+end;
4245 if (start < 0) start = 0;
4246 if (end < 0) end = 0;
39191553 4247
dd88747b 4248 /* indexes sanity checks */
4249 if (start > end || (size_t)start >= strlen) {
4250 /* Out of range start or start > end result in null reply */
4251 addReply(c,shared.nullbulk);
4252 decrRefCount(o);
4253 return;
39191553 4254 }
dd88747b 4255 if ((size_t)end >= strlen) end = strlen-1;
4256 rangelen = (end-start)+1;
4257
4258 /* Return the result */
4259 addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
4260 range = sdsnewlen((char*)o->ptr+start,rangelen);
4261 addReplySds(c,range);
4262 addReply(c,shared.crlf);
4263 decrRefCount(o);
39191553 4264}
4265
ed9b544e 4266/* ========================= Type agnostic commands ========================= */
4267
4268static void delCommand(redisClient *c) {
5109cdff 4269 int deleted = 0, j;
4270
4271 for (j = 1; j < c->argc; j++) {
4272 if (deleteKey(c->db,c->argv[j])) {
4273 server.dirty++;
4274 deleted++;
4275 }
4276 }
dd88747b 4277 addReplyLong(c,deleted);
ed9b544e 4278}
4279
4280static void existsCommand(redisClient *c) {
3305306f 4281 addReply(c,lookupKeyRead(c->db,c->argv[1]) ? shared.cone : shared.czero);
ed9b544e 4282}
4283
4284static void selectCommand(redisClient *c) {
4285 int id = atoi(c->argv[1]->ptr);
e0a62c7f 4286
ed9b544e 4287 if (selectDb(c,id) == REDIS_ERR) {
774e3047 4288 addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
ed9b544e 4289 } else {
4290 addReply(c,shared.ok);
4291 }
4292}
4293
4294static void randomkeyCommand(redisClient *c) {
4295 dictEntry *de;
dc4be23e 4296 robj *key;
e0a62c7f 4297
3305306f 4298 while(1) {
4299 de = dictGetRandomKey(c->db->dict);
ce7bef07 4300 if (!de || expireIfNeeded(c->db,dictGetEntryKey(de)) == 0) break;
3305306f 4301 }
2b619329 4302
ed9b544e 4303 if (de == NULL) {
dc4be23e 4304 addReply(c,shared.nullbulk);
4305 return;
4306 }
4307
4308 key = dictGetEntryKey(de);
4309 if (server.vm_enabled) {
4310 key = dupStringObject(key);
4311 addReplyBulk(c,key);
4312 decrRefCount(key);
ed9b544e 4313 } else {
dc4be23e 4314 addReplyBulk(c,key);
ed9b544e 4315 }
4316}
4317
4318static void keysCommand(redisClient *c) {
4319 dictIterator *di;
4320 dictEntry *de;
4321 sds pattern = c->argv[1]->ptr;
4322 int plen = sdslen(pattern);
a3f9eec2 4323 unsigned long numkeys = 0;
ed9b544e 4324 robj *lenobj = createObject(REDIS_STRING,NULL);
4325
3305306f 4326 di = dictGetIterator(c->db->dict);
ed9b544e 4327 addReply(c,lenobj);
4328 decrRefCount(lenobj);
4329 while((de = dictNext(di)) != NULL) {
4330 robj *keyobj = dictGetEntryKey(de);
3305306f 4331
ed9b544e 4332 sds key = keyobj->ptr;
4333 if ((pattern[0] == '*' && pattern[1] == '\0') ||
4334 stringmatchlen(pattern,plen,key,sdslen(key),0)) {
3305306f 4335 if (expireIfNeeded(c->db,keyobj) == 0) {
dd88747b 4336 addReplyBulk(c,keyobj);
3305306f 4337 numkeys++;
3305306f 4338 }
ed9b544e 4339 }
4340 }
4341 dictReleaseIterator(di);
a3f9eec2 4342 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
ed9b544e 4343}
4344
4345static void dbsizeCommand(redisClient *c) {
4346 addReplySds(c,
3305306f 4347 sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
ed9b544e 4348}
4349
4350static void lastsaveCommand(redisClient *c) {
4351 addReplySds(c,
c937aa89 4352 sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
ed9b544e 4353}
4354
4355static void typeCommand(redisClient *c) {
3305306f 4356 robj *o;
ed9b544e 4357 char *type;
3305306f 4358
4359 o = lookupKeyRead(c->db,c->argv[1]);
4360 if (o == NULL) {
c937aa89 4361 type = "+none";
ed9b544e 4362 } else {
ed9b544e 4363 switch(o->type) {
c937aa89 4364 case REDIS_STRING: type = "+string"; break;
4365 case REDIS_LIST: type = "+list"; break;
4366 case REDIS_SET: type = "+set"; break;
412a8bce 4367 case REDIS_ZSET: type = "+zset"; break;
ada386b2 4368 case REDIS_HASH: type = "+hash"; break;
4369 default: type = "+unknown"; break;
ed9b544e 4370 }
4371 }
4372 addReplySds(c,sdsnew(type));
4373 addReply(c,shared.crlf);
4374}
4375
4376static void saveCommand(redisClient *c) {
9d65a1bb 4377 if (server.bgsavechildpid != -1) {
05557f6d 4378 addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
4379 return;
4380 }
f78fd11b 4381 if (rdbSave(server.dbfilename) == REDIS_OK) {
ed9b544e 4382 addReply(c,shared.ok);
4383 } else {
4384 addReply(c,shared.err);
4385 }
4386}
4387
4388static void bgsaveCommand(redisClient *c) {
9d65a1bb 4389 if (server.bgsavechildpid != -1) {
ed9b544e 4390 addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
4391 return;
4392 }
f78fd11b 4393 if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
49b99ab4 4394 char *status = "+Background saving started\r\n";
4395 addReplySds(c,sdsnew(status));
ed9b544e 4396 } else {
4397 addReply(c,shared.err);
4398 }
4399}
4400
4401static void shutdownCommand(redisClient *c) {
4402 redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
a3b21203 4403 /* Kill the saving child if there is a background saving in progress.
4404 We want to avoid race conditions, for instance our saving child may
4405 overwrite the synchronous saving did by SHUTDOWN. */
9d65a1bb 4406 if (server.bgsavechildpid != -1) {
9f3c422c 4407 redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
4408 kill(server.bgsavechildpid,SIGKILL);
a3b21203 4409 rdbRemoveTempFile(server.bgsavechildpid);
9f3c422c 4410 }
ac945e2d 4411 if (server.appendonly) {
4412 /* Append only file: fsync() the AOF and exit */
4413 fsync(server.appendfd);
054e426d 4414 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 4415 exit(0);
ed9b544e 4416 } else {
ac945e2d 4417 /* Snapshotting. Perform a SYNC SAVE and exit */
4418 if (rdbSave(server.dbfilename) == REDIS_OK) {
4419 if (server.daemonize)
4420 unlink(server.pidfile);
4421 redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
4422 redisLog(REDIS_WARNING,"Server exit now, bye bye...");
054e426d 4423 if (server.vm_enabled) unlink(server.vm_swap_file);
ac945e2d 4424 exit(0);
4425 } else {
dd88747b 4426 /* Ooops.. error saving! The best we can do is to continue
4427 * operating. Note that if there was a background saving process,
4428 * in the next cron() Redis will be notified that the background
4429 * saving aborted, handling special stuff like slaves pending for
4430 * synchronization... */
e0a62c7f 4431 redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
dd88747b 4432 addReplySds(c,
4433 sdsnew("-ERR can't quit, problems saving the DB\r\n"));
ac945e2d 4434 }
ed9b544e 4435 }
4436}
4437
4438static void renameGenericCommand(redisClient *c, int nx) {
ed9b544e 4439 robj *o;
4440
4441 /* To use the same key as src and dst is probably an error */
4442 if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
c937aa89 4443 addReply(c,shared.sameobjecterr);
ed9b544e 4444 return;
4445 }
4446
dd88747b 4447 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
ed9b544e 4448 return;
dd88747b 4449
ed9b544e 4450 incrRefCount(o);
3305306f 4451 deleteIfVolatile(c->db,c->argv[2]);
4452 if (dictAdd(c->db->dict,c->argv[2],o) == DICT_ERR) {
ed9b544e 4453 if (nx) {
4454 decrRefCount(o);
c937aa89 4455 addReply(c,shared.czero);
ed9b544e 4456 return;
4457 }
3305306f 4458 dictReplace(c->db->dict,c->argv[2],o);
ed9b544e 4459 } else {
4460 incrRefCount(c->argv[2]);
4461 }
3305306f 4462 deleteKey(c->db,c->argv[1]);
ed9b544e 4463 server.dirty++;
c937aa89 4464 addReply(c,nx ? shared.cone : shared.ok);
ed9b544e 4465}
4466
4467static void renameCommand(redisClient *c) {
4468 renameGenericCommand(c,0);
4469}
4470
4471static void renamenxCommand(redisClient *c) {
4472 renameGenericCommand(c,1);
4473}
4474
4475static void moveCommand(redisClient *c) {
3305306f 4476 robj *o;
4477 redisDb *src, *dst;
ed9b544e 4478 int srcid;
4479
4480 /* Obtain source and target DB pointers */
3305306f 4481 src = c->db;
4482 srcid = c->db->id;
ed9b544e 4483 if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
c937aa89 4484 addReply(c,shared.outofrangeerr);
ed9b544e 4485 return;
4486 }
3305306f 4487 dst = c->db;
4488 selectDb(c,srcid); /* Back to the source DB */
ed9b544e 4489
4490 /* If the user is moving using as target the same
4491 * DB as the source DB it is probably an error. */
4492 if (src == dst) {
c937aa89 4493 addReply(c,shared.sameobjecterr);
ed9b544e 4494 return;
4495 }
4496
4497 /* Check if the element exists and get a reference */
3305306f 4498 o = lookupKeyWrite(c->db,c->argv[1]);
4499 if (!o) {
c937aa89 4500 addReply(c,shared.czero);
ed9b544e 4501 return;
4502 }
4503
4504 /* Try to add the element to the target DB */
3305306f 4505 deleteIfVolatile(dst,c->argv[1]);
4506 if (dictAdd(dst->dict,c->argv[1],o) == DICT_ERR) {
c937aa89 4507 addReply(c,shared.czero);
ed9b544e 4508 return;
4509 }
3305306f 4510 incrRefCount(c->argv[1]);
ed9b544e 4511 incrRefCount(o);
4512
4513 /* OK! key moved, free the entry in the source DB */
3305306f 4514 deleteKey(src,c->argv[1]);
ed9b544e 4515 server.dirty++;
c937aa89 4516 addReply(c,shared.cone);
ed9b544e 4517}
4518
4519/* =================================== Lists ================================ */
4520static void pushGenericCommand(redisClient *c, int where) {
4521 robj *lobj;
ed9b544e 4522 list *list;
3305306f 4523
4524 lobj = lookupKeyWrite(c->db,c->argv[1]);
4525 if (lobj == NULL) {
95242ab5 4526 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4527 addReply(c,shared.cone);
95242ab5 4528 return;
4529 }
ed9b544e 4530 lobj = createListObject();
4531 list = lobj->ptr;
4532 if (where == REDIS_HEAD) {
6b47e12e 4533 listAddNodeHead(list,c->argv[2]);
ed9b544e 4534 } else {
6b47e12e 4535 listAddNodeTail(list,c->argv[2]);
ed9b544e 4536 }
3305306f 4537 dictAdd(c->db->dict,c->argv[1],lobj);
ed9b544e 4538 incrRefCount(c->argv[1]);
4539 incrRefCount(c->argv[2]);
4540 } else {
ed9b544e 4541 if (lobj->type != REDIS_LIST) {
4542 addReply(c,shared.wrongtypeerr);
4543 return;
4544 }
95242ab5 4545 if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
520b5a33 4546 addReply(c,shared.cone);
95242ab5 4547 return;
4548 }
ed9b544e 4549 list = lobj->ptr;
4550 if (where == REDIS_HEAD) {
6b47e12e 4551 listAddNodeHead(list,c->argv[2]);
ed9b544e 4552 } else {
6b47e12e 4553 listAddNodeTail(list,c->argv[2]);
ed9b544e 4554 }
4555 incrRefCount(c->argv[2]);
4556 }
4557 server.dirty++;
520b5a33 4558 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",listLength(list)));
ed9b544e 4559}
4560
4561static void lpushCommand(redisClient *c) {
4562 pushGenericCommand(c,REDIS_HEAD);
4563}
4564
4565static void rpushCommand(redisClient *c) {
4566 pushGenericCommand(c,REDIS_TAIL);
4567}
4568
4569static void llenCommand(redisClient *c) {
3305306f 4570 robj *o;
ed9b544e 4571 list *l;
dd88747b 4572
4573 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4574 checkType(c,o,REDIS_LIST)) return;
e0a62c7f 4575
dd88747b 4576 l = o->ptr;
4577 addReplyUlong(c,listLength(l));
ed9b544e 4578}
4579
4580static void lindexCommand(redisClient *c) {
3305306f 4581 robj *o;
ed9b544e 4582 int index = atoi(c->argv[2]->ptr);
dd88747b 4583 list *list;
4584 listNode *ln;
4585
4586 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4587 checkType(c,o,REDIS_LIST)) return;
4588 list = o->ptr;
4589
4590 ln = listIndex(list, index);
4591 if (ln == NULL) {
c937aa89 4592 addReply(c,shared.nullbulk);
ed9b544e 4593 } else {
dd88747b 4594 robj *ele = listNodeValue(ln);
4595 addReplyBulk(c,ele);
ed9b544e 4596 }
4597}
4598
4599static void lsetCommand(redisClient *c) {
3305306f 4600 robj *o;
ed9b544e 4601 int index = atoi(c->argv[2]->ptr);
dd88747b 4602 list *list;
4603 listNode *ln;
4604
4605 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL ||
4606 checkType(c,o,REDIS_LIST)) return;
4607 list = o->ptr;
4608
4609 ln = listIndex(list, index);
4610 if (ln == NULL) {
4611 addReply(c,shared.outofrangeerr);
ed9b544e 4612 } else {
dd88747b 4613 robj *ele = listNodeValue(ln);
ed9b544e 4614
dd88747b 4615 decrRefCount(ele);
4616 listNodeValue(ln) = c->argv[3];
4617 incrRefCount(c->argv[3]);
4618 addReply(c,shared.ok);
4619 server.dirty++;
ed9b544e 4620 }
4621}
4622
4623static void popGenericCommand(redisClient *c, int where) {
3305306f 4624 robj *o;
dd88747b 4625 list *list;
4626 listNode *ln;
3305306f 4627
dd88747b 4628 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4629 checkType(c,o,REDIS_LIST)) return;
4630 list = o->ptr;
ed9b544e 4631
dd88747b 4632 if (where == REDIS_HEAD)
4633 ln = listFirst(list);
4634 else
4635 ln = listLast(list);
ed9b544e 4636
dd88747b 4637 if (ln == NULL) {
4638 addReply(c,shared.nullbulk);
4639 } else {
4640 robj *ele = listNodeValue(ln);
4641 addReplyBulk(c,ele);
4642 listDelNode(list,ln);
3ea27d37 4643 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4644 server.dirty++;
ed9b544e 4645 }
4646}
4647
4648static void lpopCommand(redisClient *c) {
4649 popGenericCommand(c,REDIS_HEAD);
4650}
4651
4652static void rpopCommand(redisClient *c) {
4653 popGenericCommand(c,REDIS_TAIL);
4654}
4655
4656static void lrangeCommand(redisClient *c) {
3305306f 4657 robj *o;
ed9b544e 4658 int start = atoi(c->argv[2]->ptr);
4659 int end = atoi(c->argv[3]->ptr);
dd88747b 4660 int llen;
4661 int rangelen, j;
4662 list *list;
4663 listNode *ln;
4664 robj *ele;
4665
4e27f268 4666 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
4667 || checkType(c,o,REDIS_LIST)) return;
dd88747b 4668 list = o->ptr;
4669 llen = listLength(list);
4670
4671 /* convert negative indexes */
4672 if (start < 0) start = llen+start;
4673 if (end < 0) end = llen+end;
4674 if (start < 0) start = 0;
4675 if (end < 0) end = 0;
4676
4677 /* indexes sanity checks */
4678 if (start > end || start >= llen) {
4679 /* Out of range start or start > end result in empty list */
4680 addReply(c,shared.emptymultibulk);
4681 return;
4682 }
4683 if (end >= llen) end = llen-1;
4684 rangelen = (end-start)+1;
3305306f 4685
dd88747b 4686 /* Return the result in form of a multi-bulk reply */
4687 ln = listIndex(list, start);
4688 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
4689 for (j = 0; j < rangelen; j++) {
4690 ele = listNodeValue(ln);
4691 addReplyBulk(c,ele);
4692 ln = ln->next;
ed9b544e 4693 }
4694}
4695
4696static void ltrimCommand(redisClient *c) {
3305306f 4697 robj *o;
ed9b544e 4698 int start = atoi(c->argv[2]->ptr);
4699 int end = atoi(c->argv[3]->ptr);
dd88747b 4700 int llen;
4701 int j, ltrim, rtrim;
4702 list *list;
4703 listNode *ln;
4704
4705 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
4706 checkType(c,o,REDIS_LIST)) return;
4707 list = o->ptr;
4708 llen = listLength(list);
4709
4710 /* convert negative indexes */
4711 if (start < 0) start = llen+start;
4712 if (end < 0) end = llen+end;
4713 if (start < 0) start = 0;
4714 if (end < 0) end = 0;
4715
4716 /* indexes sanity checks */
4717 if (start > end || start >= llen) {
4718 /* Out of range start or start > end result in empty list */
4719 ltrim = llen;
4720 rtrim = 0;
ed9b544e 4721 } else {
dd88747b 4722 if (end >= llen) end = llen-1;
4723 ltrim = start;
4724 rtrim = llen-end-1;
4725 }
ed9b544e 4726
dd88747b 4727 /* Remove list elements to perform the trim */
4728 for (j = 0; j < ltrim; j++) {
4729 ln = listFirst(list);
4730 listDelNode(list,ln);
4731 }
4732 for (j = 0; j < rtrim; j++) {
4733 ln = listLast(list);
4734 listDelNode(list,ln);
ed9b544e 4735 }
3ea27d37 4736 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4737 server.dirty++;
4738 addReply(c,shared.ok);
ed9b544e 4739}
4740
4741static void lremCommand(redisClient *c) {
3305306f 4742 robj *o;
dd88747b 4743 list *list;
4744 listNode *ln, *next;
4745 int toremove = atoi(c->argv[2]->ptr);
4746 int removed = 0;
4747 int fromtail = 0;
a4d1ba9a 4748
dd88747b 4749 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4750 checkType(c,o,REDIS_LIST)) return;
4751 list = o->ptr;
4752
4753 if (toremove < 0) {
4754 toremove = -toremove;
4755 fromtail = 1;
4756 }
4757 ln = fromtail ? list->tail : list->head;
4758 while (ln) {
4759 robj *ele = listNodeValue(ln);
4760
4761 next = fromtail ? ln->prev : ln->next;
4762 if (compareStringObjects(ele,c->argv[3]) == 0) {
4763 listDelNode(list,ln);
4764 server.dirty++;
4765 removed++;
4766 if (toremove && removed == toremove) break;
ed9b544e 4767 }
dd88747b 4768 ln = next;
ed9b544e 4769 }
3ea27d37 4770 if (listLength(list) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4771 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
ed9b544e 4772}
4773
12f9d551 4774/* This is the semantic of this command:
0f5f7e9a 4775 * RPOPLPUSH srclist dstlist:
12f9d551 4776 * IF LLEN(srclist) > 0
4777 * element = RPOP srclist
4778 * LPUSH dstlist element
4779 * RETURN element
4780 * ELSE
4781 * RETURN nil
4782 * END
4783 * END
4784 *
4785 * The idea is to be able to get an element from a list in a reliable way
4786 * since the element is not just returned but pushed against another list
4787 * as well. This command was originally proposed by Ezra Zygmuntowicz.
4788 */
0f5f7e9a 4789static void rpoplpushcommand(redisClient *c) {
12f9d551 4790 robj *sobj;
dd88747b 4791 list *srclist;
4792 listNode *ln;
4793
4794 if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4795 checkType(c,sobj,REDIS_LIST)) return;
4796 srclist = sobj->ptr;
4797 ln = listLast(srclist);
12f9d551 4798
dd88747b 4799 if (ln == NULL) {
12f9d551 4800 addReply(c,shared.nullbulk);
4801 } else {
dd88747b 4802 robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
4803 robj *ele = listNodeValue(ln);
4804 list *dstlist;
e20fb74f 4805
dd88747b 4806 if (dobj && dobj->type != REDIS_LIST) {
4807 addReply(c,shared.wrongtypeerr);
4808 return;
4809 }
12f9d551 4810
dd88747b 4811 /* Add the element to the target list (unless it's directly
4812 * passed to some BLPOP-ing client */
4813 if (!handleClientsWaitingListPush(c,c->argv[2],ele)) {
4814 if (dobj == NULL) {
4815 /* Create the list if the key does not exist */
4816 dobj = createListObject();
4817 dictAdd(c->db->dict,c->argv[2],dobj);
4818 incrRefCount(c->argv[2]);
12f9d551 4819 }
dd88747b 4820 dstlist = dobj->ptr;
4821 listAddNodeHead(dstlist,ele);
4822 incrRefCount(ele);
12f9d551 4823 }
dd88747b 4824
4825 /* Send the element to the client as reply as well */
4826 addReplyBulk(c,ele);
4827
4828 /* Finally remove the element from the source list */
4829 listDelNode(srclist,ln);
3ea27d37 4830 if (listLength(srclist) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4831 server.dirty++;
12f9d551 4832 }
4833}
4834
ed9b544e 4835/* ==================================== Sets ================================ */
4836
4837static void saddCommand(redisClient *c) {
ed9b544e 4838 robj *set;
4839
3305306f 4840 set = lookupKeyWrite(c->db,c->argv[1]);
4841 if (set == NULL) {
ed9b544e 4842 set = createSetObject();
3305306f 4843 dictAdd(c->db->dict,c->argv[1],set);
ed9b544e 4844 incrRefCount(c->argv[1]);
4845 } else {
ed9b544e 4846 if (set->type != REDIS_SET) {
c937aa89 4847 addReply(c,shared.wrongtypeerr);
ed9b544e 4848 return;
4849 }
4850 }
4851 if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
4852 incrRefCount(c->argv[2]);
4853 server.dirty++;
c937aa89 4854 addReply(c,shared.cone);
ed9b544e 4855 } else {
c937aa89 4856 addReply(c,shared.czero);
ed9b544e 4857 }
4858}
4859
4860static void sremCommand(redisClient *c) {
3305306f 4861 robj *set;
ed9b544e 4862
dd88747b 4863 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
4864 checkType(c,set,REDIS_SET)) return;
4865
4866 if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
4867 server.dirty++;
4868 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 4869 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4870 addReply(c,shared.cone);
ed9b544e 4871 } else {
dd88747b 4872 addReply(c,shared.czero);
ed9b544e 4873 }
4874}
4875
a4460ef4 4876static void smoveCommand(redisClient *c) {
4877 robj *srcset, *dstset;
4878
4879 srcset = lookupKeyWrite(c->db,c->argv[1]);
4880 dstset = lookupKeyWrite(c->db,c->argv[2]);
4881
4882 /* If the source key does not exist return 0, if it's of the wrong type
4883 * raise an error */
4884 if (srcset == NULL || srcset->type != REDIS_SET) {
4885 addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
4886 return;
4887 }
4888 /* Error if the destination key is not a set as well */
4889 if (dstset && dstset->type != REDIS_SET) {
4890 addReply(c,shared.wrongtypeerr);
4891 return;
4892 }
4893 /* Remove the element from the source set */
4894 if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
4895 /* Key not found in the src set! return zero */
4896 addReply(c,shared.czero);
4897 return;
4898 }
3ea27d37 4899 if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
4900 deleteKey(c->db,c->argv[1]);
a4460ef4 4901 server.dirty++;
4902 /* Add the element to the destination set */
4903 if (!dstset) {
4904 dstset = createSetObject();
4905 dictAdd(c->db->dict,c->argv[2],dstset);
4906 incrRefCount(c->argv[2]);
4907 }
4908 if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
4909 incrRefCount(c->argv[3]);
4910 addReply(c,shared.cone);
4911}
4912
ed9b544e 4913static void sismemberCommand(redisClient *c) {
3305306f 4914 robj *set;
ed9b544e 4915
dd88747b 4916 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4917 checkType(c,set,REDIS_SET)) return;
4918
4919 if (dictFind(set->ptr,c->argv[2]))
4920 addReply(c,shared.cone);
4921 else
c937aa89 4922 addReply(c,shared.czero);
ed9b544e 4923}
4924
4925static void scardCommand(redisClient *c) {
3305306f 4926 robj *o;
ed9b544e 4927 dict *s;
dd88747b 4928
4929 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
4930 checkType(c,o,REDIS_SET)) return;
e0a62c7f 4931
dd88747b 4932 s = o->ptr;
4933 addReplyUlong(c,dictSize(s));
ed9b544e 4934}
4935
12fea928 4936static void spopCommand(redisClient *c) {
4937 robj *set;
4938 dictEntry *de;
4939
dd88747b 4940 if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4941 checkType(c,set,REDIS_SET)) return;
4942
4943 de = dictGetRandomKey(set->ptr);
4944 if (de == NULL) {
12fea928 4945 addReply(c,shared.nullbulk);
4946 } else {
dd88747b 4947 robj *ele = dictGetEntryKey(de);
12fea928 4948
dd88747b 4949 addReplyBulk(c,ele);
4950 dictDelete(set->ptr,ele);
4951 if (htNeedsResize(set->ptr)) dictResize(set->ptr);
3ea27d37 4952 if (dictSize((dict*)set->ptr) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 4953 server.dirty++;
12fea928 4954 }
4955}
4956
2abb95a9 4957static void srandmemberCommand(redisClient *c) {
4958 robj *set;
4959 dictEntry *de;
4960
dd88747b 4961 if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
4962 checkType(c,set,REDIS_SET)) return;
4963
4964 de = dictGetRandomKey(set->ptr);
4965 if (de == NULL) {
2abb95a9 4966 addReply(c,shared.nullbulk);
4967 } else {
dd88747b 4968 robj *ele = dictGetEntryKey(de);
2abb95a9 4969
dd88747b 4970 addReplyBulk(c,ele);
2abb95a9 4971 }
4972}
4973
ed9b544e 4974static int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
4975 dict **d1 = (void*) s1, **d2 = (void*) s2;
4976
3305306f 4977 return dictSize(*d1)-dictSize(*d2);
ed9b544e 4978}
4979
682ac724 4980static void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
ed9b544e 4981 dict **dv = zmalloc(sizeof(dict*)*setsnum);
4982 dictIterator *di;
4983 dictEntry *de;
4984 robj *lenobj = NULL, *dstset = NULL;
682ac724 4985 unsigned long j, cardinality = 0;
ed9b544e 4986
ed9b544e 4987 for (j = 0; j < setsnum; j++) {
4988 robj *setobj;
3305306f 4989
4990 setobj = dstkey ?
4991 lookupKeyWrite(c->db,setskeys[j]) :
4992 lookupKeyRead(c->db,setskeys[j]);
4993 if (!setobj) {
ed9b544e 4994 zfree(dv);
5faa6025 4995 if (dstkey) {
fdcaae84 4996 if (deleteKey(c->db,dstkey))
4997 server.dirty++;
0d36ded0 4998 addReply(c,shared.czero);
5faa6025 4999 } else {
4e27f268 5000 addReply(c,shared.emptymultibulk);
5faa6025 5001 }
ed9b544e 5002 return;
5003 }
ed9b544e 5004 if (setobj->type != REDIS_SET) {
5005 zfree(dv);
c937aa89 5006 addReply(c,shared.wrongtypeerr);
ed9b544e 5007 return;
5008 }
5009 dv[j] = setobj->ptr;
5010 }
5011 /* Sort sets from the smallest to largest, this will improve our
5012 * algorithm's performace */
5013 qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
5014
5015 /* The first thing we should output is the total number of elements...
5016 * since this is a multi-bulk write, but at this stage we don't know
5017 * the intersection set size, so we use a trick, append an empty object
5018 * to the output list and save the pointer to later modify it with the
5019 * right length */
5020 if (!dstkey) {
5021 lenobj = createObject(REDIS_STRING,NULL);
5022 addReply(c,lenobj);
5023 decrRefCount(lenobj);
5024 } else {
5025 /* If we have a target key where to store the resulting set
5026 * create this key with an empty set inside */
5027 dstset = createSetObject();
ed9b544e 5028 }
5029
5030 /* Iterate all the elements of the first (smallest) set, and test
5031 * the element against all the other sets, if at least one set does
5032 * not include the element it is discarded */
5033 di = dictGetIterator(dv[0]);
ed9b544e 5034
5035 while((de = dictNext(di)) != NULL) {
5036 robj *ele;
5037
5038 for (j = 1; j < setsnum; j++)
5039 if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
5040 if (j != setsnum)
5041 continue; /* at least one set does not contain the member */
5042 ele = dictGetEntryKey(de);
5043 if (!dstkey) {
dd88747b 5044 addReplyBulk(c,ele);
ed9b544e 5045 cardinality++;
5046 } else {
5047 dictAdd(dstset->ptr,ele,NULL);
5048 incrRefCount(ele);
5049 }
5050 }
5051 dictReleaseIterator(di);
5052
83cdfe18 5053 if (dstkey) {
3ea27d37 5054 /* Store the resulting set into the target, if the intersection
5055 * is not an empty set. */
83cdfe18 5056 deleteKey(c->db,dstkey);
3ea27d37 5057 if (dictSize((dict*)dstset->ptr) > 0) {
5058 dictAdd(c->db->dict,dstkey,dstset);
5059 incrRefCount(dstkey);
d36c4e97 5060 addReplyLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5061 } else {
5062 decrRefCount(dstset);
d36c4e97 5063 addReply(c,shared.czero);
3ea27d37 5064 }
40d224a9 5065 server.dirty++;
d36c4e97 5066 } else {
5067 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
40d224a9 5068 }
ed9b544e 5069 zfree(dv);
5070}
5071
5072static void sinterCommand(redisClient *c) {
5073 sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
5074}
5075
5076static void sinterstoreCommand(redisClient *c) {
5077 sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
5078}
5079
f4f56e1d 5080#define REDIS_OP_UNION 0
5081#define REDIS_OP_DIFF 1
2830ca53 5082#define REDIS_OP_INTER 2
f4f56e1d 5083
5084static void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
40d224a9 5085 dict **dv = zmalloc(sizeof(dict*)*setsnum);
5086 dictIterator *di;
5087 dictEntry *de;
f4f56e1d 5088 robj *dstset = NULL;
40d224a9 5089 int j, cardinality = 0;
5090
40d224a9 5091 for (j = 0; j < setsnum; j++) {
5092 robj *setobj;
5093
5094 setobj = dstkey ?
5095 lookupKeyWrite(c->db,setskeys[j]) :
5096 lookupKeyRead(c->db,setskeys[j]);
5097 if (!setobj) {
5098 dv[j] = NULL;
5099 continue;
5100 }
5101 if (setobj->type != REDIS_SET) {
5102 zfree(dv);
5103 addReply(c,shared.wrongtypeerr);
5104 return;
5105 }
5106 dv[j] = setobj->ptr;
5107 }
5108
5109 /* We need a temp set object to store our union. If the dstkey
5110 * is not NULL (that is, we are inside an SUNIONSTORE operation) then
5111 * this set object will be the resulting object to set into the target key*/
5112 dstset = createSetObject();
5113
40d224a9 5114 /* Iterate all the elements of all the sets, add every element a single
5115 * time to the result set */
5116 for (j = 0; j < setsnum; j++) {
51829ed3 5117 if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
40d224a9 5118 if (!dv[j]) continue; /* non existing keys are like empty sets */
5119
5120 di = dictGetIterator(dv[j]);
40d224a9 5121
5122 while((de = dictNext(di)) != NULL) {
5123 robj *ele;
5124
5125 /* dictAdd will not add the same element multiple times */
5126 ele = dictGetEntryKey(de);
f4f56e1d 5127 if (op == REDIS_OP_UNION || j == 0) {
5128 if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
5129 incrRefCount(ele);
40d224a9 5130 cardinality++;
5131 }
f4f56e1d 5132 } else if (op == REDIS_OP_DIFF) {
5133 if (dictDelete(dstset->ptr,ele) == DICT_OK) {
5134 cardinality--;
5135 }
40d224a9 5136 }
5137 }
5138 dictReleaseIterator(di);
51829ed3 5139
d36c4e97 5140 /* result set is empty? Exit asap. */
5141 if (op == REDIS_OP_DIFF && cardinality == 0) break;
40d224a9 5142 }
5143
f4f56e1d 5144 /* Output the content of the resulting set, if not in STORE mode */
5145 if (!dstkey) {
5146 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
5147 di = dictGetIterator(dstset->ptr);
f4f56e1d 5148 while((de = dictNext(di)) != NULL) {
5149 robj *ele;
5150
5151 ele = dictGetEntryKey(de);
dd88747b 5152 addReplyBulk(c,ele);
f4f56e1d 5153 }
5154 dictReleaseIterator(di);
d36c4e97 5155 decrRefCount(dstset);
83cdfe18
AG
5156 } else {
5157 /* If we have a target key where to store the resulting set
5158 * create this key with the result set inside */
5159 deleteKey(c->db,dstkey);
3ea27d37 5160 if (dictSize((dict*)dstset->ptr) > 0) {
5161 dictAdd(c->db->dict,dstkey,dstset);
5162 incrRefCount(dstkey);
d36c4e97 5163 addReplyLong(c,dictSize((dict*)dstset->ptr));
3ea27d37 5164 } else {
5165 decrRefCount(dstset);
d36c4e97 5166 addReply(c,shared.czero);
3ea27d37 5167 }
40d224a9 5168 server.dirty++;
5169 }
5170 zfree(dv);
5171}
5172
5173static void sunionCommand(redisClient *c) {
f4f56e1d 5174 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
40d224a9 5175}
5176
5177static void sunionstoreCommand(redisClient *c) {
f4f56e1d 5178 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
5179}
5180
5181static void sdiffCommand(redisClient *c) {
5182 sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
5183}
5184
5185static void sdiffstoreCommand(redisClient *c) {
5186 sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
40d224a9 5187}
5188
6b47e12e 5189/* ==================================== ZSets =============================== */
5190
5191/* ZSETs are ordered sets using two data structures to hold the same elements
5192 * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
5193 * data structure.
5194 *
5195 * The elements are added to an hash table mapping Redis objects to scores.
5196 * At the same time the elements are added to a skip list mapping scores
5197 * to Redis objects (so objects are sorted by scores in this "view"). */
5198
5199/* This skiplist implementation is almost a C translation of the original
5200 * algorithm described by William Pugh in "Skip Lists: A Probabilistic
5201 * Alternative to Balanced Trees", modified in three ways:
5202 * a) this implementation allows for repeated values.
5203 * b) the comparison is not just by key (our 'score') but by satellite data.
5204 * c) there is a back pointer, so it's a doubly linked list with the back
5205 * pointers being only at "level 1". This allows to traverse the list
5206 * from tail to head, useful for ZREVRANGE. */
5207
5208static zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
5209 zskiplistNode *zn = zmalloc(sizeof(*zn));
5210
5211 zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
2b37892e
PN
5212 if (level > 0)
5213 zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
6b47e12e 5214 zn->score = score;
5215 zn->obj = obj;
5216 return zn;
5217}
5218
5219static zskiplist *zslCreate(void) {
5220 int j;
5221 zskiplist *zsl;
e0a62c7f 5222
6b47e12e 5223 zsl = zmalloc(sizeof(*zsl));
5224 zsl->level = 1;
cc812361 5225 zsl->length = 0;
6b47e12e 5226 zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
69d95c3e 5227 for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
6b47e12e 5228 zsl->header->forward[j] = NULL;
94e543b5 5229
5230 /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
5231 if (j < ZSKIPLIST_MAXLEVEL-1)
5232 zsl->header->span[j] = 0;
69d95c3e 5233 }
e3870fab 5234 zsl->header->backward = NULL;
5235 zsl->tail = NULL;
6b47e12e 5236 return zsl;
5237}
5238
fd8ccf44 5239static void zslFreeNode(zskiplistNode *node) {
5240 decrRefCount(node->obj);
ad807e6f 5241 zfree(node->forward);
69d95c3e 5242 zfree(node->span);
fd8ccf44 5243 zfree(node);
5244}
5245
5246static void zslFree(zskiplist *zsl) {
ad807e6f 5247 zskiplistNode *node = zsl->header->forward[0], *next;
fd8ccf44 5248
ad807e6f 5249 zfree(zsl->header->forward);
69d95c3e 5250 zfree(zsl->header->span);
ad807e6f 5251 zfree(zsl->header);
fd8ccf44 5252 while(node) {
599379dd 5253 next = node->forward[0];
fd8ccf44 5254 zslFreeNode(node);
5255 node = next;
5256 }
ad807e6f 5257 zfree(zsl);
fd8ccf44 5258}
5259
6b47e12e 5260static int zslRandomLevel(void) {
5261 int level = 1;
5262 while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
5263 level += 1;
10c2baa5 5264 return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
6b47e12e 5265}
5266
5267static void zslInsert(zskiplist *zsl, double score, robj *obj) {
5268 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
2b37892e 5269 unsigned int rank[ZSKIPLIST_MAXLEVEL];
6b47e12e 5270 int i, level;
5271
5272 x = zsl->header;
5273 for (i = zsl->level-1; i >= 0; i--) {
2b37892e
PN
5274 /* store rank that is crossed to reach the insert position */
5275 rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
69d95c3e 5276
9d60e6e4 5277 while (x->forward[i] &&
5278 (x->forward[i]->score < score ||
5279 (x->forward[i]->score == score &&
69d95c3e 5280 compareStringObjects(x->forward[i]->obj,obj) < 0))) {
a50ea45c 5281 rank[i] += i > 0 ? x->span[i-1] : 1;
6b47e12e 5282 x = x->forward[i];
69d95c3e 5283 }
6b47e12e 5284 update[i] = x;
5285 }
6b47e12e 5286 /* we assume the key is not already inside, since we allow duplicated
5287 * scores, and the re-insertion of score and redis object should never
5288 * happpen since the caller of zslInsert() should test in the hash table
5289 * if the element is already inside or not. */
5290 level = zslRandomLevel();
5291 if (level > zsl->level) {
69d95c3e 5292 for (i = zsl->level; i < level; i++) {
2b37892e 5293 rank[i] = 0;
6b47e12e 5294 update[i] = zsl->header;
2b37892e 5295 update[i]->span[i-1] = zsl->length;
69d95c3e 5296 }
6b47e12e 5297 zsl->level = level;
5298 }
5299 x = zslCreateNode(level,score,obj);
5300 for (i = 0; i < level; i++) {
5301 x->forward[i] = update[i]->forward[i];
5302 update[i]->forward[i] = x;
69d95c3e
PN
5303
5304 /* update span covered by update[i] as x is inserted here */
2b37892e
PN
5305 if (i > 0) {
5306 x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
5307 update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
5308 }
6b47e12e 5309 }
69d95c3e
PN
5310
5311 /* increment span for untouched levels */
5312 for (i = level; i < zsl->level; i++) {
2b37892e 5313 update[i]->span[i-1]++;
69d95c3e
PN
5314 }
5315
bb975144 5316 x->backward = (update[0] == zsl->header) ? NULL : update[0];
e3870fab 5317 if (x->forward[0])
5318 x->forward[0]->backward = x;
5319 else
5320 zsl->tail = x;
cc812361 5321 zsl->length++;
6b47e12e 5322}
5323
84105336
PN
5324/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
5325void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
5326 int i;
5327 for (i = 0; i < zsl->level; i++) {
5328 if (update[i]->forward[i] == x) {
5329 if (i > 0) {
5330 update[i]->span[i-1] += x->span[i-1] - 1;
5331 }
5332 update[i]->forward[i] = x->forward[i];
5333 } else {
5334 /* invariant: i > 0, because update[0]->forward[0]
5335 * is always equal to x */
5336 update[i]->span[i-1] -= 1;
5337 }
5338 }
5339 if (x->forward[0]) {
5340 x->forward[0]->backward = x->backward;
5341 } else {
5342 zsl->tail = x->backward;
5343 }
5344 while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
5345 zsl->level--;
5346 zsl->length--;
5347}
5348
50c55df5 5349/* Delete an element with matching score/object from the skiplist. */
fd8ccf44 5350static int zslDelete(zskiplist *zsl, double score, robj *obj) {
e197b441 5351 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5352 int i;
5353
5354 x = zsl->header;
5355 for (i = zsl->level-1; i >= 0; i--) {
9d60e6e4 5356 while (x->forward[i] &&
5357 (x->forward[i]->score < score ||
5358 (x->forward[i]->score == score &&
5359 compareStringObjects(x->forward[i]->obj,obj) < 0)))
e197b441 5360 x = x->forward[i];
5361 update[i] = x;
5362 }
5363 /* We may have multiple elements with the same score, what we need
5364 * is to find the element with both the right score and object. */
5365 x = x->forward[0];
50c55df5 5366 if (x && score == x->score && compareStringObjects(x->obj,obj) == 0) {
84105336 5367 zslDeleteNode(zsl, x, update);
9d60e6e4 5368 zslFreeNode(x);
9d60e6e4 5369 return 1;
5370 } else {
5371 return 0; /* not found */
e197b441 5372 }
5373 return 0; /* not found */
fd8ccf44 5374}
5375
1807985b 5376/* Delete all the elements with score between min and max from the skiplist.
5377 * Min and mx are inclusive, so a score >= min || score <= max is deleted.
5378 * Note that this function takes the reference to the hash table view of the
5379 * sorted set, in order to remove the elements from the hash table too. */
f84d3933 5380static unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
1807985b 5381 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5382 unsigned long removed = 0;
5383 int i;
5384
5385 x = zsl->header;
5386 for (i = zsl->level-1; i >= 0; i--) {
5387 while (x->forward[i] && x->forward[i]->score < min)
5388 x = x->forward[i];
5389 update[i] = x;
5390 }
5391 /* We may have multiple elements with the same score, what we need
5392 * is to find the element with both the right score and object. */
5393 x = x->forward[0];
5394 while (x && x->score <= max) {
84105336
PN
5395 zskiplistNode *next = x->forward[0];
5396 zslDeleteNode(zsl, x, update);
1807985b 5397 dictDelete(dict,x->obj);
5398 zslFreeNode(x);
1807985b 5399 removed++;
5400 x = next;
5401 }
5402 return removed; /* not found */
5403}
1807985b 5404
9212eafd 5405/* Delete all the elements with rank between start and end from the skiplist.
2424490f 5406 * Start and end are inclusive. Note that start and end need to be 1-based */
9212eafd
PN
5407static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
5408 zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
5409 unsigned long traversed = 0, removed = 0;
5410 int i;
5411
9212eafd
PN
5412 x = zsl->header;
5413 for (i = zsl->level-1; i >= 0; i--) {
5414 while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
5415 traversed += i > 0 ? x->span[i-1] : 1;
5416 x = x->forward[i];
1807985b 5417 }
9212eafd
PN
5418 update[i] = x;
5419 }
5420
5421 traversed++;
5422 x = x->forward[0];
5423 while (x && traversed <= end) {
84105336
PN
5424 zskiplistNode *next = x->forward[0];
5425 zslDeleteNode(zsl, x, update);
1807985b 5426 dictDelete(dict,x->obj);
5427 zslFreeNode(x);
1807985b 5428 removed++;
9212eafd 5429 traversed++;
1807985b 5430 x = next;
5431 }
9212eafd 5432 return removed;
1807985b 5433}
5434
50c55df5 5435/* Find the first node having a score equal or greater than the specified one.
5436 * Returns NULL if there is no match. */
5437static zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
5438 zskiplistNode *x;
5439 int i;
5440
5441 x = zsl->header;
5442 for (i = zsl->level-1; i >= 0; i--) {
5443 while (x->forward[i] && x->forward[i]->score < score)
5444 x = x->forward[i];
5445 }
5446 /* We may have multiple elements with the same score, what we need
5447 * is to find the element with both the right score and object. */
5448 return x->forward[0];
5449}
5450
27b0ccca
PN
5451/* Find the rank for an element by both score and key.
5452 * Returns 0 when the element cannot be found, rank otherwise.
5453 * Note that the rank is 1-based due to the span of zsl->header to the
5454 * first element. */
5455static unsigned long zslGetRank(zskiplist *zsl, double score, robj *o) {
5456 zskiplistNode *x;
5457 unsigned long rank = 0;
5458 int i;
5459
5460 x = zsl->header;
5461 for (i = zsl->level-1; i >= 0; i--) {
5462 while (x->forward[i] &&
5463 (x->forward[i]->score < score ||
5464 (x->forward[i]->score == score &&
5465 compareStringObjects(x->forward[i]->obj,o) <= 0))) {
a50ea45c 5466 rank += i > 0 ? x->span[i-1] : 1;
27b0ccca
PN
5467 x = x->forward[i];
5468 }
5469
5470 /* x might be equal to zsl->header, so test if obj is non-NULL */
5471 if (x->obj && compareStringObjects(x->obj,o) == 0) {
5472 return rank;
5473 }
5474 }
5475 return 0;
5476}
5477
e74825c2
PN
5478/* Finds an element by its rank. The rank argument needs to be 1-based. */
5479zskiplistNode* zslGetElementByRank(zskiplist *zsl, unsigned long rank) {
5480 zskiplistNode *x;
5481 unsigned long traversed = 0;
5482 int i;
5483
5484 x = zsl->header;
5485 for (i = zsl->level-1; i >= 0; i--) {
dd88747b 5486 while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
5487 {
a50ea45c 5488 traversed += i > 0 ? x->span[i-1] : 1;
e74825c2
PN
5489 x = x->forward[i];
5490 }
e74825c2
PN
5491 if (traversed == rank) {
5492 return x;
5493 }
5494 }
5495 return NULL;
5496}
5497
fd8ccf44 5498/* The actual Z-commands implementations */
5499
7db723ad 5500/* This generic command implements both ZADD and ZINCRBY.
e2665397 5501 * scoreval is the score if the operation is a ZADD (doincrement == 0) or
7db723ad 5502 * the increment if the operation is a ZINCRBY (doincrement == 1). */
e2665397 5503static void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
fd8ccf44 5504 robj *zsetobj;
5505 zset *zs;
5506 double *score;
5507
e2665397 5508 zsetobj = lookupKeyWrite(c->db,key);
fd8ccf44 5509 if (zsetobj == NULL) {
5510 zsetobj = createZsetObject();
e2665397 5511 dictAdd(c->db->dict,key,zsetobj);
5512 incrRefCount(key);
fd8ccf44 5513 } else {
5514 if (zsetobj->type != REDIS_ZSET) {
5515 addReply(c,shared.wrongtypeerr);
5516 return;
5517 }
5518 }
fd8ccf44 5519 zs = zsetobj->ptr;
e2665397 5520
7db723ad 5521 /* Ok now since we implement both ZADD and ZINCRBY here the code
e2665397 5522 * needs to handle the two different conditions. It's all about setting
5523 * '*score', that is, the new score to set, to the right value. */
5524 score = zmalloc(sizeof(double));
5525 if (doincrement) {
5526 dictEntry *de;
5527
5528 /* Read the old score. If the element was not present starts from 0 */
5529 de = dictFind(zs->dict,ele);
5530 if (de) {
5531 double *oldscore = dictGetEntryVal(de);
5532 *score = *oldscore + scoreval;
5533 } else {
5534 *score = scoreval;
5535 }
5536 } else {
5537 *score = scoreval;
5538 }
5539
5540 /* What follows is a simple remove and re-insert operation that is common
7db723ad 5541 * to both ZADD and ZINCRBY... */
e2665397 5542 if (dictAdd(zs->dict,ele,score) == DICT_OK) {
fd8ccf44 5543 /* case 1: New element */
e2665397 5544 incrRefCount(ele); /* added to hash */
5545 zslInsert(zs->zsl,*score,ele);
5546 incrRefCount(ele); /* added to skiplist */
fd8ccf44 5547 server.dirty++;
e2665397 5548 if (doincrement)
e2665397 5549 addReplyDouble(c,*score);
91d71bfc 5550 else
5551 addReply(c,shared.cone);
fd8ccf44 5552 } else {
5553 dictEntry *de;
5554 double *oldscore;
e0a62c7f 5555
fd8ccf44 5556 /* case 2: Score update operation */
e2665397 5557 de = dictFind(zs->dict,ele);
dfc5e96c 5558 redisAssert(de != NULL);
fd8ccf44 5559 oldscore = dictGetEntryVal(de);
5560 if (*score != *oldscore) {
5561 int deleted;
5562
e2665397 5563 /* Remove and insert the element in the skip list with new score */
5564 deleted = zslDelete(zs->zsl,*oldscore,ele);
dfc5e96c 5565 redisAssert(deleted != 0);
e2665397 5566 zslInsert(zs->zsl,*score,ele);
5567 incrRefCount(ele);
5568 /* Update the score in the hash table */
5569 dictReplace(zs->dict,ele,score);
fd8ccf44 5570 server.dirty++;
2161a965 5571 } else {
5572 zfree(score);
fd8ccf44 5573 }
e2665397 5574 if (doincrement)
5575 addReplyDouble(c,*score);
5576 else
5577 addReply(c,shared.czero);
fd8ccf44 5578 }
5579}
5580
e2665397 5581static void zaddCommand(redisClient *c) {
5582 double scoreval;
5583
bd79a6bd 5584 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5585 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
5586}
5587
7db723ad 5588static void zincrbyCommand(redisClient *c) {
e2665397 5589 double scoreval;
5590
bd79a6bd 5591 if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
e2665397 5592 zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
5593}
5594
1b7106e7 5595static void zremCommand(redisClient *c) {
5596 robj *zsetobj;
5597 zset *zs;
dd88747b 5598 dictEntry *de;
5599 double *oldscore;
5600 int deleted;
1b7106e7 5601
dd88747b 5602 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5603 checkType(c,zsetobj,REDIS_ZSET)) return;
1b7106e7 5604
dd88747b 5605 zs = zsetobj->ptr;
5606 de = dictFind(zs->dict,c->argv[2]);
5607 if (de == NULL) {
5608 addReply(c,shared.czero);
5609 return;
1b7106e7 5610 }
dd88747b 5611 /* Delete from the skiplist */
5612 oldscore = dictGetEntryVal(de);
5613 deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
5614 redisAssert(deleted != 0);
5615
5616 /* Delete from the hash table */
5617 dictDelete(zs->dict,c->argv[2]);
5618 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5619 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5620 server.dirty++;
5621 addReply(c,shared.cone);
1b7106e7 5622}
5623
1807985b 5624static void zremrangebyscoreCommand(redisClient *c) {
bbe025e0
AM
5625 double min;
5626 double max;
dd88747b 5627 long deleted;
1807985b 5628 robj *zsetobj;
5629 zset *zs;
5630
bd79a6bd
PN
5631 if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
5632 (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
bbe025e0 5633
dd88747b 5634 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5635 checkType(c,zsetobj,REDIS_ZSET)) return;
1807985b 5636
dd88747b 5637 zs = zsetobj->ptr;
5638 deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
5639 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5640 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5641 server.dirty += deleted;
5642 addReplyLong(c,deleted);
1807985b 5643}
5644
9212eafd 5645static void zremrangebyrankCommand(redisClient *c) {
bbe025e0
AM
5646 long start;
5647 long end;
dd88747b 5648 int llen;
5649 long deleted;
9212eafd
PN
5650 robj *zsetobj;
5651 zset *zs;
5652
bd79a6bd
PN
5653 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5654 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 5655
dd88747b 5656 if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
5657 checkType(c,zsetobj,REDIS_ZSET)) return;
5658 zs = zsetobj->ptr;
5659 llen = zs->zsl->length;
9212eafd 5660
dd88747b 5661 /* convert negative indexes */
5662 if (start < 0) start = llen+start;
5663 if (end < 0) end = llen+end;
5664 if (start < 0) start = 0;
5665 if (end < 0) end = 0;
9212eafd 5666
dd88747b 5667 /* indexes sanity checks */
5668 if (start > end || start >= llen) {
5669 addReply(c,shared.czero);
5670 return;
9212eafd 5671 }
dd88747b 5672 if (end >= llen) end = llen-1;
5673
5674 /* increment start and end because zsl*Rank functions
5675 * use 1-based rank */
5676 deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
5677 if (htNeedsResize(zs->dict)) dictResize(zs->dict);
3ea27d37 5678 if (dictSize(zs->dict) == 0) deleteKey(c->db,c->argv[1]);
dd88747b 5679 server.dirty += deleted;
5680 addReplyLong(c, deleted);
9212eafd
PN
5681}
5682
8f92e768
PN
5683typedef struct {
5684 dict *dict;
5685 double weight;
5686} zsetopsrc;
5687
5688static int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
5689 zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
5690 unsigned long size1, size2;
5691 size1 = d1->dict ? dictSize(d1->dict) : 0;
5692 size2 = d2->dict ? dictSize(d2->dict) : 0;
5693 return size1 - size2;
5694}
5695
d2764cd6
PN
5696#define REDIS_AGGR_SUM 1
5697#define REDIS_AGGR_MIN 2
5698#define REDIS_AGGR_MAX 3
5699
5700inline static void zunionInterAggregate(double *target, double val, int aggregate) {
5701 if (aggregate == REDIS_AGGR_SUM) {
5702 *target = *target + val;
5703 } else if (aggregate == REDIS_AGGR_MIN) {
5704 *target = val < *target ? val : *target;
5705 } else if (aggregate == REDIS_AGGR_MAX) {
5706 *target = val > *target ? val : *target;
5707 } else {
5708 /* safety net */
f83c6cb5 5709 redisPanic("Unknown ZUNION/INTER aggregate type");
d2764cd6
PN
5710 }
5711}
5712
2830ca53 5713static void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
8f92e768 5714 int i, j, zsetnum;
d2764cd6 5715 int aggregate = REDIS_AGGR_SUM;
8f92e768 5716 zsetopsrc *src;
2830ca53
PN
5717 robj *dstobj;
5718 zset *dstzset;
b287c9bb
PN
5719 dictIterator *di;
5720 dictEntry *de;
5721
2830ca53
PN
5722 /* expect zsetnum input keys to be given */
5723 zsetnum = atoi(c->argv[2]->ptr);
5724 if (zsetnum < 1) {
5725 addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNION/ZINTER\r\n"));
5726 return;
b287c9bb 5727 }
2830ca53
PN
5728
5729 /* test if the expected number of keys would overflow */
5730 if (3+zsetnum > c->argc) {
b287c9bb
PN
5731 addReply(c,shared.syntaxerr);
5732 return;
5733 }
5734
2830ca53 5735 /* read keys to be used for input */
b9eed483 5736 src = zmalloc(sizeof(zsetopsrc) * zsetnum);
2830ca53 5737 for (i = 0, j = 3; i < zsetnum; i++, j++) {
b287c9bb
PN
5738 robj *zsetobj = lookupKeyWrite(c->db,c->argv[j]);
5739 if (!zsetobj) {
8f92e768 5740 src[i].dict = NULL;
b287c9bb
PN
5741 } else {
5742 if (zsetobj->type != REDIS_ZSET) {
8f92e768 5743 zfree(src);
b287c9bb
PN
5744 addReply(c,shared.wrongtypeerr);
5745 return;
5746 }
8f92e768 5747 src[i].dict = ((zset*)zsetobj->ptr)->dict;
b287c9bb 5748 }
2830ca53
PN
5749
5750 /* default all weights to 1 */
8f92e768 5751 src[i].weight = 1.0;
b287c9bb
PN
5752 }
5753
2830ca53
PN
5754 /* parse optional extra arguments */
5755 if (j < c->argc) {
d2764cd6 5756 int remaining = c->argc - j;
b287c9bb 5757
2830ca53 5758 while (remaining) {
d2764cd6 5759 if (remaining >= (zsetnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
2830ca53 5760 j++; remaining--;
2830ca53 5761 for (i = 0; i < zsetnum; i++, j++, remaining--) {
bd79a6bd 5762 if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
bbe025e0 5763 return;
2830ca53 5764 }
d2764cd6
PN
5765 } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
5766 j++; remaining--;
5767 if (!strcasecmp(c->argv[j]->ptr,"sum")) {
5768 aggregate = REDIS_AGGR_SUM;
5769 } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
5770 aggregate = REDIS_AGGR_MIN;
5771 } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
5772 aggregate = REDIS_AGGR_MAX;
5773 } else {
5774 zfree(src);
5775 addReply(c,shared.syntaxerr);
5776 return;
5777 }
5778 j++; remaining--;
2830ca53 5779 } else {
8f92e768 5780 zfree(src);
2830ca53
PN
5781 addReply(c,shared.syntaxerr);
5782 return;
5783 }
5784 }
5785 }
b287c9bb 5786
d2764cd6
PN
5787 /* sort sets from the smallest to largest, this will improve our
5788 * algorithm's performance */
5789 qsort(src,zsetnum,sizeof(zsetopsrc), qsortCompareZsetopsrcByCardinality);
5790
2830ca53
PN
5791 dstobj = createZsetObject();
5792 dstzset = dstobj->ptr;
5793
5794 if (op == REDIS_OP_INTER) {
8f92e768
PN
5795 /* skip going over all entries if the smallest zset is NULL or empty */
5796 if (src[0].dict && dictSize(src[0].dict) > 0) {
5797 /* precondition: as src[0].dict is non-empty and the zsets are ordered
5798 * from small to large, all src[i > 0].dict are non-empty too */
5799 di = dictGetIterator(src[0].dict);
2830ca53 5800 while((de = dictNext(di)) != NULL) {
d2764cd6
PN
5801 double *score = zmalloc(sizeof(double)), value;
5802 *score = src[0].weight * (*(double*)dictGetEntryVal(de));
2830ca53 5803
d2764cd6
PN
5804 for (j = 1; j < zsetnum; j++) {
5805 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 5806 if (other) {
d2764cd6
PN
5807 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5808 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
5809 } else {
5810 break;
5811 }
5812 }
b287c9bb 5813
2830ca53 5814 /* skip entry when not present in every source dict */
8f92e768 5815 if (j != zsetnum) {
2830ca53
PN
5816 zfree(score);
5817 } else {
5818 robj *o = dictGetEntryKey(de);
5819 dictAdd(dstzset->dict,o,score);
5820 incrRefCount(o); /* added to dictionary */
5821 zslInsert(dstzset->zsl,*score,o);
5822 incrRefCount(o); /* added to skiplist */
b287c9bb
PN
5823 }
5824 }
2830ca53
PN
5825 dictReleaseIterator(di);
5826 }
5827 } else if (op == REDIS_OP_UNION) {
5828 for (i = 0; i < zsetnum; i++) {
8f92e768 5829 if (!src[i].dict) continue;
2830ca53 5830
8f92e768 5831 di = dictGetIterator(src[i].dict);
2830ca53
PN
5832 while((de = dictNext(di)) != NULL) {
5833 /* skip key when already processed */
5834 if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
5835
d2764cd6
PN
5836 double *score = zmalloc(sizeof(double)), value;
5837 *score = src[i].weight * (*(double*)dictGetEntryVal(de));
2830ca53 5838
d2764cd6
PN
5839 /* because the zsets are sorted by size, its only possible
5840 * for sets at larger indices to hold this entry */
5841 for (j = (i+1); j < zsetnum; j++) {
5842 dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
2830ca53 5843 if (other) {
d2764cd6
PN
5844 value = src[j].weight * (*(double*)dictGetEntryVal(other));
5845 zunionInterAggregate(score, value, aggregate);
2830ca53
PN
5846 }
5847 }
b287c9bb 5848
2830ca53
PN
5849 robj *o = dictGetEntryKey(de);
5850 dictAdd(dstzset->dict,o,score);
5851 incrRefCount(o); /* added to dictionary */
5852 zslInsert(dstzset->zsl,*score,o);
5853 incrRefCount(o); /* added to skiplist */
5854 }
5855 dictReleaseIterator(di);
b287c9bb 5856 }
2830ca53
PN
5857 } else {
5858 /* unknown operator */
5859 redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
b287c9bb
PN
5860 }
5861
5862 deleteKey(c->db,dstkey);
3ea27d37 5863 if (dstzset->zsl->length) {
5864 dictAdd(c->db->dict,dstkey,dstobj);
5865 incrRefCount(dstkey);
5866 addReplyLong(c, dstzset->zsl->length);
5867 server.dirty++;
5868 } else {
8bca8773 5869 decrRefCount(dstobj);
3ea27d37 5870 addReply(c, shared.czero);
5871 }
8f92e768 5872 zfree(src);
b287c9bb
PN
5873}
5874
2830ca53
PN
5875static void zunionCommand(redisClient *c) {
5876 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
b287c9bb
PN
5877}
5878
2830ca53
PN
5879static void zinterCommand(redisClient *c) {
5880 zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
b287c9bb
PN
5881}
5882
e3870fab 5883static void zrangeGenericCommand(redisClient *c, int reverse) {
cc812361 5884 robj *o;
bbe025e0
AM
5885 long start;
5886 long end;
752da584 5887 int withscores = 0;
dd88747b 5888 int llen;
5889 int rangelen, j;
5890 zset *zsetobj;
5891 zskiplist *zsl;
5892 zskiplistNode *ln;
5893 robj *ele;
752da584 5894
bd79a6bd
PN
5895 if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
5896 (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
bbe025e0 5897
752da584 5898 if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
5899 withscores = 1;
5900 } else if (c->argc >= 5) {
5901 addReply(c,shared.syntaxerr);
5902 return;
5903 }
cc812361 5904
4e27f268 5905 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
5906 || checkType(c,o,REDIS_ZSET)) return;
dd88747b 5907 zsetobj = o->ptr;
5908 zsl = zsetobj->zsl;
5909 llen = zsl->length;
cc812361 5910
dd88747b 5911 /* convert negative indexes */
5912 if (start < 0) start = llen+start;
5913 if (end < 0) end = llen+end;
5914 if (start < 0) start = 0;
5915 if (end < 0) end = 0;
cc812361 5916
dd88747b 5917 /* indexes sanity checks */
5918 if (start > end || start >= llen) {
5919 /* Out of range start or start > end result in empty list */
5920 addReply(c,shared.emptymultibulk);
5921 return;
5922 }
5923 if (end >= llen) end = llen-1;
5924 rangelen = (end-start)+1;
cc812361 5925
dd88747b 5926 /* check if starting point is trivial, before searching
5927 * the element in log(N) time */
5928 if (reverse) {
5929 ln = start == 0 ? zsl->tail : zslGetElementByRank(zsl, llen-start);
5930 } else {
5931 ln = start == 0 ?
5932 zsl->header->forward[0] : zslGetElementByRank(zsl, start+1);
5933 }
cc812361 5934
dd88747b 5935 /* Return the result in form of a multi-bulk reply */
5936 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
5937 withscores ? (rangelen*2) : rangelen));
5938 for (j = 0; j < rangelen; j++) {
5939 ele = ln->obj;
5940 addReplyBulk(c,ele);
5941 if (withscores)
5942 addReplyDouble(c,ln->score);
5943 ln = reverse ? ln->backward : ln->forward[0];
cc812361 5944 }
5945}
5946
e3870fab 5947static void zrangeCommand(redisClient *c) {
5948 zrangeGenericCommand(c,0);
5949}
5950
5951static void zrevrangeCommand(redisClient *c) {
5952 zrangeGenericCommand(c,1);
5953}
5954
f44dd428 5955/* This command implements both ZRANGEBYSCORE and ZCOUNT.
5956 * If justcount is non-zero, just the count is returned. */
5957static void genericZrangebyscoreCommand(redisClient *c, int justcount) {
50c55df5 5958 robj *o;
f44dd428 5959 double min, max;
5960 int minex = 0, maxex = 0; /* are min or max exclusive? */
80181f78 5961 int offset = 0, limit = -1;
0500ef27
SH
5962 int withscores = 0;
5963 int badsyntax = 0;
5964
f44dd428 5965 /* Parse the min-max interval. If one of the values is prefixed
5966 * by the "(" character, it's considered "open". For instance
5967 * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
5968 * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
5969 if (((char*)c->argv[2]->ptr)[0] == '(') {
5970 min = strtod((char*)c->argv[2]->ptr+1,NULL);
5971 minex = 1;
5972 } else {
5973 min = strtod(c->argv[2]->ptr,NULL);
5974 }
5975 if (((char*)c->argv[3]->ptr)[0] == '(') {
5976 max = strtod((char*)c->argv[3]->ptr+1,NULL);
5977 maxex = 1;
5978 } else {
5979 max = strtod(c->argv[3]->ptr,NULL);
5980 }
5981
5982 /* Parse "WITHSCORES": note that if the command was called with
5983 * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
5984 * enter the following paths to parse WITHSCORES and LIMIT. */
0500ef27 5985 if (c->argc == 5 || c->argc == 8) {
3a3978b1 5986 if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
5987 withscores = 1;
5988 else
5989 badsyntax = 1;
0500ef27 5990 }
3a3978b1 5991 if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
0500ef27 5992 badsyntax = 1;
0500ef27 5993 if (badsyntax) {
454d4e43 5994 addReplySds(c,
5995 sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
80181f78 5996 return;
0500ef27
SH
5997 }
5998
f44dd428 5999 /* Parse "LIMIT" */
0500ef27 6000 if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
80181f78 6001 addReply(c,shared.syntaxerr);
6002 return;
0500ef27 6003 } else if (c->argc == (7 + withscores)) {
80181f78 6004 offset = atoi(c->argv[5]->ptr);
6005 limit = atoi(c->argv[6]->ptr);
0b13687c 6006 if (offset < 0) offset = 0;
80181f78 6007 }
50c55df5 6008
f44dd428 6009 /* Ok, lookup the key and get the range */
50c55df5 6010 o = lookupKeyRead(c->db,c->argv[1]);
6011 if (o == NULL) {
4e27f268 6012 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6013 } else {
6014 if (o->type != REDIS_ZSET) {
6015 addReply(c,shared.wrongtypeerr);
6016 } else {
6017 zset *zsetobj = o->ptr;
6018 zskiplist *zsl = zsetobj->zsl;
6019 zskiplistNode *ln;
f44dd428 6020 robj *ele, *lenobj = NULL;
6021 unsigned long rangelen = 0;
50c55df5 6022
f44dd428 6023 /* Get the first node with the score >= min, or with
6024 * score > min if 'minex' is true. */
50c55df5 6025 ln = zslFirstWithScore(zsl,min);
f44dd428 6026 while (minex && ln && ln->score == min) ln = ln->forward[0];
6027
50c55df5 6028 if (ln == NULL) {
6029 /* No element matching the speciifed interval */
f44dd428 6030 addReply(c,justcount ? shared.czero : shared.emptymultibulk);
50c55df5 6031 return;
6032 }
6033
6034 /* We don't know in advance how many matching elements there
6035 * are in the list, so we push this object that will represent
6036 * the multi-bulk length in the output buffer, and will "fix"
6037 * it later */
f44dd428 6038 if (!justcount) {
6039 lenobj = createObject(REDIS_STRING,NULL);
6040 addReply(c,lenobj);
6041 decrRefCount(lenobj);
6042 }
50c55df5 6043
f44dd428 6044 while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
80181f78 6045 if (offset) {
6046 offset--;
6047 ln = ln->forward[0];
6048 continue;
6049 }
6050 if (limit == 0) break;
f44dd428 6051 if (!justcount) {
6052 ele = ln->obj;
dd88747b 6053 addReplyBulk(c,ele);
f44dd428 6054 if (withscores)
6055 addReplyDouble(c,ln->score);
6056 }
50c55df5 6057 ln = ln->forward[0];
6058 rangelen++;
80181f78 6059 if (limit > 0) limit--;
50c55df5 6060 }
f44dd428 6061 if (justcount) {
6062 addReplyLong(c,(long)rangelen);
6063 } else {
6064 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
6065 withscores ? (rangelen*2) : rangelen);
6066 }
50c55df5 6067 }
6068 }
6069}
6070
f44dd428 6071static void zrangebyscoreCommand(redisClient *c) {
6072 genericZrangebyscoreCommand(c,0);
6073}
6074
6075static void zcountCommand(redisClient *c) {
6076 genericZrangebyscoreCommand(c,1);
6077}
6078
3c41331e 6079static void zcardCommand(redisClient *c) {
e197b441 6080 robj *o;
6081 zset *zs;
dd88747b 6082
6083 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6084 checkType(c,o,REDIS_ZSET)) return;
6085
6086 zs = o->ptr;
6087 addReplyUlong(c,zs->zsl->length);
e197b441 6088}
6089
6e333bbe 6090static void zscoreCommand(redisClient *c) {
6091 robj *o;
6092 zset *zs;
dd88747b 6093 dictEntry *de;
6094
6095 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6096 checkType(c,o,REDIS_ZSET)) return;
6097
6098 zs = o->ptr;
6099 de = dictFind(zs->dict,c->argv[2]);
6100 if (!de) {
96d8b4ee 6101 addReply(c,shared.nullbulk);
6e333bbe 6102 } else {
dd88747b 6103 double *score = dictGetEntryVal(de);
6e333bbe 6104
dd88747b 6105 addReplyDouble(c,*score);
6e333bbe 6106 }
6107}
6108
798d9e55 6109static void zrankGenericCommand(redisClient *c, int reverse) {
69d95c3e 6110 robj *o;
dd88747b 6111 zset *zs;
6112 zskiplist *zsl;
6113 dictEntry *de;
6114 unsigned long rank;
6115 double *score;
6116
6117 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6118 checkType(c,o,REDIS_ZSET)) return;
6119
6120 zs = o->ptr;
6121 zsl = zs->zsl;
6122 de = dictFind(zs->dict,c->argv[2]);
6123 if (!de) {
69d95c3e
PN
6124 addReply(c,shared.nullbulk);
6125 return;
6126 }
69d95c3e 6127
dd88747b 6128 score = dictGetEntryVal(de);
6129 rank = zslGetRank(zsl, *score, c->argv[2]);
6130 if (rank) {
6131 if (reverse) {
6132 addReplyLong(c, zsl->length - rank);
27b0ccca 6133 } else {
dd88747b 6134 addReplyLong(c, rank-1);
69d95c3e 6135 }
dd88747b 6136 } else {
6137 addReply(c,shared.nullbulk);
978c2c94 6138 }
6139}
6140
798d9e55
PN
6141static void zrankCommand(redisClient *c) {
6142 zrankGenericCommand(c, 0);
6143}
6144
6145static void zrevrankCommand(redisClient *c) {
6146 zrankGenericCommand(c, 1);
6147}
6148
7fb16bac
PN
6149/* ========================= Hashes utility functions ======================= */
6150#define REDIS_HASH_KEY 1
6151#define REDIS_HASH_VALUE 2
978c2c94 6152
7fb16bac
PN
6153/* Check the length of a number of objects to see if we need to convert a
6154 * zipmap to a real hash. Note that we only check string encoded objects
6155 * as their string length can be queried in constant time. */
6156static void hashTryConversion(robj *subject, robj **argv, int start, int end) {
6157 int i;
6158 if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
978c2c94 6159
7fb16bac
PN
6160 for (i = start; i <= end; i++) {
6161 if (argv[i]->encoding == REDIS_ENCODING_RAW &&
6162 sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
6163 {
6164 convertToRealHash(subject);
978c2c94 6165 return;
6166 }
6167 }
7fb16bac 6168}
bae2c7ec 6169
97224de7
PN
6170/* Encode given objects in-place when the hash uses a dict. */
6171static void hashTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
6172 if (subject->encoding == REDIS_ENCODING_HT) {
3f973463
PN
6173 if (o1) *o1 = tryObjectEncoding(*o1);
6174 if (o2) *o2 = tryObjectEncoding(*o2);
97224de7
PN
6175 }
6176}
6177
7fb16bac 6178/* Get the value from a hash identified by key. Returns either a string
a3f3af86
PN
6179 * object or NULL if the value cannot be found. The refcount of the object
6180 * is always increased by 1 when the value was found. */
7fb16bac
PN
6181static robj *hashGet(robj *o, robj *key) {
6182 robj *value = NULL;
978c2c94 6183 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
7fb16bac
PN
6184 unsigned char *v;
6185 unsigned int vlen;
6186 key = getDecodedObject(key);
6187 if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
6188 value = createStringObject((char*)v,vlen);
6189 }
6190 decrRefCount(key);
6191 } else {
6192 dictEntry *de = dictFind(o->ptr,key);
6193 if (de != NULL) {
6194 value = dictGetEntryVal(de);
a3f3af86 6195 incrRefCount(value);
7fb16bac
PN
6196 }
6197 }
6198 return value;
6199}
978c2c94 6200
7fb16bac
PN
6201/* Test if the key exists in the given hash. Returns 1 if the key
6202 * exists and 0 when it doesn't. */
6203static int hashExists(robj *o, robj *key) {
6204 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6205 key = getDecodedObject(key);
6206 if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
6207 decrRefCount(key);
6208 return 1;
6209 }
6210 decrRefCount(key);
6211 } else {
6212 if (dictFind(o->ptr,key) != NULL) {
6213 return 1;
6214 }
6215 }
6216 return 0;
6217}
bae2c7ec 6218
7fb16bac
PN
6219/* Add an element, discard the old if the key already exists.
6220 * Return 0 on insert and 1 on update. */
feb8d7e6 6221static int hashSet(robj *o, robj *key, robj *value) {
7fb16bac
PN
6222 int update = 0;
6223 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6224 key = getDecodedObject(key);
6225 value = getDecodedObject(value);
6226 o->ptr = zipmapSet(o->ptr,
6227 key->ptr,sdslen(key->ptr),
6228 value->ptr,sdslen(value->ptr), &update);
6229 decrRefCount(key);
6230 decrRefCount(value);
6231
6232 /* Check if the zipmap needs to be upgraded to a real hash table */
6233 if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
bae2c7ec 6234 convertToRealHash(o);
978c2c94 6235 } else {
7fb16bac
PN
6236 if (dictReplace(o->ptr,key,value)) {
6237 /* Insert */
6238 incrRefCount(key);
978c2c94 6239 } else {
7fb16bac 6240 /* Update */
978c2c94 6241 update = 1;
6242 }
7fb16bac 6243 incrRefCount(value);
978c2c94 6244 }
7fb16bac 6245 return update;
978c2c94 6246}
6247
7fb16bac
PN
6248/* Delete an element from a hash.
6249 * Return 1 on deleted and 0 on not found. */
6250static int hashDelete(robj *o, robj *key) {
6251 int deleted = 0;
6252 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
6253 key = getDecodedObject(key);
6254 o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
6255 decrRefCount(key);
6256 } else {
6257 deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
6258 /* Always check if the dictionary needs a resize after a delete. */
6259 if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
d33278d1 6260 }
7fb16bac
PN
6261 return deleted;
6262}
d33278d1 6263
7fb16bac 6264/* Return the number of elements in a hash. */
c811bb38 6265static unsigned long hashLength(robj *o) {
7fb16bac
PN
6266 return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
6267 zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
6268}
6269
6270/* Structure to hold hash iteration abstration. Note that iteration over
6271 * hashes involves both fields and values. Because it is possible that
6272 * not both are required, store pointers in the iterator to avoid
6273 * unnecessary memory allocation for fields/values. */
6274typedef struct {
6275 int encoding;
6276 unsigned char *zi;
6277 unsigned char *zk, *zv;
6278 unsigned int zklen, zvlen;
6279
6280 dictIterator *di;
6281 dictEntry *de;
6282} hashIterator;
6283
c44d3b56
PN
6284static hashIterator *hashInitIterator(robj *subject) {
6285 hashIterator *hi = zmalloc(sizeof(hashIterator));
7fb16bac
PN
6286 hi->encoding = subject->encoding;
6287 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6288 hi->zi = zipmapRewind(subject->ptr);
6289 } else if (hi->encoding == REDIS_ENCODING_HT) {
6290 hi->di = dictGetIterator(subject->ptr);
d33278d1 6291 } else {
7fb16bac 6292 redisAssert(NULL);
d33278d1 6293 }
c44d3b56 6294 return hi;
7fb16bac 6295}
d33278d1 6296
7fb16bac
PN
6297static void hashReleaseIterator(hashIterator *hi) {
6298 if (hi->encoding == REDIS_ENCODING_HT) {
6299 dictReleaseIterator(hi->di);
d33278d1 6300 }
c44d3b56 6301 zfree(hi);
7fb16bac 6302}
d33278d1 6303
7fb16bac
PN
6304/* Move to the next entry in the hash. Return REDIS_OK when the next entry
6305 * could be found and REDIS_ERR when the iterator reaches the end. */
c811bb38 6306static int hashNext(hashIterator *hi) {
7fb16bac
PN
6307 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6308 if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
6309 &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
6310 } else {
6311 if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
6312 }
6313 return REDIS_OK;
6314}
d33278d1 6315
0c390abc 6316/* Get key or value object at current iteration position.
a3f3af86 6317 * This increases the refcount of the field object by 1. */
c811bb38 6318static robj *hashCurrent(hashIterator *hi, int what) {
7fb16bac
PN
6319 robj *o;
6320 if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
6321 if (what & REDIS_HASH_KEY) {
6322 o = createStringObject((char*)hi->zk,hi->zklen);
6323 } else {
6324 o = createStringObject((char*)hi->zv,hi->zvlen);
d33278d1 6325 }
d33278d1 6326 } else {
7fb16bac
PN
6327 if (what & REDIS_HASH_KEY) {
6328 o = dictGetEntryKey(hi->de);
6329 } else {
6330 o = dictGetEntryVal(hi->de);
d33278d1 6331 }
a3f3af86 6332 incrRefCount(o);
d33278d1 6333 }
7fb16bac 6334 return o;
d33278d1
PN
6335}
6336
7fb16bac
PN
6337static robj *hashLookupWriteOrCreate(redisClient *c, robj *key) {
6338 robj *o = lookupKeyWrite(c->db,key);
01426b05
PN
6339 if (o == NULL) {
6340 o = createHashObject();
7fb16bac
PN
6341 dictAdd(c->db->dict,key,o);
6342 incrRefCount(key);
01426b05
PN
6343 } else {
6344 if (o->type != REDIS_HASH) {
6345 addReply(c,shared.wrongtypeerr);
7fb16bac 6346 return NULL;
01426b05
PN
6347 }
6348 }
7fb16bac
PN
6349 return o;
6350}
01426b05 6351
7fb16bac
PN
6352/* ============================= Hash commands ============================== */
6353static void hsetCommand(redisClient *c) {
6e9e463f 6354 int update;
7fb16bac 6355 robj *o;
bbe025e0 6356
7fb16bac
PN
6357 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6358 hashTryConversion(o,c->argv,2,3);
97224de7 6359 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6360 update = hashSet(o,c->argv[2],c->argv[3]);
6e9e463f 6361 addReply(c, update ? shared.czero : shared.cone);
7fb16bac
PN
6362 server.dirty++;
6363}
01426b05 6364
1f1c7695
PN
6365static void hsetnxCommand(redisClient *c) {
6366 robj *o;
6367 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6368 hashTryConversion(o,c->argv,2,3);
6369
6370 if (hashExists(o, c->argv[2])) {
6371 addReply(c, shared.czero);
01426b05 6372 } else {
97224de7 6373 hashTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
feb8d7e6 6374 hashSet(o,c->argv[2],c->argv[3]);
1f1c7695
PN
6375 addReply(c, shared.cone);
6376 server.dirty++;
6377 }
6378}
01426b05 6379
7fb16bac
PN
6380static void hmsetCommand(redisClient *c) {
6381 int i;
6382 robj *o;
01426b05 6383
7fb16bac
PN
6384 if ((c->argc % 2) == 1) {
6385 addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
6386 return;
6387 }
01426b05 6388
7fb16bac
PN
6389 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6390 hashTryConversion(o,c->argv,2,c->argc-1);
6391 for (i = 2; i < c->argc; i += 2) {
97224de7 6392 hashTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
feb8d7e6 6393 hashSet(o,c->argv[i],c->argv[i+1]);
7fb16bac
PN
6394 }
6395 addReply(c, shared.ok);
edc2f63a 6396 server.dirty++;
7fb16bac
PN
6397}
6398
6399static void hincrbyCommand(redisClient *c) {
6400 long long value, incr;
6401 robj *o, *current, *new;
6402
bd79a6bd 6403 if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
7fb16bac
PN
6404 if ((o = hashLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
6405 if ((current = hashGet(o,c->argv[2])) != NULL) {
6406 if (current->encoding == REDIS_ENCODING_RAW)
6407 value = strtoll(current->ptr,NULL,10);
6408 else if (current->encoding == REDIS_ENCODING_INT)
6409 value = (long)current->ptr;
6410 else
6411 redisAssert(1 != 1);
a3f3af86 6412 decrRefCount(current);
7fb16bac
PN
6413 } else {
6414 value = 0;
01426b05
PN
6415 }
6416
7fb16bac 6417 value += incr;
3f973463
PN
6418 new = createStringObjectFromLongLong(value);
6419 hashTryObjectEncoding(o,&c->argv[2],NULL);
feb8d7e6 6420 hashSet(o,c->argv[2],new);
7fb16bac
PN
6421 decrRefCount(new);
6422 addReplyLongLong(c,value);
01426b05 6423 server.dirty++;
01426b05
PN
6424}
6425
978c2c94 6426static void hgetCommand(redisClient *c) {
7fb16bac 6427 robj *o, *value;
dd88747b 6428 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
6429 checkType(c,o,REDIS_HASH)) return;
6430
7fb16bac
PN
6431 if ((value = hashGet(o,c->argv[2])) != NULL) {
6432 addReplyBulk(c,value);
a3f3af86 6433 decrRefCount(value);
dd88747b 6434 } else {
7fb16bac 6435 addReply(c,shared.nullbulk);
69d95c3e 6436 }
69d95c3e
PN
6437}
6438
09aeb579
PN
6439static void hmgetCommand(redisClient *c) {
6440 int i;
7fb16bac
PN
6441 robj *o, *value;
6442 o = lookupKeyRead(c->db,c->argv[1]);
6443 if (o != NULL && o->type != REDIS_HASH) {
6444 addReply(c,shared.wrongtypeerr);
09aeb579
PN
6445 }
6446
7fb16bac
PN
6447 /* Note the check for o != NULL happens inside the loop. This is
6448 * done because objects that cannot be found are considered to be
6449 * an empty hash. The reply should then be a series of NULLs. */
09aeb579 6450 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
7fb16bac
PN
6451 for (i = 2; i < c->argc; i++) {
6452 if (o != NULL && (value = hashGet(o,c->argv[i])) != NULL) {
6453 addReplyBulk(c,value);
a3f3af86 6454 decrRefCount(value);
7fb16bac
PN
6455 } else {
6456 addReply(c,shared.nullbulk);
09aeb579
PN
6457 }
6458 }
6459}
6460
07efaf74 6461static void hdelCommand(redisClient *c) {
dd88747b 6462 robj *o;
dd88747b 6463 if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
6464 checkType(c,o,REDIS_HASH)) return;
07efaf74 6465
7fb16bac
PN
6466 if (hashDelete(o,c->argv[2])) {
6467 if (hashLength(o) == 0) deleteKey(c->db,c->argv[1]);
6468 addReply(c,shared.cone);
6469 server.dirty++;
dd88747b 6470 } else {
7fb16bac 6471 addReply(c,shared.czero);
07efaf74 6472 }
6473}
6474
92b27fe9 6475static void hlenCommand(redisClient *c) {
6476 robj *o;
dd88747b 6477 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
92b27fe9 6478 checkType(c,o,REDIS_HASH)) return;
6479
7fb16bac 6480 addReplyUlong(c,hashLength(o));
92b27fe9 6481}
6482
78409a0f 6483static void genericHgetallCommand(redisClient *c, int flags) {
7fb16bac 6484 robj *o, *lenobj, *obj;
78409a0f 6485 unsigned long count = 0;
c44d3b56 6486 hashIterator *hi;
78409a0f 6487
4e27f268 6488 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
78409a0f 6489 || checkType(c,o,REDIS_HASH)) return;
6490
6491 lenobj = createObject(REDIS_STRING,NULL);
6492 addReply(c,lenobj);
6493 decrRefCount(lenobj);
6494
c44d3b56
PN
6495 hi = hashInitIterator(o);
6496 while (hashNext(hi) != REDIS_ERR) {
7fb16bac 6497 if (flags & REDIS_HASH_KEY) {
c44d3b56 6498 obj = hashCurrent(hi,REDIS_HASH_KEY);
7fb16bac 6499 addReplyBulk(c,obj);
a3f3af86 6500 decrRefCount(obj);
7fb16bac 6501 count++;
78409a0f 6502 }
7fb16bac 6503 if (flags & REDIS_HASH_VALUE) {
c44d3b56 6504 obj = hashCurrent(hi,REDIS_HASH_VALUE);
7fb16bac 6505 addReplyBulk(c,obj);
a3f3af86 6506 decrRefCount(obj);
7fb16bac 6507 count++;
78409a0f 6508 }
78409a0f 6509 }
c44d3b56 6510 hashReleaseIterator(hi);
7fb16bac 6511
78409a0f 6512 lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
6513}
6514
6515static void hkeysCommand(redisClient *c) {
7fb16bac 6516 genericHgetallCommand(c,REDIS_HASH_KEY);
78409a0f 6517}
6518
6519static void hvalsCommand(redisClient *c) {
7fb16bac 6520 genericHgetallCommand(c,REDIS_HASH_VALUE);
78409a0f 6521}
6522
6523static void hgetallCommand(redisClient *c) {
7fb16bac 6524 genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
78409a0f 6525}
6526
a86f14b1 6527static void hexistsCommand(redisClient *c) {
6528 robj *o;
a86f14b1 6529 if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
6530 checkType(c,o,REDIS_HASH)) return;
6531
7fb16bac 6532 addReply(c, hashExists(o,c->argv[2]) ? shared.cone : shared.czero);
a86f14b1 6533}
6534
ada386b2 6535static void convertToRealHash(robj *o) {
6536 unsigned char *key, *val, *p, *zm = o->ptr;
6537 unsigned int klen, vlen;
6538 dict *dict = dictCreate(&hashDictType,NULL);
6539
6540 assert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
6541 p = zipmapRewind(zm);
6542 while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
6543 robj *keyobj, *valobj;
6544
6545 keyobj = createStringObject((char*)key,klen);
6546 valobj = createStringObject((char*)val,vlen);
05df7621 6547 keyobj = tryObjectEncoding(keyobj);
6548 valobj = tryObjectEncoding(valobj);
ada386b2 6549 dictAdd(dict,keyobj,valobj);
6550 }
6551 o->encoding = REDIS_ENCODING_HT;
6552 o->ptr = dict;
6553 zfree(zm);
6554}
6555
6b47e12e 6556/* ========================= Non type-specific commands ==================== */
6557
ed9b544e 6558static void flushdbCommand(redisClient *c) {
ca37e9cd 6559 server.dirty += dictSize(c->db->dict);
3305306f 6560 dictEmpty(c->db->dict);
6561 dictEmpty(c->db->expires);
ed9b544e 6562 addReply(c,shared.ok);
ed9b544e 6563}
6564
6565static void flushallCommand(redisClient *c) {
ca37e9cd 6566 server.dirty += emptyDb();
ed9b544e 6567 addReply(c,shared.ok);
500ece7c 6568 if (server.bgsavechildpid != -1) {
6569 kill(server.bgsavechildpid,SIGKILL);
6570 rdbRemoveTempFile(server.bgsavechildpid);
6571 }
f78fd11b 6572 rdbSave(server.dbfilename);
ca37e9cd 6573 server.dirty++;
ed9b544e 6574}
6575
56906eef 6576static redisSortOperation *createSortOperation(int type, robj *pattern) {
ed9b544e 6577 redisSortOperation *so = zmalloc(sizeof(*so));
ed9b544e 6578 so->type = type;
6579 so->pattern = pattern;
6580 return so;
6581}
6582
6583/* Return the value associated to the key with a name obtained
55017f9d
PN
6584 * substituting the first occurence of '*' in 'pattern' with 'subst'.
6585 * The returned object will always have its refcount increased by 1
6586 * when it is non-NULL. */
56906eef 6587static robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
6d7d1370 6588 char *p, *f;
ed9b544e 6589 sds spat, ssub;
6d7d1370
PN
6590 robj keyobj, fieldobj, *o;
6591 int prefixlen, sublen, postfixlen, fieldlen;
ed9b544e 6592 /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
6593 struct {
f1017b3f 6594 long len;
6595 long free;
ed9b544e 6596 char buf[REDIS_SORTKEY_MAX+1];
6d7d1370 6597 } keyname, fieldname;
ed9b544e 6598
28173a49 6599 /* If the pattern is "#" return the substitution object itself in order
6600 * to implement the "SORT ... GET #" feature. */
6601 spat = pattern->ptr;
6602 if (spat[0] == '#' && spat[1] == '\0') {
55017f9d 6603 incrRefCount(subst);
28173a49 6604 return subst;
6605 }
6606
6607 /* The substitution object may be specially encoded. If so we create
9d65a1bb 6608 * a decoded object on the fly. Otherwise getDecodedObject will just
6609 * increment the ref count, that we'll decrement later. */
6610 subst = getDecodedObject(subst);
942a3961 6611
ed9b544e 6612 ssub = subst->ptr;
6613 if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
6614 p = strchr(spat,'*');
ed5a857a 6615 if (!p) {
6616 decrRefCount(subst);
6617 return NULL;
6618 }
ed9b544e 6619
6d7d1370
PN
6620 /* Find out if we're dealing with a hash dereference. */
6621 if ((f = strstr(p+1, "->")) != NULL) {
6622 fieldlen = sdslen(spat)-(f-spat);
6623 /* this also copies \0 character */
6624 memcpy(fieldname.buf,f+2,fieldlen-1);
6625 fieldname.len = fieldlen-2;
6626 } else {
6627 fieldlen = 0;
6628 }
6629
ed9b544e 6630 prefixlen = p-spat;
6631 sublen = sdslen(ssub);
6d7d1370 6632 postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
ed9b544e 6633 memcpy(keyname.buf,spat,prefixlen);
6634 memcpy(keyname.buf+prefixlen,ssub,sublen);
6635 memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
6636 keyname.buf[prefixlen+sublen+postfixlen] = '\0';
6637 keyname.len = prefixlen+sublen+postfixlen;
942a3961 6638 decrRefCount(subst);
6639
6d7d1370
PN
6640 /* Lookup substituted key */
6641 initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(long)*2));
6642 o = lookupKeyRead(db,&keyobj);
55017f9d
PN
6643 if (o == NULL) return NULL;
6644
6645 if (fieldlen > 0) {
6646 if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
6d7d1370 6647
705dad38
PN
6648 /* Retrieve value from hash by the field name. This operation
6649 * already increases the refcount of the returned object. */
6d7d1370
PN
6650 initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(long)*2));
6651 o = hashGet(o, &fieldobj);
705dad38 6652 } else {
55017f9d 6653 if (o->type != REDIS_STRING) return NULL;
b6f07345 6654
705dad38
PN
6655 /* Every object that this function returns needs to have its refcount
6656 * increased. sortCommand decreases it again. */
6657 incrRefCount(o);
6d7d1370
PN
6658 }
6659
6660 return o;
ed9b544e 6661}
6662
6663/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
6664 * the additional parameter is not standard but a BSD-specific we have to
6665 * pass sorting parameters via the global 'server' structure */
6666static int sortCompare(const void *s1, const void *s2) {
6667 const redisSortObject *so1 = s1, *so2 = s2;
6668 int cmp;
6669
6670 if (!server.sort_alpha) {
6671 /* Numeric sorting. Here it's trivial as we precomputed scores */
6672 if (so1->u.score > so2->u.score) {
6673 cmp = 1;
6674 } else if (so1->u.score < so2->u.score) {
6675 cmp = -1;
6676 } else {
6677 cmp = 0;
6678 }
6679 } else {
6680 /* Alphanumeric sorting */
6681 if (server.sort_bypattern) {
6682 if (!so1->u.cmpobj || !so2->u.cmpobj) {
6683 /* At least one compare object is NULL */
6684 if (so1->u.cmpobj == so2->u.cmpobj)
6685 cmp = 0;
6686 else if (so1->u.cmpobj == NULL)
6687 cmp = -1;
6688 else
6689 cmp = 1;
6690 } else {
6691 /* We have both the objects, use strcoll */
6692 cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
6693 }
6694 } else {
08ee9b57 6695 /* Compare elements directly. */
6696 cmp = compareStringObjects(so1->obj,so2->obj);
ed9b544e 6697 }
6698 }
6699 return server.sort_desc ? -cmp : cmp;
6700}
6701
6702/* The SORT command is the most complex command in Redis. Warning: this code
6703 * is optimized for speed and a bit less for readability */
6704static void sortCommand(redisClient *c) {
ed9b544e 6705 list *operations;
6706 int outputlen = 0;
6707 int desc = 0, alpha = 0;
6708 int limit_start = 0, limit_count = -1, start, end;
6709 int j, dontsort = 0, vectorlen;
6710 int getop = 0; /* GET operation counter */
443c6409 6711 robj *sortval, *sortby = NULL, *storekey = NULL;
ed9b544e 6712 redisSortObject *vector; /* Resulting vector to sort */
6713
6714 /* Lookup the key to sort. It must be of the right types */
3305306f 6715 sortval = lookupKeyRead(c->db,c->argv[1]);
6716 if (sortval == NULL) {
4e27f268 6717 addReply(c,shared.emptymultibulk);
ed9b544e 6718 return;
6719 }
a5eb649b 6720 if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
6721 sortval->type != REDIS_ZSET)
6722 {
c937aa89 6723 addReply(c,shared.wrongtypeerr);
ed9b544e 6724 return;
6725 }
6726
6727 /* Create a list of operations to perform for every sorted element.
6728 * Operations can be GET/DEL/INCR/DECR */
6729 operations = listCreate();
092dac2a 6730 listSetFreeMethod(operations,zfree);
ed9b544e 6731 j = 2;
6732
6733 /* Now we need to protect sortval incrementing its count, in the future
6734 * SORT may have options able to overwrite/delete keys during the sorting
6735 * and the sorted key itself may get destroied */
6736 incrRefCount(sortval);
6737
6738 /* The SORT command has an SQL-alike syntax, parse it */
6739 while(j < c->argc) {
6740 int leftargs = c->argc-j-1;
6741 if (!strcasecmp(c->argv[j]->ptr,"asc")) {
6742 desc = 0;
6743 } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
6744 desc = 1;
6745 } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
6746 alpha = 1;
6747 } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
6748 limit_start = atoi(c->argv[j+1]->ptr);
6749 limit_count = atoi(c->argv[j+2]->ptr);
6750 j+=2;
443c6409 6751 } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
6752 storekey = c->argv[j+1];
6753 j++;
ed9b544e 6754 } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
6755 sortby = c->argv[j+1];
6756 /* If the BY pattern does not contain '*', i.e. it is constant,
6757 * we don't need to sort nor to lookup the weight keys. */
6758 if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
6759 j++;
6760 } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
6761 listAddNodeTail(operations,createSortOperation(
6762 REDIS_SORT_GET,c->argv[j+1]));
6763 getop++;
6764 j++;
ed9b544e 6765 } else {
6766 decrRefCount(sortval);
6767 listRelease(operations);
c937aa89 6768 addReply(c,shared.syntaxerr);
ed9b544e 6769 return;
6770 }
6771 j++;
6772 }
6773
6774 /* Load the sorting vector with all the objects to sort */
a5eb649b 6775 switch(sortval->type) {
6776 case REDIS_LIST: vectorlen = listLength((list*)sortval->ptr); break;
6777 case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
6778 case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
f83c6cb5 6779 default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
a5eb649b 6780 }
ed9b544e 6781 vector = zmalloc(sizeof(redisSortObject)*vectorlen);
ed9b544e 6782 j = 0;
a5eb649b 6783
ed9b544e 6784 if (sortval->type == REDIS_LIST) {
6785 list *list = sortval->ptr;
6208b3a7 6786 listNode *ln;
c7df85a4 6787 listIter li;
6208b3a7 6788
c7df85a4 6789 listRewind(list,&li);
6790 while((ln = listNext(&li))) {
ed9b544e 6791 robj *ele = ln->value;
6792 vector[j].obj = ele;
6793 vector[j].u.score = 0;
6794 vector[j].u.cmpobj = NULL;
ed9b544e 6795 j++;
6796 }
6797 } else {
a5eb649b 6798 dict *set;
ed9b544e 6799 dictIterator *di;
6800 dictEntry *setele;
6801
a5eb649b 6802 if (sortval->type == REDIS_SET) {
6803 set = sortval->ptr;
6804 } else {
6805 zset *zs = sortval->ptr;
6806 set = zs->dict;
6807 }
6808
ed9b544e 6809 di = dictGetIterator(set);
ed9b544e 6810 while((setele = dictNext(di)) != NULL) {
6811 vector[j].obj = dictGetEntryKey(setele);
6812 vector[j].u.score = 0;
6813 vector[j].u.cmpobj = NULL;
6814 j++;
6815 }
6816 dictReleaseIterator(di);
6817 }
dfc5e96c 6818 redisAssert(j == vectorlen);
ed9b544e 6819
6820 /* Now it's time to load the right scores in the sorting vector */
6821 if (dontsort == 0) {
6822 for (j = 0; j < vectorlen; j++) {
6d7d1370 6823 robj *byval;
ed9b544e 6824 if (sortby) {
6d7d1370 6825 /* lookup value to sort by */
3305306f 6826 byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
705dad38 6827 if (!byval) continue;
ed9b544e 6828 } else {
6d7d1370
PN
6829 /* use object itself to sort by */
6830 byval = vector[j].obj;
6831 }
6832
6833 if (alpha) {
08ee9b57 6834 if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
6d7d1370
PN
6835 } else {
6836 if (byval->encoding == REDIS_ENCODING_RAW) {
6837 vector[j].u.score = strtod(byval->ptr,NULL);
16fa22f1 6838 } else if (byval->encoding == REDIS_ENCODING_INT) {
6d7d1370
PN
6839 /* Don't need to decode the object if it's
6840 * integer-encoded (the only encoding supported) so
6841 * far. We can just cast it */
16fa22f1
PN
6842 vector[j].u.score = (long)byval->ptr;
6843 } else {
6844 redisAssert(1 != 1);
942a3961 6845 }
ed9b544e 6846 }
6d7d1370 6847
705dad38
PN
6848 /* when the object was retrieved using lookupKeyByPattern,
6849 * its refcount needs to be decreased. */
6850 if (sortby) {
6851 decrRefCount(byval);
ed9b544e 6852 }
6853 }
6854 }
6855
6856 /* We are ready to sort the vector... perform a bit of sanity check
6857 * on the LIMIT option too. We'll use a partial version of quicksort. */
6858 start = (limit_start < 0) ? 0 : limit_start;
6859 end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
6860 if (start >= vectorlen) {
6861 start = vectorlen-1;
6862 end = vectorlen-2;
6863 }
6864 if (end >= vectorlen) end = vectorlen-1;
6865
6866 if (dontsort == 0) {
6867 server.sort_desc = desc;
6868 server.sort_alpha = alpha;
6869 server.sort_bypattern = sortby ? 1 : 0;
5f5b9840 6870 if (sortby && (start != 0 || end != vectorlen-1))
6871 pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
6872 else
6873 qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
ed9b544e 6874 }
6875
6876 /* Send command output to the output buffer, performing the specified
6877 * GET/DEL/INCR/DECR operations if any. */
6878 outputlen = getop ? getop*(end-start+1) : end-start+1;
443c6409 6879 if (storekey == NULL) {
6880 /* STORE option not specified, sent the sorting result to client */
6881 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
6882 for (j = start; j <= end; j++) {
6883 listNode *ln;
c7df85a4 6884 listIter li;
6885
dd88747b 6886 if (!getop) addReplyBulk(c,vector[j].obj);
c7df85a4 6887 listRewind(operations,&li);
6888 while((ln = listNext(&li))) {
443c6409 6889 redisSortOperation *sop = ln->value;
6890 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6891 vector[j].obj);
6892
6893 if (sop->type == REDIS_SORT_GET) {
55017f9d 6894 if (!val) {
443c6409 6895 addReply(c,shared.nullbulk);
6896 } else {
dd88747b 6897 addReplyBulk(c,val);
55017f9d 6898 decrRefCount(val);
443c6409 6899 }
6900 } else {
dfc5e96c 6901 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
443c6409 6902 }
6903 }
ed9b544e 6904 }
443c6409 6905 } else {
6906 robj *listObject = createListObject();
6907 list *listPtr = (list*) listObject->ptr;
6908
6909 /* STORE option specified, set the sorting result as a List object */
6910 for (j = start; j <= end; j++) {
6911 listNode *ln;
c7df85a4 6912 listIter li;
6913
443c6409 6914 if (!getop) {
6915 listAddNodeTail(listPtr,vector[j].obj);
6916 incrRefCount(vector[j].obj);
6917 }
c7df85a4 6918 listRewind(operations,&li);
6919 while((ln = listNext(&li))) {
443c6409 6920 redisSortOperation *sop = ln->value;
6921 robj *val = lookupKeyByPattern(c->db,sop->pattern,
6922 vector[j].obj);
6923
6924 if (sop->type == REDIS_SORT_GET) {
55017f9d 6925 if (!val) {
443c6409 6926 listAddNodeTail(listPtr,createStringObject("",0));
6927 } else {
55017f9d
PN
6928 /* We should do a incrRefCount on val because it is
6929 * added to the list, but also a decrRefCount because
6930 * it is returned by lookupKeyByPattern. This results
6931 * in doing nothing at all. */
443c6409 6932 listAddNodeTail(listPtr,val);
443c6409 6933 }
ed9b544e 6934 } else {
dfc5e96c 6935 redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
ed9b544e 6936 }
ed9b544e 6937 }
ed9b544e 6938 }
121796f7 6939 if (dictReplace(c->db->dict,storekey,listObject)) {
6940 incrRefCount(storekey);
6941 }
443c6409 6942 /* Note: we add 1 because the DB is dirty anyway since even if the
6943 * SORT result is empty a new key is set and maybe the old content
6944 * replaced. */
6945 server.dirty += 1+outputlen;
6946 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
ed9b544e 6947 }
6948
6949 /* Cleanup */
6950 decrRefCount(sortval);
6951 listRelease(operations);
6952 for (j = 0; j < vectorlen; j++) {
16fa22f1 6953 if (alpha && vector[j].u.cmpobj)
ed9b544e 6954 decrRefCount(vector[j].u.cmpobj);
6955 }
6956 zfree(vector);
6957}
6958
ec6c7a1d 6959/* Convert an amount of bytes into a human readable string in the form
6960 * of 100B, 2G, 100M, 4K, and so forth. */
6961static void bytesToHuman(char *s, unsigned long long n) {
6962 double d;
6963
6964 if (n < 1024) {
6965 /* Bytes */
6966 sprintf(s,"%lluB",n);
6967 return;
6968 } else if (n < (1024*1024)) {
6969 d = (double)n/(1024);
6970 sprintf(s,"%.2fK",d);
6971 } else if (n < (1024LL*1024*1024)) {
6972 d = (double)n/(1024*1024);
6973 sprintf(s,"%.2fM",d);
6974 } else if (n < (1024LL*1024*1024*1024)) {
6975 d = (double)n/(1024LL*1024*1024);
b72f6a4b 6976 sprintf(s,"%.2fG",d);
ec6c7a1d 6977 }
6978}
6979
1c85b79f 6980/* Create the string returned by the INFO command. This is decoupled
6981 * by the INFO command itself as we need to report the same information
6982 * on memory corruption problems. */
6983static sds genRedisInfoString(void) {
ed9b544e 6984 sds info;
6985 time_t uptime = time(NULL)-server.stat_starttime;
c3cb078d 6986 int j;
ec6c7a1d 6987 char hmem[64];
55a8298f 6988
b72f6a4b 6989 bytesToHuman(hmem,zmalloc_used_memory());
ed9b544e 6990 info = sdscatprintf(sdsempty(),
6991 "redis_version:%s\r\n"
f1017b3f 6992 "arch_bits:%s\r\n"
7a932b74 6993 "multiplexing_api:%s\r\n"
0d7170a4 6994 "process_id:%ld\r\n"
682ac724 6995 "uptime_in_seconds:%ld\r\n"
6996 "uptime_in_days:%ld\r\n"
ed9b544e 6997 "connected_clients:%d\r\n"
6998 "connected_slaves:%d\r\n"
f86a74e9 6999 "blocked_clients:%d\r\n"
5fba9f71 7000 "used_memory:%zu\r\n"
ec6c7a1d 7001 "used_memory_human:%s\r\n"
ed9b544e 7002 "changes_since_last_save:%lld\r\n"
be2bb6b0 7003 "bgsave_in_progress:%d\r\n"
682ac724 7004 "last_save_time:%ld\r\n"
b3fad521 7005 "bgrewriteaof_in_progress:%d\r\n"
ed9b544e 7006 "total_connections_received:%lld\r\n"
7007 "total_commands_processed:%lld\r\n"
2a6a2ed1 7008 "expired_keys:%lld\r\n"
55a8298f 7009 "hash_max_zipmap_entries:%ld\r\n"
7010 "hash_max_zipmap_value:%ld\r\n"
ffc6b7f8 7011 "pubsub_channels:%ld\r\n"
7012 "pubsub_patterns:%u\r\n"
7d98e08c 7013 "vm_enabled:%d\r\n"
a0f643ea 7014 "role:%s\r\n"
ed9b544e 7015 ,REDIS_VERSION,
f1017b3f 7016 (sizeof(long) == 8) ? "64" : "32",
7a932b74 7017 aeGetApiName(),
0d7170a4 7018 (long) getpid(),
a0f643ea 7019 uptime,
7020 uptime/(3600*24),
ed9b544e 7021 listLength(server.clients)-listLength(server.slaves),
7022 listLength(server.slaves),
d5d55fc3 7023 server.blpop_blocked_clients,
b72f6a4b 7024 zmalloc_used_memory(),
ec6c7a1d 7025 hmem,
ed9b544e 7026 server.dirty,
9d65a1bb 7027 server.bgsavechildpid != -1,
ed9b544e 7028 server.lastsave,
b3fad521 7029 server.bgrewritechildpid != -1,
ed9b544e 7030 server.stat_numconnections,
7031 server.stat_numcommands,
2a6a2ed1 7032 server.stat_expiredkeys,
55a8298f 7033 server.hash_max_zipmap_entries,
7034 server.hash_max_zipmap_value,
ffc6b7f8 7035 dictSize(server.pubsub_channels),
7036 listLength(server.pubsub_patterns),
7d98e08c 7037 server.vm_enabled != 0,
a0f643ea 7038 server.masterhost == NULL ? "master" : "slave"
ed9b544e 7039 );
a0f643ea 7040 if (server.masterhost) {
7041 info = sdscatprintf(info,
7042 "master_host:%s\r\n"
7043 "master_port:%d\r\n"
7044 "master_link_status:%s\r\n"
7045 "master_last_io_seconds_ago:%d\r\n"
7046 ,server.masterhost,
7047 server.masterport,
7048 (server.replstate == REDIS_REPL_CONNECTED) ?
7049 "up" : "down",
f72b934d 7050 server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
a0f643ea 7051 );
7052 }
7d98e08c 7053 if (server.vm_enabled) {
1064ef87 7054 lockThreadedIO();
7d98e08c 7055 info = sdscatprintf(info,
7056 "vm_conf_max_memory:%llu\r\n"
7057 "vm_conf_page_size:%llu\r\n"
7058 "vm_conf_pages:%llu\r\n"
7059 "vm_stats_used_pages:%llu\r\n"
7060 "vm_stats_swapped_objects:%llu\r\n"
7061 "vm_stats_swappin_count:%llu\r\n"
7062 "vm_stats_swappout_count:%llu\r\n"
b9bc0eef 7063 "vm_stats_io_newjobs_len:%lu\r\n"
7064 "vm_stats_io_processing_len:%lu\r\n"
7065 "vm_stats_io_processed_len:%lu\r\n"
25fd2cb2 7066 "vm_stats_io_active_threads:%lu\r\n"
d5d55fc3 7067 "vm_stats_blocked_clients:%lu\r\n"
7d98e08c 7068 ,(unsigned long long) server.vm_max_memory,
7069 (unsigned long long) server.vm_page_size,
7070 (unsigned long long) server.vm_pages,
7071 (unsigned long long) server.vm_stats_used_pages,
7072 (unsigned long long) server.vm_stats_swapped_objects,
7073 (unsigned long long) server.vm_stats_swapins,
b9bc0eef 7074 (unsigned long long) server.vm_stats_swapouts,
7075 (unsigned long) listLength(server.io_newjobs),
7076 (unsigned long) listLength(server.io_processing),
7077 (unsigned long) listLength(server.io_processed),
d5d55fc3 7078 (unsigned long) server.io_active_threads,
7079 (unsigned long) server.vm_blocked_clients
7d98e08c 7080 );
1064ef87 7081 unlockThreadedIO();
7d98e08c 7082 }
c3cb078d 7083 for (j = 0; j < server.dbnum; j++) {
7084 long long keys, vkeys;
7085
7086 keys = dictSize(server.db[j].dict);
7087 vkeys = dictSize(server.db[j].expires);
7088 if (keys || vkeys) {
9d65a1bb 7089 info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
c3cb078d 7090 j, keys, vkeys);
7091 }
7092 }
1c85b79f 7093 return info;
7094}
7095
7096static void infoCommand(redisClient *c) {
7097 sds info = genRedisInfoString();
83c6a618 7098 addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
7099 (unsigned long)sdslen(info)));
ed9b544e 7100 addReplySds(c,info);
70003d28 7101 addReply(c,shared.crlf);
ed9b544e 7102}
7103
3305306f 7104static void monitorCommand(redisClient *c) {
7105 /* ignore MONITOR if aleady slave or in monitor mode */
7106 if (c->flags & REDIS_SLAVE) return;
7107
7108 c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
7109 c->slaveseldb = 0;
6b47e12e 7110 listAddNodeTail(server.monitors,c);
3305306f 7111 addReply(c,shared.ok);
7112}
7113
7114/* ================================= Expire ================================= */
7115static int removeExpire(redisDb *db, robj *key) {
7116 if (dictDelete(db->expires,key) == DICT_OK) {
7117 return 1;
7118 } else {
7119 return 0;
7120 }
7121}
7122
7123static int setExpire(redisDb *db, robj *key, time_t when) {
7124 if (dictAdd(db->expires,key,(void*)when) == DICT_ERR) {
7125 return 0;
7126 } else {
7127 incrRefCount(key);
7128 return 1;
7129 }
7130}
7131
bb32ede5 7132/* Return the expire time of the specified key, or -1 if no expire
7133 * is associated with this key (i.e. the key is non volatile) */
7134static time_t getExpire(redisDb *db, robj *key) {
7135 dictEntry *de;
7136
7137 /* No expire? return ASAP */
7138 if (dictSize(db->expires) == 0 ||
7139 (de = dictFind(db->expires,key)) == NULL) return -1;
7140
7141 return (time_t) dictGetEntryVal(de);
7142}
7143
3305306f 7144static int expireIfNeeded(redisDb *db, robj *key) {
7145 time_t when;
7146 dictEntry *de;
7147
7148 /* No expire? return ASAP */
7149 if (dictSize(db->expires) == 0 ||
7150 (de = dictFind(db->expires,key)) == NULL) return 0;
7151
7152 /* Lookup the expire */
7153 when = (time_t) dictGetEntryVal(de);
7154 if (time(NULL) <= when) return 0;
7155
7156 /* Delete the key */
7157 dictDelete(db->expires,key);
2a6a2ed1 7158 server.stat_expiredkeys++;
3305306f 7159 return dictDelete(db->dict,key) == DICT_OK;
7160}
7161
7162static int deleteIfVolatile(redisDb *db, robj *key) {
7163 dictEntry *de;
7164
7165 /* No expire? return ASAP */
7166 if (dictSize(db->expires) == 0 ||
7167 (de = dictFind(db->expires,key)) == NULL) return 0;
7168
7169 /* Delete the key */
0c66a471 7170 server.dirty++;
2a6a2ed1 7171 server.stat_expiredkeys++;
3305306f 7172 dictDelete(db->expires,key);
7173 return dictDelete(db->dict,key) == DICT_OK;
7174}
7175
bbe025e0 7176static void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
3305306f 7177 dictEntry *de;
bbe025e0
AM
7178 time_t seconds;
7179
bd79a6bd 7180 if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
bbe025e0
AM
7181
7182 seconds -= offset;
3305306f 7183
802e8373 7184 de = dictFind(c->db->dict,key);
3305306f 7185 if (de == NULL) {
7186 addReply(c,shared.czero);
7187 return;
7188 }
d4dd6556 7189 if (seconds <= 0) {
43e5ccdf 7190 if (deleteKey(c->db,key)) server.dirty++;
7191 addReply(c, shared.cone);
3305306f 7192 return;
7193 } else {
7194 time_t when = time(NULL)+seconds;
802e8373 7195 if (setExpire(c->db,key,when)) {
3305306f 7196 addReply(c,shared.cone);
77423026 7197 server.dirty++;
7198 } else {
3305306f 7199 addReply(c,shared.czero);
77423026 7200 }
3305306f 7201 return;
7202 }
7203}
7204
802e8373 7205static void expireCommand(redisClient *c) {
bbe025e0 7206 expireGenericCommand(c,c->argv[1],c->argv[2],0);
802e8373 7207}
7208
7209static void expireatCommand(redisClient *c) {
bbe025e0 7210 expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
802e8373 7211}
7212
fd88489a 7213static void ttlCommand(redisClient *c) {
7214 time_t expire;
7215 int ttl = -1;
7216
7217 expire = getExpire(c->db,c->argv[1]);
7218 if (expire != -1) {
7219 ttl = (int) (expire-time(NULL));
7220 if (ttl < 0) ttl = -1;
7221 }
7222 addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
7223}
7224
6e469882 7225/* ================================ MULTI/EXEC ============================== */
7226
7227/* Client state initialization for MULTI/EXEC */
7228static void initClientMultiState(redisClient *c) {
7229 c->mstate.commands = NULL;
7230 c->mstate.count = 0;
7231}
7232
7233/* Release all the resources associated with MULTI/EXEC state */
7234static void freeClientMultiState(redisClient *c) {
7235 int j;
7236
7237 for (j = 0; j < c->mstate.count; j++) {
7238 int i;
7239 multiCmd *mc = c->mstate.commands+j;
7240
7241 for (i = 0; i < mc->argc; i++)
7242 decrRefCount(mc->argv[i]);
7243 zfree(mc->argv);
7244 }
7245 zfree(c->mstate.commands);
7246}
7247
7248/* Add a new command into the MULTI commands queue */
7249static void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
7250 multiCmd *mc;
7251 int j;
7252
7253 c->mstate.commands = zrealloc(c->mstate.commands,
7254 sizeof(multiCmd)*(c->mstate.count+1));
7255 mc = c->mstate.commands+c->mstate.count;
7256 mc->cmd = cmd;
7257 mc->argc = c->argc;
7258 mc->argv = zmalloc(sizeof(robj*)*c->argc);
7259 memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
7260 for (j = 0; j < c->argc; j++)
7261 incrRefCount(mc->argv[j]);
7262 c->mstate.count++;
7263}
7264
7265static void multiCommand(redisClient *c) {
7266 c->flags |= REDIS_MULTI;
36c548f0 7267 addReply(c,shared.ok);
6e469882 7268}
7269
18b6cb76
DJ
7270static void discardCommand(redisClient *c) {
7271 if (!(c->flags & REDIS_MULTI)) {
7272 addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
7273 return;
7274 }
7275
7276 freeClientMultiState(c);
7277 initClientMultiState(c);
7278 c->flags &= (~REDIS_MULTI);
7279 addReply(c,shared.ok);
7280}
7281
66c8853f 7282/* Send a MULTI command to all the slaves and AOF file. Check the execCommand
7283 * implememntation for more information. */
7284static void execCommandReplicateMulti(redisClient *c) {
7285 struct redisCommand *cmd;
7286 robj *multistring = createStringObject("MULTI",5);
7287
7288 cmd = lookupCommand("multi");
7289 if (server.appendonly)
7290 feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
7291 if (listLength(server.slaves))
7292 replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
7293 decrRefCount(multistring);
7294}
7295
6e469882 7296static void execCommand(redisClient *c) {
7297 int j;
7298 robj **orig_argv;
7299 int orig_argc;
7300
7301 if (!(c->flags & REDIS_MULTI)) {
7302 addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
7303 return;
7304 }
7305
66c8853f 7306 /* Replicate a MULTI request now that we are sure the block is executed.
7307 * This way we'll deliver the MULTI/..../EXEC block as a whole and
7308 * both the AOF and the replication link will have the same consistency
7309 * and atomicity guarantees. */
7310 execCommandReplicateMulti(c);
7311
7312 /* Exec all the queued commands */
6e469882 7313 orig_argv = c->argv;
7314 orig_argc = c->argc;
7315 addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
7316 for (j = 0; j < c->mstate.count; j++) {
7317 c->argc = c->mstate.commands[j].argc;
7318 c->argv = c->mstate.commands[j].argv;
7319 call(c,c->mstate.commands[j].cmd);
7320 }
7321 c->argv = orig_argv;
7322 c->argc = orig_argc;
7323 freeClientMultiState(c);
7324 initClientMultiState(c);
7325 c->flags &= (~REDIS_MULTI);
66c8853f 7326 /* Make sure the EXEC command is always replicated / AOF, since we
7327 * always send the MULTI command (we can't know beforehand if the
7328 * next operations will contain at least a modification to the DB). */
7329 server.dirty++;
6e469882 7330}
7331
4409877e 7332/* =========================== Blocking Operations ========================= */
7333
7334/* Currently Redis blocking operations support is limited to list POP ops,
7335 * so the current implementation is not fully generic, but it is also not
7336 * completely specific so it will not require a rewrite to support new
7337 * kind of blocking operations in the future.
7338 *
7339 * Still it's important to note that list blocking operations can be already
7340 * used as a notification mechanism in order to implement other blocking
7341 * operations at application level, so there must be a very strong evidence
7342 * of usefulness and generality before new blocking operations are implemented.
7343 *
7344 * This is how the current blocking POP works, we use BLPOP as example:
7345 * - If the user calls BLPOP and the key exists and contains a non empty list
7346 * then LPOP is called instead. So BLPOP is semantically the same as LPOP
7347 * if there is not to block.
7348 * - If instead BLPOP is called and the key does not exists or the list is
7349 * empty we need to block. In order to do so we remove the notification for
7350 * new data to read in the client socket (so that we'll not serve new
7351 * requests if the blocking request is not served). Also we put the client
95242ab5 7352 * in a dictionary (db->blockingkeys) mapping keys to a list of clients
4409877e 7353 * blocking for this keys.
7354 * - If a PUSH operation against a key with blocked clients waiting is
7355 * performed, we serve the first in the list: basically instead to push
7356 * the new element inside the list we return it to the (first / oldest)
7357 * blocking client, unblock the client, and remove it form the list.
7358 *
7359 * The above comment and the source code should be enough in order to understand
7360 * the implementation and modify / fix it later.
7361 */
7362
7363/* Set a client in blocking mode for the specified key, with the specified
7364 * timeout */
b177fd30 7365static void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
4409877e 7366 dictEntry *de;
7367 list *l;
b177fd30 7368 int j;
4409877e 7369
b177fd30 7370 c->blockingkeys = zmalloc(sizeof(robj*)*numkeys);
7371 c->blockingkeysnum = numkeys;
4409877e 7372 c->blockingto = timeout;
b177fd30 7373 for (j = 0; j < numkeys; j++) {
7374 /* Add the key in the client structure, to map clients -> keys */
7375 c->blockingkeys[j] = keys[j];
7376 incrRefCount(keys[j]);
4409877e 7377
b177fd30 7378 /* And in the other "side", to map keys -> clients */
7379 de = dictFind(c->db->blockingkeys,keys[j]);
7380 if (de == NULL) {
7381 int retval;
7382
7383 /* For every key we take a list of clients blocked for it */
7384 l = listCreate();
7385 retval = dictAdd(c->db->blockingkeys,keys[j],l);
7386 incrRefCount(keys[j]);
7387 assert(retval == DICT_OK);
7388 } else {
7389 l = dictGetEntryVal(de);
7390 }
7391 listAddNodeTail(l,c);
4409877e 7392 }
b177fd30 7393 /* Mark the client as a blocked client */
4409877e 7394 c->flags |= REDIS_BLOCKED;
d5d55fc3 7395 server.blpop_blocked_clients++;
4409877e 7396}
7397
7398/* Unblock a client that's waiting in a blocking operation such as BLPOP */
b0d8747d 7399static void unblockClientWaitingData(redisClient *c) {
4409877e 7400 dictEntry *de;
7401 list *l;
b177fd30 7402 int j;
4409877e 7403
b177fd30 7404 assert(c->blockingkeys != NULL);
7405 /* The client may wait for multiple keys, so unblock it for every key. */
7406 for (j = 0; j < c->blockingkeysnum; j++) {
7407 /* Remove this client from the list of clients waiting for this key. */
7408 de = dictFind(c->db->blockingkeys,c->blockingkeys[j]);
7409 assert(de != NULL);
7410 l = dictGetEntryVal(de);
7411 listDelNode(l,listSearchKey(l,c));
7412 /* If the list is empty we need to remove it to avoid wasting memory */
7413 if (listLength(l) == 0)
7414 dictDelete(c->db->blockingkeys,c->blockingkeys[j]);
7415 decrRefCount(c->blockingkeys[j]);
7416 }
7417 /* Cleanup the client structure */
7418 zfree(c->blockingkeys);
7419 c->blockingkeys = NULL;
4409877e 7420 c->flags &= (~REDIS_BLOCKED);
d5d55fc3 7421 server.blpop_blocked_clients--;
5921aa36 7422 /* We want to process data if there is some command waiting
b0d8747d 7423 * in the input buffer. Note that this is safe even if
7424 * unblockClientWaitingData() gets called from freeClient() because
7425 * freeClient() will be smart enough to call this function
7426 * *after* c->querybuf was set to NULL. */
4409877e 7427 if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
7428}
7429
7430/* This should be called from any function PUSHing into lists.
7431 * 'c' is the "pushing client", 'key' is the key it is pushing data against,
7432 * 'ele' is the element pushed.
7433 *
7434 * If the function returns 0 there was no client waiting for a list push
7435 * against this key.
7436 *
7437 * If the function returns 1 there was a client waiting for a list push
7438 * against this key, the element was passed to this client thus it's not
7439 * needed to actually add it to the list and the caller should return asap. */
7440static int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
7441 struct dictEntry *de;
7442 redisClient *receiver;
7443 list *l;
7444 listNode *ln;
7445
7446 de = dictFind(c->db->blockingkeys,key);
7447 if (de == NULL) return 0;
7448 l = dictGetEntryVal(de);
7449 ln = listFirst(l);
7450 assert(ln != NULL);
7451 receiver = ln->value;
4409877e 7452
b177fd30 7453 addReplySds(receiver,sdsnew("*2\r\n"));
dd88747b 7454 addReplyBulk(receiver,key);
7455 addReplyBulk(receiver,ele);
b0d8747d 7456 unblockClientWaitingData(receiver);
4409877e 7457 return 1;
7458}
7459
7460/* Blocking RPOP/LPOP */
7461static void blockingPopGenericCommand(redisClient *c, int where) {
7462 robj *o;
7463 time_t timeout;
b177fd30 7464 int j;
4409877e 7465
b177fd30 7466 for (j = 1; j < c->argc-1; j++) {
7467 o = lookupKeyWrite(c->db,c->argv[j]);
7468 if (o != NULL) {
7469 if (o->type != REDIS_LIST) {
7470 addReply(c,shared.wrongtypeerr);
4409877e 7471 return;
b177fd30 7472 } else {
7473 list *list = o->ptr;
7474 if (listLength(list) != 0) {
7475 /* If the list contains elements fall back to the usual
7476 * non-blocking POP operation */
7477 robj *argv[2], **orig_argv;
7478 int orig_argc;
e0a62c7f 7479
b177fd30 7480 /* We need to alter the command arguments before to call
7481 * popGenericCommand() as the command takes a single key. */
7482 orig_argv = c->argv;
7483 orig_argc = c->argc;
7484 argv[1] = c->argv[j];
7485 c->argv = argv;
7486 c->argc = 2;
7487
7488 /* Also the return value is different, we need to output
7489 * the multi bulk reply header and the key name. The
7490 * "real" command will add the last element (the value)
7491 * for us. If this souds like an hack to you it's just
7492 * because it is... */
7493 addReplySds(c,sdsnew("*2\r\n"));
dd88747b 7494 addReplyBulk(c,argv[1]);
b177fd30 7495 popGenericCommand(c,where);
7496
7497 /* Fix the client structure with the original stuff */
7498 c->argv = orig_argv;
7499 c->argc = orig_argc;
7500 return;
7501 }
4409877e 7502 }
7503 }
7504 }
7505 /* If the list is empty or the key does not exists we must block */
b177fd30 7506 timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
4409877e 7507 if (timeout > 0) timeout += time(NULL);
b177fd30 7508 blockForKeys(c,c->argv+1,c->argc-2,timeout);
4409877e 7509}
7510
7511static void blpopCommand(redisClient *c) {
7512 blockingPopGenericCommand(c,REDIS_HEAD);
7513}
7514
7515static void brpopCommand(redisClient *c) {
7516 blockingPopGenericCommand(c,REDIS_TAIL);
7517}
7518
ed9b544e 7519/* =============================== Replication ============================= */
7520
a4d1ba9a 7521static int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7522 ssize_t nwritten, ret = size;
7523 time_t start = time(NULL);
7524
7525 timeout++;
7526 while(size) {
7527 if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
7528 nwritten = write(fd,ptr,size);
7529 if (nwritten == -1) return -1;
7530 ptr += nwritten;
7531 size -= nwritten;
7532 }
7533 if ((time(NULL)-start) > timeout) {
7534 errno = ETIMEDOUT;
7535 return -1;
7536 }
7537 }
7538 return ret;
7539}
7540
a4d1ba9a 7541static int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
ed9b544e 7542 ssize_t nread, totread = 0;
7543 time_t start = time(NULL);
7544
7545 timeout++;
7546 while(size) {
7547 if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
7548 nread = read(fd,ptr,size);
7549 if (nread == -1) return -1;
7550 ptr += nread;
7551 size -= nread;
7552 totread += nread;
7553 }
7554 if ((time(NULL)-start) > timeout) {
7555 errno = ETIMEDOUT;
7556 return -1;
7557 }
7558 }
7559 return totread;
7560}
7561
7562static int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
7563 ssize_t nread = 0;
7564
7565 size--;
7566 while(size) {
7567 char c;
7568
7569 if (syncRead(fd,&c,1,timeout) == -1) return -1;
7570 if (c == '\n') {
7571 *ptr = '\0';
7572 if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
7573 return nread;
7574 } else {
7575 *ptr++ = c;
7576 *ptr = '\0';
7577 nread++;
7578 }
7579 }
7580 return nread;
7581}
7582
7583static void syncCommand(redisClient *c) {
40d224a9 7584 /* ignore SYNC if aleady slave or in monitor mode */
7585 if (c->flags & REDIS_SLAVE) return;
7586
7587 /* SYNC can't be issued when the server has pending data to send to
7588 * the client about already issued commands. We need a fresh reply
7589 * buffer registering the differences between the BGSAVE and the current
7590 * dataset, so that we can copy to other slaves if needed. */
7591 if (listLength(c->reply) != 0) {
7592 addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
7593 return;
7594 }
7595
7596 redisLog(REDIS_NOTICE,"Slave ask for synchronization");
7597 /* Here we need to check if there is a background saving operation
7598 * in progress, or if it is required to start one */
9d65a1bb 7599 if (server.bgsavechildpid != -1) {
40d224a9 7600 /* Ok a background save is in progress. Let's check if it is a good
7601 * one for replication, i.e. if there is another slave that is
7602 * registering differences since the server forked to save */
7603 redisClient *slave;
7604 listNode *ln;
c7df85a4 7605 listIter li;
40d224a9 7606
c7df85a4 7607 listRewind(server.slaves,&li);
7608 while((ln = listNext(&li))) {
40d224a9 7609 slave = ln->value;
7610 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
40d224a9 7611 }
7612 if (ln) {
7613 /* Perfect, the server is already registering differences for
7614 * another slave. Set the right state, and copy the buffer. */
7615 listRelease(c->reply);
7616 c->reply = listDup(slave->reply);
40d224a9 7617 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7618 redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
7619 } else {
7620 /* No way, we need to wait for the next BGSAVE in order to
7621 * register differences */
7622 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
7623 redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
7624 }
7625 } else {
7626 /* Ok we don't have a BGSAVE in progress, let's start one */
7627 redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
7628 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
7629 redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
7630 addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
7631 return;
7632 }
7633 c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7634 }
6208b3a7 7635 c->repldbfd = -1;
40d224a9 7636 c->flags |= REDIS_SLAVE;
7637 c->slaveseldb = 0;
6b47e12e 7638 listAddNodeTail(server.slaves,c);
40d224a9 7639 return;
7640}
7641
6208b3a7 7642static void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
7643 redisClient *slave = privdata;
7644 REDIS_NOTUSED(el);
7645 REDIS_NOTUSED(mask);
7646 char buf[REDIS_IOBUF_LEN];
7647 ssize_t nwritten, buflen;
7648
7649 if (slave->repldboff == 0) {
7650 /* Write the bulk write count before to transfer the DB. In theory here
7651 * we don't know how much room there is in the output buffer of the
7652 * socket, but in pratice SO_SNDLOWAT (the minimum count for output
7653 * operations) will never be smaller than the few bytes we need. */
7654 sds bulkcount;
7655
7656 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
7657 slave->repldbsize);
7658 if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
7659 {
7660 sdsfree(bulkcount);
7661 freeClient(slave);
7662 return;
7663 }
7664 sdsfree(bulkcount);
7665 }
7666 lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
7667 buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
7668 if (buflen <= 0) {
7669 redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
7670 (buflen == 0) ? "premature EOF" : strerror(errno));
7671 freeClient(slave);
7672 return;
7673 }
7674 if ((nwritten = write(fd,buf,buflen)) == -1) {
f870935d 7675 redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
6208b3a7 7676 strerror(errno));
7677 freeClient(slave);
7678 return;
7679 }
7680 slave->repldboff += nwritten;
7681 if (slave->repldboff == slave->repldbsize) {
7682 close(slave->repldbfd);
7683 slave->repldbfd = -1;
7684 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
7685 slave->replstate = REDIS_REPL_ONLINE;
7686 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
266373b2 7687 sendReplyToClient, slave) == AE_ERR) {
6208b3a7 7688 freeClient(slave);
7689 return;
7690 }
7691 addReplySds(slave,sdsempty());
7692 redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
7693 }
7694}
ed9b544e 7695
a3b21203 7696/* This function is called at the end of every backgrond saving.
7697 * The argument bgsaveerr is REDIS_OK if the background saving succeeded
7698 * otherwise REDIS_ERR is passed to the function.
7699 *
7700 * The goal of this function is to handle slaves waiting for a successful
7701 * background saving in order to perform non-blocking synchronization. */
7702static void updateSlavesWaitingBgsave(int bgsaveerr) {
6208b3a7 7703 listNode *ln;
7704 int startbgsave = 0;
c7df85a4 7705 listIter li;
ed9b544e 7706
c7df85a4 7707 listRewind(server.slaves,&li);
7708 while((ln = listNext(&li))) {
6208b3a7 7709 redisClient *slave = ln->value;
ed9b544e 7710
6208b3a7 7711 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
7712 startbgsave = 1;
7713 slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
7714 } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
dde65f3f 7715 struct redis_stat buf;
e0a62c7f 7716
6208b3a7 7717 if (bgsaveerr != REDIS_OK) {
7718 freeClient(slave);
7719 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
7720 continue;
7721 }
7722 if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
dde65f3f 7723 redis_fstat(slave->repldbfd,&buf) == -1) {
6208b3a7 7724 freeClient(slave);
7725 redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
7726 continue;
7727 }
7728 slave->repldboff = 0;
7729 slave->repldbsize = buf.st_size;
7730 slave->replstate = REDIS_REPL_SEND_BULK;
7731 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
266373b2 7732 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
6208b3a7 7733 freeClient(slave);
7734 continue;
7735 }
7736 }
ed9b544e 7737 }
6208b3a7 7738 if (startbgsave) {
7739 if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
c7df85a4 7740 listIter li;
7741
7742 listRewind(server.slaves,&li);
6208b3a7 7743 redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
c7df85a4 7744 while((ln = listNext(&li))) {
6208b3a7 7745 redisClient *slave = ln->value;
ed9b544e 7746
6208b3a7 7747 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
7748 freeClient(slave);
7749 }
7750 }
7751 }
ed9b544e 7752}
7753
7754static int syncWithMaster(void) {
d0ccebcf 7755 char buf[1024], tmpfile[256], authcmd[1024];
18e61fa2 7756 long dumpsize;
ed9b544e 7757 int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
8c5abee8 7758 int dfd, maxtries = 5;
ed9b544e 7759
7760 if (fd == -1) {
7761 redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
7762 strerror(errno));
7763 return REDIS_ERR;
7764 }
d0ccebcf 7765
7766 /* AUTH with the master if required. */
7767 if(server.masterauth) {
7768 snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
7769 if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
7770 close(fd);
7771 redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
7772 strerror(errno));
7773 return REDIS_ERR;
7774 }
7775 /* Read the AUTH result. */
7776 if (syncReadLine(fd,buf,1024,3600) == -1) {
7777 close(fd);
7778 redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
7779 strerror(errno));
7780 return REDIS_ERR;
7781 }
7782 if (buf[0] != '+') {
7783 close(fd);
7784 redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
7785 return REDIS_ERR;
7786 }
7787 }
7788
ed9b544e 7789 /* Issue the SYNC command */
7790 if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
7791 close(fd);
7792 redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
7793 strerror(errno));
7794 return REDIS_ERR;
7795 }
7796 /* Read the bulk write count */
8c4d91fc 7797 if (syncReadLine(fd,buf,1024,3600) == -1) {
ed9b544e 7798 close(fd);
7799 redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
7800 strerror(errno));
7801 return REDIS_ERR;
7802 }
4aa701c1 7803 if (buf[0] != '$') {
7804 close(fd);
7805 redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
7806 return REDIS_ERR;
7807 }
18e61fa2 7808 dumpsize = strtol(buf+1,NULL,10);
7809 redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
ed9b544e 7810 /* Read the bulk write data on a temp file */
8c5abee8 7811 while(maxtries--) {
7812 snprintf(tmpfile,256,
7813 "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
7814 dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
7815 if (dfd != -1) break;
5de9ad7c 7816 sleep(1);
8c5abee8 7817 }
ed9b544e 7818 if (dfd == -1) {
7819 close(fd);
7820 redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
7821 return REDIS_ERR;
7822 }
7823 while(dumpsize) {
7824 int nread, nwritten;
7825
7826 nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
7827 if (nread == -1) {
7828 redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
7829 strerror(errno));
7830 close(fd);
7831 close(dfd);
7832 return REDIS_ERR;
7833 }
7834 nwritten = write(dfd,buf,nread);
7835 if (nwritten == -1) {
7836 redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
7837 close(fd);
7838 close(dfd);
7839 return REDIS_ERR;
7840 }
7841 dumpsize -= nread;
7842 }
7843 close(dfd);
7844 if (rename(tmpfile,server.dbfilename) == -1) {
7845 redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
7846 unlink(tmpfile);
7847 close(fd);
7848 return REDIS_ERR;
7849 }
7850 emptyDb();
f78fd11b 7851 if (rdbLoad(server.dbfilename) != REDIS_OK) {
ed9b544e 7852 redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
7853 close(fd);
7854 return REDIS_ERR;
7855 }
7856 server.master = createClient(fd);
7857 server.master->flags |= REDIS_MASTER;
179b3952 7858 server.master->authenticated = 1;
ed9b544e 7859 server.replstate = REDIS_REPL_CONNECTED;
7860 return REDIS_OK;
7861}
7862
321b0e13 7863static void slaveofCommand(redisClient *c) {
7864 if (!strcasecmp(c->argv[1]->ptr,"no") &&
7865 !strcasecmp(c->argv[2]->ptr,"one")) {
7866 if (server.masterhost) {
7867 sdsfree(server.masterhost);
7868 server.masterhost = NULL;
7869 if (server.master) freeClient(server.master);
7870 server.replstate = REDIS_REPL_NONE;
7871 redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
7872 }
7873 } else {
7874 sdsfree(server.masterhost);
7875 server.masterhost = sdsdup(c->argv[1]->ptr);
7876 server.masterport = atoi(c->argv[2]->ptr);
7877 if (server.master) freeClient(server.master);
7878 server.replstate = REDIS_REPL_CONNECT;
7879 redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
7880 server.masterhost, server.masterport);
7881 }
7882 addReply(c,shared.ok);
7883}
7884
3fd78bcd 7885/* ============================ Maxmemory directive ======================== */
7886
a5819310 7887/* Try to free one object form the pre-allocated objects free list.
7888 * This is useful under low mem conditions as by default we take 1 million
7889 * free objects allocated. On success REDIS_OK is returned, otherwise
7890 * REDIS_ERR. */
7891static int tryFreeOneObjectFromFreelist(void) {
f870935d 7892 robj *o;
7893
a5819310 7894 if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
7895 if (listLength(server.objfreelist)) {
7896 listNode *head = listFirst(server.objfreelist);
7897 o = listNodeValue(head);
7898 listDelNode(server.objfreelist,head);
7899 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7900 zfree(o);
7901 return REDIS_OK;
7902 } else {
7903 if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
7904 return REDIS_ERR;
7905 }
f870935d 7906}
7907
3fd78bcd 7908/* This function gets called when 'maxmemory' is set on the config file to limit
7909 * the max memory used by the server, and we are out of memory.
7910 * This function will try to, in order:
7911 *
7912 * - Free objects from the free list
7913 * - Try to remove keys with an EXPIRE set
7914 *
7915 * It is not possible to free enough memory to reach used-memory < maxmemory
7916 * the server will start refusing commands that will enlarge even more the
7917 * memory usage.
7918 */
7919static void freeMemoryIfNeeded(void) {
7920 while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
a5819310 7921 int j, k, freed = 0;
7922
7923 if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
7924 for (j = 0; j < server.dbnum; j++) {
7925 int minttl = -1;
7926 robj *minkey = NULL;
7927 struct dictEntry *de;
7928
7929 if (dictSize(server.db[j].expires)) {
7930 freed = 1;
7931 /* From a sample of three keys drop the one nearest to
7932 * the natural expire */
7933 for (k = 0; k < 3; k++) {
7934 time_t t;
7935
7936 de = dictGetRandomKey(server.db[j].expires);
7937 t = (time_t) dictGetEntryVal(de);
7938 if (minttl == -1 || t < minttl) {
7939 minkey = dictGetEntryKey(de);
7940 minttl = t;
3fd78bcd 7941 }
3fd78bcd 7942 }
a5819310 7943 deleteKey(server.db+j,minkey);
3fd78bcd 7944 }
3fd78bcd 7945 }
a5819310 7946 if (!freed) return; /* nothing to free... */
3fd78bcd 7947 }
7948}
7949
f80dff62 7950/* ============================== Append Only file ========================== */
7951
7952static void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
7953 sds buf = sdsempty();
7954 int j;
7955 ssize_t nwritten;
7956 time_t now;
7957 robj *tmpargv[3];
7958
7959 /* The DB this command was targetting is not the same as the last command
7960 * we appendend. To issue a SELECT command is needed. */
7961 if (dictid != server.appendseldb) {
7962 char seldb[64];
7963
7964 snprintf(seldb,sizeof(seldb),"%d",dictid);
682ac724 7965 buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
83c6a618 7966 (unsigned long)strlen(seldb),seldb);
f80dff62 7967 server.appendseldb = dictid;
7968 }
7969
7970 /* "Fix" the argv vector if the command is EXPIRE. We want to translate
7971 * EXPIREs into EXPIREATs calls */
7972 if (cmd->proc == expireCommand) {
7973 long when;
7974
7975 tmpargv[0] = createStringObject("EXPIREAT",8);
7976 tmpargv[1] = argv[1];
7977 incrRefCount(argv[1]);
7978 when = time(NULL)+strtol(argv[2]->ptr,NULL,10);
7979 tmpargv[2] = createObject(REDIS_STRING,
7980 sdscatprintf(sdsempty(),"%ld",when));
7981 argv = tmpargv;
7982 }
7983
7984 /* Append the actual command */
7985 buf = sdscatprintf(buf,"*%d\r\n",argc);
7986 for (j = 0; j < argc; j++) {
7987 robj *o = argv[j];
7988
9d65a1bb 7989 o = getDecodedObject(o);
83c6a618 7990 buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
f80dff62 7991 buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
7992 buf = sdscatlen(buf,"\r\n",2);
9d65a1bb 7993 decrRefCount(o);
f80dff62 7994 }
7995
7996 /* Free the objects from the modified argv for EXPIREAT */
7997 if (cmd->proc == expireCommand) {
7998 for (j = 0; j < 3; j++)
7999 decrRefCount(argv[j]);
8000 }
8001
8002 /* We want to perform a single write. This should be guaranteed atomic
8003 * at least if the filesystem we are writing is a real physical one.
8004 * While this will save us against the server being killed I don't think
8005 * there is much to do about the whole server stopping for power problems
8006 * or alike */
8007 nwritten = write(server.appendfd,buf,sdslen(buf));
8008 if (nwritten != (signed)sdslen(buf)) {
8009 /* Ooops, we are in troubles. The best thing to do for now is
8010 * to simply exit instead to give the illusion that everything is
8011 * working as expected. */
8012 if (nwritten == -1) {
8013 redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
8014 } else {
8015 redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
8016 }
8017 exit(1);
8018 }
85a83172 8019 /* If a background append only file rewriting is in progress we want to
8020 * accumulate the differences between the child DB and the current one
8021 * in a buffer, so that when the child process will do its work we
8022 * can append the differences to the new append only file. */
8023 if (server.bgrewritechildpid != -1)
8024 server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
8025
8026 sdsfree(buf);
f80dff62 8027 now = time(NULL);
8028 if (server.appendfsync == APPENDFSYNC_ALWAYS ||
8029 (server.appendfsync == APPENDFSYNC_EVERYSEC &&
8030 now-server.lastfsync > 1))
8031 {
8032 fsync(server.appendfd); /* Let's try to get this data on the disk */
8033 server.lastfsync = now;
8034 }
8035}
8036
8037/* In Redis commands are always executed in the context of a client, so in
8038 * order to load the append only file we need to create a fake client. */
8039static struct redisClient *createFakeClient(void) {
8040 struct redisClient *c = zmalloc(sizeof(*c));
8041
8042 selectDb(c,0);
8043 c->fd = -1;
8044 c->querybuf = sdsempty();
8045 c->argc = 0;
8046 c->argv = NULL;
8047 c->flags = 0;
9387d17d 8048 /* We set the fake client as a slave waiting for the synchronization
8049 * so that Redis will not try to send replies to this client. */
8050 c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
f80dff62 8051 c->reply = listCreate();
8052 listSetFreeMethod(c->reply,decrRefCount);
8053 listSetDupMethod(c->reply,dupClientReplyValue);
8054 return c;
8055}
8056
8057static void freeFakeClient(struct redisClient *c) {
8058 sdsfree(c->querybuf);
8059 listRelease(c->reply);
8060 zfree(c);
8061}
8062
8063/* Replay the append log file. On error REDIS_OK is returned. On non fatal
8064 * error (the append only file is zero-length) REDIS_ERR is returned. On
8065 * fatal error an error message is logged and the program exists. */
8066int loadAppendOnlyFile(char *filename) {
8067 struct redisClient *fakeClient;
8068 FILE *fp = fopen(filename,"r");
8069 struct redis_stat sb;
b492cf00 8070 unsigned long long loadedkeys = 0;
f80dff62 8071
8072 if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
8073 return REDIS_ERR;
8074
8075 if (fp == NULL) {
8076 redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
8077 exit(1);
8078 }
8079
8080 fakeClient = createFakeClient();
8081 while(1) {
8082 int argc, j;
8083 unsigned long len;
8084 robj **argv;
8085 char buf[128];
8086 sds argsds;
8087 struct redisCommand *cmd;
8088
8089 if (fgets(buf,sizeof(buf),fp) == NULL) {
8090 if (feof(fp))
8091 break;
8092 else
8093 goto readerr;
8094 }
8095 if (buf[0] != '*') goto fmterr;
8096 argc = atoi(buf+1);
8097 argv = zmalloc(sizeof(robj*)*argc);
8098 for (j = 0; j < argc; j++) {
8099 if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
8100 if (buf[0] != '$') goto fmterr;
8101 len = strtol(buf+1,NULL,10);
8102 argsds = sdsnewlen(NULL,len);
0f151ef1 8103 if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
f80dff62 8104 argv[j] = createObject(REDIS_STRING,argsds);
8105 if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
8106 }
8107
8108 /* Command lookup */
8109 cmd = lookupCommand(argv[0]->ptr);
8110 if (!cmd) {
8111 redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
8112 exit(1);
8113 }
bdcb92f2 8114 /* Try object encoding */
f80dff62 8115 if (cmd->flags & REDIS_CMD_BULK)
05df7621 8116 argv[argc-1] = tryObjectEncoding(argv[argc-1]);
f80dff62 8117 /* Run the command in the context of a fake client */
8118 fakeClient->argc = argc;
8119 fakeClient->argv = argv;
8120 cmd->proc(fakeClient);
8121 /* Discard the reply objects list from the fake client */
8122 while(listLength(fakeClient->reply))
8123 listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
8124 /* Clean up, ready for the next command */
8125 for (j = 0; j < argc; j++) decrRefCount(argv[j]);
8126 zfree(argv);
b492cf00 8127 /* Handle swapping while loading big datasets when VM is on */
8128 loadedkeys++;
8129 if (server.vm_enabled && (loadedkeys % 5000) == 0) {
8130 while (zmalloc_used_memory() > server.vm_max_memory) {
a69a0c9c 8131 if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
b492cf00 8132 }
8133 }
f80dff62 8134 }
8135 fclose(fp);
8136 freeFakeClient(fakeClient);
8137 return REDIS_OK;
8138
8139readerr:
8140 if (feof(fp)) {
8141 redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
8142 } else {
8143 redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
8144 }
8145 exit(1);
8146fmterr:
8147 redisLog(REDIS_WARNING,"Bad file format reading the append only file");
8148 exit(1);
8149}
8150
9d65a1bb 8151/* Write an object into a file in the bulk format $<count>\r\n<payload>\r\n */
9c8e3cee 8152static int fwriteBulkObject(FILE *fp, robj *obj) {
9d65a1bb 8153 char buf[128];
b9bc0eef 8154 int decrrc = 0;
8155
f2d9f50f 8156 /* Avoid the incr/decr ref count business if possible to help
8157 * copy-on-write (we are often in a child process when this function
8158 * is called).
8159 * Also makes sure that key objects don't get incrRefCount-ed when VM
8160 * is enabled */
8161 if (obj->encoding != REDIS_ENCODING_RAW) {
b9bc0eef 8162 obj = getDecodedObject(obj);
8163 decrrc = 1;
8164 }
9d65a1bb 8165 snprintf(buf,sizeof(buf),"$%ld\r\n",(long)sdslen(obj->ptr));
8166 if (fwrite(buf,strlen(buf),1,fp) == 0) goto err;
e96e4fbf 8167 if (sdslen(obj->ptr) && fwrite(obj->ptr,sdslen(obj->ptr),1,fp) == 0)
8168 goto err;
9d65a1bb 8169 if (fwrite("\r\n",2,1,fp) == 0) goto err;
b9bc0eef 8170 if (decrrc) decrRefCount(obj);
9d65a1bb 8171 return 1;
8172err:
b9bc0eef 8173 if (decrrc) decrRefCount(obj);
9d65a1bb 8174 return 0;
8175}
8176
9c8e3cee 8177/* Write binary-safe string into a file in the bulkformat
8178 * $<count>\r\n<payload>\r\n */
8179static int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
8180 char buf[128];
8181
8182 snprintf(buf,sizeof(buf),"$%ld\r\n",(unsigned long)len);
8183 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8184 if (len && fwrite(s,len,1,fp) == 0) return 0;
8185 if (fwrite("\r\n",2,1,fp) == 0) return 0;
8186 return 1;
8187}
8188
9d65a1bb 8189/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
8190static int fwriteBulkDouble(FILE *fp, double d) {
8191 char buf[128], dbuf[128];
8192
8193 snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
8194 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
8195 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8196 if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
8197 return 1;
8198}
8199
8200/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
8201static int fwriteBulkLong(FILE *fp, long l) {
8202 char buf[128], lbuf[128];
8203
8204 snprintf(lbuf,sizeof(lbuf),"%ld\r\n",l);
8205 snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(lbuf)-2);
8206 if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
8207 if (fwrite(lbuf,strlen(lbuf),1,fp) == 0) return 0;
8208 return 1;
8209}
8210
8211/* Write a sequence of commands able to fully rebuild the dataset into
8212 * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
8213static int rewriteAppendOnlyFile(char *filename) {
8214 dictIterator *di = NULL;
8215 dictEntry *de;
8216 FILE *fp;
8217 char tmpfile[256];
8218 int j;
8219 time_t now = time(NULL);
8220
8221 /* Note that we have to use a different temp name here compared to the
8222 * one used by rewriteAppendOnlyFileBackground() function. */
8223 snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
8224 fp = fopen(tmpfile,"w");
8225 if (!fp) {
8226 redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
8227 return REDIS_ERR;
8228 }
8229 for (j = 0; j < server.dbnum; j++) {
8230 char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
8231 redisDb *db = server.db+j;
8232 dict *d = db->dict;
8233 if (dictSize(d) == 0) continue;
8234 di = dictGetIterator(d);
8235 if (!di) {
8236 fclose(fp);
8237 return REDIS_ERR;
8238 }
8239
8240 /* SELECT the new DB */
8241 if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
85a83172 8242 if (fwriteBulkLong(fp,j) == 0) goto werr;
9d65a1bb 8243
8244 /* Iterate this DB writing every entry */
8245 while((de = dictNext(di)) != NULL) {
e7546c63 8246 robj *key, *o;
8247 time_t expiretime;
8248 int swapped;
8249
8250 key = dictGetEntryKey(de);
b9bc0eef 8251 /* If the value for this key is swapped, load a preview in memory.
8252 * We use a "swapped" flag to remember if we need to free the
8253 * value object instead to just increment the ref count anyway
8254 * in order to avoid copy-on-write of pages if we are forked() */
996cb5f7 8255 if (!server.vm_enabled || key->storage == REDIS_VM_MEMORY ||
8256 key->storage == REDIS_VM_SWAPPING) {
e7546c63 8257 o = dictGetEntryVal(de);
8258 swapped = 0;
8259 } else {
8260 o = vmPreviewObject(key);
e7546c63 8261 swapped = 1;
8262 }
8263 expiretime = getExpire(db,key);
9d65a1bb 8264
8265 /* Save the key and associated value */
9d65a1bb 8266 if (o->type == REDIS_STRING) {
8267 /* Emit a SET command */
8268 char cmd[]="*3\r\n$3\r\nSET\r\n";
8269 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8270 /* Key and value */
9c8e3cee 8271 if (fwriteBulkObject(fp,key) == 0) goto werr;
8272 if (fwriteBulkObject(fp,o) == 0) goto werr;
9d65a1bb 8273 } else if (o->type == REDIS_LIST) {
8274 /* Emit the RPUSHes needed to rebuild the list */
8275 list *list = o->ptr;
8276 listNode *ln;
c7df85a4 8277 listIter li;
9d65a1bb 8278
c7df85a4 8279 listRewind(list,&li);
8280 while((ln = listNext(&li))) {
9d65a1bb 8281 char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
8282 robj *eleobj = listNodeValue(ln);
8283
8284 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8285 if (fwriteBulkObject(fp,key) == 0) goto werr;
8286 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8287 }
8288 } else if (o->type == REDIS_SET) {
8289 /* Emit the SADDs needed to rebuild the set */
8290 dict *set = o->ptr;
8291 dictIterator *di = dictGetIterator(set);
8292 dictEntry *de;
8293
8294 while((de = dictNext(di)) != NULL) {
8295 char cmd[]="*3\r\n$4\r\nSADD\r\n";
8296 robj *eleobj = dictGetEntryKey(de);
8297
8298 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8299 if (fwriteBulkObject(fp,key) == 0) goto werr;
8300 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8301 }
8302 dictReleaseIterator(di);
8303 } else if (o->type == REDIS_ZSET) {
8304 /* Emit the ZADDs needed to rebuild the sorted set */
8305 zset *zs = o->ptr;
8306 dictIterator *di = dictGetIterator(zs->dict);
8307 dictEntry *de;
8308
8309 while((de = dictNext(di)) != NULL) {
8310 char cmd[]="*4\r\n$4\r\nZADD\r\n";
8311 robj *eleobj = dictGetEntryKey(de);
8312 double *score = dictGetEntryVal(de);
8313
8314 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8315 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8316 if (fwriteBulkDouble(fp,*score) == 0) goto werr;
9c8e3cee 8317 if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
9d65a1bb 8318 }
8319 dictReleaseIterator(di);
9c8e3cee 8320 } else if (o->type == REDIS_HASH) {
8321 char cmd[]="*4\r\n$4\r\nHSET\r\n";
8322
8323 /* Emit the HSETs needed to rebuild the hash */
8324 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8325 unsigned char *p = zipmapRewind(o->ptr);
8326 unsigned char *field, *val;
8327 unsigned int flen, vlen;
8328
8329 while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
8330 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8331 if (fwriteBulkObject(fp,key) == 0) goto werr;
8332 if (fwriteBulkString(fp,(char*)field,flen) == -1)
8333 return -1;
8334 if (fwriteBulkString(fp,(char*)val,vlen) == -1)
8335 return -1;
8336 }
8337 } else {
8338 dictIterator *di = dictGetIterator(o->ptr);
8339 dictEntry *de;
8340
8341 while((de = dictNext(di)) != NULL) {
8342 robj *field = dictGetEntryKey(de);
8343 robj *val = dictGetEntryVal(de);
8344
8345 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
8346 if (fwriteBulkObject(fp,key) == 0) goto werr;
8347 if (fwriteBulkObject(fp,field) == -1) return -1;
8348 if (fwriteBulkObject(fp,val) == -1) return -1;
8349 }
8350 dictReleaseIterator(di);
8351 }
9d65a1bb 8352 } else {
f83c6cb5 8353 redisPanic("Unknown object type");
9d65a1bb 8354 }
8355 /* Save the expire time */
8356 if (expiretime != -1) {
e96e4fbf 8357 char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
9d65a1bb 8358 /* If this key is already expired skip it */
8359 if (expiretime < now) continue;
8360 if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
9c8e3cee 8361 if (fwriteBulkObject(fp,key) == 0) goto werr;
9d65a1bb 8362 if (fwriteBulkLong(fp,expiretime) == 0) goto werr;
8363 }
b9bc0eef 8364 if (swapped) decrRefCount(o);
9d65a1bb 8365 }
8366 dictReleaseIterator(di);
8367 }
8368
8369 /* Make sure data will not remain on the OS's output buffers */
8370 fflush(fp);
8371 fsync(fileno(fp));
8372 fclose(fp);
e0a62c7f 8373
9d65a1bb 8374 /* Use RENAME to make sure the DB file is changed atomically only
8375 * if the generate DB file is ok. */
8376 if (rename(tmpfile,filename) == -1) {
8377 redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
8378 unlink(tmpfile);
8379 return REDIS_ERR;
8380 }
8381 redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
8382 return REDIS_OK;
8383
8384werr:
8385 fclose(fp);
8386 unlink(tmpfile);
e96e4fbf 8387 redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
9d65a1bb 8388 if (di) dictReleaseIterator(di);
8389 return REDIS_ERR;
8390}
8391
8392/* This is how rewriting of the append only file in background works:
8393 *
8394 * 1) The user calls BGREWRITEAOF
8395 * 2) Redis calls this function, that forks():
8396 * 2a) the child rewrite the append only file in a temp file.
8397 * 2b) the parent accumulates differences in server.bgrewritebuf.
8398 * 3) When the child finished '2a' exists.
8399 * 4) The parent will trap the exit code, if it's OK, will append the
8400 * data accumulated into server.bgrewritebuf into the temp file, and
8401 * finally will rename(2) the temp file in the actual file name.
8402 * The the new file is reopened as the new append only file. Profit!
8403 */
8404static int rewriteAppendOnlyFileBackground(void) {
8405 pid_t childpid;
8406
8407 if (server.bgrewritechildpid != -1) return REDIS_ERR;
054e426d 8408 if (server.vm_enabled) waitEmptyIOJobsQueue();
9d65a1bb 8409 if ((childpid = fork()) == 0) {
8410 /* Child */
8411 char tmpfile[256];
9d65a1bb 8412
054e426d 8413 if (server.vm_enabled) vmReopenSwapFile();
8414 close(server.fd);
9d65a1bb 8415 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
8416 if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
478c2c6f 8417 _exit(0);
9d65a1bb 8418 } else {
478c2c6f 8419 _exit(1);
9d65a1bb 8420 }
8421 } else {
8422 /* Parent */
8423 if (childpid == -1) {
8424 redisLog(REDIS_WARNING,
8425 "Can't rewrite append only file in background: fork: %s",
8426 strerror(errno));
8427 return REDIS_ERR;
8428 }
8429 redisLog(REDIS_NOTICE,
8430 "Background append only file rewriting started by pid %d",childpid);
8431 server.bgrewritechildpid = childpid;
884d4b39 8432 updateDictResizePolicy();
85a83172 8433 /* We set appendseldb to -1 in order to force the next call to the
8434 * feedAppendOnlyFile() to issue a SELECT command, so the differences
8435 * accumulated by the parent into server.bgrewritebuf will start
8436 * with a SELECT statement and it will be safe to merge. */
8437 server.appendseldb = -1;
9d65a1bb 8438 return REDIS_OK;
8439 }
8440 return REDIS_OK; /* unreached */
8441}
8442
8443static void bgrewriteaofCommand(redisClient *c) {
8444 if (server.bgrewritechildpid != -1) {
8445 addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
8446 return;
8447 }
8448 if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
49b99ab4 8449 char *status = "+Background append only file rewriting started\r\n";
8450 addReplySds(c,sdsnew(status));
9d65a1bb 8451 } else {
8452 addReply(c,shared.err);
8453 }
8454}
8455
8456static void aofRemoveTempFile(pid_t childpid) {
8457 char tmpfile[256];
8458
8459 snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
8460 unlink(tmpfile);
8461}
8462
996cb5f7 8463/* Virtual Memory is composed mainly of two subsystems:
8464 * - Blocking Virutal Memory
8465 * - Threaded Virtual Memory I/O
8466 * The two parts are not fully decoupled, but functions are split among two
8467 * different sections of the source code (delimited by comments) in order to
8468 * make more clear what functionality is about the blocking VM and what about
8469 * the threaded (not blocking) VM.
8470 *
8471 * Redis VM design:
8472 *
8473 * Redis VM is a blocking VM (one that blocks reading swapped values from
8474 * disk into memory when a value swapped out is needed in memory) that is made
8475 * unblocking by trying to examine the command argument vector in order to
8476 * load in background values that will likely be needed in order to exec
8477 * the command. The command is executed only once all the relevant keys
8478 * are loaded into memory.
8479 *
8480 * This basically is almost as simple of a blocking VM, but almost as parallel
8481 * as a fully non-blocking VM.
8482 */
8483
8484/* =================== Virtual Memory - Blocking Side ====================== */
054e426d 8485
8486/* substitute the first occurrence of '%p' with the process pid in the
8487 * swap file name. */
8488static void expandVmSwapFilename(void) {
8489 char *p = strstr(server.vm_swap_file,"%p");
8490 sds new;
e0a62c7f 8491
054e426d 8492 if (!p) return;
8493 new = sdsempty();
8494 *p = '\0';
8495 new = sdscat(new,server.vm_swap_file);
8496 new = sdscatprintf(new,"%ld",(long) getpid());
8497 new = sdscat(new,p+2);
8498 zfree(server.vm_swap_file);
8499 server.vm_swap_file = new;
8500}
8501
75680a3c 8502static void vmInit(void) {
8503 off_t totsize;
996cb5f7 8504 int pipefds[2];
bcaa7a4f 8505 size_t stacksize;
75680a3c 8506
4ad37480 8507 if (server.vm_max_threads != 0)
8508 zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
8509
054e426d 8510 expandVmSwapFilename();
8511 redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
6fa987e3 8512 if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
8513 server.vm_fp = fopen(server.vm_swap_file,"w+b");
8514 }
75680a3c 8515 if (server.vm_fp == NULL) {
6fa987e3 8516 redisLog(REDIS_WARNING,
8517 "Impossible to open the swap file: %s. Exiting.",
8518 strerror(errno));
75680a3c 8519 exit(1);
8520 }
8521 server.vm_fd = fileno(server.vm_fp);
8522 server.vm_next_page = 0;
8523 server.vm_near_pages = 0;
7d98e08c 8524 server.vm_stats_used_pages = 0;
8525 server.vm_stats_swapped_objects = 0;
8526 server.vm_stats_swapouts = 0;
8527 server.vm_stats_swapins = 0;
75680a3c 8528 totsize = server.vm_pages*server.vm_page_size;
8529 redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
8530 if (ftruncate(server.vm_fd,totsize) == -1) {
8531 redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
8532 strerror(errno));
8533 exit(1);
8534 } else {
8535 redisLog(REDIS_NOTICE,"Swap file allocated with success");
8536 }
7d30035d 8537 server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
f870935d 8538 redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
4ef8de8a 8539 (long long) (server.vm_pages+7)/8, server.vm_pages);
7d30035d 8540 memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
92f8e882 8541
996cb5f7 8542 /* Initialize threaded I/O (used by Virtual Memory) */
8543 server.io_newjobs = listCreate();
8544 server.io_processing = listCreate();
8545 server.io_processed = listCreate();
d5d55fc3 8546 server.io_ready_clients = listCreate();
92f8e882 8547 pthread_mutex_init(&server.io_mutex,NULL);
a5819310 8548 pthread_mutex_init(&server.obj_freelist_mutex,NULL);
8549 pthread_mutex_init(&server.io_swapfile_mutex,NULL);
92f8e882 8550 server.io_active_threads = 0;
996cb5f7 8551 if (pipe(pipefds) == -1) {
8552 redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
8553 ,strerror(errno));
8554 exit(1);
8555 }
8556 server.io_ready_pipe_read = pipefds[0];
8557 server.io_ready_pipe_write = pipefds[1];
8558 redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
bcaa7a4f 8559 /* LZF requires a lot of stack */
8560 pthread_attr_init(&server.io_threads_attr);
8561 pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
8562 while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
8563 pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
b9bc0eef 8564 /* Listen for events in the threaded I/O pipe */
8565 if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
8566 vmThreadedIOCompletedJob, NULL) == AE_ERR)
8567 oom("creating file event");
75680a3c 8568}
8569
06224fec 8570/* Mark the page as used */
8571static void vmMarkPageUsed(off_t page) {
8572 off_t byte = page/8;
8573 int bit = page&7;
970e10bb 8574 redisAssert(vmFreePage(page) == 1);
06224fec 8575 server.vm_bitmap[byte] |= 1<<bit;
8576}
8577
8578/* Mark N contiguous pages as used, with 'page' being the first. */
8579static void vmMarkPagesUsed(off_t page, off_t count) {
8580 off_t j;
8581
8582 for (j = 0; j < count; j++)
7d30035d 8583 vmMarkPageUsed(page+j);
7d98e08c 8584 server.vm_stats_used_pages += count;
7c775e09 8585 redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
8586 (long long)count, (long long)page);
06224fec 8587}
8588
8589/* Mark the page as free */
8590static void vmMarkPageFree(off_t page) {
8591 off_t byte = page/8;
8592 int bit = page&7;
970e10bb 8593 redisAssert(vmFreePage(page) == 0);
06224fec 8594 server.vm_bitmap[byte] &= ~(1<<bit);
8595}
8596
8597/* Mark N contiguous pages as free, with 'page' being the first. */
8598static void vmMarkPagesFree(off_t page, off_t count) {
8599 off_t j;
8600
8601 for (j = 0; j < count; j++)
7d30035d 8602 vmMarkPageFree(page+j);
7d98e08c 8603 server.vm_stats_used_pages -= count;
7c775e09 8604 redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
8605 (long long)count, (long long)page);
06224fec 8606}
8607
8608/* Test if the page is free */
8609static int vmFreePage(off_t page) {
8610 off_t byte = page/8;
8611 int bit = page&7;
7d30035d 8612 return (server.vm_bitmap[byte] & (1<<bit)) == 0;
06224fec 8613}
8614
8615/* Find N contiguous free pages storing the first page of the cluster in *first.
e0a62c7f 8616 * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
3a66edc7 8617 * REDIS_ERR is returned.
06224fec 8618 *
8619 * This function uses a simple algorithm: we try to allocate
8620 * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
8621 * again from the start of the swap file searching for free spaces.
8622 *
8623 * If it looks pretty clear that there are no free pages near our offset
8624 * we try to find less populated places doing a forward jump of
8625 * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
8626 * without hurry, and then we jump again and so forth...
e0a62c7f 8627 *
06224fec 8628 * This function can be improved using a free list to avoid to guess
8629 * too much, since we could collect data about freed pages.
8630 *
8631 * note: I implemented this function just after watching an episode of
8632 * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
8633 */
c7df85a4 8634static int vmFindContiguousPages(off_t *first, off_t n) {
06224fec 8635 off_t base, offset = 0, since_jump = 0, numfree = 0;
8636
8637 if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
8638 server.vm_near_pages = 0;
8639 server.vm_next_page = 0;
8640 }
8641 server.vm_near_pages++; /* Yet another try for pages near to the old ones */
8642 base = server.vm_next_page;
8643
8644 while(offset < server.vm_pages) {
8645 off_t this = base+offset;
8646
8647 /* If we overflow, restart from page zero */
8648 if (this >= server.vm_pages) {
8649 this -= server.vm_pages;
8650 if (this == 0) {
8651 /* Just overflowed, what we found on tail is no longer
8652 * interesting, as it's no longer contiguous. */
8653 numfree = 0;
8654 }
8655 }
8656 if (vmFreePage(this)) {
8657 /* This is a free page */
8658 numfree++;
8659 /* Already got N free pages? Return to the caller, with success */
8660 if (numfree == n) {
7d30035d 8661 *first = this-(n-1);
8662 server.vm_next_page = this+1;
7c775e09 8663 redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
3a66edc7 8664 return REDIS_OK;
06224fec 8665 }
8666 } else {
8667 /* The current one is not a free page */
8668 numfree = 0;
8669 }
8670
8671 /* Fast-forward if the current page is not free and we already
8672 * searched enough near this place. */
8673 since_jump++;
8674 if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
8675 offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
8676 since_jump = 0;
8677 /* Note that even if we rewind after the jump, we are don't need
8678 * to make sure numfree is set to zero as we only jump *if* it
8679 * is set to zero. */
8680 } else {
8681 /* Otherwise just check the next page */
8682 offset++;
8683 }
8684 }
3a66edc7 8685 return REDIS_ERR;
8686}
8687
a5819310 8688/* Write the specified object at the specified page of the swap file */
8689static int vmWriteObjectOnSwap(robj *o, off_t page) {
8690 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8691 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
8692 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8693 redisLog(REDIS_WARNING,
9ebed7cf 8694 "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
a5819310 8695 strerror(errno));
8696 return REDIS_ERR;
8697 }
8698 rdbSaveObject(server.vm_fp,o);
ba76a8f9 8699 fflush(server.vm_fp);
a5819310 8700 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8701 return REDIS_OK;
8702}
8703
3a66edc7 8704/* Swap the 'val' object relative to 'key' into disk. Store all the information
8705 * needed to later retrieve the object into the key object.
8706 * If we can't find enough contiguous empty pages to swap the object on disk
8707 * REDIS_ERR is returned. */
a69a0c9c 8708static int vmSwapObjectBlocking(robj *key, robj *val) {
b9bc0eef 8709 off_t pages = rdbSavedObjectPages(val,NULL);
3a66edc7 8710 off_t page;
8711
8712 assert(key->storage == REDIS_VM_MEMORY);
4ef8de8a 8713 assert(key->refcount == 1);
3a66edc7 8714 if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
a5819310 8715 if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
3a66edc7 8716 key->vm.page = page;
8717 key->vm.usedpages = pages;
8718 key->storage = REDIS_VM_SWAPPED;
d894161b 8719 key->vtype = val->type;
3a66edc7 8720 decrRefCount(val); /* Deallocate the object from memory. */
8721 vmMarkPagesUsed(page,pages);
7d30035d 8722 redisLog(REDIS_DEBUG,"VM: object %s swapped out at %lld (%lld pages)",
8723 (unsigned char*) key->ptr,
8724 (unsigned long long) page, (unsigned long long) pages);
7d98e08c 8725 server.vm_stats_swapped_objects++;
8726 server.vm_stats_swapouts++;
3a66edc7 8727 return REDIS_OK;
8728}
8729
a5819310 8730static robj *vmReadObjectFromSwap(off_t page, int type) {
8731 robj *o;
3a66edc7 8732
a5819310 8733 if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
8734 if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
3a66edc7 8735 redisLog(REDIS_WARNING,
d5d55fc3 8736 "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
3a66edc7 8737 strerror(errno));
478c2c6f 8738 _exit(1);
3a66edc7 8739 }
a5819310 8740 o = rdbLoadObject(type,server.vm_fp);
8741 if (o == NULL) {
d5d55fc3 8742 redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
478c2c6f 8743 _exit(1);
3a66edc7 8744 }
a5819310 8745 if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
8746 return o;
8747}
8748
8749/* Load the value object relative to the 'key' object from swap to memory.
8750 * The newly allocated object is returned.
8751 *
8752 * If preview is true the unserialized object is returned to the caller but
8753 * no changes are made to the key object, nor the pages are marked as freed */
8754static robj *vmGenericLoadObject(robj *key, int preview) {
8755 robj *val;
8756
d5d55fc3 8757 redisAssert(key->storage == REDIS_VM_SWAPPED || key->storage == REDIS_VM_LOADING);
a5819310 8758 val = vmReadObjectFromSwap(key->vm.page,key->vtype);
7e69548d 8759 if (!preview) {
8760 key->storage = REDIS_VM_MEMORY;
8761 key->vm.atime = server.unixtime;
8762 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
8763 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk",
8764 (unsigned char*) key->ptr);
7d98e08c 8765 server.vm_stats_swapped_objects--;
38aba9a1 8766 } else {
8767 redisLog(REDIS_DEBUG, "VM: object %s previewed from disk",
8768 (unsigned char*) key->ptr);
7e69548d 8769 }
7d98e08c 8770 server.vm_stats_swapins++;
3a66edc7 8771 return val;
06224fec 8772}
8773
7e69548d 8774/* Plain object loading, from swap to memory */
8775static robj *vmLoadObject(robj *key) {
996cb5f7 8776 /* If we are loading the object in background, stop it, we
8777 * need to load this object synchronously ASAP. */
8778 if (key->storage == REDIS_VM_LOADING)
8779 vmCancelThreadedIOJob(key);
7e69548d 8780 return vmGenericLoadObject(key,0);
8781}
8782
8783/* Just load the value on disk, without to modify the key.
8784 * This is useful when we want to perform some operation on the value
8785 * without to really bring it from swap to memory, like while saving the
8786 * dataset or rewriting the append only log. */
8787static robj *vmPreviewObject(robj *key) {
8788 return vmGenericLoadObject(key,1);
8789}
8790
4ef8de8a 8791/* How a good candidate is this object for swapping?
8792 * The better candidate it is, the greater the returned value.
8793 *
8794 * Currently we try to perform a fast estimation of the object size in
8795 * memory, and combine it with aging informations.
8796 *
8797 * Basically swappability = idle-time * log(estimated size)
8798 *
8799 * Bigger objects are preferred over smaller objects, but not
8800 * proportionally, this is why we use the logarithm. This algorithm is
8801 * just a first try and will probably be tuned later. */
8802static double computeObjectSwappability(robj *o) {
8803 time_t age = server.unixtime - o->vm.atime;
8804 long asize = 0;
8805 list *l;
8806 dict *d;
8807 struct dictEntry *de;
8808 int z;
8809
8810 if (age <= 0) return 0;
8811 switch(o->type) {
8812 case REDIS_STRING:
8813 if (o->encoding != REDIS_ENCODING_RAW) {
8814 asize = sizeof(*o);
8815 } else {
8816 asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
8817 }
8818 break;
8819 case REDIS_LIST:
8820 l = o->ptr;
8821 listNode *ln = listFirst(l);
8822
8823 asize = sizeof(list);
8824 if (ln) {
8825 robj *ele = ln->value;
8826 long elesize;
8827
8828 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8829 (sizeof(*o)+sdslen(ele->ptr)) :
8830 sizeof(*o);
8831 asize += (sizeof(listNode)+elesize)*listLength(l);
8832 }
8833 break;
8834 case REDIS_SET:
8835 case REDIS_ZSET:
8836 z = (o->type == REDIS_ZSET);
8837 d = z ? ((zset*)o->ptr)->dict : o->ptr;
8838
8839 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8840 if (z) asize += sizeof(zset)-sizeof(dict);
8841 if (dictSize(d)) {
8842 long elesize;
8843 robj *ele;
8844
8845 de = dictGetRandomKey(d);
8846 ele = dictGetEntryKey(de);
8847 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8848 (sizeof(*o)+sdslen(ele->ptr)) :
8849 sizeof(*o);
8850 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8851 if (z) asize += sizeof(zskiplistNode)*dictSize(d);
8852 }
8853 break;
a97b9060 8854 case REDIS_HASH:
8855 if (o->encoding == REDIS_ENCODING_ZIPMAP) {
8856 unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
8857 unsigned int len = zipmapLen((unsigned char*)o->ptr);
8858 unsigned int klen, vlen;
8859 unsigned char *key, *val;
8860
8861 if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
8862 klen = 0;
8863 vlen = 0;
8864 }
8865 asize = len*(klen+vlen+3);
8866 } else if (o->encoding == REDIS_ENCODING_HT) {
8867 d = o->ptr;
8868 asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
8869 if (dictSize(d)) {
8870 long elesize;
8871 robj *ele;
8872
8873 de = dictGetRandomKey(d);
8874 ele = dictGetEntryKey(de);
8875 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8876 (sizeof(*o)+sdslen(ele->ptr)) :
8877 sizeof(*o);
8878 ele = dictGetEntryVal(de);
8879 elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
8880 (sizeof(*o)+sdslen(ele->ptr)) :
8881 sizeof(*o);
8882 asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
8883 }
8884 }
8885 break;
4ef8de8a 8886 }
c8c72447 8887 return (double)age*log(1+asize);
4ef8de8a 8888}
8889
8890/* Try to swap an object that's a good candidate for swapping.
8891 * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
a69a0c9c 8892 * to swap any object at all.
8893 *
8894 * If 'usethreaded' is true, Redis will try to swap the object in background
8895 * using I/O threads. */
8896static int vmSwapOneObject(int usethreads) {
4ef8de8a 8897 int j, i;
8898 struct dictEntry *best = NULL;
8899 double best_swappability = 0;
b9bc0eef 8900 redisDb *best_db = NULL;
4ef8de8a 8901 robj *key, *val;
8902
8903 for (j = 0; j < server.dbnum; j++) {
8904 redisDb *db = server.db+j;
b72f6a4b 8905 /* Why maxtries is set to 100?
8906 * Because this way (usually) we'll find 1 object even if just 1% - 2%
8907 * are swappable objects */
b0d8747d 8908 int maxtries = 100;
4ef8de8a 8909
8910 if (dictSize(db->dict) == 0) continue;
8911 for (i = 0; i < 5; i++) {
8912 dictEntry *de;
8913 double swappability;
8914
e3cadb8a 8915 if (maxtries) maxtries--;
4ef8de8a 8916 de = dictGetRandomKey(db->dict);
8917 key = dictGetEntryKey(de);
8918 val = dictGetEntryVal(de);
1064ef87 8919 /* Only swap objects that are currently in memory.
8920 *
8921 * Also don't swap shared objects if threaded VM is on, as we
8922 * try to ensure that the main thread does not touch the
8923 * object while the I/O thread is using it, but we can't
8924 * control other keys without adding additional mutex. */
8925 if (key->storage != REDIS_VM_MEMORY ||
8926 (server.vm_max_threads != 0 && val->refcount != 1)) {
e3cadb8a 8927 if (maxtries) i--; /* don't count this try */
8928 continue;
8929 }
4ef8de8a 8930 swappability = computeObjectSwappability(val);
8931 if (!best || swappability > best_swappability) {
8932 best = de;
8933 best_swappability = swappability;
b9bc0eef 8934 best_db = db;
4ef8de8a 8935 }
8936 }
8937 }
7c775e09 8938 if (best == NULL) return REDIS_ERR;
4ef8de8a 8939 key = dictGetEntryKey(best);
8940 val = dictGetEntryVal(best);
8941
e3cadb8a 8942 redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
4ef8de8a 8943 key->ptr, best_swappability);
8944
8945 /* Unshare the key if needed */
8946 if (key->refcount > 1) {
8947 robj *newkey = dupStringObject(key);
8948 decrRefCount(key);
8949 key = dictGetEntryKey(best) = newkey;
8950 }
8951 /* Swap it */
a69a0c9c 8952 if (usethreads) {
b9bc0eef 8953 vmSwapObjectThreaded(key,val,best_db);
4ef8de8a 8954 return REDIS_OK;
8955 } else {
a69a0c9c 8956 if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
8957 dictGetEntryVal(best) = NULL;
8958 return REDIS_OK;
8959 } else {
8960 return REDIS_ERR;
8961 }
4ef8de8a 8962 }
8963}
8964
a69a0c9c 8965static int vmSwapOneObjectBlocking() {
8966 return vmSwapOneObject(0);
8967}
8968
8969static int vmSwapOneObjectThreaded() {
8970 return vmSwapOneObject(1);
8971}
8972
7e69548d 8973/* Return true if it's safe to swap out objects in a given moment.
8974 * Basically we don't want to swap objects out while there is a BGSAVE
8975 * or a BGAEOREWRITE running in backgroud. */
8976static int vmCanSwapOut(void) {
8977 return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
8978}
8979
1b03836c 8980/* Delete a key if swapped. Returns 1 if the key was found, was swapped
8981 * and was deleted. Otherwise 0 is returned. */
8982static int deleteIfSwapped(redisDb *db, robj *key) {
8983 dictEntry *de;
8984 robj *foundkey;
8985
8986 if ((de = dictFind(db->dict,key)) == NULL) return 0;
8987 foundkey = dictGetEntryKey(de);
8988 if (foundkey->storage == REDIS_VM_MEMORY) return 0;
8989 deleteKey(db,key);
8990 return 1;
8991}
8992
996cb5f7 8993/* =================== Virtual Memory - Threaded I/O ======================= */
8994
b9bc0eef 8995static void freeIOJob(iojob *j) {
d5d55fc3 8996 if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
8997 j->type == REDIS_IOJOB_DO_SWAP ||
8998 j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
b9bc0eef 8999 decrRefCount(j->val);
78ebe4c8 9000 /* We don't decrRefCount the j->key field as we did't incremented
9001 * the count creating IO Jobs. This is because the key field here is
9002 * just used as an indentifier and if a key is removed the Job should
9003 * never be touched again. */
b9bc0eef 9004 zfree(j);
9005}
9006
996cb5f7 9007/* Every time a thread finished a Job, it writes a byte into the write side
9008 * of an unix pipe in order to "awake" the main thread, and this function
9009 * is called. */
9010static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
9011 int mask)
9012{
9013 char buf[1];
b0d8747d 9014 int retval, processed = 0, toprocess = -1, trytoswap = 1;
996cb5f7 9015 REDIS_NOTUSED(el);
9016 REDIS_NOTUSED(mask);
9017 REDIS_NOTUSED(privdata);
9018
9019 /* For every byte we read in the read side of the pipe, there is one
9020 * I/O job completed to process. */
9021 while((retval = read(fd,buf,1)) == 1) {
b9bc0eef 9022 iojob *j;
9023 listNode *ln;
9024 robj *key;
9025 struct dictEntry *de;
9026
996cb5f7 9027 redisLog(REDIS_DEBUG,"Processing I/O completed job");
b9bc0eef 9028
9029 /* Get the processed element (the oldest one) */
9030 lockThreadedIO();
1064ef87 9031 assert(listLength(server.io_processed) != 0);
f6c0bba8 9032 if (toprocess == -1) {
9033 toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
9034 if (toprocess <= 0) toprocess = 1;
9035 }
b9bc0eef 9036 ln = listFirst(server.io_processed);
9037 j = ln->value;
9038 listDelNode(server.io_processed,ln);
9039 unlockThreadedIO();
9040 /* If this job is marked as canceled, just ignore it */
9041 if (j->canceled) {
9042 freeIOJob(j);
9043 continue;
9044 }
9045 /* Post process it in the main thread, as there are things we
9046 * can do just here to avoid race conditions and/or invasive locks */
6c96ba7d 9047 redisLog(REDIS_DEBUG,"Job %p type: %d, key at %p (%s) refcount: %d\n", (void*) j, j->type, (void*)j->key, (char*)j->key->ptr, j->key->refcount);
b9bc0eef 9048 de = dictFind(j->db->dict,j->key);
9049 assert(de != NULL);
9050 key = dictGetEntryKey(de);
9051 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9052 redisDb *db;
9053
b9bc0eef 9054 /* Key loaded, bring it at home */
9055 key->storage = REDIS_VM_MEMORY;
9056 key->vm.atime = server.unixtime;
9057 vmMarkPagesFree(key->vm.page,key->vm.usedpages);
9058 redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
9059 (unsigned char*) key->ptr);
9060 server.vm_stats_swapped_objects--;
9061 server.vm_stats_swapins++;
d5d55fc3 9062 dictGetEntryVal(de) = j->val;
9063 incrRefCount(j->val);
9064 db = j->db;
b9bc0eef 9065 freeIOJob(j);
d5d55fc3 9066 /* Handle clients waiting for this key to be loaded. */
9067 handleClientsBlockedOnSwappedKey(db,key);
b9bc0eef 9068 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9069 /* Now we know the amount of pages required to swap this object.
9070 * Let's find some space for it, and queue this task again
9071 * rebranded as REDIS_IOJOB_DO_SWAP. */
054e426d 9072 if (!vmCanSwapOut() ||
9073 vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
9074 {
9075 /* Ooops... no space or we can't swap as there is
9076 * a fork()ed Redis trying to save stuff on disk. */
b9bc0eef 9077 freeIOJob(j);
054e426d 9078 key->storage = REDIS_VM_MEMORY; /* undo operation */
b9bc0eef 9079 } else {
c7df85a4 9080 /* Note that we need to mark this pages as used now,
9081 * if the job will be canceled, we'll mark them as freed
9082 * again. */
9083 vmMarkPagesUsed(j->page,j->pages);
b9bc0eef 9084 j->type = REDIS_IOJOB_DO_SWAP;
9085 lockThreadedIO();
9086 queueIOJob(j);
9087 unlockThreadedIO();
9088 }
9089 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
9090 robj *val;
9091
9092 /* Key swapped. We can finally free some memory. */
6c96ba7d 9093 if (key->storage != REDIS_VM_SWAPPING) {
9094 printf("key->storage: %d\n",key->storage);
9095 printf("key->name: %s\n",(char*)key->ptr);
9096 printf("key->refcount: %d\n",key->refcount);
9097 printf("val: %p\n",(void*)j->val);
9098 printf("val->type: %d\n",j->val->type);
9099 printf("val->ptr: %s\n",(char*)j->val->ptr);
9100 }
9101 redisAssert(key->storage == REDIS_VM_SWAPPING);
b9bc0eef 9102 val = dictGetEntryVal(de);
9103 key->vm.page = j->page;
9104 key->vm.usedpages = j->pages;
9105 key->storage = REDIS_VM_SWAPPED;
9106 key->vtype = j->val->type;
9107 decrRefCount(val); /* Deallocate the object from memory. */
f11b8647 9108 dictGetEntryVal(de) = NULL;
b9bc0eef 9109 redisLog(REDIS_DEBUG,
9110 "VM: object %s swapped out at %lld (%lld pages) (threaded)",
9111 (unsigned char*) key->ptr,
9112 (unsigned long long) j->page, (unsigned long long) j->pages);
9113 server.vm_stats_swapped_objects++;
9114 server.vm_stats_swapouts++;
9115 freeIOJob(j);
f11b8647 9116 /* Put a few more swap requests in queue if we are still
9117 * out of memory */
b0d8747d 9118 if (trytoswap && vmCanSwapOut() &&
9119 zmalloc_used_memory() > server.vm_max_memory)
9120 {
f11b8647 9121 int more = 1;
9122 while(more) {
9123 lockThreadedIO();
9124 more = listLength(server.io_newjobs) <
9125 (unsigned) server.vm_max_threads;
9126 unlockThreadedIO();
9127 /* Don't waste CPU time if swappable objects are rare. */
b0d8747d 9128 if (vmSwapOneObjectThreaded() == REDIS_ERR) {
9129 trytoswap = 0;
9130 break;
9131 }
f11b8647 9132 }
9133 }
b9bc0eef 9134 }
c953f24b 9135 processed++;
f6c0bba8 9136 if (processed == toprocess) return;
996cb5f7 9137 }
9138 if (retval < 0 && errno != EAGAIN) {
9139 redisLog(REDIS_WARNING,
9140 "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
9141 strerror(errno));
9142 }
9143}
9144
9145static void lockThreadedIO(void) {
9146 pthread_mutex_lock(&server.io_mutex);
9147}
9148
9149static void unlockThreadedIO(void) {
9150 pthread_mutex_unlock(&server.io_mutex);
9151}
9152
9153/* Remove the specified object from the threaded I/O queue if still not
9154 * processed, otherwise make sure to flag it as canceled. */
9155static void vmCancelThreadedIOJob(robj *o) {
9156 list *lists[3] = {
6c96ba7d 9157 server.io_newjobs, /* 0 */
9158 server.io_processing, /* 1 */
9159 server.io_processed /* 2 */
996cb5f7 9160 };
9161 int i;
9162
9163 assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
2e111efe 9164again:
996cb5f7 9165 lockThreadedIO();
9166 /* Search for a matching key in one of the queues */
9167 for (i = 0; i < 3; i++) {
9168 listNode *ln;
c7df85a4 9169 listIter li;
996cb5f7 9170
c7df85a4 9171 listRewind(lists[i],&li);
9172 while ((ln = listNext(&li)) != NULL) {
996cb5f7 9173 iojob *job = ln->value;
9174
6c96ba7d 9175 if (job->canceled) continue; /* Skip this, already canceled. */
78ebe4c8 9176 if (job->key == o) {
970e10bb 9177 redisLog(REDIS_DEBUG,"*** CANCELED %p (%s) (type %d) (LIST ID %d)\n",
9178 (void*)job, (char*)o->ptr, job->type, i);
427a2153 9179 /* Mark the pages as free since the swap didn't happened
9180 * or happened but is now discarded. */
970e10bb 9181 if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
427a2153 9182 vmMarkPagesFree(job->page,job->pages);
9183 /* Cancel the job. It depends on the list the job is
9184 * living in. */
996cb5f7 9185 switch(i) {
9186 case 0: /* io_newjobs */
6c96ba7d 9187 /* If the job was yet not processed the best thing to do
996cb5f7 9188 * is to remove it from the queue at all */
6c96ba7d 9189 freeIOJob(job);
996cb5f7 9190 listDelNode(lists[i],ln);
9191 break;
9192 case 1: /* io_processing */
d5d55fc3 9193 /* Oh Shi- the thread is messing with the Job:
9194 *
9195 * Probably it's accessing the object if this is a
9196 * PREPARE_SWAP or DO_SWAP job.
9197 * If it's a LOAD job it may be reading from disk and
9198 * if we don't wait for the job to terminate before to
9199 * cancel it, maybe in a few microseconds data can be
9200 * corrupted in this pages. So the short story is:
9201 *
9202 * Better to wait for the job to move into the
9203 * next queue (processed)... */
9204
9205 /* We try again and again until the job is completed. */
9206 unlockThreadedIO();
9207 /* But let's wait some time for the I/O thread
9208 * to finish with this job. After all this condition
9209 * should be very rare. */
9210 usleep(1);
9211 goto again;
996cb5f7 9212 case 2: /* io_processed */
2e111efe 9213 /* The job was already processed, that's easy...
9214 * just mark it as canceled so that we'll ignore it
9215 * when processing completed jobs. */
996cb5f7 9216 job->canceled = 1;
9217 break;
9218 }
c7df85a4 9219 /* Finally we have to adjust the storage type of the object
9220 * in order to "UNDO" the operaiton. */
996cb5f7 9221 if (o->storage == REDIS_VM_LOADING)
9222 o->storage = REDIS_VM_SWAPPED;
9223 else if (o->storage == REDIS_VM_SWAPPING)
9224 o->storage = REDIS_VM_MEMORY;
9225 unlockThreadedIO();
9226 return;
9227 }
9228 }
9229 }
9230 unlockThreadedIO();
9231 assert(1 != 1); /* We should never reach this */
9232}
9233
b9bc0eef 9234static void *IOThreadEntryPoint(void *arg) {
9235 iojob *j;
9236 listNode *ln;
9237 REDIS_NOTUSED(arg);
9238
9239 pthread_detach(pthread_self());
9240 while(1) {
9241 /* Get a new job to process */
9242 lockThreadedIO();
9243 if (listLength(server.io_newjobs) == 0) {
9244 /* No new jobs in queue, exit. */
9ebed7cf 9245 redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
9246 (long) pthread_self());
b9bc0eef 9247 server.io_active_threads--;
9248 unlockThreadedIO();
9249 return NULL;
9250 }
9251 ln = listFirst(server.io_newjobs);
9252 j = ln->value;
9253 listDelNode(server.io_newjobs,ln);
9254 /* Add the job in the processing queue */
9255 j->thread = pthread_self();
9256 listAddNodeTail(server.io_processing,j);
9257 ln = listLast(server.io_processing); /* We use ln later to remove it */
9258 unlockThreadedIO();
9ebed7cf 9259 redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
9260 (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
b9bc0eef 9261
9262 /* Process the Job */
9263 if (j->type == REDIS_IOJOB_LOAD) {
d5d55fc3 9264 j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
b9bc0eef 9265 } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
9266 FILE *fp = fopen("/dev/null","w+");
9267 j->pages = rdbSavedObjectPages(j->val,fp);
9268 fclose(fp);
9269 } else if (j->type == REDIS_IOJOB_DO_SWAP) {
a5819310 9270 if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
9271 j->canceled = 1;
b9bc0eef 9272 }
9273
9274 /* Done: insert the job into the processed queue */
9ebed7cf 9275 redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
9276 (long) pthread_self(), (void*)j, (char*)j->key->ptr);
b9bc0eef 9277 lockThreadedIO();
9278 listDelNode(server.io_processing,ln);
9279 listAddNodeTail(server.io_processed,j);
9280 unlockThreadedIO();
e0a62c7f 9281
b9bc0eef 9282 /* Signal the main thread there is new stuff to process */
9283 assert(write(server.io_ready_pipe_write,"x",1) == 1);
9284 }
9285 return NULL; /* never reached */
9286}
9287
9288static void spawnIOThread(void) {
9289 pthread_t thread;
478c2c6f 9290 sigset_t mask, omask;
a97b9060 9291 int err;
b9bc0eef 9292
478c2c6f 9293 sigemptyset(&mask);
9294 sigaddset(&mask,SIGCHLD);
9295 sigaddset(&mask,SIGHUP);
9296 sigaddset(&mask,SIGPIPE);
9297 pthread_sigmask(SIG_SETMASK, &mask, &omask);
a97b9060 9298 while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
9299 redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
9300 strerror(err));
9301 usleep(1000000);
9302 }
478c2c6f 9303 pthread_sigmask(SIG_SETMASK, &omask, NULL);
b9bc0eef 9304 server.io_active_threads++;
9305}
9306
4ee9488d 9307/* We need to wait for the last thread to exit before we are able to
9308 * fork() in order to BGSAVE or BGREWRITEAOF. */
054e426d 9309static void waitEmptyIOJobsQueue(void) {
4ee9488d 9310 while(1) {
76b7233a 9311 int io_processed_len;
9312
4ee9488d 9313 lockThreadedIO();
054e426d 9314 if (listLength(server.io_newjobs) == 0 &&
9315 listLength(server.io_processing) == 0 &&
9316 server.io_active_threads == 0)
9317 {
4ee9488d 9318 unlockThreadedIO();
9319 return;
9320 }
76b7233a 9321 /* While waiting for empty jobs queue condition we post-process some
9322 * finshed job, as I/O threads may be hanging trying to write against
9323 * the io_ready_pipe_write FD but there are so much pending jobs that
9324 * it's blocking. */
9325 io_processed_len = listLength(server.io_processed);
4ee9488d 9326 unlockThreadedIO();
76b7233a 9327 if (io_processed_len) {
9328 vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
9329 usleep(1000); /* 1 millisecond */
9330 } else {
9331 usleep(10000); /* 10 milliseconds */
9332 }
4ee9488d 9333 }
9334}
9335
054e426d 9336static void vmReopenSwapFile(void) {
478c2c6f 9337 /* Note: we don't close the old one as we are in the child process
9338 * and don't want to mess at all with the original file object. */
054e426d 9339 server.vm_fp = fopen(server.vm_swap_file,"r+b");
9340 if (server.vm_fp == NULL) {
9341 redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
9342 server.vm_swap_file);
478c2c6f 9343 _exit(1);
054e426d 9344 }
9345 server.vm_fd = fileno(server.vm_fp);
9346}
9347
b9bc0eef 9348/* This function must be called while with threaded IO locked */
9349static void queueIOJob(iojob *j) {
6c96ba7d 9350 redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
9351 (void*)j, j->type, (char*)j->key->ptr);
b9bc0eef 9352 listAddNodeTail(server.io_newjobs,j);
9353 if (server.io_active_threads < server.vm_max_threads)
9354 spawnIOThread();
9355}
9356
9357static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
9358 iojob *j;
e0a62c7f 9359
b9bc0eef 9360 assert(key->storage == REDIS_VM_MEMORY);
9361 assert(key->refcount == 1);
9362
9363 j = zmalloc(sizeof(*j));
9364 j->type = REDIS_IOJOB_PREPARE_SWAP;
9365 j->db = db;
78ebe4c8 9366 j->key = key;
b9bc0eef 9367 j->val = val;
9368 incrRefCount(val);
9369 j->canceled = 0;
9370 j->thread = (pthread_t) -1;
f11b8647 9371 key->storage = REDIS_VM_SWAPPING;
b9bc0eef 9372
9373 lockThreadedIO();
9374 queueIOJob(j);
9375 unlockThreadedIO();
9376 return REDIS_OK;
9377}
9378
b0d8747d 9379/* ============ Virtual Memory - Blocking clients on missing keys =========== */
9380
d5d55fc3 9381/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
9382 * If there is not already a job loading the key, it is craeted.
9383 * The key is added to the io_keys list in the client structure, and also
9384 * in the hash table mapping swapped keys to waiting clients, that is,
9385 * server.io_waited_keys. */
9386static int waitForSwappedKey(redisClient *c, robj *key) {
9387 struct dictEntry *de;
9388 robj *o;
9389 list *l;
9390
9391 /* If the key does not exist or is already in RAM we don't need to
9392 * block the client at all. */
9393 de = dictFind(c->db->dict,key);
9394 if (de == NULL) return 0;
9395 o = dictGetEntryKey(de);
9396 if (o->storage == REDIS_VM_MEMORY) {
9397 return 0;
9398 } else if (o->storage == REDIS_VM_SWAPPING) {
9399 /* We were swapping the key, undo it! */
9400 vmCancelThreadedIOJob(o);
9401 return 0;
9402 }
e0a62c7f 9403
d5d55fc3 9404 /* OK: the key is either swapped, or being loaded just now. */
9405
9406 /* Add the key to the list of keys this client is waiting for.
9407 * This maps clients to keys they are waiting for. */
9408 listAddNodeTail(c->io_keys,key);
9409 incrRefCount(key);
9410
9411 /* Add the client to the swapped keys => clients waiting map. */
9412 de = dictFind(c->db->io_keys,key);
9413 if (de == NULL) {
9414 int retval;
9415
9416 /* For every key we take a list of clients blocked for it */
9417 l = listCreate();
9418 retval = dictAdd(c->db->io_keys,key,l);
9419 incrRefCount(key);
9420 assert(retval == DICT_OK);
9421 } else {
9422 l = dictGetEntryVal(de);
9423 }
9424 listAddNodeTail(l,c);
9425
9426 /* Are we already loading the key from disk? If not create a job */
9427 if (o->storage == REDIS_VM_SWAPPED) {
9428 iojob *j;
9429
9430 o->storage = REDIS_VM_LOADING;
9431 j = zmalloc(sizeof(*j));
9432 j->type = REDIS_IOJOB_LOAD;
9433 j->db = c->db;
78ebe4c8 9434 j->key = o;
d5d55fc3 9435 j->key->vtype = o->vtype;
9436 j->page = o->vm.page;
9437 j->val = NULL;
9438 j->canceled = 0;
9439 j->thread = (pthread_t) -1;
9440 lockThreadedIO();
9441 queueIOJob(j);
9442 unlockThreadedIO();
9443 }
9444 return 1;
9445}
9446
76583ea4
PN
9447/* Preload keys needed for the ZUNION and ZINTER commands. */
9448static void zunionInterBlockClientOnSwappedKeys(redisClient *c) {
9449 int i, num;
9450 num = atoi(c->argv[2]->ptr);
9451 for (i = 0; i < num; i++) {
9452 waitForSwappedKey(c,c->argv[3+i]);
9453 }
9454}
9455
b0d8747d 9456/* Is this client attempting to run a command against swapped keys?
d5d55fc3 9457 * If so, block it ASAP, load the keys in background, then resume it.
b0d8747d 9458 *
d5d55fc3 9459 * The important idea about this function is that it can fail! If keys will
9460 * still be swapped when the client is resumed, this key lookups will
9461 * just block loading keys from disk. In practical terms this should only
9462 * happen with SORT BY command or if there is a bug in this function.
9463 *
9464 * Return 1 if the client is marked as blocked, 0 if the client can
9465 * continue as the keys it is going to access appear to be in memory. */
9466static int blockClientOnSwappedKeys(struct redisCommand *cmd, redisClient *c) {
7c775e09 9467 int j, last;
9468
76583ea4
PN
9469 if (cmd->vm_preload_proc != NULL) {
9470 cmd->vm_preload_proc(c);
9471 } else {
9472 if (cmd->vm_firstkey == 0) return 0;
9473 last = cmd->vm_lastkey;
9474 if (last < 0) last = c->argc+last;
9475 for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep)
9476 waitForSwappedKey(c,c->argv[j]);
9477 }
9478
d5d55fc3 9479 /* If the client was blocked for at least one key, mark it as blocked. */
9480 if (listLength(c->io_keys)) {
9481 c->flags |= REDIS_IO_WAIT;
9482 aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
9483 server.vm_blocked_clients++;
9484 return 1;
9485 } else {
9486 return 0;
9487 }
9488}
9489
9490/* Remove the 'key' from the list of blocked keys for a given client.
9491 *
9492 * The function returns 1 when there are no longer blocking keys after
9493 * the current one was removed (and the client can be unblocked). */
9494static int dontWaitForSwappedKey(redisClient *c, robj *key) {
9495 list *l;
9496 listNode *ln;
9497 listIter li;
9498 struct dictEntry *de;
9499
9500 /* Remove the key from the list of keys this client is waiting for. */
9501 listRewind(c->io_keys,&li);
9502 while ((ln = listNext(&li)) != NULL) {
9503 if (compareStringObjects(ln->value,key) == 0) {
9504 listDelNode(c->io_keys,ln);
9505 break;
9506 }
9507 }
9508 assert(ln != NULL);
9509
9510 /* Remove the client form the key => waiting clients map. */
9511 de = dictFind(c->db->io_keys,key);
9512 assert(de != NULL);
9513 l = dictGetEntryVal(de);
9514 ln = listSearchKey(l,c);
9515 assert(ln != NULL);
9516 listDelNode(l,ln);
9517 if (listLength(l) == 0)
9518 dictDelete(c->db->io_keys,key);
9519
9520 return listLength(c->io_keys) == 0;
9521}
9522
9523static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
9524 struct dictEntry *de;
9525 list *l;
9526 listNode *ln;
9527 int len;
9528
9529 de = dictFind(db->io_keys,key);
9530 if (!de) return;
9531
9532 l = dictGetEntryVal(de);
9533 len = listLength(l);
9534 /* Note: we can't use something like while(listLength(l)) as the list
9535 * can be freed by the calling function when we remove the last element. */
9536 while (len--) {
9537 ln = listFirst(l);
9538 redisClient *c = ln->value;
9539
9540 if (dontWaitForSwappedKey(c,key)) {
9541 /* Put the client in the list of clients ready to go as we
9542 * loaded all the keys about it. */
9543 listAddNodeTail(server.io_ready_clients,c);
9544 }
9545 }
b0d8747d 9546}
b0d8747d 9547
500ece7c 9548/* =========================== Remote Configuration ========================= */
9549
9550static void configSetCommand(redisClient *c) {
9551 robj *o = getDecodedObject(c->argv[3]);
9552 if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
9553 zfree(server.dbfilename);
9554 server.dbfilename = zstrdup(o->ptr);
9555 } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
9556 zfree(server.requirepass);
9557 server.requirepass = zstrdup(o->ptr);
9558 } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
9559 zfree(server.masterauth);
9560 server.masterauth = zstrdup(o->ptr);
9561 } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
9562 server.maxmemory = strtoll(o->ptr, NULL, 10);
9563 } else {
9564 addReplySds(c,sdscatprintf(sdsempty(),
9565 "-ERR not supported CONFIG parameter %s\r\n",
9566 (char*)c->argv[2]->ptr));
9567 decrRefCount(o);
9568 return;
9569 }
9570 decrRefCount(o);
9571 addReply(c,shared.ok);
9572}
9573
9574static void configGetCommand(redisClient *c) {
9575 robj *o = getDecodedObject(c->argv[2]);
9576 robj *lenobj = createObject(REDIS_STRING,NULL);
9577 char *pattern = o->ptr;
9578 int matches = 0;
9579
9580 addReply(c,lenobj);
9581 decrRefCount(lenobj);
9582
9583 if (stringmatch(pattern,"dbfilename",0)) {
9584 addReplyBulkCString(c,"dbfilename");
9585 addReplyBulkCString(c,server.dbfilename);
9586 matches++;
9587 }
9588 if (stringmatch(pattern,"requirepass",0)) {
9589 addReplyBulkCString(c,"requirepass");
9590 addReplyBulkCString(c,server.requirepass);
9591 matches++;
9592 }
9593 if (stringmatch(pattern,"masterauth",0)) {
9594 addReplyBulkCString(c,"masterauth");
9595 addReplyBulkCString(c,server.masterauth);
9596 matches++;
9597 }
9598 if (stringmatch(pattern,"maxmemory",0)) {
9599 char buf[128];
9600
9601 snprintf(buf,128,"%llu\n",server.maxmemory);
9602 addReplyBulkCString(c,"maxmemory");
9603 addReplyBulkCString(c,buf);
9604 matches++;
9605 }
9606 decrRefCount(o);
9607 lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
9608}
9609
9610static void configCommand(redisClient *c) {
9611 if (!strcasecmp(c->argv[1]->ptr,"set")) {
9612 if (c->argc != 4) goto badarity;
9613 configSetCommand(c);
9614 } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
9615 if (c->argc != 3) goto badarity;
9616 configGetCommand(c);
9617 } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
9618 if (c->argc != 2) goto badarity;
9619 server.stat_numcommands = 0;
9620 server.stat_numconnections = 0;
9621 server.stat_expiredkeys = 0;
9622 server.stat_starttime = time(NULL);
9623 addReply(c,shared.ok);
9624 } else {
9625 addReplySds(c,sdscatprintf(sdsempty(),
9626 "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
9627 }
9628 return;
9629
9630badarity:
9631 addReplySds(c,sdscatprintf(sdsempty(),
9632 "-ERR Wrong number of arguments for CONFIG %s\r\n",
9633 (char*) c->argv[1]->ptr));
9634}
9635
befec3cd 9636/* =========================== Pubsub implementation ======================== */
9637
ffc6b7f8 9638static void freePubsubPattern(void *p) {
9639 pubsubPattern *pat = p;
9640
9641 decrRefCount(pat->pattern);
9642 zfree(pat);
9643}
9644
9645static int listMatchPubsubPattern(void *a, void *b) {
9646 pubsubPattern *pa = a, *pb = b;
9647
9648 return (pa->client == pb->client) &&
9649 (compareStringObjects(pa->pattern,pb->pattern) == 0);
9650}
9651
9652/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
9653 * 0 if the client was already subscribed to that channel. */
9654static int pubsubSubscribeChannel(redisClient *c, robj *channel) {
befec3cd 9655 struct dictEntry *de;
9656 list *clients = NULL;
9657 int retval = 0;
9658
ffc6b7f8 9659 /* Add the channel to the client -> channels hash table */
9660 if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
befec3cd 9661 retval = 1;
ffc6b7f8 9662 incrRefCount(channel);
9663 /* Add the client to the channel -> list of clients hash table */
9664 de = dictFind(server.pubsub_channels,channel);
befec3cd 9665 if (de == NULL) {
9666 clients = listCreate();
ffc6b7f8 9667 dictAdd(server.pubsub_channels,channel,clients);
9668 incrRefCount(channel);
befec3cd 9669 } else {
9670 clients = dictGetEntryVal(de);
9671 }
9672 listAddNodeTail(clients,c);
9673 }
9674 /* Notify the client */
9675 addReply(c,shared.mbulk3);
9676 addReply(c,shared.subscribebulk);
ffc6b7f8 9677 addReplyBulk(c,channel);
9678 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
befec3cd 9679 return retval;
9680}
9681
ffc6b7f8 9682/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9683 * 0 if the client was not subscribed to the specified channel. */
9684static int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
befec3cd 9685 struct dictEntry *de;
9686 list *clients;
9687 listNode *ln;
9688 int retval = 0;
9689
ffc6b7f8 9690 /* Remove the channel from the client -> channels hash table */
9691 incrRefCount(channel); /* channel may be just a pointer to the same object
201037f5 9692 we have in the hash tables. Protect it... */
ffc6b7f8 9693 if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
befec3cd 9694 retval = 1;
ffc6b7f8 9695 /* Remove the client from the channel -> clients list hash table */
9696 de = dictFind(server.pubsub_channels,channel);
befec3cd 9697 assert(de != NULL);
9698 clients = dictGetEntryVal(de);
9699 ln = listSearchKey(clients,c);
9700 assert(ln != NULL);
9701 listDelNode(clients,ln);
ff767a75 9702 if (listLength(clients) == 0) {
9703 /* Free the list and associated hash entry at all if this was
9704 * the latest client, so that it will be possible to abuse
ffc6b7f8 9705 * Redis PUBSUB creating millions of channels. */
9706 dictDelete(server.pubsub_channels,channel);
ff767a75 9707 }
befec3cd 9708 }
9709 /* Notify the client */
9710 if (notify) {
9711 addReply(c,shared.mbulk3);
9712 addReply(c,shared.unsubscribebulk);
ffc6b7f8 9713 addReplyBulk(c,channel);
9714 addReplyLong(c,dictSize(c->pubsub_channels)+
9715 listLength(c->pubsub_patterns));
9716
9717 }
9718 decrRefCount(channel); /* it is finally safe to release it */
9719 return retval;
9720}
9721
9722/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
9723static int pubsubSubscribePattern(redisClient *c, robj *pattern) {
9724 int retval = 0;
9725
9726 if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
9727 retval = 1;
9728 pubsubPattern *pat;
9729 listAddNodeTail(c->pubsub_patterns,pattern);
9730 incrRefCount(pattern);
9731 pat = zmalloc(sizeof(*pat));
9732 pat->pattern = getDecodedObject(pattern);
9733 pat->client = c;
9734 listAddNodeTail(server.pubsub_patterns,pat);
9735 }
9736 /* Notify the client */
9737 addReply(c,shared.mbulk3);
9738 addReply(c,shared.psubscribebulk);
9739 addReplyBulk(c,pattern);
9740 addReplyLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
9741 return retval;
9742}
9743
9744/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
9745 * 0 if the client was not subscribed to the specified channel. */
9746static int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
9747 listNode *ln;
9748 pubsubPattern pat;
9749 int retval = 0;
9750
9751 incrRefCount(pattern); /* Protect the object. May be the same we remove */
9752 if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
9753 retval = 1;
9754 listDelNode(c->pubsub_patterns,ln);
9755 pat.client = c;
9756 pat.pattern = pattern;
9757 ln = listSearchKey(server.pubsub_patterns,&pat);
9758 listDelNode(server.pubsub_patterns,ln);
9759 }
9760 /* Notify the client */
9761 if (notify) {
9762 addReply(c,shared.mbulk3);
9763 addReply(c,shared.punsubscribebulk);
9764 addReplyBulk(c,pattern);
9765 addReplyLong(c,dictSize(c->pubsub_channels)+
9766 listLength(c->pubsub_patterns));
befec3cd 9767 }
ffc6b7f8 9768 decrRefCount(pattern);
befec3cd 9769 return retval;
9770}
9771
ffc6b7f8 9772/* Unsubscribe from all the channels. Return the number of channels the
9773 * client was subscribed from. */
9774static int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
9775 dictIterator *di = dictGetIterator(c->pubsub_channels);
befec3cd 9776 dictEntry *de;
9777 int count = 0;
9778
9779 while((de = dictNext(di)) != NULL) {
ffc6b7f8 9780 robj *channel = dictGetEntryKey(de);
befec3cd 9781
ffc6b7f8 9782 count += pubsubUnsubscribeChannel(c,channel,notify);
befec3cd 9783 }
9784 dictReleaseIterator(di);
9785 return count;
9786}
9787
ffc6b7f8 9788/* Unsubscribe from all the patterns. Return the number of patterns the
9789 * client was subscribed from. */
9790static int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
9791 listNode *ln;
9792 listIter li;
9793 int count = 0;
9794
9795 listRewind(c->pubsub_patterns,&li);
9796 while ((ln = listNext(&li)) != NULL) {
9797 robj *pattern = ln->value;
9798
9799 count += pubsubUnsubscribePattern(c,pattern,notify);
9800 }
9801 return count;
9802}
9803
befec3cd 9804/* Publish a message */
ffc6b7f8 9805static int pubsubPublishMessage(robj *channel, robj *message) {
befec3cd 9806 int receivers = 0;
9807 struct dictEntry *de;
ffc6b7f8 9808 listNode *ln;
9809 listIter li;
befec3cd 9810
ffc6b7f8 9811 /* Send to clients listening for that channel */
9812 de = dictFind(server.pubsub_channels,channel);
befec3cd 9813 if (de) {
9814 list *list = dictGetEntryVal(de);
9815 listNode *ln;
9816 listIter li;
9817
9818 listRewind(list,&li);
9819 while ((ln = listNext(&li)) != NULL) {
9820 redisClient *c = ln->value;
9821
9822 addReply(c,shared.mbulk3);
9823 addReply(c,shared.messagebulk);
ffc6b7f8 9824 addReplyBulk(c,channel);
befec3cd 9825 addReplyBulk(c,message);
9826 receivers++;
9827 }
9828 }
ffc6b7f8 9829 /* Send to clients listening to matching channels */
9830 if (listLength(server.pubsub_patterns)) {
9831 listRewind(server.pubsub_patterns,&li);
9832 channel = getDecodedObject(channel);
9833 while ((ln = listNext(&li)) != NULL) {
9834 pubsubPattern *pat = ln->value;
9835
9836 if (stringmatchlen((char*)pat->pattern->ptr,
9837 sdslen(pat->pattern->ptr),
9838 (char*)channel->ptr,
9839 sdslen(channel->ptr),0)) {
c8d0ea0e 9840 addReply(pat->client,shared.mbulk4);
9841 addReply(pat->client,shared.pmessagebulk);
9842 addReplyBulk(pat->client,pat->pattern);
ffc6b7f8 9843 addReplyBulk(pat->client,channel);
9844 addReplyBulk(pat->client,message);
9845 receivers++;
9846 }
9847 }
9848 decrRefCount(channel);
9849 }
befec3cd 9850 return receivers;
9851}
9852
9853static void subscribeCommand(redisClient *c) {
9854 int j;
9855
9856 for (j = 1; j < c->argc; j++)
ffc6b7f8 9857 pubsubSubscribeChannel(c,c->argv[j]);
befec3cd 9858}
9859
9860static void unsubscribeCommand(redisClient *c) {
9861 if (c->argc == 1) {
ffc6b7f8 9862 pubsubUnsubscribeAllChannels(c,1);
9863 return;
9864 } else {
9865 int j;
9866
9867 for (j = 1; j < c->argc; j++)
9868 pubsubUnsubscribeChannel(c,c->argv[j],1);
9869 }
9870}
9871
9872static void psubscribeCommand(redisClient *c) {
9873 int j;
9874
9875 for (j = 1; j < c->argc; j++)
9876 pubsubSubscribePattern(c,c->argv[j]);
9877}
9878
9879static void punsubscribeCommand(redisClient *c) {
9880 if (c->argc == 1) {
9881 pubsubUnsubscribeAllPatterns(c,1);
befec3cd 9882 return;
9883 } else {
9884 int j;
9885
9886 for (j = 1; j < c->argc; j++)
ffc6b7f8 9887 pubsubUnsubscribePattern(c,c->argv[j],1);
befec3cd 9888 }
9889}
9890
9891static void publishCommand(redisClient *c) {
9892 int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
9893 addReplyLong(c,receivers);
9894}
9895
7f957c92 9896/* ================================= Debugging ============================== */
9897
9898static void debugCommand(redisClient *c) {
9899 if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
9900 *((char*)-1) = 'x';
210e29f7 9901 } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
9902 if (rdbSave(server.dbfilename) != REDIS_OK) {
9903 addReply(c,shared.err);
9904 return;
9905 }
9906 emptyDb();
9907 if (rdbLoad(server.dbfilename) != REDIS_OK) {
9908 addReply(c,shared.err);
9909 return;
9910 }
9911 redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
9912 addReply(c,shared.ok);
71c2b467 9913 } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
9914 emptyDb();
9915 if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
9916 addReply(c,shared.err);
9917 return;
9918 }
9919 redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
9920 addReply(c,shared.ok);
333298da 9921 } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
9922 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9923 robj *key, *val;
9924
9925 if (!de) {
9926 addReply(c,shared.nokeyerr);
9927 return;
9928 }
9929 key = dictGetEntryKey(de);
9930 val = dictGetEntryVal(de);
59146ef3 9931 if (!server.vm_enabled || (key->storage == REDIS_VM_MEMORY ||
9932 key->storage == REDIS_VM_SWAPPING)) {
07efaf74 9933 char *strenc;
9934 char buf[128];
9935
9936 if (val->encoding < (sizeof(strencoding)/sizeof(char*))) {
9937 strenc = strencoding[val->encoding];
9938 } else {
9939 snprintf(buf,64,"unknown encoding %d\n", val->encoding);
9940 strenc = buf;
9941 }
ace06542 9942 addReplySds(c,sdscatprintf(sdsempty(),
9943 "+Key at:%p refcount:%d, value at:%p refcount:%d "
07efaf74 9944 "encoding:%s serializedlength:%lld\r\n",
682ac724 9945 (void*)key, key->refcount, (void*)val, val->refcount,
07efaf74 9946 strenc, (long long) rdbSavedObjectLen(val,NULL)));
ace06542 9947 } else {
9948 addReplySds(c,sdscatprintf(sdsempty(),
9949 "+Key at:%p refcount:%d, value swapped at: page %llu "
9950 "using %llu pages\r\n",
9951 (void*)key, key->refcount, (unsigned long long) key->vm.page,
9952 (unsigned long long) key->vm.usedpages));
9953 }
78ebe4c8 9954 } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
9955 lookupKeyRead(c->db,c->argv[2]);
9956 addReply(c,shared.ok);
7d30035d 9957 } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
9958 dictEntry *de = dictFind(c->db->dict,c->argv[2]);
9959 robj *key, *val;
9960
9961 if (!server.vm_enabled) {
9962 addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
9963 return;
9964 }
9965 if (!de) {
9966 addReply(c,shared.nokeyerr);
9967 return;
9968 }
9969 key = dictGetEntryKey(de);
9970 val = dictGetEntryVal(de);
4ef8de8a 9971 /* If the key is shared we want to create a copy */
9972 if (key->refcount > 1) {
9973 robj *newkey = dupStringObject(key);
9974 decrRefCount(key);
9975 key = dictGetEntryKey(de) = newkey;
9976 }
9977 /* Swap it */
7d30035d 9978 if (key->storage != REDIS_VM_MEMORY) {
9979 addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
a69a0c9c 9980 } else if (vmSwapObjectBlocking(key,val) == REDIS_OK) {
7d30035d 9981 dictGetEntryVal(de) = NULL;
9982 addReply(c,shared.ok);
9983 } else {
9984 addReply(c,shared.err);
9985 }
7f957c92 9986 } else {
333298da 9987 addReplySds(c,sdsnew(
bdcb92f2 9988 "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
7f957c92 9989 }
9990}
56906eef 9991
6c96ba7d 9992static void _redisAssert(char *estr, char *file, int line) {
dfc5e96c 9993 redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
6c96ba7d 9994 redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true\n",file,line,estr);
dfc5e96c 9995#ifdef HAVE_BACKTRACE
9996 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
9997 *((char*)-1) = 'x';
9998#endif
9999}
10000
c651fd9e 10001static void _redisPanic(char *msg, char *file, int line) {
10002 redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
17772754 10003 redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
c651fd9e 10004#ifdef HAVE_BACKTRACE
10005 redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
10006 *((char*)-1) = 'x';
10007#endif
10008}
10009
bcfc686d 10010/* =================================== Main! ================================ */
56906eef 10011
bcfc686d 10012#ifdef __linux__
10013int linuxOvercommitMemoryValue(void) {
10014 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
10015 char buf[64];
56906eef 10016
bcfc686d 10017 if (!fp) return -1;
10018 if (fgets(buf,64,fp) == NULL) {
10019 fclose(fp);
10020 return -1;
10021 }
10022 fclose(fp);
56906eef 10023
bcfc686d 10024 return atoi(buf);
10025}
10026
10027void linuxOvercommitMemoryWarning(void) {
10028 if (linuxOvercommitMemoryValue() == 0) {
7ccd2d0a 10029 redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
bcfc686d 10030 }
10031}
10032#endif /* __linux__ */
10033
10034static void daemonize(void) {
10035 int fd;
10036 FILE *fp;
10037
10038 if (fork() != 0) exit(0); /* parent exits */
10039 setsid(); /* create a new session */
10040
10041 /* Every output goes to /dev/null. If Redis is daemonized but
10042 * the 'logfile' is set to 'stdout' in the configuration file
10043 * it will not log at all. */
10044 if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
10045 dup2(fd, STDIN_FILENO);
10046 dup2(fd, STDOUT_FILENO);
10047 dup2(fd, STDERR_FILENO);
10048 if (fd > STDERR_FILENO) close(fd);
10049 }
10050 /* Try to write the pid file */
10051 fp = fopen(server.pidfile,"w");
10052 if (fp) {
10053 fprintf(fp,"%d\n",getpid());
10054 fclose(fp);
56906eef 10055 }
56906eef 10056}
10057
42ab0172
AO
10058static void version() {
10059 printf("Redis server version %s\n", REDIS_VERSION);
10060 exit(0);
10061}
10062
723fb69b
AO
10063static void usage() {
10064 fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
e9409273 10065 fprintf(stderr," ./redis-server - (read config from stdin)\n");
723fb69b
AO
10066 exit(1);
10067}
10068
bcfc686d 10069int main(int argc, char **argv) {
9651a787 10070 time_t start;
10071
bcfc686d 10072 initServerConfig();
10073 if (argc == 2) {
44efe66e 10074 if (strcmp(argv[1], "-v") == 0 ||
10075 strcmp(argv[1], "--version") == 0) version();
10076 if (strcmp(argv[1], "--help") == 0) usage();
bcfc686d 10077 resetServerSaveParams();
10078 loadServerConfig(argv[1]);
723fb69b
AO
10079 } else if ((argc > 2)) {
10080 usage();
bcfc686d 10081 } else {
10082 redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
10083 }
bcfc686d 10084 if (server.daemonize) daemonize();
71c54b21 10085 initServer();
bcfc686d 10086 redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
10087#ifdef __linux__
10088 linuxOvercommitMemoryWarning();
10089#endif
9651a787 10090 start = time(NULL);
bcfc686d 10091 if (server.appendonly) {
10092 if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
9651a787 10093 redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
bcfc686d 10094 } else {
10095 if (rdbLoad(server.dbfilename) == REDIS_OK)
9651a787 10096 redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
bcfc686d 10097 }
bcfc686d 10098 redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
d5d55fc3 10099 aeSetBeforeSleepProc(server.el,beforeSleep);
bcfc686d 10100 aeMain(server.el);
10101 aeDeleteEventLoop(server.el);
10102 return 0;
10103}
10104
10105/* ============================= Backtrace support ========================= */
10106
10107#ifdef HAVE_BACKTRACE
10108static char *findFuncName(void *pointer, unsigned long *offset);
10109
56906eef 10110static void *getMcontextEip(ucontext_t *uc) {
10111#if defined(__FreeBSD__)
10112 return (void*) uc->uc_mcontext.mc_eip;
10113#elif defined(__dietlibc__)
10114 return (void*) uc->uc_mcontext.eip;
06db1f50 10115#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
da0a1620 10116 #if __x86_64__
10117 return (void*) uc->uc_mcontext->__ss.__rip;
10118 #else
56906eef 10119 return (void*) uc->uc_mcontext->__ss.__eip;
da0a1620 10120 #endif
06db1f50 10121#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
cb7e07cc 10122 #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
06db1f50 10123 return (void*) uc->uc_mcontext->__ss.__rip;
cbc59b38 10124 #else
10125 return (void*) uc->uc_mcontext->__ss.__eip;
e0a62c7f 10126 #endif
54bac49d 10127#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
c04c9ac9 10128 return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
b91cf5ef 10129#elif defined(__ia64__) /* Linux IA64 */
10130 return (void*) uc->uc_mcontext.sc_ip;
10131#else
10132 return NULL;
56906eef 10133#endif
10134}
10135
10136static void segvHandler(int sig, siginfo_t *info, void *secret) {
10137 void *trace[100];
10138 char **messages = NULL;
10139 int i, trace_size = 0;
10140 unsigned long offset=0;
56906eef 10141 ucontext_t *uc = (ucontext_t*) secret;
1c85b79f 10142 sds infostring;
56906eef 10143 REDIS_NOTUSED(info);
10144
10145 redisLog(REDIS_WARNING,
10146 "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
1c85b79f 10147 infostring = genRedisInfoString();
10148 redisLog(REDIS_WARNING, "%s",infostring);
10149 /* It's not safe to sdsfree() the returned string under memory
10150 * corruption conditions. Let it leak as we are going to abort */
e0a62c7f 10151
56906eef 10152 trace_size = backtrace(trace, 100);
de96dbfe 10153 /* overwrite sigaction with caller's address */
b91cf5ef 10154 if (getMcontextEip(uc) != NULL) {
10155 trace[1] = getMcontextEip(uc);
10156 }
56906eef 10157 messages = backtrace_symbols(trace, trace_size);
fe3bbfbe 10158
d76412d1 10159 for (i=1; i<trace_size; ++i) {
56906eef 10160 char *fn = findFuncName(trace[i], &offset), *p;
10161
10162 p = strchr(messages[i],'+');
10163 if (!fn || (p && ((unsigned long)strtol(p+1,NULL,10)) < offset)) {
10164 redisLog(REDIS_WARNING,"%s", messages[i]);
10165 } else {
10166 redisLog(REDIS_WARNING,"%d redis-server %p %s + %d", i, trace[i], fn, (unsigned int)offset);
10167 }
10168 }
b177fd30 10169 /* free(messages); Don't call free() with possibly corrupted memory. */
478c2c6f 10170 _exit(0);
fe3bbfbe 10171}
56906eef 10172
10173static void setupSigSegvAction(void) {
10174 struct sigaction act;
10175
10176 sigemptyset (&act.sa_mask);
10177 /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
10178 * is used. Otherwise, sa_handler is used */
10179 act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
10180 act.sa_sigaction = segvHandler;
10181 sigaction (SIGSEGV, &act, NULL);
10182 sigaction (SIGBUS, &act, NULL);
12fea928 10183 sigaction (SIGFPE, &act, NULL);
10184 sigaction (SIGILL, &act, NULL);
10185 sigaction (SIGBUS, &act, NULL);
e65fdc78 10186 return;
56906eef 10187}
e65fdc78 10188
bcfc686d 10189#include "staticsymbols.h"
10190/* This function try to convert a pointer into a function name. It's used in
10191 * oreder to provide a backtrace under segmentation fault that's able to
10192 * display functions declared as static (otherwise the backtrace is useless). */
10193static char *findFuncName(void *pointer, unsigned long *offset){
10194 int i, ret = -1;
10195 unsigned long off, minoff = 0;
ed9b544e 10196
bcfc686d 10197 /* Try to match against the Symbol with the smallest offset */
10198 for (i=0; symsTable[i].pointer; i++) {
10199 unsigned long lp = (unsigned long) pointer;
0bc03378 10200
bcfc686d 10201 if (lp != (unsigned long)-1 && lp >= symsTable[i].pointer) {
10202 off=lp-symsTable[i].pointer;
10203 if (ret < 0 || off < minoff) {
10204 minoff=off;
10205 ret=i;
10206 }
10207 }
0bc03378 10208 }
bcfc686d 10209 if (ret == -1) return NULL;
10210 *offset = minoff;
10211 return symsTable[ret].name;
0bc03378 10212}
bcfc686d 10213#else /* HAVE_BACKTRACE */
10214static void setupSigSegvAction(void) {
0bc03378 10215}
bcfc686d 10216#endif /* HAVE_BACKTRACE */
0bc03378 10217
ed9b544e 10218
ed9b544e 10219
bcfc686d 10220/* The End */
10221
10222
ed9b544e 10223